├── .gitignore ├── .nojekyll ├── 01-shell-intro.qmd ├── 02-filedir.qmd ├── 03-filework.qmd ├── 04-pipes.qmd ├── 05-git.qmd ├── 06-concepts.qmd ├── 09-R-packages.qmd ├── 10-spatial-R.qmd ├── 11-NLP-R.qmd ├── 11-NLP-R_cache └── revealjs │ ├── __packages │ ├── unnamed-chunk-10_b67627f3927e12ad713028cf98e417bb.RData │ ├── unnamed-chunk-10_b67627f3927e12ad713028cf98e417bb.rdb │ ├── unnamed-chunk-10_b67627f3927e12ad713028cf98e417bb.rdx │ ├── unnamed-chunk-11_c35984ea9b2f979ff7f85284fee4cda0.RData │ ├── unnamed-chunk-11_c35984ea9b2f979ff7f85284fee4cda0.rdb │ ├── unnamed-chunk-11_c35984ea9b2f979ff7f85284fee4cda0.rdx │ ├── unnamed-chunk-12_18410f94565577aaf66decee3574d410.RData │ ├── unnamed-chunk-12_18410f94565577aaf66decee3574d410.rdb │ ├── unnamed-chunk-12_18410f94565577aaf66decee3574d410.rdx │ ├── unnamed-chunk-13_17c61bf27164538fea6ca0a3e4dc5b20.RData │ ├── unnamed-chunk-13_17c61bf27164538fea6ca0a3e4dc5b20.rdb │ ├── unnamed-chunk-13_17c61bf27164538fea6ca0a3e4dc5b20.rdx │ ├── unnamed-chunk-14_3cbee9f25d7102231341290c4fc06f0d.RData │ ├── unnamed-chunk-14_3cbee9f25d7102231341290c4fc06f0d.rdb │ ├── unnamed-chunk-14_3cbee9f25d7102231341290c4fc06f0d.rdx │ ├── unnamed-chunk-15_50c5eb25cbf3bf122ec79bddb67172f2.RData │ ├── unnamed-chunk-15_50c5eb25cbf3bf122ec79bddb67172f2.rdb │ ├── unnamed-chunk-15_50c5eb25cbf3bf122ec79bddb67172f2.rdx │ ├── unnamed-chunk-16_1c82565a0983d4f9ac69861c5f41d0d1.RData │ ├── unnamed-chunk-16_1c82565a0983d4f9ac69861c5f41d0d1.rdb │ ├── unnamed-chunk-16_1c82565a0983d4f9ac69861c5f41d0d1.rdx │ ├── unnamed-chunk-17_a7915cd327518d28ed8f0a9e584a9247.RData │ ├── unnamed-chunk-17_a7915cd327518d28ed8f0a9e584a9247.rdb │ ├── unnamed-chunk-17_a7915cd327518d28ed8f0a9e584a9247.rdx │ ├── unnamed-chunk-18_13c48c8c58c1b5cd21bb3ca46d39505a.RData │ ├── unnamed-chunk-18_13c48c8c58c1b5cd21bb3ca46d39505a.rdb │ ├── unnamed-chunk-18_13c48c8c58c1b5cd21bb3ca46d39505a.rdx │ ├── unnamed-chunk-19_60f976a677f568d76e9935e9173d5545.RData │ ├── unnamed-chunk-19_60f976a677f568d76e9935e9173d5545.rdb │ ├── unnamed-chunk-19_60f976a677f568d76e9935e9173d5545.rdx │ ├── unnamed-chunk-1_b443c34df83ffb4e47c67e5e9ac4cfce.RData │ ├── unnamed-chunk-1_b443c34df83ffb4e47c67e5e9ac4cfce.rdb │ ├── unnamed-chunk-1_b443c34df83ffb4e47c67e5e9ac4cfce.rdx │ ├── unnamed-chunk-20_f90113178a75d661e8a9b319f8dd63b9.RData │ ├── unnamed-chunk-20_f90113178a75d661e8a9b319f8dd63b9.rdb │ ├── unnamed-chunk-20_f90113178a75d661e8a9b319f8dd63b9.rdx │ ├── unnamed-chunk-21_0cc656c58d55c349f872ab6c59a1c9c6.RData │ ├── unnamed-chunk-21_0cc656c58d55c349f872ab6c59a1c9c6.rdb │ ├── unnamed-chunk-21_0cc656c58d55c349f872ab6c59a1c9c6.rdx │ ├── unnamed-chunk-2_ea58cca509eaa82089e5339e5054c58a.RData │ ├── unnamed-chunk-2_ea58cca509eaa82089e5339e5054c58a.rdb │ ├── unnamed-chunk-2_ea58cca509eaa82089e5339e5054c58a.rdx │ ├── unnamed-chunk-3_3ba45688564a3db789f5c5f910f2d7c8.RData │ ├── unnamed-chunk-3_3ba45688564a3db789f5c5f910f2d7c8.rdb │ ├── unnamed-chunk-3_3ba45688564a3db789f5c5f910f2d7c8.rdx │ ├── unnamed-chunk-4_1ac64f41a7478af12db8d4afc3796ea6.RData │ ├── unnamed-chunk-4_1ac64f41a7478af12db8d4afc3796ea6.rdb │ ├── unnamed-chunk-4_1ac64f41a7478af12db8d4afc3796ea6.rdx │ ├── unnamed-chunk-5_ca8c5c862543df61f31a683820040a75.RData │ ├── unnamed-chunk-5_ca8c5c862543df61f31a683820040a75.rdb │ ├── unnamed-chunk-5_ca8c5c862543df61f31a683820040a75.rdx │ ├── unnamed-chunk-6_502f7c554aaaed64ff954e58c7fbaa66.RData │ ├── unnamed-chunk-6_502f7c554aaaed64ff954e58c7fbaa66.rdb │ ├── unnamed-chunk-6_502f7c554aaaed64ff954e58c7fbaa66.rdx │ ├── unnamed-chunk-7_2750b1c3b2e57236a5af4f8faef1c5d1.RData │ ├── unnamed-chunk-7_2750b1c3b2e57236a5af4f8faef1c5d1.rdb │ ├── unnamed-chunk-7_2750b1c3b2e57236a5af4f8faef1c5d1.rdx │ ├── unnamed-chunk-8_485a42cd3be166890574dd8e3464bfd8.RData │ ├── unnamed-chunk-8_485a42cd3be166890574dd8e3464bfd8.rdb │ ├── unnamed-chunk-8_485a42cd3be166890574dd8e3464bfd8.rdx │ ├── unnamed-chunk-9_56df5bed1c943e1c4727cc65d4f7ab22.RData │ ├── unnamed-chunk-9_56df5bed1c943e1c4727cc65d4f7ab22.rdb │ └── unnamed-chunk-9_56df5bed1c943e1c4727cc65d4f7ab22.rdx ├── LICENSE ├── R └── sayhello.R ├── README.md ├── ScPoProgramming.Rproj ├── _extensions └── metropolis-theme │ ├── _extension.yml │ └── metropolis.scss ├── _quarto.yml ├── about.qmd ├── custom.scss ├── data ├── brexit.csv └── shell-lesson-data.zip ├── images ├── 02_datum_fig.png ├── PHD.png ├── ScPo-logo.png ├── Tux.png ├── bad.gif ├── distance1.png ├── distance2.png ├── filesystem-challenge.odg ├── filesystem-challenge.svg ├── filesystem.svg ├── find-file-tree.odg ├── find-file-tree.svg ├── git-images │ └── full.png ├── git-staging-CDG.jpeg ├── git-staging-area.svg ├── git_staging.svg ├── home-directories.svg ├── homedir.odg ├── nano-screenshot.png ├── phd101212s.png ├── redirects-and-pipes.svg ├── removed-that.png ├── seine.png ├── seine2.png ├── seine3.png ├── seine4.png ├── seine5.png ├── shell_command_syntax.svg ├── shell_script_for_loop_flow_chart.svg ├── standard-filesystem-hierarchy.svg ├── toypackage.png ├── vector_lonlat.png ├── vector_lonlatglobe.png ├── vector_lonlatparis.png ├── vector_projected.png ├── vector_projectedparis.png └── which-version.png ├── index.qmd ├── scripts ├── 01-shell-intro.sh ├── _tidy_tasks.Rmd ├── geotask.R └── lon-lat-geocomp.r └── styles.css /.gitignore: -------------------------------------------------------------------------------- 1 | /.quarto/ 2 | /_site/ 3 | .Rproj.user 4 | .Rhistory 5 | .Rdata 6 | .httr-oauth 7 | .DS_Store 8 | /data/shapefiles/ 9 | .quarto 10 | -------------------------------------------------------------------------------- /.nojekyll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/.nojekyll -------------------------------------------------------------------------------- /01-shell-intro.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "The Unix Shell" 3 | format: 4 | revealjs: 5 | chalkboard: false 6 | logo: /images/ScPo-logo.png 7 | footer: "[SciencesPo Intro To Programming 2024](https://floswald.github.io/ScPoProgramming/)" 8 | incremental: false 9 | code-line-numbers: false 10 | highlight-style: github 11 | author: Florian Oswald and Software Carpentry 12 | subtitle: "[SciencesPo Intro To Programming 2024](https://floswald.github.io/ScPoProgramming/)" 13 | date: today 14 | date-format: "D MMMM, YYYY" 15 | --- 16 | 17 | ## Intro 18 | 19 | ::::: {.callout-note} 20 | # Question 21 | 22 | * What is a command shell and why would I use one? 23 | ::::: 24 | 25 | ::::: {.callout-tip} 26 | # Objectives 27 | - Explain how the shell relates to the keyboard, the screen, the operating system, and users' programs. 28 | - Explain when and why command-line interfaces should be used instead of graphical interfaces. 29 | :::::: 30 | 31 | --- 32 | 33 | 34 | ## Do You GUI? 35 | 36 | ::: {.callout-tip} 37 | 38 | # What's a GUI 39 | A *Graphical User Interface* (GUI) lets the user interact by clicking with a mouse and using menus. 40 | ::: 41 | 42 | . . . 43 | 44 | * I love 😍 a good Graphical User Interface (GUI) 45 | 46 | . . . 47 | 48 | * But. Bad things can happen. 49 | 50 | . . . 51 | 52 | * ☠️ 53 | 54 | --- 55 | 56 | ## {background-image="/images/bad.gif" background-size="cover"} 57 | 58 | --- 59 | 60 | ## Bad. 61 | 62 | ::::: {.columns} 63 | ::::: {.column width="50%"} 64 | ::: {.callout-warning} 65 | 66 | # No More than 65,536 Rows 67 | 68 | * Public Health England [missed 14.000 covid cases](https://www.influentialsoftware.com/how-an-nhs-test-and-trace-excel-error-lost-16000-covid-19-cases/) 69 | * They used `.xls` document to collect data. 70 | ::: 71 | ::::: 72 | 73 | 74 | ::::: {.column width="50%"} 75 | ::: {.callout-warning} 76 | 77 | # No Growth with High Debt? 78 | 79 | * [Reinhardt and Rogoff controversy](https://theconversation.com/the-reinhart-rogoff-error-or-how-not-to-excel-at-economics-13646).^[Both are eminent researchers and we do *not* imply misconduct.] 80 | * Inadvertently did not select all relevant countries on spreadsheet. 81 | ::: 82 | ::::: 83 | 84 | ::::: 85 | 86 | 87 | --- 88 | 89 | ## Or Do you CLI? 90 | 91 | ::: {.callout-tip} 92 | 93 | # What's a CLI 94 | A *Command Line Interface* (CLI) allows interaction via (text) commands. 95 | ::: 96 | 97 | 98 | * CLIs can collect commands somewhere - *reproducible* 99 | * But one has to learn a *language*. 100 | * They are great for *long, repetitive tasks*. 101 | * It's often the **only** way to interact with high-performance computing. 👉 show [DANTE](https://www.ipgp.fr/en/details-du-cluster-dante/) 102 | 103 | 104 | --- 105 | 106 | ## The Shell {.smaller} 107 | 108 | * The *shell* is a program where we can type in commands and get output. 109 | * We often use very simple programs - good for just one thing. 110 | * There is tremendous power in *combining* those little programs. 111 | * It's a *platform* approach to an Operating System. 112 | 113 | ::: {.callout-tip} 114 | # Unix is a Platform 115 | 116 | A protocol and many small program who interact according to the rules with each other 117 | ::: 118 | 119 | ![lewing@isc.tamu.edu Larry Ewing and The GIMP, Attribution, via Wikimedia Commons](/images/Tux.png) 120 | 121 | 122 | --- 123 | 124 | ## Go! 🚀 125 | 126 | 1. Open your terminal! (`GitBash` on Windows) 127 | 2. You should see something like 128 | ```bash 129 | $ 130 | ``` 131 | which is called the **prompt**. 132 | 133 | 3. You **don't have to type** the `$`! 134 | 3. next to it, you see a cursor. 135 | 136 | 137 | --- 138 | 139 | ## First command: `ls` 140 | 141 | * type `ls` and hit enter 142 | * you see something like this as output: 143 | 144 | ``` 145 | Desktop Downloads Movies Pictures 146 | Documents Library Music Public 147 | ``` 148 | 149 | * By default, the terminal opens in your home directory. 150 | * `ls` *lists* the content of that directory. 151 | 152 | --- 153 | 154 | ## First Error! 155 | 156 | ::: {.callout-caution} 157 | * If you mistype a command, or look for a program that is not installed, you get an error. Like: 158 | 159 | ``` 160 | bash-3.2$ ks 161 | ``` 162 | 163 | ``` 164 | bash: ks: command not found 165 | ``` 166 | ::: 167 | 168 | * Look for a spelling mistake (it's `ls` not `ks`) 169 | * Or install the required program. 170 | 171 | 172 | ## Nelle's Pipeline: A Typical Problem {.smaller} 173 | 174 | * Nelle Nemo is a marine biologist. 🌊 🐡 175 | * Just sampled gelatinous marine life in the 176 | [Great Pacific Garbage Patch](http://en.wikipedia.org/wiki/Great_Pacific_Garbage_Patch). 177 | * From 1520 samples she obtained measures of the relative abundance of 300 proteins. 178 | * Her supervisor, *Professor Jones*, handed over to her a program called `goostats.sh`. 179 | * `goostats.sh` needs to be run on each of the 1520 samples. 180 | * Paper needs to be ready by the end of the month. 181 | 182 | --- 183 | 184 | ## {background-image="/images/PHD.png" background-size="50%" background-position="center"} 185 | 186 | 187 | --- 188 | 189 | ## Battle Plan 190 | 191 | * Using a GUI to run `goostats.sh`, Nelle will have to use her mouse to select and open a file 1520 times. 192 | * If `goostats.sh` takes 30 secs to run, this will take more than 12 hours of Nelle's *active time*. 193 | * With the help of the shell, Nelle could make her computer go through that list of files instead. 194 | * **Bonus** : she will have a working pipeline, that can be re-run each time she wants to add data or reproduce previous output. 195 | 196 | --- 197 | 198 | ## What Does Nelle Need {.smaller} 199 | 200 | Nelle has needs to learn the following things: 201 | 202 | - navigate to a file/directory 203 | - create a file/directory 204 | - check the length of a file 205 | - chain commands together 206 | - retrieve a set of files 207 | - iterate over files 208 | - run a shell script containing her pipeline 209 | 210 | And we will be right next to her. 🙂 211 | 212 | 213 | --- 214 | 215 | ## 216 | 217 | ::: {.callout-tip} 218 | 219 | # Key Points 220 | 221 | - A shell is a program whose primary purpose is to read commands and run other programs. 222 | - This lesson uses Bash, the default shell in many implementations of Unix. 223 | - Programs can be run in Bash by entering commands at the command-line prompt. 224 | - The shell's main advantages are its high action-to-keystroke ratio, its support for automating repetitive tasks, and its capacity to access networked machines. 225 | - The shell's main disadvantages are its primarily textual nature and how cryptic its commands and operation can be. 226 | - ChatGPT can help you write Bash scripts, still it is fundamental to know the basics. 227 | 228 | ::: 229 | 230 | 231 | 232 | 233 | 234 | -------------------------------------------------------------------------------- /02-filedir.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Navigating Files and Directories" 3 | format: 4 | revealjs: 5 | theme: _extensions/metropolis-theme/metropolis.scss 6 | chalkboard: true 7 | logo: /images/ScPo-logo.png 8 | footer: "[SciencesPo Intro To Programming 2024](https://floswald.github.io/ScPoProgramming/)" 9 | incremental: false 10 | code-line-numbers: false 11 | highlight-style: github 12 | author: Florian Oswald and The Software Carpentry 13 | subtitle: "[SciencesPo Intro To Programming 2024](https://floswald.github.io/ScPoProgramming/)" 14 | date: today 15 | date-format: "D MMMM, YYYY" 16 | --- 17 | 18 | 19 | ## Intro 20 | 21 | ::::: {.callout-note} 22 | # Questions 23 | 24 | - How can I move around on my computer? 25 | - How can I see what files and directories I have? 26 | - How can I specify the location of a file or directory on my computer? 27 | ::::: 28 | 29 | ::::: {.callout-tip} 30 | # Objectives 31 | - Explain the similarities and differences between a file and a directory. 32 | - Translate an absolute path into a relative path and vice versa. 33 | - Construct absolute and relative paths that identify specific files and directories. 34 | - Use options and arguments to change the behaviour of a shell command. 35 | - Demonstrate the use of tab completion and explain its advantages. 36 | :::::: 37 | 38 | 39 | --- 40 | 41 | ## The File System {.smaller} 42 | 43 | * The file system organizes data into files and directories on your computer. 44 | * let's start finding out where we are by running the `pwd` command - *present working directory*. 45 | 46 | ```bash 47 | $ pwd 48 | ``` 49 | 50 | with output: 51 | 52 | ``` 53 | /Users/nelle 54 | ``` 55 | 56 | ::: {.callout-note} 57 | 58 | # Home Directory Variations 59 | 60 | * Linux: `/home/nelle` 61 | * Windows: `C:\Documents and Settings\nelle` 62 | 63 | if `pwd` does not return your home directory, may need to navigate there first with `cd`. 64 | ::: 65 | 66 | --- 67 | 68 | ## Nelle's Home Directory 69 | 70 | Nelle's file system looks like this: 71 | 72 | ![The file system is made up of a root directory that contains sub-directories 73 | titled bin, data, users, and tmp](/images/filesystem.svg) 74 | 75 | * `/` is the *root* of the system 76 | * all other locations can be reached from there via a *path* 77 | * path to homedir is from `/` to directory `Users`, which contains folder `Nelle` 78 | * We know exactly where the home is stored by looking at this path. 79 | * Notice that *inside* a path, `/` is a *separator*. (It's `\` on Windows!) 80 | 81 | --- 82 | 83 | ## Working with `ls` 84 | 85 | * type `ls -F`. This adds *option* `F` (for *full*) to the command. 86 | * now you get also 87 | - a trailing `/` indicates that this is a directory 88 | - `@` indicates a link 89 | - `*` indicates an executable 90 | 91 | ```bash 92 | $ ls -F 93 | ``` 94 | 95 | ``` 96 | Applications/ Documents/ Library/ Music/ Public/ 97 | Desktop/ Downloads/ Movies/ Pictures/ 98 | ``` 99 | 100 | ::: {.callout-caution} 101 | 102 | # question 103 | 104 | What kind of objects does Nelle's home directory contain? 105 | ::: 106 | 107 | 108 | --- 109 | 110 | ## Help 111 | 112 | ::: {.callout-note} 113 | 114 | # Clear Terminal 115 | 116 | * Use the `clear` command to clear terminal. 117 | * you can use your `↑` and `↓` keys to see previous commands, or just scroll up. 118 | 119 | ::: 120 | 121 | 122 | ::: {.callout-tip} 123 | 124 | # Getting Help 125 | 126 | 1. pass the `--help` option to a command: 127 | ```bash 128 | $ ls --Help 129 | ``` 130 | 131 | 2. Read the manual entry with `man` (MacOS and Linux only) 132 | ```bash 133 | man ls 134 | ``` 135 | 136 | 3. Search internet for `unix man ls` 137 | 138 | 139 | ::: 140 | 141 | 142 | ## More `ls` Flags 143 | 144 | ::: {.callout-warning} 145 | 146 | # Challenge 147 | 148 | You can also use two options at the same time. What does the command `ls` do when used 149 | with the `-l` option? What about if you use both the `-l` and the `-h` option? 150 | ::: 151 | 152 |
153 | Show Solution 154 | 155 | ::: {.callout-note} 156 | # Solution 157 | The `-l` option makes `ls` use a **l**ong listing format, showing not only the file/directory names but also additional information, such as the file size and the time of its last modification. If you use both the `-h` option and the `-l` option, 158 | this makes the file size '**h**uman readable', i.e. displaying something like `5.3K` instead of `5369`. 159 | ::: 160 |
161 | 162 | 163 | --- 164 | 165 | ## More `ls` Challenges 166 | 167 | ::: {.callout-warning} 168 | 169 | # Listing in Reverse Chronological Order 170 | 171 | By default, `ls` lists the contents of a directory in alphabetical 172 | order by name. The command `ls -t` lists items by time of last 173 | change instead of alphabetically. The command `ls -r` lists the 174 | contents of a directory in reverse order. 175 | Which file is displayed last when you combine the `-t` and `-r` options? 176 | Hint: You may need to use the `-l` option to see the 177 | last changed dates. 178 | ::: 179 | 180 |
181 | Show Solution 182 | 183 | ::: {.callout-note} 184 | # Solution 185 | The most recently changed file is listed last when using `-rt`. This 186 | can be very useful for finding your most recent edits or checking to 187 | see if a new output file was written. 188 | ::: 189 |
190 | 191 | 192 | --- 193 | 194 | ## Getting Data 195 | 196 | 1. Let's [download some data](/data/shell-lesson-data.zip) 197 | 2. unzip it and and move it to your `home` directory. (`~`, not `Desktop`!) 198 | 199 | --- 200 | 201 | ## Exploring More Directories 202 | 203 | * `ls` can search other than only the *current* directories. 204 | * Let's see what is on our `home`: 205 | 206 | ```bash 207 | $ cd # goes HOME 208 | $ ls -F . 209 | ``` 210 | 211 | shows for Nelle only the data we just downloaded: 212 | 213 | ``` 214 | shell-lesson-data/ 215 | ``` 216 | 217 | * We can also look *inside* that data from where we are: 218 | 219 | ```bash 220 | $ ls -F shell-less-data 221 | ``` 222 | 223 | ``` 224 | exercise-data/ north-pacific-gyre/ 225 | ``` 226 | 227 | * looks intriguing 🧐. Let's try and go there! 228 | 229 | --- 230 | 231 | ## Going into Subdirectories 232 | 233 | * `cd` is for *change directory*. Moves the shell to a different location in the file system. 234 | * Let's go to our data folder: 235 | 236 | ```bash 237 | $ cd shell-lesson-data 238 | $ cd exercise-data 239 | ``` 240 | 241 | * Notice that the `cd` command does not print any output by default. 242 | * Run `ls -F` again to see what's in this directory! 243 | * Run `pwd` to see where we are! 244 | 245 | --- 246 | 247 | ## Coming Back from Subdirectories 248 | 249 | * Now we want to go back up one level. 250 | * It's tempting to say `cd shell-lesson-data` 251 | * But `cd` can only go into *its own subdirectories*. 252 | * It has a special one: `..` is its *parent* directory, so goes one up. 253 | 254 | ```bash 255 | $ cd .. 256 | $ pwd 257 | ``` 258 | 259 | puts Nelle back into 260 | 261 | ``` 262 | /Users/nelle/shell-lesson-data 263 | ``` 264 | 265 | * Notice how `..` is listed if you flag `-a` on the `ls` command. 266 | 267 | 268 | --- 269 | 270 | ## Hidden Files 271 | 272 | ::: {.callout-tip} 273 | 274 | # Hidden Files 275 | 276 | * Typing `cd` without any arguments puts you back into your Home directory. Do it. 277 | * Let's use `ls -F -a` or `ls -Fa` to list *all* files. Also **hidden** ones! 278 | 279 | ::: 280 | 281 | ::: {.callout-note} 282 | 283 | # Relative and Absolute Paths 284 | 285 | * Up until now, we used *relative paths*. `cd` and `ls` operated **from our current position** in the file sytem. 286 | * We can also specify the *absolute path*, i.e. starting at the root `/`. This allows to go anywhere from anywhere. 287 | 288 | ::: 289 | 290 | 291 | --- 292 | 293 | ## More Shortcuts 294 | 295 | ::: {.callout-note} 296 | 297 | # Tilde (`~`) and dash (`-`) 298 | 299 | * The tilde `~` in first position means *current user's home* 300 | * The dash in `cd -` means *go into the directory I was previously in*. 301 | * So: 302 | 303 | 1. `cd ..` brings you *up* one level 304 | 2. `cd -` takes you *back* to wherever you've come from. 305 | ::: 306 | 307 | --- 308 | 309 | ## Challenges 310 | 311 | ::: {.callout-caution} 312 | 313 | # Challenge 314 | 315 | Starting from `/Users/amanda/data`, 316 | which of the following commands could Amanda use to navigate to her home directory, 317 | which is `/Users/amanda`? 318 | 319 | 1. `cd .` 320 | 2. `cd /` 321 | 3. `cd /home/amanda` 322 | 4. `cd ../..` 323 | 5. `cd ~` 324 | 6. `cd home` 325 | 7. `cd ~/data/..` 326 | 8. `cd` 327 | 9. `cd ..` 328 | 329 | ::: 330 | 331 | --- 332 | 333 | ## Solution 334 | 335 |
336 | Show Solution 337 | 338 | ::: {.callout-note} 339 | # Solution 340 | 341 | 1. No: `.` stands for the current directory. 342 | 2. No: `/` stands for the root directory. 343 | 3. No: Amanda's home directory is `/Users/amanda`. 344 | 4. No: this command goes up two levels, i.e. ends in `/Users`. 345 | 5. Yes: `~` stands for the user's home directory, in this case `/Users/amanda`. 346 | 6. No: this command would navigate into a directory `home` in the current directory 347 | if it exists. 348 | 7. Yes: unnecessarily complicated, but correct. 349 | 8. Yes: shortcut to go back to the user's home directory. 350 | 9. Yes: goes up one level. 351 | 352 | 353 | ::: 354 |
355 | 356 | --- 357 | 358 | 359 | ![](/images/filesystem-challenge.svg){.absolute top=200 right=30 width="550" height="500"} 360 | 361 | ::: {.callout-caution} 362 | 363 | # Challenge 364 | Using the filesystem diagram , if `pwd` displays `/Users/thing`, 365 | what will `ls -F ../backup` display? 366 | 367 | 1. `../backup: No such file or directory` 368 | 2. `2012-12-01 2013-01-08 2013-01-27` 369 | 3. `2012-12-01/ 2013-01-08/ 2013-01-27/` 370 | 4. `original/ pnas_final/ pnas_sub/` 371 | 372 | 373 | 374 | ::: 375 | 376 | 377 | --- 378 | 379 | ## Solution 380 | 381 |
382 | Show Solution 383 | 384 | ::: {.callout-note} 385 | # Solution 386 | 387 | 1. No: there *is* a directory `backup` in `/Users`. 388 | 2. No: this is the content of `Users/thing/backup`, but with `..`, we asked for one level further up. 389 | 3. No: see previous explanation. 390 | 4. Yes: `../backup/` refers to `/Users/backup/`. 391 | 392 | ::: 393 |
394 | 395 | --- 396 | 397 | ![](/images/filesystem-challenge.svg){.absolute top=200 right=60 width="550" height="500"} 398 | 399 | ::: {.callout-caution} 400 | 401 | # Challenge 402 | Using the filesystem diagram below, if `pwd` displays `/Users/backup`,and `-r` tells `ls` to display things in reverse order, 403 | what command(s) will result in the following output: 404 | 405 | ``` 406 | pnas_sub/ pnas_final/ original/ 407 | ``` 408 | 409 | is it: 410 | 411 | 1. `ls pwd`? 412 | 2. `ls -r -F`? 413 | 3. `ls -r -F /Users/backup`? 414 | 415 | 416 | ::: 417 | 418 | --- 419 | 420 | ## Solution 421 | 422 |
423 | Show Solution 424 | 425 | ::: {.callout-note} 426 | # Solution 427 | 428 | 1. No: `pwd` is not the name of a directory. 429 | 2. Yes: `ls` without directory argument lists files and directories 430 | in the current directory. 431 | 3. Yes: uses the absolute path explicitly. 432 | 433 | ::: 434 |
435 | 436 | 437 | --- 438 | 439 | ## General Syntax of Shell Commands {.smaller} 440 | 441 | Let's take as example this command: 442 | 443 | ```bash 444 | $ ls -F / 445 | ``` 446 | 447 | ![General syntax of a shell command](/images/shell_command_syntax.svg) 448 | 449 | * The space between `ls` and whatever options you put is important. 450 | * Capitalization is important. `ls -s` is not the same as `ls -S`: 451 | 452 | ```bash 453 | $ cd ~/shell-lesson-data 454 | $ ls -s exercise-data # size 455 | $ ls -S exercise-data # sort by size 456 | ``` 457 | 458 | --- 459 | 460 | ## Nelle's Pipeline and Tab Completion 461 | 462 | 1. Nelle organized the output of the assay machine into `north-pacific-gyre/`. let's go there. 463 | ```bash 464 | $ cd ~/shell-lesson-data/ 465 | $ cd north-pacific-gyre 466 | ``` 467 | 2. Now `north-pacific-gyre` is a mouthful to write. try instead to type `cd n` and hit the TAB key. 468 | 3. hitting TAB twice without any leading character, gives you a list of files in `pwd`. 469 | 470 | 471 | --- 472 | 473 | ::: {.callout-tip} 474 | 475 | # Key Points 476 | 477 | - The file system is responsible for managing information on the disk. 478 | - Information is stored in files, which are stored in directories (folders). 479 | - Directories can also store other directories, which then form a directory tree. 480 | - `pwd` prints the user's current working directory. 481 | - `ls [path]` prints a listing of a specific file or directory; `ls` on its own lists the current working directory. 482 | - `cd [path]` changes the current working directory. 483 | - Most commands take options that begin with a single `-`. 484 | - Directory names in a path are separated with `/` on Unix, but `\\` on Windows. 485 | - `/` on its own is the root directory of the whole file system. 486 | - An absolute path specifies a location from the root of the file system. 487 | - A relative path specifies a location starting from the current location. 488 | - `.` on its own means 'the current directory'; `..` means 'the directory above the current one'. 489 | 490 | ::: -------------------------------------------------------------------------------- /03-filework.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Working with Files and Directories" 3 | format: 4 | revealjs: 5 | theme: _extensions/metropolis-theme/metropolis.scss 6 | chalkboard: true 7 | logo: /images/ScPo-logo.png 8 | footer: "[SciencesPo Intro To Programming 2024](https://floswald.github.io/ScPoProgramming/)" 9 | incremental: false 10 | code-line-numbers: false 11 | highlight-style: github 12 | author: Florian Oswald and The Software Carpentry 13 | subtitle: "[SciencesPo Intro To Programming 2024](https://floswald.github.io/ScPoProgramming/)" 14 | date: today 15 | date-format: "D MMMM, YYYY" 16 | --- 17 | 18 | 19 | ## Intro 20 | 21 | ::: {.callout-note} 22 | 23 | # Questions 24 | 25 | - How can I create, copy, and delete files and directories? 26 | - How can I edit files? 27 | 28 | ::: 29 | 30 | ::: {.callout-tip} 31 | 32 | # Objectives 33 | 34 | - Create a directory hierarchy that matches a given diagram. 35 | - Create files in that hierarchy using an editor or by copying and renaming existing files. 36 | - Delete, copy and move specified files and/or directories. 37 | 38 | ::: 39 | 40 | --- 41 | 42 | ## Creating Directories 43 | 44 | * Let's create a directory `thesis` here: 45 | ```bash 46 | $ cd ~/shell-lesson-data/exercise-data/writing 47 | $ ls -F 48 | ``` 49 | this outputs: 50 | 51 | ``` 52 | haiku.txt LittleWomen.txt 53 | ``` 54 | 55 | * Use `mkdir` to create: 56 | ```bash 57 | $ mkdir thesis 58 | ``` 59 | 60 | used like this, `thesis` is created in the current directory. While with the `-p` flag we created nested subdirectories: 61 | 62 | ```bash 63 | $ mkdir -p ../project/data ../project/results 64 | ``` 65 | 66 | --- 67 | 68 | ## Good File Names 69 | 70 | 71 | ::: {.callout-warning} 72 | 73 | # Bad File Names 74 | 75 | 1. Don't use spaces. Spaces don't work well on unix file names. `north pacific gyre` is not a good one. Use `north-pacific-gyre` instead. 76 | 2. Don't begin with `-`. 77 | 3. Stick with letters, numbers, `.`, `-`, and `_` 78 | 79 | ::: 80 | 81 | --- 82 | 83 | ## Creating a Text File 84 | 85 | * Let's go into the `thesis` directory and create a text file called `draft.txt`. 86 | 87 | ```bash 88 | $ cd thesis 89 | $ nano draft.txt 90 | ``` 91 | 92 | ::: {.callout-note} 93 | 94 | # TEXT Editor 95 | 96 | `nano` is a super simple editor, and you can use it *only* to edit text files (That's normal for *text editors* 😉). You will probably switch to a more powerful editor later on (I recommend `VSCode`), but `nano` is a good starting point. 97 | Notice that `^` key is the `Ctrl` key, so `^X` means `Ctrl + X`. 98 | ::: 99 | 100 | 101 | --- 102 | 103 | ## Filename Extensions 104 | 105 | 106 | ### Task 107 | 108 | 1. Go to your home directory: `cd` 109 | 2. create an *empty* file with the `touch` command: 110 | 111 | ```bash 112 | $ # this is a comment, by the way 113 | $ cd # so, going home. 114 | $ touch new_doc.pdf # creating an empty file. 115 | ``` 116 | 117 | 3. Open your file browser and double click on `new_doc.pdf`. What is going to happen? 118 | 119 | 120 | --- 121 | 122 | * Ok, let's get rid of that file now. 123 | * use the `rm` command (more later) 124 | ```bash 125 | $ rm new_doc.pdf 126 | ``` 127 | * Caution: `rm` is forever gone. 128 | * You can add `-i` *interactive* to be safe(r). 129 | 130 | 131 | --- 132 | 133 | ## Moving Files and Directories 134 | 135 | * Let's go back to the `writing` directory 136 | ```bash 137 | $ cd ~/shell-lesson-data/exercise-data/writing 138 | ``` 139 | * Let's *rename* `draft.txt` to `quotes.txt` with `mv`. 140 | ```bash 141 | $ mv thesis/draft.txt thesis/quotes.txt 142 | ``` 143 | * Now let's actually *move* it into the current dir: 144 | ```bash 145 | $ mv thesis/quotes.txt . 146 | ``` 147 | * Notice: `mv x y` means `x` is gone afterwards! 148 | 149 | --- 150 | 151 | ::: {.callout-caution} 152 | 153 | # Challenge 154 | 155 | Jamie placed `maltose.dat` and `sucrose.datfiles` in the `analyzed` folder by mistake. He wants to move those back to the `raw` folder now: 156 | 157 | ```bash 158 | $ ls -F 159 | analyzed/ raw/ 160 | $ ls -F analyzed 161 | fructose.dat glucose.dat maltose.dat sucrose.dat 162 | $ cd analyzed 163 | ``` 164 | What has to go in the blanks to achieve this? 165 | 166 | ```bash 167 | $ mv sucrose.dat maltose.dat ____/____ 168 | ``` 169 | ::: 170 | 171 |
172 | Show Solution 173 | 174 | ::: {.callout-note} 175 | # Solution 176 | 177 | ```bash 178 | $ mv sucrose.dat maltose.dat ../raw 179 | ``` 180 | 181 | ::: 182 |
183 | 184 | 185 | --- 186 | 187 | ## Copying Files and Directories 188 | 189 | * `cp x y` is similar to `mv x y`, but you keep `x`. 190 | ```bash 191 | $ cp quotes.txt thesis/quotations.txt 192 | $ ls quotes.txt thesis/quotations.txt 193 | ``` 194 | 195 | * the `-r` option means *recursively* and copies entire folders: 196 | ```bash 197 | $ cp -r thesis thesis_backup 198 | $ ls thesis thesis_backup 199 | ``` 200 | 201 | * Notice that `rm -r mydir` will delete everything inside the `mydir` folder! 202 | 203 | 204 | --- 205 | 206 | ## Using *Wildcards* 207 | 208 | * the `*` character is a *wildcard*, i.e it matches all characters: 209 | ```bash 210 | $ cd shell-lesson-data/exercise-data/ 211 | $ ls proteins/p* 212 | proteins/pentane.pdb proteins/propane.pdb 213 | ``` 214 | 215 | 216 | --- 217 | 218 | ## Reproducing a Folder Structure 219 | 220 | Suppose we want to create the following structure on our computer: 221 | 222 | ```bash 223 | 2016-05-20/ 224 | └── data 225 | ├── processed 226 | └── raw 227 | ``` 228 | 229 | ::: {.callout-caution} 230 | # Challenge 231 | 232 | Which sequence will achieve this result? 233 | 234 | ``` 235 | 1. 236 | $ mkdir 2016-05-20 237 | $ mkdir 2016-05-20/data 238 | $ mkdir 2016-05-20/data/processed 239 | $ mkdir 2016-05-20/data/raw 240 | ``` 241 | 242 | ``` 243 | 2. 244 | $ mkdir 2016-05-20/data/raw 245 | $ mkdir 2016-05-20/data/processed 246 | ``` 247 | 248 | ``` 249 | 3. 250 | $ mkdir -p 2016-05-20/data/raw 251 | $ mkdir -p 2016-05-20/data/processed 252 | ``` 253 | 254 | ::: 255 | 256 | 257 | --- 258 | 259 | ## Nice Trick 260 | 261 | * Oh by the way. 262 | * If you are on MacOS, try to this on the command line 263 | 264 | ```bash 265 | $ open . 266 | ``` 267 | 268 | * Pretty handy! 269 | 270 | --- 271 | 272 | ::: {.callout-tip} 273 | 274 | - `cp [old] [new]` copies a file. 275 | - `mkdir [path]` creates a new directory. 276 | - `mv [old] [new]` moves (renames) a file or directory. 277 | - `rm [path]` removes (deletes) a file. 278 | - `*` matches zero or more characters in a filename, so `*.txt` matches all files ending in `.txt`. 279 | - `?` matches any single character in a filename, so `?.txt` matches `a.txt` but not `any.txt`. 280 | - Use of the Control key may be described in many ways, including `Ctrl-X`, `Control-X`, and `^X`. 281 | - The shell does not have a trash bin: once something is deleted, it's really gone. 282 | - Most files' names are `something.extension`. The extension isn't required, and doesn't guarantee anything, but is normally used to indicate the type of data in the file. 283 | - Depending on the type of work you do, you may need a more powerful text editor than Nano. 284 | 285 | ::: 286 | 287 | -------------------------------------------------------------------------------- /04-pipes.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Pipes and Filters" 3 | format: 4 | revealjs: 5 | theme: _extensions/metropolis-theme/metropolis.scss 6 | chalkboard: true 7 | logo: /images/ScPo-logo.png 8 | footer: "[SciencesPo Intro To Programming 2024](https://floswald.github.io/ScPoProgramming/)" 9 | incremental: false 10 | code-line-numbers: false 11 | highlight-style: github 12 | author: Florian Oswald and The Software Carpentry 13 | subtitle: "[SciencesPo Intro To Programming 2024](https://floswald.github.io/ScPoProgramming/)" 14 | date: today 15 | date-format: "D MMMM, YYYY" 16 | --- 17 | 18 | 19 | ## Combining Commands 20 | 21 | * We are now ready to combine some of the commands we learned. 22 | 23 | * You will see that here is where the real power lies. 24 | 25 | * Let's navigate into our exercise data folder first. 26 | 27 | ```bash 28 | $ cd ~/shell-lesson-data/exercise-data/proteins 29 | $ ls 30 | cubane.pdb ethane.pdb methane.pdb octane.pdb pentane.pdb propane.pdb 31 | ``` 32 | 33 | * Those are *protein data bank* files. 34 | 35 | --- 36 | 37 | ## Capturing Output 38 | 39 | * Introducing the `wc` word count command. 40 | 41 | ```bash 42 | wc cubane.pdb 43 | 20 156 1158 cubane.pdb 44 | ``` 45 | 29 lines, 156 words, 1158 characters. 46 | 47 | * Let's **redirect** the output of `wc` to a file instead with `>`: 48 | 49 | ```bash 50 | wc -l *.pdb > lengths.txt 51 | ``` 52 | 53 | * no ouput on screen, you see? but now there is a new file: `lengths.txt`. 54 | * Let's *concatenate* its content (i.e. join together) and print to screen: 55 | ```bash 56 | $ cat lengths.txt 57 | 20 cubane.pdb 58 | 12 ethane.pdb 59 | 9 methane.pdb 60 | 30 octane.pdb 61 | 21 pentane.pdb 62 | 15 propane.pdb 63 | 107 total 64 | ``` 65 | 66 | --- 67 | 68 | ## Reading Text Files 69 | 70 | * `cat` prints the entire thing to screen. 71 | * `tail` only the end 72 | * `head` only the beginning 73 | * `less` lets you scroll and read (arrows up/down or `j` (up) and `k` (down), `q` exits.) 74 | 75 | ```bash 76 | $ head -n 3 ../animal-counts/animals.csv 77 | $ tail -n 2 ../animal-counts/animals.csv 78 | $ less ../../north-pacific-gyre/NENE01729A.txt 79 | ``` 80 | 81 | 82 | --- 83 | 84 | ## Printing Text with `echo` 85 | 86 | * The `echo` function prints text - by default to screen: 87 | ```bash 88 | $ echo hi 89 | hi 90 | ``` 91 | 92 | * But you can redirect it to a file as well: 93 | ```bash 94 | $ echo I said hi! > echofile1.txt 95 | ``` 96 | 97 | ::: {.callout-important} 98 | 99 | # Challenge 100 | 101 | Do 2 times in a row: 102 | ```bash 103 | $ echo I said hi! > echofile1.txt 104 | ``` 105 | 106 | Now do twice (notice `>>`!) 107 | ```bash 108 | $ echo I said hi! >> echofile2.txt 109 | ``` 110 | * What's happening? 111 | ::: 112 | 113 | 114 | --- 115 | 116 | ## Appending to Files 117 | 118 | ::: {.callout-important} 119 | 120 | # Challenge 121 | 122 | Consider the file `shell-lesson-data/exercise-data/animal-counts/animals.csv`. What is result of this: 123 | 124 | ```bash 125 | $ head -n 3 animals.csv > animals-subset.csv 126 | $ tail -n 2 animals.csv >> animals-subset.csv 127 | ``` 128 | 129 | 1. The first three lines of animals.csv? 130 | 2. The last two lines of animals.csv? 131 | 3. The first three lines and the last two lines of animals.csv? 132 | 4. The second and third lines of animals.csv? 133 | ::: 134 | 135 | 136 |
137 | Show Solution 138 | 139 | ::: {.callout-note} 140 | # Solution 141 | Option 3 is correct. 142 | ::: 143 |
144 | 145 | 146 | --- 147 | 148 | ## Filtering Files with `sort` 149 | 150 | * `sort` reads a file and *sorts* it's content to screen 151 | * it does not change the file. 152 | 153 | ```bash 154 | $ sort -n lengths.txt 155 | ``` 156 | 157 | ``` 158 | 9 methane.pdb 159 | 12 ethane.pdb 160 | 15 propane.pdb 161 | 20 cubane.pdb 162 | 21 pentane.pdb 163 | 30 octane.pdb 164 | 107 total 165 | ``` 166 | 167 | --- 168 | 169 | ## Filtering Files and using the result 170 | 171 | * Cool 😎 but now we want to use this list. 172 | * Could save it to a new file? 173 | 174 | ```bash 175 | $ sort -n lengths.txt > sorted_lengths.txt 176 | $ head -n 2 sorted_lengths.txt 177 | ``` 178 | 179 | ``` 180 | 9 methane.pdb 181 | 12 ethane.pdb 182 | ``` 183 | 184 | ## Filtering Files and **the pipe** 185 | 186 | * We call `|` the pipe. It takes output from a command and gives it to another command. 187 | * Modern languages use their own version of this (R has a package and now also a native pipe, julia has of course a pipe etc. Stata not sure 😜) 188 | * The **pipe** allows us to do this *without* storing intermediate results. 189 | 190 | ```bash 191 | $ sort -n lengths.txt | head -n 1 192 | ``` 193 | 194 | ``` 195 | 9 methane.pdb 196 | ``` 197 | 198 | * But, wait 🤔. Then we don't even need `lengths.txt`: 199 | 200 | ```bash 201 | $ wc -l *.pdb | sort -n | head -n 1 202 | ``` 203 | 204 | ``` 205 | 9 methane.pdb 206 | ``` 207 | 208 | * That's a *pipeline*. 🤯 209 | 210 | 211 | --- 212 | 213 | 214 | ## Piping Away 215 | 216 | * Make sure we are still in `~/shell-lesson-data/exercise-data/proteins` 217 | 218 | 219 | :::{.callout-important} 220 | 221 | # Pipe Dreams 222 | 223 | Which of the following commands shows us the 3 files with the least number of lines in the current directory? Build the pipeline up from left to right to check! 224 | 225 | 1. `wc -l * > sort -n > head -n 3` 226 | 2. `wc -l * | sort -n | head -n 1-3` 227 | 3. `wc -l * | sort -n | tail -n 4 | head -n 3` 228 | 4. `wc -l * | sort -n | head -n 3` 229 | 230 | ::: 231 | 232 | --- 233 | 234 | ## Piping Away 235 | 236 | * Make sure we are still in `~/shell-lesson-data/exercise-data/proteins` 237 | 238 | 239 | 240 |
241 | Show Solution 242 | 243 | ::: {.callout-note} 244 | # Solution 245 | Option 4 is correct. Option 3 finds the ones with *most* lines. 246 | ::: 247 |
248 | 249 | 250 | --- 251 | 252 | ## Cutting and Piping 253 | 254 | * We have a `.csv` file here: `shell-lesson-data/exercise-data/animal-counts` 255 | * Let's use the `cut` command to get parts of it. 256 | 257 | ```bash 258 | $ cd ~/shell-lesson-data/exercise-data/animal-counts 259 | $ cut -d , -f 2 animals.csv 260 | ``` 261 | 262 | ::: {.callout-important} 263 | 264 | # Building a Pipe 265 | 266 | * `uniq` filters **adjacent** matching lines in a file. 267 | * Can you extend the above command with `uniq` (and another command?) such that we get the list of unique animal names? 268 | * Add the `-c` flag to `uniq` to get a contingency table. 269 | 270 | ::: 271 | 272 | --- 273 | 274 | ## Building a Pipe 275 | 276 | 277 |
278 | Show Solution 279 | 280 | ::: {.callout-note} 281 | # Solution 282 | 1. `cut -d , -f 2 animals.csv | sort | uniq` 283 | 1. `cut -d , -f 2 animals.csv | sort | uniq -c` 284 | ::: 285 |
286 | 287 | 288 | --- 289 | 290 | ## House Prices in France 291 | 292 | The below dataset contains information on house sales (price, location, type of house etc). We call one record a _housing transaction_. 293 | 294 | Using the shell: 295 | 296 | 1. Use `wget` to download data to [from here](https://static.data.gouv.fr/resources/demandes-de-valeurs-foncieres/20240408-125738/valeursfoncieres-2023.txt) to your downloads folder as `carburants.csv`: `wget https://static.data.gouv.fr/resources/demandes-de-valeurs-foncieres/20240408-125738/valeursfoncieres-2023.txt` 297 | 2. use `wc -l` to count how many rows (*lines*) there are 298 | 3. use `head -n 2` to see the first two rows (the *header*) 299 | 4. Use the above solution to build a contingency table that tells us the number of housing transactions per *commune*. Show the 10 cities with most housing transactions. 300 | 5. Compute the average of variable `Valeur fonciere`. You should use the `awk` command like this : `awk 'BEGIN{s=0;}{s+=$1;}END{print s/(NR);} your_file.txt'` 301 | 302 | --- 303 | 304 | ## House Prices in France 305 | 306 | The below dataset contains information on house sales (price, location, type of house etc). We call one record a _housing transaction_. 307 | 308 | Using the shell: 309 | 310 | 5. Compute the average of variable `Valeur fonciere`. You should use the `awk` command like this : `awk 'BEGIN{s=0;}{s+=$1;}END{print s/(NR);} your_file.txt'` 311 | 312 | --- 313 | 314 | ## Real Data 315 | 316 |
317 | Show Solution 318 | 319 | 1. `wget https://static.data.gouv.fr/resources/demandes-de-valeurs-foncieres/20240408-125738/valeursfoncieres-2023.txt` 320 | 2. `wc -l valeursfoncieres-2023.txt` 321 | 3. `head -n 2 valeursfoncieres-2023.txt` 322 | 4. `cut -d '|' -f 18 valeursfoncieres-2023.txt | sort | uniq -c | sort -r | head -n 10 ` 323 | 5. `cut -d '|' -f 11 valeursfoncieres-2023.txt | cut -d , -f 1 | awk 'BEGIN{s=0;}{s+=$1;}END{print s/(NR);}' ` 324 | 6. If you have R installed you can check whether this is the same as reading this column into it (it's not! odd.) 325 | 326 | ```bash 327 | Rscript -e 'x = data.table::fread(cmd = "cut -d \'|\' -f 11 valeursfoncieres-2023.txt | cut -d , -f 1"); x[, mean(`Valeur fonciere`,na.rm = TRUE)]' 328 | ``` 329 | 330 |
331 | 332 | 333 | 334 | 335 | -------------------------------------------------------------------------------- /05-git.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Version Control with `Git`" 3 | format: 4 | revealjs: 5 | theme: _extensions/metropolis-theme/metropolis.scss 6 | chalkboard: true 7 | logo: /images/ScPo-logo.png 8 | footer: "[SciencesPo Intro To Programming 2024](https://floswald.github.io/ScPoProgramming/)" 9 | incremental: false 10 | code-line-numbers: false 11 | highlight-style: github 12 | author: Florian Oswald and The Software Carpentry 13 | subtitle: "[SciencesPo Intro To Programming 2024](https://floswald.github.io/ScPoProgramming/)" 14 | date: today 15 | date-format: "D MMMM, YYYY" 16 | --- 17 | 18 | ## Version What? 19 | 20 | 21 | ::::: {.callout-note} 22 | # Question 23 | 24 | * What is Version Control and Why Should I Care? 25 | ::::: 26 | 27 | ::::: {.callout-tip} 28 | # Objectives 29 | - Understand the benefits of an automated version control system. 30 | - Understand the basics of how automated version control systems work. 31 | :::::: 32 | 33 | --- 34 | 35 | ## Final.doc 36 | 37 | !["Piled Higher and Deeper" by Jorge Cham, http://www.phdcomics.com](/images/phd101212s.png) 38 | 39 | 40 | 41 | --- 42 | 43 | ## Undo 44 | 45 | * The *latest version* is often best for text documents. 46 | * However, sometimes our view of *best* evolves. Then, we want to *undo*. 47 | * *Undo* means going back in history. 48 | 49 | . . . 50 | 51 | * MS Word etc have *track changes* features. 52 | * Once you accepted the proposed changes of a collaborator, can you go back? 53 | * What about Dropbox-like solutions? (What *is* dropbox actually?) 54 | 55 | --- 56 | 57 | ## Which Version: 20210611_draft.tex 58 | 59 | :::: {.columns} 60 | 61 | ::: {.column width="50%"} 62 | 63 | > Research team 👇 orders files by YYYYMMDD. 64 | 65 | * *Hey, fixed that thing last week.* 66 | * In `20220629-paper.tex`? 67 | * *Erm. Yes. No. I think `20211203-paper.tex` - messed up the file name.* 68 | * Ok, can you copy it into the latest version? 69 | * *Sure. Damn, can't find it anymore. I'll just write it again. All in my head.* 🤯 70 | ::: 71 | 72 | ::: {.column width="40%"} 73 | !["True Story" by Florian Oswald](/images/which-version.png) 74 | ::: 75 | 76 | :::: 77 | 78 | 79 | --- 80 | 81 | ## Which Version 2: **Why is the sample size so small suddenly**? 82 | 83 | 84 | ::::: {.columns} 85 | 86 | :::: {.column width="40%"} 87 | 88 | * We had 800 observations, now 733. Why? 89 | * Erm...😱 No clue! 90 | * Well you must have changed the code. 91 | * Yes, I *improved* the code in several parts. 92 | * Well you have to find out what happened. 93 | * But that was weeks ago - I don't remember! 😢 94 | :::: 95 | 96 | 97 | 98 | :::: {.column width="50%"} 99 | 100 | 101 | ### Hard Bugs 102 | 103 | * The hard bugs 🐛 are the ones you see only after a while. 104 | * See result today, error was introduced long ago. 105 | * You can rewind dropbox 30 days. What if... ? 106 | * Also, throw away 30 days of work? 107 | * 😱 😱 😱 😱 108 | 109 | :::: 110 | 111 | 112 | 113 | ::::: 114 | 115 | 116 | --- 117 | 118 | ## {background-image="./images/removed-that.png" background-size=100%} 119 | 120 | 121 | 122 | --- 123 | 124 | 125 | ## Setting Up Git 126 | 127 | * We all installed `git`. 128 | * Let's setup our name 129 | 130 | ```bash 131 | $ git config --global user.name "Your Name" 132 | $ git config --global user.email "your@mail.com" 133 | ``` 134 | 135 | * Line Endings on Windows: 136 | 137 | ```bash 138 | git config --global core.autocrlf false 139 | ``` 140 | 141 | --- 142 | 143 | 144 | ## Creating a Git **Repository** 145 | 146 | 147 | ::::: {.callout-note} 148 | # Question 149 | 150 | * Where does Git store information? 151 | 152 | ::::: 153 | 154 | ::::: {.callout-tip} 155 | # Objectives 156 | - Create a local repository 157 | - Describe purpose of `.git` directory 158 | :::::: 159 | 160 | 161 | --- 162 | 163 | ## House Prices Project 164 | 165 | * Let's create a project folder in our home to look at the house prices from last week. 166 | 167 | ```bash 168 | $ cd # going to home dir 169 | $ mkdir houseprices # create directory 170 | $ cd houseprices 171 | $ git init 172 | ``` 173 | 174 | * Now the directory `~/houseprices` is endowed with `git` version control. 175 | * What does that look like? 176 | 177 | 178 | --- 179 | 180 | ## Where is Git? 181 | 182 | * Remember *hidden files* and folders? 183 | 184 | ```bash 185 | $ ls -a 186 | ./ ../ .git/ 187 | ``` 188 | 189 | * Git for this repository resides in `.git` 190 | 191 | ::: {.callout-warning} 192 | 193 | # Danger Zone 194 | 195 | * If you _delete_ that folder, the entire version control is GONE. 196 | * Be very careful that you really want to do that. 197 | 198 | ::: 199 | 200 | 201 | --- 202 | 203 | 204 | ## Tracking Changes with Git 205 | 206 | 207 | 208 | ::::: {.callout-note} 209 | # Question 210 | 211 | * How do I record changes in Git? 212 | * How do I check the status of my version control repository? 213 | * How do I record notes about what changes I made and why? 214 | ::::: 215 | 216 | ::::: {.callout-tip} 217 | # Objectives 218 | - Understand the benefits of an automated version control system. 219 | - Understand the basics of how automated version control systems work. 220 | :::::: 221 | 222 | 223 | 224 | 225 | --- 226 | 227 | ## Adding Code and Text 228 | 229 | ::: {.callout-note} 230 | 231 | * Notice: The code we produce **is** text. 232 | * Remember what we learned about **file endings**. 233 | 234 | ::: 235 | 236 | * Let's add a shell script where we add our pipeline from last week. 237 | 238 | 1. run to get the raw data again: 239 | ```bash 240 | wget https://static.data.gouv.fr/resources/demandes-de-valeurs-foncieres/20240408-125738/valeursfoncieres-2023.txt 241 | ``` 242 | 243 | --- 244 | 245 | ## Adding Code and Text 246 | 247 | 248 | 2. create a script 249 | 250 | ```bash 251 | nano maketable.sh # open nano 252 | # type this: 253 | cd ~/houseprices # make sure we are in the right place 254 | cut -d '|' -f 18 valeursfoncieres-2023.txt | sort | uniq -c | sort -r | head -n 10 255 | # save and exit 256 | ``` 257 | 258 | 3. (Does it work?) 259 | ```bash 260 | ls . # check the new file is there 261 | ./maketable.sh # run it! 262 | ``` 263 | 264 | . . . 265 | 266 | 4. No, it doesn't. 😖 267 | ```bash 268 | chmod +x ./maketable.sh # add executable mode 269 | ls -a 270 | ./maketable.sh 271 | ``` 272 | 273 | 274 | 275 | --- 276 | 277 | ## Viewing Changes 278 | 279 | * Ok, now let's see what `git` makes of our additions to this directory. 280 | 281 | ```tcl 282 | floswald@PTL11077 ~/houseprices (main)> git status 283 | On branch main 284 | 285 | No commits yet 286 | 287 | Untracked files: 288 | (use "git add ..." to include in what will be committed) 289 | valeursfoncieres-2023.txt 290 | maketable.sh 291 | ``` 292 | 293 | * It is actually helpful **not** to use `bash` as a shell... 294 | * Customizing your shell is an extremely effective procrastination device. 295 | * You must know what [*shaving a Yak*](https://projects.csail.mit.edu/gsb/old-archive/gsb-archive/gsb2000-02-11.html) means before you walk out of my class. 296 | 297 | 298 | --- 299 | 300 | ## Seeing the Difference 301 | 302 | * the command `git diff` shows you what changed between versions. 303 | * lets see what it shows now: 304 | 305 | ```bash 306 | $ git diff 307 | ``` 308 | 309 | * It shows nothing, i.e. an _empty_ diff, because _there are no commits yet_ to compare with. 310 | * Ok, let's change that. 311 | 312 | 313 | --- 314 | 315 | ## Modify-Add-Commit 1 316 | 317 | * git reports about _untracked files_. We need to decide *what to track*. 318 | 319 | 1. Move files to *staging area*: 320 | ```bash 321 | git add maketable.sh 322 | git status 323 | ``` 324 | 325 | * Notice that I did *not* want to track the `csv` file. 326 | 327 | ```bash 328 | On branch main 329 | 330 | No commits yet 331 | 332 | Changes to be committed: 333 | (use "git rm --cached ..." to unstage) 334 | new file: maketable.sh 335 | 336 | Untracked files: 337 | (use "git add ..." to include in what will be committed) 338 | valeursfoncieres-2023.txt 339 | ``` 340 | 341 | 342 | --- 343 | 344 | ## Modify-Add-Commit 2 345 | 346 | * Now, let's *record* what is in the staging area. 347 | 348 | ```bash 349 | $ git commit -m 'added the maketable script' 350 | 351 | [main (root-commit) 9956506] added the maketable script 352 | 1 file changed, 2 insertions(+) 353 | create mode 100644 maketable.sh 354 | ``` 355 | 356 | * check status: 357 | 358 | ```bash 359 | $ git status 360 | 361 | On branch main 362 | Untracked files: 363 | (use "git add ..." to include in what will be committed) 364 | valeursfoncieres-2023.txt 365 | 366 | nothing added to commit but untracked files present (use "git add" to track) 367 | ➜ gasprices git:(main) ✗ 368 | ``` 369 | 370 | --- 371 | 372 | ## Modify-Add-Commit 3 373 | 374 | * Let's check what's in the log. 375 | 376 | ```bash 377 | $ git log 378 | 379 | commit 9956506dc3159403b87aea3b04654c293e82c680 (HEAD -> main) 380 | Author: Florian Oswald 381 | Date: Tue Feb 7 10:50:51 2023 +0100 382 | 383 | added the maketable script 384 | ``` 385 | 386 | --- 387 | 388 | ## Modify-Add-Commit 4 389 | 390 | * Now let's _modify_ the script finally. 391 | 392 | ```bash 393 | $ nano maketable.sh 394 | 395 | # add this line on top 396 | echo hello user, will make a contigency table now. 397 | # save and exit 398 | ``` 399 | 400 | * now - what's the difference in the repo? 401 | 402 | --- 403 | 404 | ## Diffing 405 | 406 | * there are still the same files here: 407 | 408 | ```bash 409 | $ ls 410 | valeursfoncieres-2023.txt maketable.sh 411 | ``` 412 | 413 | * But we can now _compare_ versions: 414 | 415 | ```bash 416 | $ git diff 417 | 418 | diff --git a/maketable.sh b/maketable.sh 419 | index 7e01058..3b7007e 100644 420 | --- a/maketable.sh 421 | +++ b/maketable.sh 422 | @@ -1,2 +1,3 @@ 423 | +echo hello user, will make a contigency table now. 424 | cd ~/valeursfoncieres-2023.txt # make sure we are in the right place 425 | cut -d ';' -f 5 valeursfoncieres-2023.txt | tr [:lower:] [:upper:] | sort | uniq -c | sort 426 | ``` 427 | 428 | 429 | --- 430 | 431 | ## Commiting Changes Again 432 | 433 | * let's first check everything runs 434 | 435 | ```bash 436 | $ ./maketable.sh 437 | ``` 438 | 439 | * good. commit! 440 | ```bash 441 | $ git add maketable.sh 442 | $ git commit -m 'added message to user' 443 | ``` 444 | 445 | 446 | --- 447 | 448 | ## Adding a README 449 | 450 | * Good. Now let's add a `README` file. 451 | * It's customary to write this in [markdown](https://carpentries-incubator.github.io/markdown-intro/) 452 | 453 | ```bash 454 | $ nano README.md 455 | ``` 456 | write this in nano and save when done. 457 | 458 | ```md 459 | # Gas Prices 460 | 461 | This repo contains code to analyse gas prices at French gas stations. 462 | ``` 463 | 464 | * add to staging area, so we can take a snapshot 465 | 466 | ```bash 467 | $ git add README.md 468 | $ git commit -m 'added readme' 469 | ``` 470 | 471 | --- 472 | 473 | ## What is this Staging Area? 474 | 475 | * `git` is like a fotographic camera. 476 | * before you take a picture of your friends, you need to arrange them somehow, so that all fit, and so that all 😁. 477 | * You put them _on stage_. Same for files in your repo. 478 | 479 | ![figure from [software carpentry]()](/images/git-staging-area.svg) 480 | 481 | 482 | --- 483 | 484 | ## What is this Staging Area? 485 | 486 | 487 | ![I took that picture at CDG airport](/images/git-staging-CDG.jpeg) 488 | 489 | 490 | --- 491 | 492 | ## Looking at History 493 | 494 | 495 | 496 | 497 | ::::: {.callout-note} 498 | # Question 499 | * How can I identify old versions of files? 500 | * How do I review my changes? 501 | * How can I recover old versions of files? 502 | 503 | ::::: 504 | 505 | ::::: {.callout-tip} 506 | # Objectives 507 | 508 | 509 | * Explain what the HEAD of a repository is and how to use it. 510 | * Identify and use Git commit numbers. 511 | * Compare various versions of tracked files. 512 | * Restore old versions of files. 513 | :::::: 514 | 515 | 516 | --- 517 | 518 | ## The most recent version: HEAD 519 | 520 | * Let's change the `maketable.sh` script again: 521 | ```bash 522 | $ nano maketable.sh 523 | echo program run successfully 524 | # save exit 525 | 526 | $ git add maketable.sh 527 | ``` 528 | 529 | * The most recent version of our repo is called `HEAD`. 530 | ```bash 531 | $ git diff # compares entire repo to HEAD 532 | $ git diff HEAD maketable.sh 533 | ``` 534 | 535 | --- 536 | 537 | ## Whoops, typo 538 | 539 | * Oh no, we wrote _program run successfully_. That should be _ran_ not _run_. 540 | * What now? 541 | 542 | . . . 543 | 544 | * we have not committed this yet! 545 | * we can just get back the version in HEAD, and edit again: 546 | 547 | ```bash 548 | $ git restore maketable.sh 549 | $ git checkout maketable.sh # also works 550 | ``` 551 | 552 | * edit the script, add and commit. 553 | 554 | --- 555 | 556 | ## How to get a *specific* version 557 | 558 | * What if you want something else than `HEAD`? 559 | * like, the first version of `maketable.sh`? 560 | * look at history: 561 | 562 | ```bash 563 | $ git log --oneline --graph 564 | 565 | * a6f023b (HEAD -> main) added readme 566 | * 9956506 added the maketable script 567 | ``` 568 | 569 | * The `9956506` is the unique identifier of that version. 570 | * We can go back to that version: 571 | 572 | ```bash 573 | $ git checkout 9956506 maketable.sh 574 | ``` 575 | 576 | 577 | --- 578 | 579 | 580 | ::: {.callout-tip} 581 | 582 | # Key Points 583 | 584 | * `git diff` displays differences between commits. 585 | * `git checkout` recovers old versions of files. 586 | ::: 587 | 588 | --- 589 | 590 | ## So, how does this thing work? 591 | 592 | ![software carpentry image.](/images/git_staging.svg) 593 | 594 | 595 | --- 596 | 597 | ## Version Control with VScode 598 | 599 | * Download [Visual Studio Code](https://code.visualstudio.com/) 600 | * Start 601 | * Open folder `~/gasprices` 602 | * check version control tab on the left. 603 | 604 | 605 | 606 | --- 607 | 608 | ## Version Control with RStudio 609 | 610 | * top right click on *new project* 611 | * Select *existing directory* 612 | * Select `~/gasprices` 613 | * checkout out the `git` tab in Rstudio! 614 | 615 | 616 | --- 617 | 618 | ## Collaborating with Git on GitHub 619 | 620 | * Create repo 621 | * copy ssh remote URL 622 | * connect local to remote repo 623 | 624 | --- 625 | 626 | ## SSH connections 627 | 628 | * Secure Shell Protocol 629 | * Private-Public key pair. It's like a lock, and you have the only key. 630 | * Let's check if you have one already! 631 | 632 | ```bash 633 | ls -la ~/.ssh 634 | ``` 635 | 636 | if error, create one: 637 | 638 | ```bash 639 | ssh-keygen -t ed25519 -C "your@email.com" 640 | ``` 641 | press enter (no passphrase) 642 | 643 | check 644 | ```bash 645 | ls -la ~/.ssh 646 | ``` 647 | 648 | --- 649 | 650 | ## Communicate with GitHub Remote 651 | 652 | * Let's ping the remote server at GitHub now. 653 | 654 | ```bash 655 | ssh -T git@github.com 656 | ``` 657 | 658 | * right, of course Github doesn't have our public key yet (the _lock_ for our key!) 659 | 660 | * copy from your terminal 661 | ```bash 662 | cat ~/.ssh/id_ed25519.pub # or your *.pub 663 | ``` 664 | 665 | * Go to github.com, click top right corner, settings, SSH keys. 666 | 667 | 668 | --- 669 | 670 | ## Adding a Remote to your local Repo 671 | 672 | * Now that we can talk to Github.com, let's add the remote to our local repo. 673 | * We `add` a remote by getting the `SSH` url from the repository (green button) online. 674 | 675 | ```bash 676 | $ git remote add origin git@github.com:YOUR_USER/YOUR_REPO.git 677 | ``` 678 | * `origin` is the _name_ of the remote server. your choice, but _origin_ is common. 679 | * this should set that remote both for sending and retrieving stuff from the repo. _pull_ and _push_, in git language: 680 | 681 | ```bash 682 | $ git remote --v 683 | ``` 684 | 685 | 686 | 687 | --- 688 | 689 | ## Pushing It 690 | 691 | * Now we can _push_ our local repository to the remote repo. 692 | * There will be a full copy of what is in `.git` (i.e., the entire history of the repo) on that remote machine. 693 | * You will be able to use it like a central backup location for your work. 694 | 695 | 696 | ```bash 697 | $ git push -u origin main 698 | ``` 699 | 700 | * the `-u` flag sets the _main_ branch as default _upstream_ branch to track. 701 | 702 | 703 | --- 704 | 705 | ## Branching It 706 | 707 | * Next to different _versions_ of a file/directory _over time_, we can have versions evolving in parallel. 708 | * Imagine development history _branching_ off into 2 separate directions at one point. 709 | * They may converge at some point again, but maybe one of them will turn out a failure and we drop it. 710 | * Branches are hugely useful to organize team work. 711 | 712 | ```bash 713 | $ git checkout -b testing # checkout repo on new branch `testing` 714 | Switched to a new branch 'testing' 715 | ``` 716 | 717 | * Now can develop stuff on the `testing` branch. 718 | * Later on, we can `merge` it back into `main` if we like it. 719 | 720 | 721 | --- 722 | 723 | ## The Full Picture(s) 724 | 725 | ![picture from [@MarkLodato](https://marklodato.github.io/visual-git-guide/index-en.html) - click for more!](/images/git-images/full.png) 726 | 727 | 728 | --- 729 | 730 | ## Pushing Branches to GitHub 731 | 732 | * Once you created a local branch you can of course copy (_push_) it to your remote to share with others. 733 | * you would amend the push command: 734 | 735 | ```bash 736 | # make sure you are on the desired branch 737 | $ git branch 738 | main 739 | * testing 740 | 741 | $ git push origin testing 742 | ``` 743 | 744 | -------------------------------------------------------------------------------- /06-concepts.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Intro to Generic Programming" 3 | --- 4 | 5 | 6 | ```{r reticulate_config} 7 | #| cache: false 8 | #| include: false 9 | library(reticulate) 10 | use_condaenv("introprog") 11 | ``` 12 | 13 | In this short lecture we introduce a few core concepts used in programming. We will be using both `R` and `python` as examples, however, the concepts are transversal across all/most languages. The implementation details - i.e. how do you _invoke_ a certain concept - will differ across languages. 14 | 15 | [How to install python](https://realpython.com/installing-python/). 16 | [Here](https://swcarpentry.github.io/python-novice-inflammation/) is a nice introduction for Python novices. 17 | 18 | 19 | ## Setup 20 | 21 | Ideally you would try to run all commands in the below for both languages. I recommend that you open two terminal windows, one running `R` and one running `python`. For python we need the `numpy` package to demonstrate array support. Depending on how you installed python, there are different options. 22 | 23 | * Anaconda installation: `conda install numpy` 24 | * Homebrew or download from python.org : `pip install numpy` 25 | 26 | 27 | Check whether the installation worked by doing 28 | 29 | ```{python} 30 | import numpy as np 31 | # loads the numpy library and gives it 32 | # short name `np` 33 | ``` 34 | 35 | 36 | ## Variables 37 | 38 | Variables are labels for objects. This can be simple numbers, or strings, but often also any other sort of object you could think of: a plot, a table, a matrix, a vector, a list, ... 39 | 40 | What is curious to know about variables is their _scoping_ behaviour: where in our programs we can we see which variable? This differs quite importantly across languages and is something that requires some thought. 41 | 42 | First, let's create a variable `x` which holds the value `12.3`: 43 | 44 | ::: {.panel-tabset group="language"} 45 | 46 | ### Python 47 | 48 | ```{python pvars} 49 | #| cache: false 50 | x = 12.3 51 | x + 5 52 | ``` 53 | 54 | ### R 55 | 56 | ```{r rvars} 57 | #| cache: false 58 | x <- 12.3 # = works also 59 | x + 5 60 | ``` 61 | 62 | ::: 63 | 64 | Next, a *function* which will use the variables - here we do not provide `x` as an argument to the function, so which value will it use in each case? 65 | 66 | ::: {.panel-tabset group="language"} 67 | 68 | ### Python 69 | 70 | ```{python pf} 71 | #| cache: false 72 | def myfun(y): 73 | return x + y # must use `return` 74 | # note the indentation! 75 | # function definition finishes after last line of indented block. 76 | 77 | myfun(8) 78 | ``` 79 | 80 | ### R 81 | 82 | ```{r rf} 83 | #| cache: false 84 | myfun <- function(y){ 85 | x + y # can use `return()` 86 | } 87 | myfun(8) 88 | ``` 89 | 90 | ::: 91 | 92 | we see that in both cases, the function looked for the variable `x` in it's _calling scope_, i.e. the environment where it was called from. This only worked because we had defined `x` before. This may or may not work in other languages. In general this is called [*lexical scoping*](https://www.gnu.org/software/guile/manual/html_node/Lexical-Scope.html). 93 | 94 | ## Loops 95 | 96 | If we have a repetitive task, it's useful to be able to _iterate_, i.e. do the same thing to a potentially changing input. Consider that we had 4 numbers `2,3,4,5` and we wanted to print them to screen. We could do of course write 4 identical `print` statements, each with a different input: 97 | 98 | ::: {.panel-tabset group="language"} 99 | 100 | ### Python 101 | 102 | ```{python ploop0} 103 | #| cache: false 104 | #| eval: false 105 | print("this is number",2) 106 | print("this is number",3) 107 | print("this is number",4) 108 | print("this is number",5) 109 | ``` 110 | 111 | ### R 112 | 113 | ```{r rloop0} 114 | #| cache: false 115 | #| eval: false 116 | print(paste("this is number",2)) 117 | print(paste("this is number",3)) 118 | print(paste("this is number",4)) 119 | print(paste("this is number",5)) 120 | ``` 121 | 122 | ::: 123 | 124 | but you can see that this a lot of repetitive code, which we want to avoid. Also, adding an additional number would mean a lot of extra work. So, loops are better here: 125 | 126 | 127 | 128 | ::: {.panel-tabset group="language"} 129 | 130 | ### Python 131 | 132 | ```{python ploop} 133 | #| cache: false 134 | for i in range(2,5) : 135 | print(f"this is number",i) # note the indentation! 136 | ``` 137 | 138 | ### R 139 | 140 | ```{r rloop} 141 | #| cache: false 142 | for (i in 2:4){ 143 | print(paste("this is number",i)) 144 | } 145 | ``` 146 | 147 | ::: 148 | 149 | ## Useful Datastructures 150 | 151 | * python docs on [data structures](https://docs.python.org/3/tutorial/datastructures.html) 152 | * Article about [R datastructures](http://adv-r.had.co.nz/Data-structures.html) 153 | 154 | concept | Python | R 155 | -----| -------|----- 156 | 1d list | `[1,2]` | `c(1,2)` 157 | 1d vector | `np.array([1,2])` | `c(1,2)` 158 | matrix | `np.array([row, col])` | `matrix(data,rows,cols)` 159 | n-d array | `np.array` | `array` 160 | Dictionary | `dict` | `list` 161 | DataFrame | `pandas.df` | `data.frame` 162 | 163 | ### 1-D list/vector 164 | 165 | ::: {.panel-tabset group="language"} 166 | 167 | ### Python 168 | 169 | ```{python plist} 170 | #| cache: false 171 | li = [1,3] 172 | li + li # not well defined vector space with `+` and `*` 173 | ``` 174 | 175 | ### R 176 | 177 | ```{r rlist} 178 | #| cache: false 179 | li = c(1,3) 180 | li * li # element-by-element 181 | li + li 182 | ``` 183 | 184 | ::: 185 | 186 | in python we use the `numpy` package for linear algebra: 187 | 188 | 189 | ::: {.panel-tabset group="language"} 190 | 191 | ### Python 192 | 193 | ```{python pnp} 194 | #| cache: false 195 | import numpy as np 196 | li = np.array([1,3]) 197 | li * li 198 | li + li 199 | ``` 200 | 201 | ### R 202 | 203 | ```{r} 204 | #| cache: false 205 | li = c(1,3) 206 | li * li # element-by-element 207 | li + li 208 | ``` 209 | 210 | ::: 211 | 212 | ### Matrices 213 | 214 | 215 | ::: {.panel-tabset group="language"} 216 | 217 | ### Python 218 | 219 | ```{python} 220 | #| cache: false 221 | import numpy as np 222 | ma = np.array([[1,3], [2,4]]) 223 | ma * ma 224 | ma + ma 225 | ``` 226 | 227 | ### R 228 | 229 | ```{r} 230 | #| cache: false 231 | ma = matrix(c(1,2,3,4),nrow = 2, ncol = 2) 232 | ma * ma # element-by-element 233 | ma + ma 234 | ``` 235 | 236 | ::: 237 | 238 | ### N-D arrays 239 | 240 | ::: {.panel-tabset group="language"} 241 | 242 | ### Python 243 | 244 | ```{python} 245 | #| cache: false 246 | a = np.arange(1,9) 247 | np.reshape(a, (2,2,2)) 248 | ``` 249 | 250 | ### R 251 | 252 | ```{r} 253 | #| cache: false 254 | array(1:8,dim = c(2,2,2)) 255 | ``` 256 | 257 | ::: 258 | 259 | 260 | ### Dictionaries 261 | 262 | `Dict`s are lists with a *key -> value* structure. Like a telephone book: 263 | 264 | ::: {.panel-tabset group="language"} 265 | 266 | ### Python 267 | 268 | ```{python} 269 | #| cache: false 270 | di = {'peter' : 1225, 'alice' : 4333} 271 | di 272 | ``` 273 | 274 | ### R 275 | 276 | ```{r} 277 | #| cache: false 278 | di = list(peter = 1225, alice = 4333) 279 | di 280 | ``` 281 | 282 | ::: 283 | 284 | ### DataFrames 285 | 286 | In python, we use the `pandas` package for dataframe support. In R they are built-in as we know. There are many ways to create a `pandas` dataframe. 287 | 288 | * Here is the official pandas [documentation](https://pandas.pydata.org/pandas-docs/stable/index.html). 289 | * in `R`, type `?data.frame` for the help entry. 290 | 291 | 292 | ::: {.panel-tabset group="language"} 293 | 294 | ### Python 295 | 296 | ```{python} 297 | #| cache: false 298 | import pandas as pd 299 | d = {"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]} 300 | pd.DataFrame(d) 301 | ``` 302 | 303 | ### R 304 | 305 | ```{r} 306 | #| cache: false 307 | data.frame(one = c(1,2,3,4.0), two = c(4,3,2,1.0)) 308 | ``` 309 | 310 | ::: -------------------------------------------------------------------------------- /09-R-packages.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Building `R` Packages" 3 | format: 4 | revealjs: 5 | theme: _extensions/metropolis-theme/metropolis.scss 6 | chalkboard: true 7 | logo: /images/ScPo-logo.png 8 | footer: "[SciencesPo Intro To Programming 2024](https://floswald.github.io/ScPoProgramming/)" 9 | incremental: false 10 | code-line-numbers: false 11 | highlight-style: github 12 | slide-number: true 13 | author: Florian Oswald 14 | subtitle: "[SciencesPo Intro To Programming 2024](https://floswald.github.io/ScPoProgramming/)" 15 | date: today 16 | date-format: "D MMMM, YYYY" 17 | --- 18 | 19 | ## R and Packages 20 | 21 | 22 | ::: {.columns} 23 | 24 | ::: {.column width=45%} 25 | 26 | ::: {.callout-tip} 27 | 28 | # Questions 29 | 30 | 1. Why write our _own_ `R` package? 31 | 2. How to create an `R` package? 32 | 3. What are _unit tests_? 33 | 34 | ::: 35 | 36 | ::: 37 | 38 | ::: {.column width=45%} 39 | 40 | 41 | ::: {.callout-note} 42 | 43 | # Objectives 44 | 45 | * Learn the `RStudio`-powered package development workflow. 46 | * Create a package, and test it. 47 | * Publish the package to github. 48 | * Publish the package docs as a self-contained website. 49 | 50 | ::: 51 | 52 | 53 | ::: 54 | 55 | ::: 56 | 57 | 58 | 59 | 60 | 61 | 62 | --- 63 | 64 | ## R and Packages 65 | 66 | * We have been using `R` packages all the time. 67 | 68 | * Each time we say `library(xyz)` we are using *external* code provided in the `xyz` package. 69 | 70 | * You can write **your own** packages. 71 | 72 | . . . 73 | 74 | ::: {.callout-tip} 75 | 76 | # What's the point of Packages? 77 | 78 | 1. Extend `R` functionality. 79 | 2. **for researchers**: key tool to ensure _reproducibilty_ of findings 80 | 3. **for researchers**: key tool to _organize_ code in team work 81 | 82 | ::: 83 | 84 | * Let's go through some material from the [`r-pkgs`](https://r-pkgs.org) book! 85 | 86 | 87 | 88 | --- 89 | 90 | 91 | ## Building a Toy Package 92 | 93 | ::: {.columns} 94 | 95 | ::: {.column width=45%} 96 | ### `RStudio` for the win 97 | 98 | * We do this in `RStudio` 99 | 100 | * we use the `devtools` package 101 | 102 | * check you have a recent version: 103 | 104 | ```{r} 105 | #| echo: true 106 | packageVersion("devtools") 107 | ``` 108 | 109 | * if not - reinstall. 110 | 111 | ::: 112 | 113 | ::: {.column width=45%} 114 | 115 | ### Let's Do it! 116 | 117 | ```{r} 118 | #| echo: true 119 | #| eval: true 120 | 121 | library(devtools) 122 | 123 | # create a package `here` 124 | create_package("~/toypackage") 125 | ``` 126 | 127 | 128 | * You see Rstudio jumps to that location 129 | ::: 130 | 131 | ::: 132 | 133 | 134 | --- 135 | 136 | ## Adding Git 137 | 138 | * Of course we want to track our package with `git`. 139 | * We use functions from the `usethis` package. This is loaded by default when attaching the `devtools` package (`use_git` is part of `usethis`...) 140 | 141 | ```{r} 142 | #| eval: true 143 | #| echo: true 144 | #| 145 | library(devtools) 146 | use_git() 147 | ``` 148 | 149 | * Say `Yes` to everything ✌️ 150 | 151 | --- 152 | 153 | ## Adding Code 154 | 155 | * We add `R` _source_ code in the `R/` folder. 156 | * Create as many `.R` files as you want. 157 | * It's good practice to organize tests accompanying source files. 158 | 159 | ```{r} 160 | #| eval: true 161 | #| echo: true 162 | #| message: true 163 | use_r("sayhello") 164 | ``` 165 | 166 | * What's with that `use_test()` thing? 🤔 Let's worry about this later. 167 | 168 | 169 | --- 170 | 171 | ## Ok, but...Adding Code?? 172 | 173 | * Let's add a function to the file `R/sayhello.R`: 174 | 175 | ```r 176 | # Notice I'm using = instead of < - because 177 | # the font of those slides prints it weirdly 178 | hello = function(who){ 179 | paste("hello,",who) 180 | } 181 | ``` 182 | 183 | * Now, if this were a simple `R` script, we could `source` the `R/sayhello.R` file into global space and try this out. 184 | * We _don't_ want to do that here though. 🤨 185 | * Instead, we want to _load_ the **package**, which _contains_ our function. 186 | * do `load_all()`: 187 | ```r 188 | load_all() 189 | ℹ Loading toypackage 190 | ``` 191 | 192 | --- 193 | 194 | ## Trying out Code 195 | 196 | ::: {.callout-note} 197 | 198 | # `load_all()` 199 | 200 | * The `load_all()` function simulates the process of building, installing, and attaching the `toypackage` package. 201 | * This means that **all** the functions you included in the package will become _visible_ in the global scope (in your console) 202 | * This is _not_ in general the case: Later on we will fine-tune which functions are visible to the user, and which ones are not! 203 | ::: 204 | 205 | * Call the function with your name! 206 | 207 | ```r 208 | hello("Peter") 209 | [1] "hello, Peter" 210 | ``` 211 | 212 | * Great! 213 | 214 | --- 215 | 216 | ## Checking the Package 217 | 218 | * `R` has a rigid set of rules for what a package needs to look like. 219 | * What files should be where, their names and permissions, such that the structure is nicely uniform across all R packages. 220 | * Particularly relevant for _official_ packages on [CRAN](https://cran.r-project.org/) 221 | * Do this here often: 222 | 223 | ```r 224 | check() 225 | ``` 226 | 227 | * This outputs a bunch of things: 228 | 1. It actually _builds_ our package in a separate process - immune from our current workspace 229 | 2. It runs a battery of checks and returns a report: 230 | 231 | ```r 232 | 0 errors ✔ | 1 warning ✖ | 1 note ✖ 233 | ``` 234 | 235 | --- 236 | 237 | ## Editing DESCRIPTION 238 | 239 | * Open the `DESCRIPTION` file (or type `Ctrl + .` and start typing `desc`) 240 | * Fill in the obviously missing contents. 241 | 242 | ### Adding a LICENSE 243 | 244 | > [Use a license, any license (Jeff Atwood)](https://blog.codinghorror.com/pick-a-license-any-license/) 245 | 246 | Let's 247 | ```r 248 | use_mit_license() 249 | ``` 250 | 251 | --- 252 | 253 | ## Documenting with Roxygen 254 | 255 | * Go back to the `hello` function, place the cursor inside the function body, and do `Code > Insert Roxygen Skeleton`. 256 | * You'll see something like this: 257 | 258 | ```r 259 | #' Title 260 | #' 261 | #' @param who 262 | #' 263 | #' @return 264 | #' @export 265 | #' 266 | #' @examples 267 | hello <- function(who){ 268 | paste("hello,",who) 269 | } 270 | ``` 271 | 272 | * Each line starting with `#'` is part of the **docstring**. 273 | * The `roxygen` package can _separate_ those blocks from our code, and produce valid `R` documentation for us! 🤯 274 | 275 | --- 276 | 277 | ## Building Documentation 278 | 279 | * Let's modify the docstring accordingly. 280 | * execute the `document()` function. 281 | * After that, the documentation is visible to us: 282 | 283 | ```r 284 | ?hello 285 | ℹ Rendering development documentation for "hello" 286 | ``` 287 | * Look in the _Help_ pane in RStudio! 288 | 289 | 290 | --- 291 | 292 | ## NAMESPACE 293 | 294 | * Did you notice the `@export` tag in the docstring? 295 | * when we ran `document()`, roxygen changed the `NAMESPACE` file based upon that tag. 296 | * Go and look at that file! 297 | * The contents of `NAMESPACE` specify what is _visible_ to a user who does `library(toypackage)`. 298 | * Try removing the `@export` tag, and `document()` again. Look back at `NAMESPACE`! 299 | 300 | ### `check()` again! 301 | 302 | ```r 303 | check() 304 | 0 errors ✔ | 0 warnings ✔ | 0 notes ✔ 305 | ``` 306 | 307 | --- 308 | 309 | ## Time to INSTALL the package 310 | 311 | * Ok, great. Now we have a minimal package that _works to a certain extent_ 🙂. 312 | * We must _install_ it into our package library, in order to be able to use it like any other package (same as when we did `install.packages("ggplot2")`) 313 | * Notice that `R` installs your packages here: 314 | ```{r} 315 | #| echo: true 316 | 317 | .libPaths() 318 | ``` 319 | 320 | * We _install_ our package into that location with `install()` 321 | * Look out for the final message: 322 | ```r 323 | * DONE (toypackage) 324 | ``` 325 | 👏 326 | 327 | --- 328 | 329 | ## New Session - Try it Out! 330 | 331 | * Restart Rstudio 332 | * type into the console 333 | ```r 334 | library(toypackage) 335 | ``` 336 | 337 | * and then let's see our cool 😎 function: 338 | 339 | ```r 340 | hello("John Spencer Blues Explosion") 341 | [1] "hello, John Spencer Blues Explosion" 342 | ``` 343 | 344 | * Works! Bingo! 🎉 345 | 346 | --- 347 | 348 | ## Automatically Testing Our Code 349 | 350 | * We verified ourselves that this _works_. 351 | * We had our own, informal, way to convince ourselves that it works. 352 | * We knew which steps we had to follow until we would conclude that "yes, this works". 353 | 354 | . . . 355 | 356 | ::: {.columns} 357 | 358 | ::: {.column width=45%} 359 | ::: {.callout-caution} 360 | 361 | # The Time Factor 362 | 363 | If you come back to this in 2 months time you probably 364 | 365 | a. won't remember all the steps you have taken (above) 366 | b. won't be able to reproduce what you _tested_ today! 367 | 368 | ::: 369 | ::: 370 | 371 | ::: {.column width=45%} 372 | 373 | ::: {.callout-warning} 374 | 375 | # The Scale Factor 376 | 377 | As your package grows, you will find it hard to come back to all components repeatedly, making sure they all _still_ work as intended (now that they may depend on other parts of your code) 378 | 379 | ::: 380 | ::: 381 | 382 | 383 | 384 | ::: 385 | 386 | 387 | # Unit Testing and Continuous Integration (CI) 388 | 389 | 390 | --- 391 | 392 | ## Enter **Unit Testing** 393 | 394 | * Automatic Unit Testing or [_Continuous Integration_ (CI)](https://en.wikipedia.org/wiki/Continuous_integration) is our best response to this. 395 | * We still have to *design* and *write* the tests, but we can offload the work to **run** the tasks repeatedly, and automatically, to a helpful infrastructure. 396 | 397 | ```r 398 | library(devtools) 399 | use_testthat() 400 | ``` 401 | 402 | * then 403 | ```r 404 | use_test("sayhello") 405 | • Modify 'tests/testthat/test-sayhello.R' 406 | ``` 407 | 408 | --- 409 | 410 | ## Writing Unit Tests 411 | 412 | * Ideally, each function in our `R/` folder is _covered_ by a corresponding test. 413 | 414 | ::: {.callout-important} 415 | 416 | # What Is a Test? 417 | 418 | The purpose of a **test** is to verify that some part of your code, a function in most cases, works **as intended**. 419 | ::: 420 | 421 | * Modify `'tests/testthat/test-sayhello.R'` like so 422 | ```r 423 | test_that("hello function works", { 424 | who = "James T. Kirk" 425 | expect_equal(hello(who), paste("hello,",who)) 426 | }) 427 | ``` 428 | 429 | * Ready for 🚀 takeoff? 430 | 431 | --- 432 | 433 | ## Running all unit tests 434 | 435 | * You can run each test file separately to try it out (you must do `library(testthat)` first) 436 | * It's better practice to test the entire package though: 437 | 438 | ```r 439 | > test() 440 | ℹ Testing toypackage 441 | 442 | Attaching package: ‘testthat’ 443 | 444 | The following object is masked from ‘package:devtools’: 445 | 446 | test_file 447 | 448 | ✔ | F W S OK | Context 449 | ✔ | 1 | sayhello 450 | 451 | ══ Results ══════════════════════════════════════════════════════ 452 | [ FAIL 0 | WARN 0 | SKIP 0 | PASS 1 ] 453 | ``` 454 | 455 | * Celebrate! 🎉 🥳 🎊 456 | 457 | ## Using _other_ packages 458 | 459 | * Most likely our package would depend some _other_ package as well. 460 | * Like we could choose the `export` some of our functions, we now may want to `import` some functions from elsewhere. 461 | * Suppose we want to use the `dplyr` package: 462 | ```r 463 | > use_package("dplyr") 464 | ✔ Adding 'dplyr' to Imports field in DESCRIPTION 465 | • Refer to functions with `dplyr::fun()` 466 | ``` 467 | 468 | * Let's check the `DESCRIPTION` file to see what happened. 469 | 470 | --- 471 | 472 | ## Hook it up to GitHub! 473 | 474 | * It's fairly easy to publish our new package to a github repo. 475 | * Let's `use_github()` 476 | ```r 477 | use_github() 478 | ``` 479 | 480 | * answer all the prompts and end up here! 481 | 482 | ![](/images/toypackage.png) 483 | 484 | 485 | --- 486 | 487 | ## Adding a Readme file 488 | 489 | * We know by now that readme files are very important on any git repo. 490 | * Let's add one here as well! 491 | * the `usethis::use_readme_rmd()` function is perfect for this: 492 | 493 | ```r 494 | usethis::use_readme_rmd() 495 | ``` 496 | 497 | * If we want to automatically run our tests on a remote server called _github actions_, we can call this function as well to set this up: 498 | 499 | ```r 500 | use_github_actions() 501 | ``` 502 | 503 | * let's re-build the package now. (look for rstudio button `install` in `build` tab) 504 | 505 | 506 | --- 507 | 508 | ## Adding a **Vignette** 509 | 510 | * Vignette's are a great feature of R packages. They are full text introductions of the package to a first time user. 511 | * A _tutorial_ for your package. 512 | * This is going to be much more verbose and spiked with example input and ouput than the standard documentation. 513 | * Often it features the main use case of your package. 514 | * There is an [entire chapter](https://r-pkgs.org/vignettes.html) on `r-pkgs` dedicated to this! 515 | 516 | ### Adding the Vignette(s) 517 | 518 | ```r 519 | usethis::use_vignette("vignette-toypackage-1") 520 | ``` 521 | 522 | --- 523 | 524 | ## Deploy package documention on a website 525 | 526 |
527 |
528 | 529 | * 🚨 Now we are entering the seriously cool zone of R package development 😎 530 | 531 | * Wouldn't it be 🤩 amazing if all of our package documentation, the content of our readme, and any explanatory articles we might have written as vignettes, were **available on a (free to host!) website which is always up to date**? 532 | 533 | 534 | 535 | 536 | --- 537 | 538 | ## You Bet It's Cool 😎 {transition="zoom" transition-speed="slow"} 539 | 540 | 541 | 542 | 543 | 544 | 545 | ![](https://media.giphy.com/media/8q92vsFOM9I2s/giphy-downsized-large.gif) 546 | 547 | 548 | --- 549 | 550 | ## Deploy package documention on a website 3 551 | 552 | * Ready? 553 | 554 | . . . 555 | 556 | ```r 557 | usethis::use_pkgdown() 558 | ``` 559 | 560 | * `pkgdown` is a package for website and docs building. 561 | 562 | . . . 563 | 564 | * Let's build that site! 565 | 566 | ```r 567 | pkdown::build_site() 568 | ``` 569 | 570 | . . . 571 | 572 | * Let's get `gh-actions` going 573 | 574 | ```r 575 | usethis::use_pkgdown_github_pages() 576 | ``` 577 | 578 | * commit everything and push to github! 579 | 580 | 581 | --- 582 | 583 | ## Summary 584 | 585 | ::: {.callout-tip} 586 | 587 | # Key Points 588 | 589 | 1. `RStudio` greatly facilitates `R` package development. 590 | 2. `R` packages contain code, data and documentation in highly structured fashion. 591 | 3. We are encouraged to run automated unit tests. 592 | 3. It is relatively straightforward to publish the package to github for collaboration. 593 | 4. It is equally straightforward to build and publish a full website with package documentation and vignettes, hosted _for free_ on github.com. 594 | 595 | ::: 596 | 597 | -------------------------------------------------------------------------------- /10-spatial-R.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Spatial Data With `R` 3 | format: 4 | revealjs: 5 | theme: _extensions/metropolis-theme/metropolis.scss 6 | chalkboard: true 7 | logo: /images/ScPo-logo.png 8 | footer: "[SciencesPo Intro To Programming 2023](https://floswald.github.io/ScPoProgramming/)" 9 | incremental: false 10 | code-line-numbers: false 11 | highlight-style: github 12 | slide-number: true 13 | author: Florian Oswald 14 | subtitle: "[SciencesPo Intro To Programming 2023](https://floswald.github.io/ScPoProgramming/)" 15 | date: today 16 | date-format: "D MMMM, YYYY" 17 | --- 18 | 19 | ## Intro 20 | 21 | 22 | In this lecture we will cover some basics about geospatial data and how to handle it with `R`. Spatial data is getting always more important, so we need a powerful tool to work with it. 23 | 24 | ::: {.callout-note} 25 | 26 | # tl;dr 27 | 28 | Yes, `R` is a fully fledged GIS. No, you don't need an ArchGIS (or other) license to do real work with spatial data (I don't have one, and I use it for *real work* 😉). 29 | 30 | ::: 31 | 32 | . . . 33 | 34 | ### Resources 35 | 36 | 1. [Geocomputation with `R`](https://r.geocompx.org/index.html) is our main reference. 37 | 2. The [`sf` package vignettes](https://cran.r-project.org/web/packages/sf/vignettes/sf1.html) are *outstanding*. 38 | 39 | --- 40 | 41 | ## Spatial Data Basics 42 | 43 | * One prime example of spatial data are of course *maps*, providing an answer to the age-old question *where is what*. 44 | * Fundamentally, spatial data still provide an answer to the same question, it is just that the *what* part has gotten much richer over the years. 45 | * The attribute *location* may be only one of many other features of information on a certain observation. 46 | * Multiple measurements imply that observations can be observed *moving* in space. 47 | * There are two fundamentally different ways in which to consider spatial data: 48 | 49 | --- 50 | 51 | ## Spatial Data Types 52 | 53 | 54 |
55 | 56 | ::: {.fragment} 57 | ::: {.callout-warning } 58 | 59 | # 1. Vector Data 60 | 61 | We represent things with *points, lines and polygons*. We can scale and stretch and transform those easily with mathematical operations. Can increase precision to arbitrary levels (can always zoom in futher). 62 | ::: 63 | ::: 64 | 65 | 66 | 67 | ::: {.fragment} 68 | ::: {.callout-warning .fragment} 69 | # 2. Raster Data 70 | 71 | We have fixed-size *tiles* or *cells* (like a mosaic, or like *pixels*), which form a **grid**. Fixed resolution. 72 | ::: 73 | ::: 74 | 75 | ::: {.fragment} 76 | > 👉 This lecture deals only with *Vector* Data. 77 | ::: 78 | 79 | # Vector Data and Coordinate Reference Systems 80 | 81 | --- 82 | 83 | ## Representation of Vector Data 84 | 85 | ::: {.fragment} 86 | * Basically, we concentrate on a 2-dimensional space, even though three-dimensional spaces can be useful as well (any ideas for 87 | examples?) 88 | ::: 89 | ::: {.fragment} 90 | * In other words, we denote a location with a tuple of coordinates $(x,y)$, or $(x,y,z)$ as the case may be, where each coordinate gives the *distance from the origin* in each direction. For example, we could represent Paris by the tuple `c(2.34,48.85)` 91 | ::: 92 | ::: {.fragment} 93 | * One key question in the context of spatial data concerning planet earth you should ask is: *Where is the Origin*? 94 | ::: 95 | ::: {.fragment} 96 | * Another question is, related to the well known fact that the earth is quasi-elipsoid (i.e. a bit like a squashed football and - just to be sure: **not flat**), *how to represent locations in three dimensions on a 2-dimensional map*? 97 | ::: 98 | 99 | --- 100 | 101 | ## Coordinate Reference Systems (*CRS*) 102 | 103 | ::: {.fragment} 104 | * CRSs use *longitude* and *latitude* to identify locations. 105 | ::: 106 | ::: {.fragment} 107 | * One widely used CRS is the *World Geodetic System 1984*, or WGS84 (used on google maps). It measures *angular distance* in degrees in a *geocentric datum* (made for the entire planet). 108 | ::: 109 | ::: {.fragment} 110 | * *longitude* measures East-West distance from the **Prime Meridian Plane**. (left-to-right distance from a starting point) 111 | ::: 112 | ::: {.fragment} 113 | * *latitude* measures North-South distance of **Equatorial Plane**. (up-down distance from a starting point) 114 | ::: 115 | 116 | --- 117 | 118 | ## One Standard CRS: WGS84 119 | 120 | ::: {columns} 121 | 122 | ::: {.column width=50%} 123 | ![](/images/vector_lonlatglobe.png) 124 | ::: 125 | 126 | ::: {.column width=40%} 127 | * The dashed lines are the WGS84 elipsoid coordinate frame 128 | * The blue circle is *the origin* at $(0,0)$ : 129 | 1. 0 degrees longitude (x-direction): Prime Meridian through Greenwhich, London. 130 | 1. 0 degrees latitude (y-direction): Equator. 131 | ::: 132 | 133 | ::: 134 | 135 | --- 136 | 137 | ## Paris in Different CRS 138 | 139 | ::: {columns} 140 | 141 | ::: {.column width=45%} 142 | ![*Paris* at `c(2.34,48.85)` in WGS64](/images/vector_lonlatparis.png) 143 | ::: 144 | 145 | ::: {.column width=45%} 146 | ![*Paris* at `c(600256.4, 127726.4)` in NTF Lambert North France](/images/vector_projectedparis.png) 147 | 148 | ::: 149 | 150 | ::: 151 | 152 | --- 153 | 154 | 155 | ## Paris Where? {.inverse} 156 | 157 | ::: {columns} 158 | ::: {.column width=45%} 159 | ![*Paris* at `c(600256.4, 127726.4)` in NTF Lambert North France](/images/vector_projectedparis.png) 160 | 161 | ::: 162 | ::: {.column width=45%} 163 |
164 |
165 |
166 | 167 | ### Task 168 | 169 | 1. Search for *NTF Lambert North France* 170 | 2. What does `c(600256.4, 127726.4)` actually mean? 171 | 172 | ::: 173 | ::: 174 | 175 | 176 | 177 | 178 | 179 | --- 180 | 181 | ## Geocentric vs Local Datum 182 | 183 | ![Figure from [Geocomputation with R](https://r.geocompx.org/spatial-class.html). Geocentric and local geodetic datums shown on top of a geoid (in false color and the vertical exaggeration by 10,000 scale factor). Image of the geoid is adapted from the work of Ince et al. (2019)](/images/02_datum_fig.png) 184 | 185 | 186 | # Vector Spatial Data in `R` 187 | 188 | 189 | ## Working with (Vector) Spatial Data in **R** 190 | 191 | * We rely on a few core libraries. 192 | * `sf` being the main one. That itself relies on several other lower level libraries. 193 | 194 | ```r 195 | install.packages("sf") 196 | ``` 197 | 198 | * Don't try to build from `source` unless you know why. 199 | * For problems, please consult the [package readme](https://github.com/r-spatial/sf#installing). 200 | * Let's try to load the library: 201 | 202 | ```{r} 203 | #| echo: true 204 | #| warning: true 205 | library(sf) 206 | ``` 207 | 208 | * I highly recommend the package vignettes! 209 | 210 | ```r 211 | vignette(package = "sf") # see which vignettes are available 212 | vignette("sf1") # an introduction to the package 213 | ``` 214 | 215 | --- 216 | 217 | ## Working with **sf** 1 218 | 219 | Let's read a *shapefile* from the `sf` package: 220 | 221 | ```{r} 222 | #| echo: true 223 | nc = st_read(system.file("shape/nc.shp", package="sf")) 224 | head(nc[,c("AREA","NAME","FIPS","BIR79")]) 225 | ``` 226 | 227 | --- 228 | 229 | 230 | ## Working with **sf** 2 231 | 232 | * Notice the `geometry` column. 233 | * This is basically a geo-referenced `data.frame`. 234 | 235 | ```{r} 236 | #| echo: true 237 | plot(nc[,"AREA"]) # plot feature "AREA" (i.e. column 1) 238 | ``` 239 | 240 | --- 241 | 242 | 243 | ## Working with **sf** 3 244 | 245 | * Works also with `ggplot2` 246 | 247 | ```{r} 248 | #| echo: true 249 | library(ggplot2) 250 | ggplot(nc) + geom_sf(aes(fill = AREA)) + 251 | scale_fill_viridis_c(name = "Area") 252 | ``` 253 | 254 | --- 255 | 256 | 257 | ## Working with **sf** 4: CRS Transform 258 | 259 | ::: {columns} 260 | 261 | ::: {.column width=45%} 262 |
263 | 264 | 265 | ```{r} 266 | #| echo: true 267 | ggplot(nc) + geom_sf(aes(fill = AREA)) + 268 | scale_fill_viridis_c(name = "Area") 269 | ``` 270 | 271 | ::: 272 | 273 | ::: {.column width=45%} 274 | ```{r} 275 | #| echo: true 276 | nc %>% 277 | st_transform("+proj=moll") %>% 278 | ggplot() + geom_sf(aes(fill = AREA)) + 279 | scale_fill_viridis_c(name = "Area") + 280 | ggtitle("Mollweide Projection") 281 | ``` 282 | 283 | ::: 284 | 285 | ::: 286 | 287 | 288 | ## Geometric Operations with **sf** 1 289 | 290 | * the [simple features standard](https://en.wikipedia.org/wiki/Simple_Features) specifies a series of operations. 291 | * the relevant functions start with `st_` (for *spatio-temporal*) 292 | * For 2 geometries `x,y` we can compute things like `st_distance(x,y)`, `st_intersect(x,y)`, etc 293 | * For single geometries we can do things like `st_area(x)`, `st_union(x)`, `st_buffer(x,dist)` etc 294 | 295 | ```{r} 296 | #| echo: true 297 | 298 | st_area(st_union(nc)) 299 | ``` 300 | 301 | * Ooof, how many square km is that now? 🤔 302 | 303 | . . . 304 | 305 | ```{r} 306 | #| echo: true 307 | 308 | st_area(st_union(nc)) %>% units::set_units(km2) 309 | ``` 310 | 311 | --- 312 | 313 | ## Geometric Operations with **sf** 2 314 | 315 | ```{r} 316 | #| echo: true 317 | # copied from https://github.com/uo-ec607/lectures 318 | nc_centroid = st_centroid(nc) 319 | 320 | ggplot(nc) + 321 | geom_sf(fill = "black", alpha = 0.8, col = "white") + 322 | geom_sf(data = nc_centroid, col = "red") + ## Notice how easy it is to combine different sf objects 323 | labs( 324 | title = "Counties of North Carolina", 325 | subtitle = "Centroids in red" 326 | ) 327 | ``` 328 | 329 | --- 330 | 331 | ## Mapping the Seine 1 {background-image="/images/seine.png" background-position="75% 70%" background-size="40%"} 332 | 333 | ::: {columns} 334 | ::: {.column width=35%} 335 | ```{r} 336 | #| echo: true 337 | # copied from https://github.com/uo-ec607/lectures 338 | # install.packages(c("maps","spData")) 339 | ## Get the data 340 | france = st_as_sf( 341 | maps::map('france', 342 | plot = FALSE, 343 | fill = TRUE) 344 | ) 345 | data("seine", 346 | package = "spData") 347 | 348 | ## Make sure they have the same projection 349 | seine = st_transform(seine, 350 | crs = st_crs(france)) 351 | ``` 352 | ```{r} 353 | #| echo: true 354 | # now, make a base plot: 355 | pseine = ggplot() + 356 | geom_sf(data = france, 357 | alpha = 0.8, 358 | fill = "black", 359 | col = "gray50") + 360 | labs( 361 | title = "Administrative regions of France" 362 | ) 363 | ggsave(plot = pseine, 364 | "images/seine.png", 365 | width=6, height=6) 366 | ``` 367 | ::: 368 | 369 | ::: 370 | 371 | --- 372 | 373 | ## Mapping the Seine 2 {background-image="/images/seine2.png" background-position="75% 70%" background-size="40%"} 374 | 375 | ::: {columns} 376 | ::: {.column width=35%} 377 | 378 | ```{r} 379 | #| echo: true 380 | #| eval: false 381 | # let's add the seine! 382 | pseine2 = pseine + 383 | geom_sf(data = seine, col = "#05E9FF", lwd = 1) + 384 | labs( 385 | title = "Administrative regions of France", 386 | subtitle = "Also showing the Seine, Marne and Yonne rivers" 387 | ) 388 | ggsave(plot = pseine2, 389 | "images/seine2.png", 390 | width=6, height=6) 391 | ``` 392 | ::: 393 | ::: 394 | 395 | 396 | --- 397 | 398 | 399 | ## Intersect two **sf** objects {background-image="/images/seine3.png" background-position="95% 70%" background-size="45%"} 400 | 401 | 402 | ::: {columns} 403 | ::: {.column width=50%} 404 | ```{r} 405 | #| echo: true 406 | seine = st_transform(seine, crs = st_crs(france)) 407 | sf_use_s2(FALSE) # need to turn off because of invalid geometry 408 | france_intersected = st_intersection(france, seine) 409 | head(france_intersected,2) 410 | ``` 411 | 412 | ```{r} 413 | #| eval: true 414 | #| echo: true 415 | pl3 = france_intersected %>% 416 | ggplot() + 417 | geom_sf(alpha = 0.8, aes(fill = ID, col = ID)) + 418 | labs( 419 | title = "Seine, Marne and Yonne rivers", 420 | caption = "Colours depict French administrative regions" 421 | ) + 422 | theme(legend.title = element_blank()) 423 | ggsave(plot = pl3,"images/seine3.png", 424 | width=7, height=5) 425 | ``` 426 | ::: 427 | ::: 428 | 429 | --- 430 | 431 | ## Join two **sf** objects {background-image="/images/seine4.png" background-position="85% 45%" background-size="45%"} 432 | 433 | 434 | ::: {columns} 435 | ::: {.column width=50%} 436 | ```{r} 437 | #| eval: false 438 | #| echo: true 439 | pl4 = st_join(france, seine) %>% 440 | ## Get rid of regions with no overlap 441 | dplyr::filter(!is.na(name)) %>% 442 | ## Some regions are duplicated b/c two 443 | ## branches of the river network flow through them 444 | dplyr::distinct(ID, .keep_all = T) %>% 445 | ## pipe into ggplot 446 | ggplot() + 447 | geom_sf(alpha = 0.5, 448 | fill = "#01731f", 449 | col = "#fcb4b3", # of borders 450 | linewidth = 0.5) + # of borders 451 | geom_sf(data = seine, col = "#05E9FF", lwd = 1) + 452 | labs(title = "Intersected regions only") + 453 | theme_bw() 454 | ggsave(plot = pl4,"images/seine4.png", 455 | width=7, height=5) 456 | ``` 457 | ::: 458 | ::: 459 | 460 | --- 461 | 462 | ## Joining Task { .inverse} 463 | 464 | 465 | ::: {columns} 466 | ::: {.column width=35%} 467 | * Modify the code chunk on the previous slide. 468 | * We want to have different colors for the shown departements, instead of all "#01731f". 469 | * I.e. make this for me 👉 470 | ```{r} 471 | #| eval: false 472 | #| echo: false 473 | d5 = st_join(france, seine) %>% 474 | ## Get rid of regions with no overlap 475 | dplyr::filter(!is.na(name)) %>% 476 | ## Some regions are duplicated b/c two 477 | ## branches of the river network flow through them 478 | dplyr::distinct(ID, .keep_all = T) 479 | 480 | my_colors = palette.colors(nrow(d5), palette = "Alphabet") 481 | names(my_colors) <- NULL 482 | 483 | ## pipe into ggplot 484 | pl5 = ggplot(data = d5) + 485 | geom_sf(aes(fill = ID), 486 | col = "#fcb4b3", # of borders 487 | linewidth = 0.5) + # of borders 488 | geom_sf(data = seine, col = "#05E9FF", lwd = 1.5) + 489 | labs(title = "Intersected regions only", fill = "Departement") + 490 | theme_bw() + 491 | scale_fill_manual(values = my_colors) 492 | ggsave(plot = pl5,"images/seine5.png", 493 | width=7, height=5) 494 | ``` 495 | ::: 496 | 497 | ::: {.column width=55%} 498 | ![](images/seine5.png) 499 | ::: 500 | ::: 501 | 502 | --- 503 | 504 | ## Joining Task Solution 505 | 506 | ```{r} 507 | #| eval: false 508 | #| echo: true 509 | d5 = st_join(france, seine) %>% 510 | ## Get rid of regions with no overlap 511 | dplyr::filter(!is.na(name)) %>% 512 | ## Some regions are duplicated b/c two 513 | ## branches of the river network flow through them 514 | dplyr::distinct(ID, .keep_all = T) 515 | 516 | my_colors = palette.colors(nrow(d5), palette = "Alphabet") 517 | names(my_colors) <- NULL 518 | 519 | ## pipe into ggplot 520 | pl5 = ggplot(data = d5) + 521 | geom_sf(aes(fill = ID), 522 | col = "#fcb4b3", # of borders 523 | linewidth = 0.5) + # of borders 524 | geom_sf(data = seine, col = "#05E9FF", lwd = 1.5) + 525 | labs(title = "Intersected regions only", fill = "Departement") + 526 | theme_bw() + 527 | scale_fill_manual(values = my_colors) 528 | ggsave(plot = pl5,"images/seine5.png", 529 | width=7, height=5) 530 | ``` 531 | 532 | 533 | --- 534 | 535 | ## Distances 536 | 537 | * Another typical question could be: 538 | 539 | > What's the (straight-line) distance between 2 points? 540 | 541 | As in 542 | 543 | >What's the distance between the centroids of the Seine-Maritime and Nievre Departements? 544 | 545 | --- 546 | 547 | ## Task: Distances {.inverse} 548 | 549 | Modifying the plot from the previous task, produce 2 new plots 550 | 551 | 1. One that colors only the concerned departments, and marks their respective centroids with a point. 552 | 2. Another one with the same coloring, but where a straight solid line connects both centroids, and we print the distance in km into the table title. 553 | 554 | --- 555 | 556 | ## Task Desired Result: Distances {.inverse} 557 | 558 | **Hint:** 559 | 560 | ```r 561 | # start from here 562 | p6 = ggplot(d5) + geom_sf() 563 | ``` 564 | 565 | ::: {layout-ncol=2} 566 | 567 | ![Figure 1](images/distance1.png) 568 | 569 | ![Figure 2](images/distance2.png) 570 | 571 | Desired Outputs 572 | 573 | ::: 574 | 575 | 576 | --- 577 | 578 | ## Task Solution 579 | 580 | ```{r} 581 | #| echo: true 582 | #| eval: false 583 | cvec = rep(NA, length(unique(d5$ID))) 584 | names(cvec) <- unique(d5$ID) 585 | cvec["Seine-Maritime"] <- "purple" 586 | cvec["Nievre"] <- "brown" 587 | 588 | p6 = ggplot(d5) + geom_sf() 589 | p6 = ggplot(d5) + geom_sf(aes(fill = ID)) 590 | p6 = p6 + scale_fill_manual(values = cvec, limits= c("Seine-Maritime","Nievre")) 591 | subdeps = d5 %>% dplyr::filter(ID %in% c("Seine-Maritime","Nievre")) 592 | p6 = p6 + geom_sf(data = st_centroid(subdeps)) 593 | ggsave(plot = p6, "images/distance1.png", width = 5,height=4) 594 | 595 | dists = st_distance(subdeps) %>% units::set_units("km") 596 | coords = st_centroid(subdeps) %>% st_coordinates() 597 | coords = data.frame(lon = coords[1,"X"], 598 | lat = coords[1,"Y"], 599 | lon_end = coords[2,"X"], 600 | lat_end = coords[2,"Y"]) 601 | p7 = p6 + geom_segment(data = coords, aes(lon, lat, xend = lon_end, yend = lat_end)) 602 | p7 = p7 + ggtitle(paste("Distance between Centroids:",round(dists[1,2],0), "km")) 603 | ggsave(plot = p7, "images/distance2.png",width = 5,height=4) 604 | 605 | 606 | ``` 607 | 608 | 609 | # Raster Data 610 | 611 | --- 612 | 613 | ## What's Different? 614 | 615 | * We have a grid (*pixels*) where each cell contains one single data value - usually our measure of interest. 616 | * We can have multiple *layers* of measurements (e.g. temperature, humidity and elevation for a grid cell) 617 | * CRS considerations equally apply. 618 | * Remote Sensing Data (e.g. Satelitte images) are often in raster format. 619 | 620 | --- 621 | 622 | ## Raster Resources 623 | 624 | * [Chapter 6 of gecompr](https://r.geocompx.org/raster-vector.html) is a great starting point. 625 | * [R as GIS for Economists](https://tmieno2.github.io/R-as-GIS-for-Economists/index.html) is in general a great resource, [chapter 4](https://tmieno2.github.io/R-as-GIS-for-Economists/raster-basics.html) in particular so. 626 | * [`R` package `{raster}`](https://cran.r-project.org/web/packages/raster/index.html) is the very mature and traditional solution. 627 | * [`R` package `{star}`](https://cran.r-project.org/web/packages/stars/index.html) is a great recent development in this space. 628 | 629 | --- 630 | 631 | ## Other Spatial Resources 632 | 633 | * [Spatial Data Science](https://r-spatial.org/book/): Still WIP but looks like the ultimate authority amongst books. 634 | * [mapview/](https://r-spatial.github.io/mapview/) : great for interactive and quick mapping 635 | * [tmaps](https://cran.r-project.org/web/packages/tmap/vignettes/tmap-getstarted.html): same 636 | * [Analyzing US Census Data](https://walker-data.com/census-r/) by Kyle Walker is a brilliant intro do his package [`{tidycensus}`](https://walker-data.com/tidycensus/). 637 | * Nice mapping [examples](https://ryanpeek.github.io/2017-11-05-mapping-with-sf-Part-2/) 638 | * [`sf` and `raster`](https://nceas.github.io/oss-lessons/spatial-data-gis-law/3-mon-intro-gis-in-r.html) intro by NCEAS 639 | 640 | 641 | # End -------------------------------------------------------------------------------- /11-NLP-R.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Quick Intro to NLP with `R` 3 | format: 4 | revealjs: 5 | theme: _extensions/metropolis-theme/metropolis.scss 6 | chalkboard: true 7 | logo: /images/ScPo-logo.png 8 | footer: "[SciencesPo Intro To Programming 2024](https://floswald.github.io/ScPoProgramming/)" 9 | incremental: false 10 | code-line-numbers: false 11 | highlight-style: github 12 | slide-number: true 13 | author: Florian Oswald 14 | subtitle: "[SciencesPo Intro To Programming 2024](https://floswald.github.io/ScPoProgramming/)" 15 | date: today 16 | date-format: "D MMMM, YYYY" 17 | execute: 18 | echo: true 19 | cache: true 20 | --- 21 | 22 | ## Intro 23 | 24 | 25 | In this lecture we will introduce the most basic language models with R. 26 | 27 | This is based on a nice [introduction](https://datascienceplus.com/an-introduction-to-k-gram-language-models-in-r/) by Valerio Gherardi, author of the `kgrams` package for R. 28 | 29 | --- 30 | 31 | ## Natuarl Language Processing (NLP) basics 32 | 33 | * We are all familiar with *large* language models (LLMs for short) by now. 34 | * ChatGPT (short for _Chat Generative Pretrained Transformer_) is a proprietary solution, there are by now many open source alternatives. 35 | * We will not be able to go into the details of those, but see some simpler cousins. 36 | 37 | 38 | ## $k$-gram language models 39 | 40 | * Let $w_i$ be the $i$-th word in a sentence, i.e. $s = w_1 w_2 \dots w_n$ 41 | * An NLP model gives the probability of observing this sentence, i.e. $\Pr(s)$. 42 | * As usual, we can *sample* from $\Pr(s)$ to obtain *random* sentences. 43 | * In general all the $s$ at our disposal come from a certain _corpus_, i.e. a collection of sentences/words. 44 | 45 | ## Continuation Probabilities 46 | 47 | * Define a sequence of words as _context_: $w_1 w_2 \dots w_m$ 48 | * We can _predict_ the next word in the sequence by computing $\Pr(w|c)$, i.e. $\Pr(w|c)$ is the probability that the next word is $w$, given context $c$. 49 | * That is in a nutshell what ChatGPT computes for you. 50 | 51 | ## Dictionaries 52 | 53 | * The list of known words in an NLP model is called the _dictionary_. 54 | * This also tells us how to deal with _unknown_ words - those are mapped to the `UNK` (unknown word token). 55 | * It also tells us how to deal with the end of sentences, by introducing an `EOS` (end of sentence) token. 56 | * `kgram` models (below) also include a `BOS` (beginning of sentence) token. Each sentence is left-padded with $N-1$ `BOS` tokens ($N$ the order of the model). This helps predicting _the first workd of the next sentence_ from the preceding $N-1$ tokens. 57 | 58 | ## $k$-gram Models 59 | 60 | * A $k$-gram model makes a _markovian_ assumption on continuation probabilites. 61 | * We assume that the next word depends only on the last $N-1$ words, where $N$ is the _order_ of the model. 62 | * We have 63 | 64 | $$\begin{align} 65 | \Pr(w|c) &= \Pr(w|w_1 w_2 \cdots w_{N-1})\\ 66 | c &= \cdots w_{-1} w_0 w_1 w_2 \cdots w_{N-1} 67 | \end{align}$$ 68 | 69 | * We call the $k$ tuples of words $(w_1, w_2,\dots, w_k)$ _k-grams_. 70 | * You can see that we can only capture relatively short range dependencies. 71 | * As $N$ becomes too large, memory requirements explode. 72 | 73 | ## Estimating Continuation Probabilities 74 | 75 | * We can make a table from our corpus, counting how many times each $k$ gram occurs. 76 | * While this is simple, we need a _smoothing_ technique to account for the fact that many potentially sensible sentences are never observed in our Corpus. 77 | * The smoothing will take some probability from the very frequently observed sequences and give some to the rarer ones, simply speaking. 78 | 79 | $$\hat{\Pr}_{MLE}(w|c) = \frac{C(w_1 w_2 \cdots w_{k} w)}{C(w_1 w_2 \cdots w_{k})}$$ 80 | 81 | * Our data is sparse: many sequences are not in our corpus, hence the above estimator incorrectly assigns zero probability to them. 82 | * If context $w_1 w_2 \cdots w_{k}$ not in data, estimator is not defined. 83 | 84 | ## Training and Testing NLP Models 85 | 86 | * We need an evaluation metric: how good is this model. 87 | * Widely used is [perplexity](https://en.wikipedia.org/wiki/Perplexity): The larger _perplexity_ of a discrete probability distribution, the less likely it will be that an observer could guess the next value to be drawn from it. 88 | * We will evaluate $H=-\frac{1}{W} \sum_s \ln \Pr(s)$ where $W$ is the total number of words in our corpus. 89 | 90 | ## Training a k-gram model in R 91 | 92 | ```{r} 93 | #| echo: true 94 | library(kgrams) 95 | ``` 96 | We can get the spoken text from the following Shakespear plays: 97 | 98 | ```{r} 99 | #| echo: true 100 | 101 | playcodes <- c( 102 | "All's Well That Ends Well" = "AWW", 103 | "Antony and Cleopatra" = "Ant", 104 | "As You Like It" = "AYL", 105 | "The Comedy of Errors" = "Err", 106 | "Coriolanus" = "Cor", 107 | "Cymbeline" = "Cym", 108 | "Hamlet" = "Ham", 109 | "Henry IV, Part 1" = "1H4", 110 | "Henry IV, Part 2" = "2H4", 111 | "Henry V" = "H5", 112 | "Henry VI, Part 1" = "1H6", 113 | "Henry VI, Part 2" = "2H6", 114 | "Henry VI, Part 3" = "3H6", 115 | "Henry VIII" = "H8", 116 | "Julius Caesar" = "JC", 117 | "King John" = "Jn", 118 | "King Lear" = "Lr", 119 | "Love's Labor's Lost" = "LLL", 120 | "Macbeth" = "Mac", 121 | "Measure for Measure" = "MM", 122 | "The Merchant of Venice" = "MV", 123 | "The Merry Wives of Windsor" = "Wiv", 124 | "A Midsummer Night's Dream" = "MND", 125 | "Much Ado About Nothing" = "Ado", 126 | "Othello" = "Oth", 127 | "Pericles" = "Per", 128 | "Richard II" = "R2", 129 | "Richard III" = "R3", 130 | "Romeo and Juliet" = "Rom", 131 | "The Taming of the Shrew" = "Shr", 132 | "The Tempest" = "Tmp", 133 | "Timon of Athens" = "Tim", 134 | "Titus Andronicus" = "Tit", 135 | "Troilus and Cressida" = "Tro", 136 | "Twelfth Night" = "TN", 137 | "Two Gentlemen of Verona" = "TGV", 138 | "Two Noble Kinsmen" = "TNK", 139 | "The Winter's Tale" = "WT" 140 | ) 141 | ``` 142 | 143 | ## Estimating 2 144 | 145 | We could get the text from "Much Ado about Nothing" as follows: 146 | 147 | ```{r} 148 | #| echo: true 149 | get_url_con <- function(playcode) { 150 | stopifnot(playcode %in% playcodes) 151 | url <- paste0("https://www.folgerdigitaltexts.org/", playcode, "/text") 152 | con <- url(url) 153 | } 154 | 155 | con <- get_url_con("Ado") 156 | open(con) 157 | readLines(con, 10) 158 | ``` 159 | 160 | ```{r} 161 | #| echo: true 162 | close(con) 163 | ``` 164 | 165 | ## Defining Training and Testing Data 166 | 167 | We will use all plays but "Hamlet" as training data, and reserve this last one for testing our model. 168 | 169 | ```{r} 170 | train_playcodes <- playcodes[names(playcodes) != c("Hamlet")] 171 | test_playcodes <- playcodes[names(playcodes) == c("Hamlet")] 172 | ``` 173 | 174 | We want to pre-process the text data. Here we want to remove some html tags and make everything lower-case. 175 | 176 | ```{r} 177 | .preprocess <- function(x) { 178 | # Remove html tags 179 | x <- gsub("<[^>]+>", "", x) 180 | # Lower-case and remove characters not alphanumeric or punctuation 181 | x <- kgrams::preprocess(x) 182 | return(x) 183 | } 184 | ``` 185 | 186 | ## Preprocessing Text 187 | 188 | * We need to split sentences at sensible punctuation marks `.!?:;` and insert `EOS` and `BOS` tokens into the data. 189 | * This will treat `.!?:;` as regular _words_, hence the model will be able to *predict* those. 190 | 191 | ```{r} 192 | .tknz_sent <- function(x) { 193 | # Collapse everything to a single string 194 | x <- paste(x, collapse = " ") 195 | # Tokenize sentences 196 | x <- kgrams::tknz_sent(x, keep_first = TRUE) 197 | # Remove empty sentences 198 | x <- x[x != ""] 199 | return(x) 200 | } 201 | ``` 202 | 203 | ## Making $k$-gram frequency counts 204 | 205 | * Let us now make a table of occurences of all $k$-grams in our corpus. 206 | * We set an _order_: 207 | 208 | ```{r} 209 | N = 5 210 | freqs = kgram_freqs(N, .preprocess = .preprocess, .tknz_sent = .tknz_sent) 211 | summary(freqs) 212 | ``` 213 | 214 | * So, for now this is an empty model as you can see. Let's train it on our corpus! 215 | 216 | ## Training the NLP model 217 | 218 | ```{r} 219 | lapply(train_playcodes, 220 | function(playcode) { 221 | con <- get_url_con(playcode) 222 | process_sentences(text = con, freqs = freqs, verbose = FALSE) 223 | }) 224 | ``` 225 | 226 | ## Checking the Frequency tables 227 | 228 | * the `freqs` object was modified during the previous call. 229 | * Let's check it quickly: 230 | 231 | ```{r} 232 | query(freqs, c("leonato", "pound of flesh", "smartphones")) 233 | ``` 234 | 235 | * Last thing to do: choose a smoother. 236 | ```{r} 237 | 238 | smoothers() 239 | ``` 240 | 241 | Let's choose the _modified Kneser-Ney_ smoother and set some default parameters: 242 | 243 | ```{r} 244 | info("mkn") 245 | ``` 246 | 247 | ## Building the model 248 | 249 | ```{r} 250 | model <- language_model(freqs, smoother = "mkn", D1 = 0.5, D2 = 0.5, D3 = 0.5) 251 | summary(model) 252 | ``` 253 | 254 | ## Making Predictions with the model 255 | 256 | * Now we can compute probabilities for given sentences: 257 | 258 | ```{r} 259 | sentences <- c( 260 | "I have a letter from monsieur Berowne to one lady Rosaline.", 261 | "I have an email from monsieur Valerio to one lady Judit." 262 | ) 263 | probability(sentences, model) 264 | ``` 265 | 266 | or we can get the _continuation probability_ for a context: 267 | 268 | ```{r} 269 | context <- "pound of" 270 | words <- c("flesh", "bananas") 271 | probability(words %|% context, model) 272 | ``` 273 | 274 | ## Tuning our models 275 | 276 | * Remember we held out "Hamlet" from our training data. Let's use it to test performance now! 277 | 278 | ```{r} 279 | con <- get_url_con(test_playcodes) 280 | perplexity(text = con, model = model) 281 | ``` 282 | 283 | This applies the same transformations and tokenization to test data than it does to training data (which is important). 284 | 285 | ## Tuning More 286 | 287 | * We could now create a grid over the parameters of the model (`D1`, `D2` etc) as well as the order of the models 288 | * We would then choose those parameters for whcih the perplexity is smallest. 289 | * Suppose we find that the $k=4$ models works best. 290 | * Let's use it to create some random sentences! 291 | 292 | ```{r} 293 | param(model, "N") <- 4 294 | ``` 295 | 296 | ## Random Text generation 297 | 298 | ```{r} 299 | set.seed(840) 300 | sample_sentences(model, 10, max_length = 20) 301 | ``` 302 | 303 | ## Temperature 304 | 305 | * The temperature parameter makes the pdf smoother and rougher. Smaller values mean the model will not deviate much from it's implied distribution, higher values means there will be much more randomness in output. 306 | 307 | ```{r} 308 | set.seed(841) 309 | sample_sentences(model, 10, max_length = 20) # Normal temperature 310 | ``` 311 | 312 | ## High temperature 313 | 314 | ```{r} 315 | set.seed(841) 316 | sample_sentences(model, 10, max_length = 20, t = 10) 317 | ``` 318 | 319 | 320 | ## Low temperature 321 | 322 | ```{r} 323 | set.seed(841) 324 | sample_sentences(model, 10, max_length = 20, t = 0.1) 325 | ``` 326 | 327 | 328 | 329 | # End -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/__packages: -------------------------------------------------------------------------------- 1 | kgrams 2 | -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-10_b67627f3927e12ad713028cf98e417bb.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-10_b67627f3927e12ad713028cf98e417bb.RData -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-10_b67627f3927e12ad713028cf98e417bb.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-10_b67627f3927e12ad713028cf98e417bb.rdb -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-10_b67627f3927e12ad713028cf98e417bb.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-10_b67627f3927e12ad713028cf98e417bb.rdx -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-11_c35984ea9b2f979ff7f85284fee4cda0.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-11_c35984ea9b2f979ff7f85284fee4cda0.RData -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-11_c35984ea9b2f979ff7f85284fee4cda0.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-11_c35984ea9b2f979ff7f85284fee4cda0.rdb -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-11_c35984ea9b2f979ff7f85284fee4cda0.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-11_c35984ea9b2f979ff7f85284fee4cda0.rdx -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-12_18410f94565577aaf66decee3574d410.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-12_18410f94565577aaf66decee3574d410.RData -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-12_18410f94565577aaf66decee3574d410.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-12_18410f94565577aaf66decee3574d410.rdb -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-12_18410f94565577aaf66decee3574d410.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-12_18410f94565577aaf66decee3574d410.rdx -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-13_17c61bf27164538fea6ca0a3e4dc5b20.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-13_17c61bf27164538fea6ca0a3e4dc5b20.RData -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-13_17c61bf27164538fea6ca0a3e4dc5b20.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-13_17c61bf27164538fea6ca0a3e4dc5b20.rdb -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-13_17c61bf27164538fea6ca0a3e4dc5b20.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-13_17c61bf27164538fea6ca0a3e4dc5b20.rdx -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-14_3cbee9f25d7102231341290c4fc06f0d.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-14_3cbee9f25d7102231341290c4fc06f0d.RData -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-14_3cbee9f25d7102231341290c4fc06f0d.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-14_3cbee9f25d7102231341290c4fc06f0d.rdb -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-14_3cbee9f25d7102231341290c4fc06f0d.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-14_3cbee9f25d7102231341290c4fc06f0d.rdx -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-15_50c5eb25cbf3bf122ec79bddb67172f2.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-15_50c5eb25cbf3bf122ec79bddb67172f2.RData -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-15_50c5eb25cbf3bf122ec79bddb67172f2.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-15_50c5eb25cbf3bf122ec79bddb67172f2.rdb -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-15_50c5eb25cbf3bf122ec79bddb67172f2.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-15_50c5eb25cbf3bf122ec79bddb67172f2.rdx -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-16_1c82565a0983d4f9ac69861c5f41d0d1.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-16_1c82565a0983d4f9ac69861c5f41d0d1.RData -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-16_1c82565a0983d4f9ac69861c5f41d0d1.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-16_1c82565a0983d4f9ac69861c5f41d0d1.rdb -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-16_1c82565a0983d4f9ac69861c5f41d0d1.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-16_1c82565a0983d4f9ac69861c5f41d0d1.rdx -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-17_a7915cd327518d28ed8f0a9e584a9247.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-17_a7915cd327518d28ed8f0a9e584a9247.RData -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-17_a7915cd327518d28ed8f0a9e584a9247.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-17_a7915cd327518d28ed8f0a9e584a9247.rdb -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-17_a7915cd327518d28ed8f0a9e584a9247.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-17_a7915cd327518d28ed8f0a9e584a9247.rdx -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-18_13c48c8c58c1b5cd21bb3ca46d39505a.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-18_13c48c8c58c1b5cd21bb3ca46d39505a.RData -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-18_13c48c8c58c1b5cd21bb3ca46d39505a.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-18_13c48c8c58c1b5cd21bb3ca46d39505a.rdb -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-18_13c48c8c58c1b5cd21bb3ca46d39505a.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-18_13c48c8c58c1b5cd21bb3ca46d39505a.rdx -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-19_60f976a677f568d76e9935e9173d5545.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-19_60f976a677f568d76e9935e9173d5545.RData -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-19_60f976a677f568d76e9935e9173d5545.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-19_60f976a677f568d76e9935e9173d5545.rdb -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-19_60f976a677f568d76e9935e9173d5545.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-19_60f976a677f568d76e9935e9173d5545.rdx -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-1_b443c34df83ffb4e47c67e5e9ac4cfce.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-1_b443c34df83ffb4e47c67e5e9ac4cfce.RData -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-1_b443c34df83ffb4e47c67e5e9ac4cfce.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-1_b443c34df83ffb4e47c67e5e9ac4cfce.rdb -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-1_b443c34df83ffb4e47c67e5e9ac4cfce.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-1_b443c34df83ffb4e47c67e5e9ac4cfce.rdx -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-20_f90113178a75d661e8a9b319f8dd63b9.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-20_f90113178a75d661e8a9b319f8dd63b9.RData -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-20_f90113178a75d661e8a9b319f8dd63b9.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-20_f90113178a75d661e8a9b319f8dd63b9.rdb -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-20_f90113178a75d661e8a9b319f8dd63b9.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-20_f90113178a75d661e8a9b319f8dd63b9.rdx -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-21_0cc656c58d55c349f872ab6c59a1c9c6.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-21_0cc656c58d55c349f872ab6c59a1c9c6.RData -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-21_0cc656c58d55c349f872ab6c59a1c9c6.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-21_0cc656c58d55c349f872ab6c59a1c9c6.rdb -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-21_0cc656c58d55c349f872ab6c59a1c9c6.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-21_0cc656c58d55c349f872ab6c59a1c9c6.rdx -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-2_ea58cca509eaa82089e5339e5054c58a.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-2_ea58cca509eaa82089e5339e5054c58a.RData -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-2_ea58cca509eaa82089e5339e5054c58a.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-2_ea58cca509eaa82089e5339e5054c58a.rdb -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-2_ea58cca509eaa82089e5339e5054c58a.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-2_ea58cca509eaa82089e5339e5054c58a.rdx -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-3_3ba45688564a3db789f5c5f910f2d7c8.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-3_3ba45688564a3db789f5c5f910f2d7c8.RData -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-3_3ba45688564a3db789f5c5f910f2d7c8.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-3_3ba45688564a3db789f5c5f910f2d7c8.rdb -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-3_3ba45688564a3db789f5c5f910f2d7c8.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-3_3ba45688564a3db789f5c5f910f2d7c8.rdx -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-4_1ac64f41a7478af12db8d4afc3796ea6.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-4_1ac64f41a7478af12db8d4afc3796ea6.RData -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-4_1ac64f41a7478af12db8d4afc3796ea6.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-4_1ac64f41a7478af12db8d4afc3796ea6.rdb -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-4_1ac64f41a7478af12db8d4afc3796ea6.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-4_1ac64f41a7478af12db8d4afc3796ea6.rdx -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-5_ca8c5c862543df61f31a683820040a75.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-5_ca8c5c862543df61f31a683820040a75.RData -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-5_ca8c5c862543df61f31a683820040a75.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-5_ca8c5c862543df61f31a683820040a75.rdb -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-5_ca8c5c862543df61f31a683820040a75.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-5_ca8c5c862543df61f31a683820040a75.rdx -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-6_502f7c554aaaed64ff954e58c7fbaa66.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-6_502f7c554aaaed64ff954e58c7fbaa66.RData -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-6_502f7c554aaaed64ff954e58c7fbaa66.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-6_502f7c554aaaed64ff954e58c7fbaa66.rdb -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-6_502f7c554aaaed64ff954e58c7fbaa66.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-6_502f7c554aaaed64ff954e58c7fbaa66.rdx -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-7_2750b1c3b2e57236a5af4f8faef1c5d1.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-7_2750b1c3b2e57236a5af4f8faef1c5d1.RData -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-7_2750b1c3b2e57236a5af4f8faef1c5d1.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-7_2750b1c3b2e57236a5af4f8faef1c5d1.rdb -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-7_2750b1c3b2e57236a5af4f8faef1c5d1.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-7_2750b1c3b2e57236a5af4f8faef1c5d1.rdx -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-8_485a42cd3be166890574dd8e3464bfd8.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-8_485a42cd3be166890574dd8e3464bfd8.RData -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-8_485a42cd3be166890574dd8e3464bfd8.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-8_485a42cd3be166890574dd8e3464bfd8.rdb -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-8_485a42cd3be166890574dd8e3464bfd8.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-8_485a42cd3be166890574dd8e3464bfd8.rdx -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-9_56df5bed1c943e1c4727cc65d4f7ab22.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-9_56df5bed1c943e1c4727cc65d4f7ab22.RData -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-9_56df5bed1c943e1c4727cc65d4f7ab22.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-9_56df5bed1c943e1c4727cc65d4f7ab22.rdb -------------------------------------------------------------------------------- /11-NLP-R_cache/revealjs/unnamed-chunk-9_56df5bed1c943e1c4727cc65d4f7ab22.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-9_56df5bed1c943e1c4727cc65d4f7ab22.rdx -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Florian Oswald 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /R/sayhello.R: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/R/sayhello.R -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ScPo Intro To Programming 2 | 3 | This repository contains the course material for the introductory programming course at Sciences Po. 4 | 5 | You can find the course website [here](https://floswald.github.io/ScPoProgramming/). 6 | 7 | ## License 8 | 9 | You are free to copy and remix this content as long as you stick the terms laid out in the LICENSE file. Thanks. -------------------------------------------------------------------------------- /ScPoProgramming.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | -------------------------------------------------------------------------------- /_extensions/metropolis-theme/_extension.yml: -------------------------------------------------------------------------------- 1 | title: Quarto Metropolis Theme 2 | author: Patrick Schratz 3 | version: 1.0.0 4 | contributes: 5 | formats: 6 | revealjs: 7 | theme: metropolis.scss 8 | -------------------------------------------------------------------------------- /_extensions/metropolis-theme/metropolis.scss: -------------------------------------------------------------------------------- 1 | /*-- scss:defaults --*/ 2 | 3 | // fonts 4 | @import url(https://fonts.googleapis.com/css?family=Fira+Sans:300,300i,400,400i,500,500i,700,700i); 5 | @import url(https://cdn.rawgit.com/tonsky/FiraCode/1.204/distr/fira_code.css); 6 | @import url("https://fonts.googleapis.com/css?family=Roboto+Mono|JetBrains+Mono&display=swap"); 7 | @import url("https://fonts.googleapis.com/css?family=Roboto:300,400,500,700&display=swap"); 8 | 9 | $font-family-sans-serif: "Roboto", "Fira Sans", "Droid Serif", serif !default; 10 | $font-family-monospace: "JetBrains Mono", "Fira Code", monospace; 11 | $presentation-font-size-root: 30px; 12 | $presentation-line-height: 1.5em; 13 | $presentation-heading-font-weight: 400; 14 | 15 | // colors 16 | $body-bg: #fafafa !default; 17 | $body-color: #000 !default; 18 | // $link-color: #EB811B !default; 19 | $selection-bg: #26351c; 20 | 21 | // headings 22 | // $presentation-heading-font: $font-family-sans-serif, serif !default; 23 | // $presentation-heading-color: #383d3d !default; 24 | 25 | /*-- scss:rules --*/ 26 | 27 | .reveal a { 28 | line-height: 1.5em; 29 | color: #eb811b; 30 | font-weight: 300; 31 | } 32 | 33 | .reveal .footer a { 34 | color: #eb811b !important; 35 | } 36 | 37 | .reveal p { 38 | font-weight: 300; 39 | } 40 | 41 | .reveal .slide ul li, 42 | .reveal .slide ol li { 43 | font-weight: 300; 44 | } 45 | 46 | // maximum height of code blocks before scrolling is used 47 | .reveal pre.sourceCode code { 48 | max-height: 700px; // default 500 49 | } 50 | 51 | // title slide 52 | .title-slide { 53 | background-color: #fafafa; 54 | border-top: 80px solid #fafafa; 55 | } 56 | 57 | h1.title { 58 | color: #1a292c; 59 | font-size: 45px; 60 | text-shadow: none; 61 | font-weight: 400; 62 | text-align: left; 63 | margin-left: 15px; 64 | padding-top: 80px; 65 | } 66 | p.subtitle { 67 | // margin-top: -10px; 68 | // padding-bottom: -20px; 69 | color: #1a292c; 70 | text-shadow: none; 71 | font-weight: 300; 72 | font-size: 40px; 73 | text-align: left; 74 | margin-left: 15px; 75 | } 76 | p.author { 77 | color: #1a292c; 78 | text-shadow: none; 79 | font-weight: 300; 80 | font-size: 30px; 81 | text-align: left; 82 | margin-left: 15px; 83 | margin-bottom: -10px; 84 | margin-top: 0px; 85 | } 86 | 87 | p.date { 88 | color: #1a292c; 89 | text-shadow: none; 90 | font-weight: 300; 91 | font-size: 30px; 92 | text-align: left; 93 | margin-left: 15px; 94 | // margin-bottom: -30px; 95 | } 96 | 97 | p.subtitle:after { 98 | content: ""; 99 | display: block; 100 | border: none; 101 | background-color: #eb811b; 102 | color: #eb811b; 103 | height: 1px; 104 | margin: 25px 0 25px; 105 | } 106 | 107 | // Section break slide 108 | hr, 109 | h1::after { 110 | content: ""; 111 | display: block; 112 | border: none; 113 | background-color: #eb811b; 114 | color: #eb811b; 115 | height: 1px; 116 | margin: 1em 10px 0 10px; 117 | } 118 | 119 | // Override h1 style for title slide (remove section break slide style) 120 | hr, 121 | h1.title::after { 122 | content: ""; 123 | display: block; 124 | border: none; 125 | background-color: transparent !important; 126 | color: transparent !important; 127 | height: 0px; 128 | margin: 0px !important; 129 | } 130 | 131 | h2::after.title { 132 | margin: 10px 15px 35px 0; 133 | } 134 | 135 | .reveal .slide-number a { 136 | font-size: 120%; 137 | background-color: #fafafa; 138 | border-radius: 12px; 139 | padding: 5px; 140 | } 141 | 142 | // inline 143 | .reveal code { 144 | font-size: 70%; 145 | background-color: #afb8c133; 146 | color: #000; 147 | padding: 4px; 148 | border-radius: 6px; 149 | } 150 | 151 | // code blocks 152 | .reveal div.sourceCode pre code { 153 | font-size: 100%; 154 | } 155 | 156 | // code output 157 | .reveal pre code { 158 | font-size: 100%; 159 | padding-top: 15px; 160 | } 161 | 162 | 163 | 164 | .column { 165 | // #column; 166 | // border: 2px solid red; 167 | border-radius: 10px !important; 168 | padding: 10px; 169 | margin: 5px; 170 | // background-color: #ededed; 171 | // background-color: #ffffff; 172 | } 173 | 174 | 175 | .reveal h2 { 176 | background-color: #23373b; 177 | padding: 5px 0px 5px 10px; 178 | color: #fafafa; 179 | border-radius: 12px; 180 | } 181 | 182 | .inverse { 183 | background-color: #fff3f2; 184 | padding: 5px 0px 5px 10px; 185 | color: #870000; 186 | border-radius: 12px; 187 | } 188 | 189 | .small-font { 190 | font-size: 70%; 191 | } 192 | 193 | iframe { 194 | display: block; 195 | margin-right: auto; 196 | margin-left: auto; 197 | } 198 | 199 | .center { 200 | text-align: center; 201 | } 202 | 203 | // 204 | .reveal .slide-menu-button .fa-bars::before { 205 | background-image: url('data:image/svg+xml,'); 206 | } 207 | 208 | .reveal .slide-chalkboard-buttons .fa-easel2::before { 209 | padding-bottom: 6px; 210 | background-image: url('data:image/svg+xml,'); 211 | } 212 | 213 | .reveal .slide-chalkboard-buttons .fa-brush::before { 214 | padding-bottom: 6px; 215 | background-image: url('data:image/svg+xml,'); 216 | } 217 | 218 | .reveal .progress { 219 | color: #23373b; 220 | } 221 | -------------------------------------------------------------------------------- /_quarto.yml: -------------------------------------------------------------------------------- 1 | project: 2 | type: website 3 | 4 | website: 5 | title: "ScPoProgramming" 6 | favicon: /images/ScPo-logo.png 7 | twitter-card: true 8 | google-analytics: "G-TQGG8QBSRH" 9 | body-footer: © Florian Oswald, 2024 10 | sidebar: 11 | style: "docked" 12 | contents: 13 | - section: "Lessons:" 14 | contents: 15 | - href: 01-shell-intro.qmd 16 | text: 1. Shell Intro 17 | - href: 02-filedir.qmd 18 | text: 2. Files and Directories 19 | - href: 03-filework.qmd 20 | text: 3. Working with Files 21 | - href: 04-pipes.qmd 22 | text: 4. Filters and Pipes 23 | - href: 05-git.qmd 24 | text: 5. `Git` Version Control 25 | - href: https://raw.githack.com/ScPoEcon/ScPoEconometrics-Slides/master/chapter_intro_programming/chapter_intro.html 26 | text: 6. `R` intro 27 | - href: https://raw.githack.com/ScPoEcon/ScPoEconometrics-Slides/master/chapter_tidy_programming/chapter_tidy.html 28 | text: 7. `R Tidyverse` 29 | - href: 06-concepts.qmd 30 | text: 8. `R` and `python` generics 31 | - href: https://raw.githack.com/floswald/lectures/master/05-datatable/05-datatable.html 32 | text: 9. `R data.table` 33 | - href: 09-R-packages.qmd 34 | text: 10. Building `R` packages 35 | - href: 10-spatial-R.qmd 36 | text: 11. Spatial Data with `R` 37 | - href: https://floswald.github.io/julia-bootcamp/01-variables.html 38 | text: 11. `julia` intro 1 39 | - href: https://floswald.github.io/julia-bootcamp/02-functions.html 40 | text: 13. `julia` intro 2 41 | 42 | navbar: 43 | left: 44 | - href: index.qmd 45 | text: Home 46 | logo: /images/ScPo-logo.png 47 | background: "#ba0202" 48 | foreground: "#faf7f7" 49 | format: 50 | html: 51 | theme: journal 52 | linkcolor: "#ba0202" 53 | css: styles.css 54 | toc: true 55 | highlight-style: github 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /about.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "About" 3 | --- 4 | 5 | About this site 6 | -------------------------------------------------------------------------------- /custom.scss: -------------------------------------------------------------------------------- 1 | $code-block-bg: #ded9ca; 2 | $code-block-border-color: #000000; -------------------------------------------------------------------------------- /data/brexit.csv: -------------------------------------------------------------------------------- 1 | "","startdate","enddate","pollster","poll_type","samplesize","remain","leave","undecided","spread" 2 | "1",2016-06-23,2016-06-23,"YouGov","Online",4772,0.52,0.48,0,0.04 3 | "2",2016-06-22,2016-06-22,"Populus","Online",4700,0.55,0.45,0,0.1 4 | "3",2016-06-20,2016-06-22,"YouGov","Online",3766,0.51,0.49,0,0.02 5 | "4",2016-06-20,2016-06-22,"Ipsos MORI","Telephone",1592,0.49,0.46,0.01,0.03 6 | "5",2016-06-20,2016-06-22,"Opinium","Online",3011,0.44,0.45,0.09,-0.01 7 | "6",2016-06-17,2016-06-22,"ComRes","Telephone",1032,0.54,0.46,0,0.08 8 | "7",2016-06-17,2016-06-22,"ComRes","Telephone",1032,0.48,0.42,0.11,0.06 9 | "8",2016-06-16,2016-06-22,"TNS","Online",2320,0.41,0.43,0.16,-0.02 10 | "9",2016-06-20,2016-06-20,"Survation/IG Group","Telephone",1003,0.45,0.44,0.11,0.01 11 | "10",2016-06-18,2016-06-19,"YouGov","Online",1652,0.42,0.44,0.13,-0.02 12 | "11",2016-06-16,2016-06-19,"ORB/Telegraph","Telephone",800,0.53,0.46,0.02,0.07 13 | "12",2016-06-17,2016-06-18,"Survation","Telephone",1004,0.45,0.42,0.13,0.03 14 | "13",2016-06-16,2016-06-17,"YouGov","Online",1694,0.44,0.43,0.09,0.01 15 | "14",2016-06-14,2016-06-17,"Opinium","Online",2006,0.44,0.44,0.12,0 16 | "15",2016-06-15,2016-06-16,"YouGov","Online",1734,0.42,0.44,0.09,-0.02 17 | "16",2016-06-15,2016-06-15,"Survation","Telephone",1104,0.42,0.45,0.13,-0.03 18 | "17",2016-06-10,2016-06-15,"BMG Research","Online",1468,0.37,0.47,0.16,-0.1 19 | "18",2016-06-10,2016-06-15,"BMG Research","Telephone",1064,0.46,0.43,0.11,0.03 20 | "19",2016-06-11,2016-06-14,"Ipsos MORI","Telephone",1257,0.43,0.49,0.03,-0.06 21 | "20",2016-06-12,2016-06-13,"YouGov","Online",1905,0.39,0.46,0.15,-0.07 22 | "21",2016-06-10,2016-06-13,"ICM","Telephone",1000,0.45,0.5,0.05,-0.05 23 | "22",2016-06-10,2016-06-13,"ICM","Online",2001,0.44,0.49,0.07,-0.05 24 | "23",2016-06-09,2016-06-13,"ComRes","Telephone",1002,0.46,0.45,0.09,0.01 25 | "24",2016-06-07,2016-06-13,"TNS","Online",2497,0.4,0.47,0.13,-0.07 26 | "25",2016-06-09,2016-06-12,"ORB","Telephone",800,0.48,0.49,0.03,-0.01 27 | "26",2016-06-09,2016-06-10,"YouGov","Online",1671,0.42,0.43,0.11,-0.01 28 | "27",2016-06-07,2016-06-10,"Opinium","Online",2009,0.44,0.42,0.13,0.02 29 | "28",2016-06-08,2016-06-09,"ORB","Online",2052,0.45,0.55,0,-0.1 30 | "29",2016-06-05,2016-06-06,"YouGov","Online",2001,0.43,0.42,0.11,0.01 31 | "30",2016-06-03,2016-06-05,"ICM","Online",2047,0.43,0.48,0.09,-0.05 32 | "31",2016-06-02,2016-06-05,"ORB","Telephone",800,0.48,0.47,0.05,0.01 33 | "32",2016-06-01,2016-06-03,"YouGov","Online",3405,0.41,0.45,0.11,-0.04 34 | "33",2016-05-31,2016-05-31,"Opinium","Online",2007,0.43,0.41,0.16,0.02 35 | "34",2016-05-31,2016-05-31,"Opinium","Online",2007,0.4,0.43,0.16,-0.03 36 | "35",2016-05-30,2016-05-31,"YouGov","Online",1735,0.41,0.41,0.13,0 37 | "36",2016-05-27,2016-05-29,"ICM","Telephone",1004,0.42,0.45,0.15,-0.03 38 | "37",2016-05-27,2016-05-29,"ICM","Online",2052,0.44,0.47,0.09,-0.03 39 | "38",2016-05-25,2016-05-29,"ORB","Telephone",800,0.51,0.46,0.03,0.05 40 | "39",2016-05-20,2016-05-25,"BMG Research","Online",1638,0.44,0.45,0.12,-0.01 41 | "40",2016-05-24,2016-05-24,"Survation","Telephone",1013,0.44,0.38,0.18,0.06 42 | "41",2016-05-23,2016-05-24,"YouGov","Online",1756,0.41,0.41,0.13,0 43 | "42",2016-05-19,2016-05-23,"TNS","Online",1213,0.41,0.43,0.16,-0.02 44 | "43",2016-05-20,2016-05-22,"ICM","Online",2003,0.45,0.45,0.1,0 45 | "44",2016-05-18,2016-05-22,"ORB","Telephone",800,0.55,0.42,0.03,0.13 46 | "45",2016-05-17,2016-05-19,"Opinium","Online",2008,0.44,0.4,0.14,0.04 47 | "46",2016-05-16,2016-05-17,"YouGov","Online",1648,0.44,0.4,0.12,0.04 48 | "47",2016-05-14,2016-05-17,"ComRes","Telephone",1000,0.52,0.41,0.07,0.11 49 | "48",2016-05-14,2016-05-16,"Ipsos MORI","Telephone",1002,0.55,0.37,0.05,0.18 50 | "49",2016-05-13,2016-05-15,"ICM","Telephone",1002,0.47,0.39,0.14,0.08 51 | "50",2016-05-13,2016-05-15,"ICM","Online",2048,0.43,0.47,0.1,-0.04 52 | "51",2016-05-11,2016-05-15,"ORB","Telephone",800,0.55,0.4,0.05,0.15 53 | "52",2016-05-10,2016-05-12,"TNS","Online",1222,0.38,0.41,0.21,-0.03 54 | "53",2016-04-29,2016-04-29,"YouGov","Telephone",996,0.36,0.39,0.22,-0.03 55 | "54",2016-04-29,2016-04-29,"YouGov","Online",1973,0.38,0.4,0.16,-0.02 56 | "55",2016-05-06,2016-05-08,"ICM","Online",2005,0.44,0.46,0.11,-0.02 57 | "56",2016-05-04,2016-05-06,"YouGov","Online",3378,0.42,0.4,0.13,0.02 58 | "57",2016-04-29,2016-04-29,"ICM","Online",2040,0.44,0.45,0.11,-0.01 59 | "58",2016-04-27,2016-04-29,"ICM","Online",2029,0.43,0.46,0.11,-0.03 60 | "59",2016-04-26,2016-04-29,"Opinium","Online",2005,0.42,0.41,0.14,0.01 61 | "60",2016-04-27,2016-04-29,"ORB","Online",2000,0.49,0.51,0,-0.02 62 | "61",2016-04-26,2016-04-28,"TNS","Online",1221,0.39,0.36,0.26,0.03 63 | "62",2016-04-25,2016-04-26,"YouGov","Online",1650,0.41,0.42,0.13,-0.01 64 | "63",2016-04-25,2016-04-26,"Survation","Telephone",1003,0.45,0.38,0.17,0.07 65 | "64",2016-04-22,2016-04-26,"BMG Research","Online",2001,0.43,0.45,0.13,-0.02 66 | "65",2016-04-22,2016-04-24,"ICM","Online",2001,0.44,0.46,0.1,-0.02 67 | "66",2016-04-20,2016-04-24,"ORB","Telephone",800,0.51,0.43,0.06,0.08 68 | "67",2016-04-16,2016-04-19,"ComRes","Telephone",1002,0.51,0.4,0.09,0.11 69 | "68",2016-04-16,2016-04-18,"Ipsos MORI","Telephone",1026,0.49,0.39,0.08,0.1 70 | "69",2016-04-15,2016-04-17,"ICM","Telephone",1003,0.48,0.41,0.11,0.07 71 | "70",2016-04-15,2016-04-17,"ICM","Online",2008,0.43,0.44,0.13,-0.01 72 | "71",2016-04-13,2016-04-17,"ORB","Telephone",800,0.53,0.41,0.06,0.12 73 | "72",2016-04-12,2016-04-14,"TNS","Online",1198,0.38,0.34,0.28,0.04 74 | "73",2016-04-12,2016-04-14,"YouGov","Online",3371,0.4,0.39,0.16,0.01 75 | "74",2016-04-11,2016-04-12,"YouGov","Online",1693,0.39,0.39,0.17,0 76 | "75",2016-04-07,2016-04-11,"TNS","Online",1198,0.35,0.35,0.3,0 77 | "76",2016-04-08,2016-04-10,"ComRes","Telephone",1002,0.45,0.38,0.17,0.07 78 | "77",2016-04-08,2016-04-10,"ICM","Online",2030,0.42,0.45,0.12,-0.03 79 | "78",2016-04-06,2016-04-07,"YouGov","Online",1612,0.4,0.38,0.16,0.02 80 | "79",2016-03-29,2016-03-29,"YouGov","Online",3754,0.39,0.38,0.18,0.01 81 | "80",2016-04-01,2016-04-03,"ICM","Online",2007,0.44,0.43,0.13,0.01 82 | "81",2016-03-29,2016-03-29,"ORB","Telephone",800,0.51,0.44,0.05,0.07 83 | "82",2016-03-29,2016-03-29,"Opinium","Online",1966,0.39,0.43,0.18,-0.04 84 | "83",2016-03-24,2016-03-29,"TNS","Online",1193,0.35,0.35,0.3,0 85 | "84",2016-03-24,2016-03-29,"BMG Research","Online",1518,0.41,0.45,0.14,-0.04 86 | "85",2016-03-24,2016-03-28,"ORB","Online",2002,0.51,0.49,0,0.02 87 | "86",2016-03-22,2016-03-24,"ICM","Online",1970,0.45,0.43,0.12,0.02 88 | "87",2016-03-19,2016-03-22,"Ipsos MORI","Telephone",1023,0.49,0.41,0.1,0.08 89 | "88",2016-03-17,2016-03-22,"YouGov","Online",1688,0.4,0.37,0.19,0.03 90 | "89",2016-03-18,2016-03-20,"ComRes","Telephone",1002,0.48,0.41,0.11,0.07 91 | "90",2016-03-18,2016-03-20,"ICM","Online",2000,0.41,0.43,0.17,-0.02 92 | "91",2016-03-17,2016-03-19,"Survation","Telephone",1006,0.46,0.35,0.19,0.11 93 | "92",2016-03-11,2016-03-14,"ORB","Telephone",823,0.47,0.49,0.04,-0.02 94 | "93",2016-03-11,2016-03-13,"ICM","Online",2031,0.43,0.41,0.16,0.02 95 | "94",2016-03-04,2016-03-11,"Greenberg Quinlan Rosner Research","Online",2282,0.45,0.4,0.16,0.05 96 | "95",2016-03-02,2016-03-10,"Populus/Number Cruncher Politics","Online",4047,0.48,0.45,0.07,0.03 97 | "96",2016-03-04,2016-03-06,"Populus/Number Cruncher Politics","Telephone",966,0.49,0.35,0.15,0.14 98 | "97",2016-03-04,2016-03-06,"ICM","Online",2051,0.4,0.41,0.19,-0.00999999999999995 99 | "98",2016-03-02,2016-03-03,"YouGov","Online",1695,0.4,0.37,0.18,0.03 100 | "99",2016-03-01,2016-03-02,"YouGov","Online",1705,0.4,0.35,0.19,0.05 101 | "100",2016-02-29,2016-02-29,"YouGov","Online",2233,0.39,0.37,0.19,0.02 102 | "101",2016-02-26,2016-02-29,"ICM","Online",2003,0.41,0.41,0.18,0 103 | "102",2016-02-26,2016-02-28,"Populus/Number Cruncher Politics","Online",2071,0.39,0.45,0.18,-0.06 104 | "103",2016-02-26,2016-02-28,"Populus/Number Cruncher Politics","Telephone",1002,0.48,0.37,0.15,0.11 105 | "104",2016-02-24,2016-02-25,"ORB","Online",2014,0.48,0.52,0,-0.04 106 | "105",2016-02-21,2016-02-23,"YouGov","Online",3482,0.37,0.38,0.25,-0.01 107 | "106",2016-02-17,2016-02-23,"BMG Research","Online",1517,0.38,0.36,0.25,0.02 108 | "107",2016-02-19,2016-02-22,"ICM","Online",2021,0.42,0.4,0.17,0.02 109 | "108",2016-02-19,2016-02-22,"ComRes","Telephone",1000,0.51,0.39,0.1,0.12 110 | "109",2016-02-13,2016-02-20,"Survation","Telephone",938,0.45,0.32,0.23,0.13 111 | "110",2016-02-18,2016-02-19,"Opinium","Online",1033,0.4,0.41,0.19,-0.00999999999999995 112 | "111",2016-02-13,2016-02-16,"Ipsos MORI","Telephone",497,0.54,0.36,0.1,0.18 113 | "112",2016-02-11,2016-02-15,"TNS","Online",1079,0.36,0.39,0.25,-0.03 114 | "113",2016-02-12,2016-02-14,"ICM","Online",2001,0.43,0.39,0.18,0.04 115 | "114",2016-02-11,2016-02-14,"ComRes","Telephone",1105,0.49,0.41,0.1,0.08 116 | "115",2016-02-05,2016-02-07,"ICM","Online",2018,0.41,0.42,0.17,-0.01 117 | "116",2016-02-03,2016-02-04,"YouGov/The Times","Online",1675,0.36,0.45,0.19,-0.09 118 | "117",2016-01-29,2016-01-31,"ICM","Online",2002,0.42,0.39,0.19,0.03 119 | "118",2016-01-27,2016-01-28,"YouGov","Online",1735,0.38,0.42,0.2,-0.04 120 | "119",2016-01-23,2016-01-25,"Ipsos MORI","Telephone",513,0.55,0.36,0.09,0.19 121 | "120",2016-01-21,2016-01-25,"BMG Research","Online",1511,0.44,0.42,0.14,0.02 122 | "121",2016-01-22,2016-01-24,"ComRes","Telephone",1006,0.54,0.36,0.1,0.18 123 | "122",2016-01-22,2016-01-24,"ICM","Online",2010,0.41,0.41,0.18,0 124 | "123",2016-01-20,2016-01-21,"ORB","Online",2015,0.52,0.48,0,0.04 125 | "124",2016-01-15,2016-01-17,"ICM","Online",2023,0.42,0.4,0.17,0.02 126 | "125",2016-01-15,2016-01-16,"Survation","Online",1017,0.38,0.4,0.22,-0.02 127 | "126",2016-01-08,2016-01-14,"Panelbase","Online",2087,0.42,0.45,0.12,-0.03 128 | "127",2016-01-08,2016-01-10,"ICM","Online",2055,0.44,0.38,0.18,0.06 129 | -------------------------------------------------------------------------------- /data/shell-lesson-data.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/data/shell-lesson-data.zip -------------------------------------------------------------------------------- /images/02_datum_fig.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/02_datum_fig.png -------------------------------------------------------------------------------- /images/PHD.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/PHD.png -------------------------------------------------------------------------------- /images/ScPo-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/ScPo-logo.png -------------------------------------------------------------------------------- /images/Tux.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/Tux.png -------------------------------------------------------------------------------- /images/bad.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/bad.gif -------------------------------------------------------------------------------- /images/distance1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/distance1.png -------------------------------------------------------------------------------- /images/distance2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/distance2.png -------------------------------------------------------------------------------- /images/filesystem-challenge.odg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/filesystem-challenge.odg -------------------------------------------------------------------------------- /images/filesystem.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 20 | 22 | 42 | 44 | 45 | 47 | image/svg+xml 48 | 50 | 51 | 52 | 53 | 54 | 68 | 71 | 73 | 81 | / 97 | 98 | 100 | 103 | 111 | 118 | 126 | 127 | bin 143 | 144 | 147 | 155 | data 171 | 172 | 175 | 183 | Users 199 | 200 | 203 | 211 | tmp 227 | 228 | 230 | 238 | 247 | 255 | 264 | 279 | 284 | 285 | 286 | 287 | -------------------------------------------------------------------------------- /images/find-file-tree.odg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/find-file-tree.odg -------------------------------------------------------------------------------- /images/git-images/full.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/git-images/full.png -------------------------------------------------------------------------------- /images/git-staging-CDG.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/git-staging-CDG.jpeg -------------------------------------------------------------------------------- /images/git-staging-area.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | 9 | 10 | .git 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 22 | 23 | 26 | 27 | 28 | 29 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 43 | 44 | 48 | 49 | 50 | 53 | 54 | 55 | 58 | 59 | 60 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | git add 82 | 83 | 84 | 85 | 86 | 87 | git commit 88 | 89 | staging area 90 | 91 | repository 92 | 93 | 94 | -------------------------------------------------------------------------------- /images/homedir.odg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/homedir.odg -------------------------------------------------------------------------------- /images/nano-screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/nano-screenshot.png -------------------------------------------------------------------------------- /images/phd101212s.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/phd101212s.png -------------------------------------------------------------------------------- /images/redirects-and-pipes.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | background 6 | 7 | 8 | 9 | 10 | 11 | 12 | Layer 1 13 | 14 | 15 | 16 | wc -l *.pdb 17 | 18 | wc -l *.pdb 19 | 20 | 21 | $ 22 | Output in Shell 23 | 24 | OUT 25 | 26 | 27 | wc -l *.pdb 28 | 29 | Output in File 30 | 31 | OUT 32 | wc -l *.pdb 33 | 34 | wc -l *.pdb 35 | 36 | OUT 37 | 38 | 39 | 40 | $ 41 | Output in Shell 42 | 43 | sort -n 44 | IN 45 | 46 | OUT 47 | 48 | IN 49 | 50 | OUT 51 | head -n 1 52 | 53 | lengths 54 | $ 55 | $ 56 | | 57 | sort -n 58 | | 59 | head -n 1 60 | wc -l *.pdb 61 | $ 62 | | 63 | sort -n 64 | | 65 | head -n 1 66 | wc -l *.pdb 67 | $ 68 | > 69 | lengths 70 | 71 | 72 | -------------------------------------------------------------------------------- /images/removed-that.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/removed-that.png -------------------------------------------------------------------------------- /images/seine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/seine.png -------------------------------------------------------------------------------- /images/seine2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/seine2.png -------------------------------------------------------------------------------- /images/seine3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/seine3.png -------------------------------------------------------------------------------- /images/seine4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/seine4.png -------------------------------------------------------------------------------- /images/seine5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/seine5.png -------------------------------------------------------------------------------- /images/toypackage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/toypackage.png -------------------------------------------------------------------------------- /images/vector_lonlat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/vector_lonlat.png -------------------------------------------------------------------------------- /images/vector_lonlatglobe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/vector_lonlatglobe.png -------------------------------------------------------------------------------- /images/vector_lonlatparis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/vector_lonlatparis.png -------------------------------------------------------------------------------- /images/vector_projected.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/vector_projected.png -------------------------------------------------------------------------------- /images/vector_projectedparis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/vector_projectedparis.png -------------------------------------------------------------------------------- /images/which-version.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/which-version.png -------------------------------------------------------------------------------- /index.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: ScPoProgramming 3 | author: "[Florian Oswald](https://floswald.github.io)" 4 | subtitle: "SciencesPo Paris, Ecole Doctorale 2024" 5 | --- 6 | 7 | # Welcome to *Introduction to Programming* 8 | 9 | * This website is the home of the course _Introduction to Programming_ taught to first year PhD students (M1) in the doctoral program of the [department of economics](https://www.sciencespo.fr/department-economics/) at Sciences Po Paris. 10 | * This course assumes **no prior programming** experience. 11 | * Below you will find setup instructions and a syllabus. 12 | * You can obtain all material for this course from the associated github repository at [link](https://github.com/floswald/ScPoProgramming/) 13 | 14 | ## Objectives of this Course 15 | 16 | * After this course, we want you to be able to participate or conduct your own research project in an efficient way. *Research* nowadays means data sciencey stuff in most cases, certainly in Economics. 17 | * We want you to have a basic understanding of how an operating system (in particular, *your* OS) works. 18 | * We want to be able to achieve a basic level of automation in repetitive tasks. 19 | * We want you to know what Version Control is and how to use it in a research project. 20 | * We will introduce some (hopefully) useful `R` programming. 21 | 22 | # Syllabus 23 | 24 | | Session Number | Topic | Author | 25 | |:------------:|:-----------|:-------------:| 26 | | 1 | [The Unix Shell](01-shell-intro.qmd) | [The Software Carpentry Project](https://software-carpentry.org/) + Florian Oswald | 27 | | 2 | [Shell: Files and Directories](02-filedir.qmd) | [The Software Carpentry Project](https://software-carpentry.org/) + Florian Oswald | 28 | | 3 | [Shell: Working with Files and Directories](03-filework.qmd) | [The Software Carpentry Project](https://software-carpentry.org/) + Florian Oswald | 29 | | 4 | [Shell: Pipes and Filters](04-pipes.qmd) | [The Software Carpentry Project](https://software-carpentry.org/) + Florian Oswald | 30 | | 5 | [`Git` Version Control](05-git.qmd) | [The Software Carpentry Project](https://software-carpentry.org/) + Florian Oswald | 31 | | | [Homework 1](https://github.com/floswald/scpoproghw1): complete and run a bash script on gh-actions | Florian Oswald | 32 | | 6 | [`R` Intro](https://raw.githack.com/ScPoEcon/ScPoEconometrics-Slides/master/chapter_intro_programming/chapter_intro.html) | Florian Oswald| 33 | | 7 | [`R {tidyverse}`](https://raw.githack.com/ScPoEcon/ScPoEconometrics-Slides/master/chapter_tidy_programming/chapter_tidy.html) | [Grant McDermott](https://grantmcdermott.com/) + Florian Oswald| 34 | | 8 | [`R` and `python` generics](06-concepts.qmd) | Florian Oswald| 35 | | 9 | [`R {data.table}`](https://raw.githack.com/floswald/lectures/master/05-datatable/05-datatable.html) | [Grant McDermott](https://grantmcdermott.com/) + Florian Oswald| 36 | | 10 | [Building `R` packages](09-R-packages.qmd) | Florian Oswald| 37 | | 11 | [Spatial Data with `R`](10-spatial-R.qmd) | Florian Oswald| 38 | | 12 | [`julia` intro 1](https://floswald.github.io/julia-bootcamp/01-variables.html) | Florian Oswald| 39 | | 13 | [`julia` intro 2](https://floswald.github.io/julia-bootcamp/02-functions.html) | Florian Oswald| 40 | | 14 | [Quick Intro to NLP with R](11-NLP-R.qmd) | Florian Oswald| 41 | 42 | 43 | # Setup Instructions 44 | 45 | * You must bring your own laptop to each class. 46 | * Please make sure you have an up to date operating system, i.e. run a software update before we start. 47 | * Everybody should [install R](https://cran.r-project.org/) or make sure they have a recent version installed. 48 | * Everybody should [install RStudio](https://posit.co/download/rstudio-desktop/) or run an update on the installed program. 49 | 50 | 51 | ## Windows and Mac Specific Instructions 52 | 53 | There are different instructions depending on whether you have a Mac or a Windows computer. Unix-based computers are similar to Macs in most respects. 54 | 55 | ### Windows 56 | 57 | We need to install some things that make your windows computer a resemble a bit a Unix box. In particular, we want to be able to use the unix shell. Therefore, I want you to download and install 58 | 59 | * [GitForWindows](https://gitforwindows.org/) 60 | 61 | *Specifics:* 62 | 63 | 1. During the installation process, choose all default settings. 64 | 2. At one point, you are offered a choice for a default editor being used for `git`. If you know `vim` already, why not (I use `vim`), otherwise I recommend `nano`, which is a simple to use editor that runs inside your command line - we want to avoid having to open an external window of a separate editor for our tasks (i.e. don't choose notepad and other standalone editors). 65 | 66 | 67 | ### Mac 68 | 69 | You should be all set. To make sure we have really everything we need, open `Terminal.app` (in *Applications > Utilities* or do `Cmd + Space` to get spotlight search and type `terminal`). Then paste this code and hit enter: 70 | 71 | ```bash 72 | xcode-select --install 73 | ``` 74 | 75 | click on *install* (don't click on `get Xcode`) 76 | 77 | 78 | ### Unix 79 | 80 | Same, all set. Maybe open a terminal and type 81 | 82 | ```bash 83 | git --version 84 | ``` 85 | 86 | if that throws an error, install it with your package manager, e.g. 87 | 88 | ```bash 89 | sudo apt install git-all 90 | ``` 91 | 92 | 93 | # Code of Conduct 94 | 95 | If you decide to participate in this course, I expect you to abide by the following minimal code of conduct. 96 | 97 | 1. Be polite to the other class participants. 98 | 2. While in class, do not spend time on messaging apps, chat rooms, computer games, or similar content. 99 | 100 | You can expect your instructor to abide by the same code of conduct, so this is a matter of mutual respect. If you are found in breach of the above you will be given a single warning, and I will ask you to no longer join the course after a second time. Your grade will be "fail". 101 | 102 | 103 | 104 | # License 105 | 106 | All lectures of this course are derived from the work of the [Software Carpentry](https://software-carpentry.org/license/). Their material is licensed under [creative commons license 4.0](https://creativecommons.org/licenses/by/4.0/), whereby I am allowd to share and remix the content, if appropriate attribution is given. 107 | 108 | Those terms apply to anyone wanting use material on this website as well. Thank you. -------------------------------------------------------------------------------- /scripts/01-shell-intro.sh: -------------------------------------------------------------------------------- 1 | # this line is a comment (it starts with the # character) 2 | 3 | # (bash) shell commands for 01-shell-intro 4 | 5 | -------------------------------------------------------------------------------- /scripts/_tidy_tasks.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Tidying, Visualising and Summarising Data - Tasks" 3 | author: "Mylène Feuillade, Gustave Kenedi, Florian Oswald and Pierre Villedieu" 4 | date: "`r Sys.Date()`" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | 12 | ## Task 1: Data wrangling 13 | 14 | Load the data by running the following code: 15 | 16 | ```{r} 17 | library(dslabs) 18 | data(polls_us_election_2016) 19 | 20 | library(tidyverse) 21 | ``` 22 | 23 | 1\. Which polls had a missing `grade`? 24 | 25 | Only showing the first 6 rows using `head`, otherwise the document would be very long. 26 | 27 | ```{r} 28 | polls_us_election_2016 %>% 29 | filter(is.na(grade)) %>% 30 | head 31 | ``` 32 | 33 | 2\. Which polls were (i) polled by American Strategies, GfK Group or Merrill Poll, (ii) had a sample size greater than 1,000, _and_ (iii) started on October 20th, 2016? (*Hint: for (i) `%in%` might come in handy. Recall that vectors are created using the `c()` function. For (iii) make sure to check the format of the variable containing the poll's start date.*) 34 | 35 | ```{r} 36 | polls_us_election_2016 %>% 37 | filter(pollster %in% c("American Strategies","GfK Group","Merrill Poll") & 38 | samplesize > 1000 & 39 | startdate == "2016-10-20") 40 | ``` 41 | 42 | 3\. Which polls (i) did not have missing poll data for Johnson, (ii) had a combined raw poll vote share for Trump and Clinton greater than 95% _and_ (iii) were done in the state of Ohio? (*Hint: it might be practical to first create a variable containing the combined raw poll vote share for Trump and Clinton and then filter.*) 43 | 44 | ```{r} 45 | polls_us_election_2016 %>% 46 | mutate(rawpoll_clintontrump = rawpoll_clinton + rawpoll_trump) %>% 47 | filter(!is.na(rawpoll_johnson) & rawpoll_clintontrump > 95 & state == "Ohio") 48 | ``` 49 | 50 | 51 | 4\. Which state had the highest average Trump vote share for polls which had at least a sample size of 2,000? (*Hint: you'll have to use `filter`, `group_by`, `summarise` and `arrange`. To obtain ranking in descending order check `arrange`'s help page.*) 52 | 53 | ```{r} 54 | polls_us_election_2016 %>% 55 | filter(samplesize >= 2000) %>% 56 | group_by(state) %>% 57 | summarise(mean_trump = mean(rawpoll_trump)) %>% 58 | arrange(desc(mean_trump)) 59 | ``` 60 | 61 | 62 | ## Task 2: Understanding the data 63 | 64 | Load the data by running the following code: 65 | 66 | ```{r, echo = T, eval = F} 67 | library(dslabs) 68 | data(gapminder, package = "dslabs") 69 | ``` 70 | 71 | 1\. Compute the average population per continent per year, `mean_pop`, and assign the output to a new object `gapminder_mean`. (*Hint: you should have one observation (row) per continent for each year. You'll have to use `group_by` and `summarise`.*) 72 | 73 | ```{r} 74 | gapminder_mean <- gapminder %>% 75 | group_by(continent, year) %>% 76 | summarise(mean_pop = mean(population)) 77 | ``` 78 | 79 | 80 | ## Task 3: Visualising data 81 | 82 | Using the `gapminder` data, create the following plots using `ggplot2`. Don't forget to label the axes. 83 | 84 | 1\. A histogram of life expectancy in 2015. (*Hint: do you need to specify a `y` in `aes()` for a histogram?*) Once you've created the histogram, within the appropriate `geom_*` set: `binwidth` to 5, `boundary` to 45, `colour` to "white" and `fill` to "#d90502". What does each of these options do?
*Optional:* Using the previous graph, facet it by continent such that each continent's plot is a new row. (*Hint: check the help for `facet_grid`.*) 85 | 86 | The basic histogram: 87 | 88 | ```{r} 89 | gapminder %>% 90 | filter(year == 2015) %>% 91 | ggplot() + 92 | aes(x = life_expectancy) + 93 | geom_histogram() 94 | ``` 95 | 96 | The fancy histogram (with axis labels): 97 | 98 | ```{r} 99 | life_exp_hist <- gapminder %>% 100 | filter(year == 2015) %>% 101 | ggplot() + 102 | aes(x = life_expectancy) + 103 | geom_histogram(binwidth = 5, 104 | boundary = 45, 105 | colour = "white", 106 | fill = "#d90502") + 107 | labs(x = "Life expectancy", 108 | y = "Frequency") 109 | life_exp_hist 110 | ``` 111 | 112 | The faceted fancy histogram: 113 | 114 | ```{r} 115 | life_exp_hist + 116 | facet_grid(rows = vars(continent)) 117 | ``` 118 | 119 | 2\. A boxplot of average life expectancy per year by continent. Within the appropriate `geom_*` set: `colour` to "black" and `fill` to "#d90502". (*Hint: you need to group by both `continent` and `year`.*) 120 | 121 | ```{r} 122 | gapminder %>% 123 | group_by(continent, year) %>% 124 | summarise(mean_life_exp = mean(life_expectancy)) %>% 125 | ggplot() + 126 | aes(x = continent, y = mean_life_exp) + 127 | geom_boxplot(colour = "black", 128 | fill = "#d90502") + 129 | labs(x = "Continent", 130 | y = "Life expectancy") 131 | ``` 132 | 133 | 3\. A scatter plot of fertility rate (y-axis) with respect to infant mortality (x-axis) in 2015. Once you've created the scatter plot, within the appropriate `geom_*` set: `size` to 3, `alpha` to 0.5, `colour` to "#d90502". 134 | 135 | The basic scatter plot: 136 | 137 | ```{r} 138 | gapminder %>% 139 | filter(year == 2015) %>% 140 | ggplot() + 141 | aes(x = infant_mortality, y = fertility) + 142 | geom_point() 143 | ``` 144 | 145 | The fancy scatter plot with axis labels: 146 | 147 | ```{r} 148 | gapminder %>% 149 | filter(year == 2015) %>% 150 | ggplot() + 151 | aes(x = infant_mortality, y = fertility) + 152 | geom_point(size = 3, 153 | alpha = 0.5, 154 | colour = "#d90502") + 155 | labs(x = "Infant mortality", y = "Fertility") 156 | ``` 157 | 158 | 159 | ## Task 4: Summarising data 160 | 161 | 1\. Compute the mean of GDP in 2011 and assign to object `mean`. You should exclude missing values. (*Hint: read the help for `mean` to remove `NA`s*). 162 | 163 | ```{r} 164 | mean_GDP <- gapminder %>% 165 | filter(year == 2011) %>% 166 | summarise(mean(gdp, na.rm = T)) 167 | mean_GDP 168 | ``` 169 | 170 | 2\. Compute the median of GDP in 2011 and assign to object `median`. Again, you should exclude missing values. Is it greater or smaller than the average? 171 | 172 | ```{r} 173 | median_GDP <- gapminder %>% 174 | filter(year == 2011) %>% 175 | summarise(median(gdp, na.rm = T)) 176 | median_GDP 177 | ``` 178 | 179 | **The median is much smaller than the average.** 180 | 181 | 3\. Create a density plot of GDP in 2011 using `geom_density`. A density plot is a way of representing the distribution of a numeric variable. Add the following code to your plot to show the median and mean as vertical lines. What do you observe? 182 | `geom_vline(xintercept = as.numeric(mean_GDP), colour = "red") +
183 | geom_vline(xintercept = as.numeric(median_GDP), colour = "orange")` 184 | 185 | ```{r} 186 | gdp_density <- gapminder %>% 187 | filter(year == 2011) %>% 188 | ggplot() + 189 | aes(x = gdp) + 190 | geom_density() + 191 | geom_vline(xintercept = as.numeric(mean_GDP), colour = "red") + 192 | geom_vline(xintercept = as.numeric(median_GDP), colour = "orange") 193 | gdp_density 194 | ``` 195 | 196 | **The distribution of GDP is highly ***skewed***: there are many countries with small GDPs and very few with huge GDPs (U.S., Japan, China). In such cases, the average will be (significantly) greater than the median. To see this more clearly, here's a graph where I've transformed the x-axis such that each tick is 10 times larger than the previous one (the scale is therefore not linear, i.e. the first tick is 100,000, the second is 1 million, the third is 10 million, etc.).** 197 | 198 | ```{r} 199 | gdp_density + 200 | scale_x_log10() 201 | ``` 202 | 203 | 4\. Compute the correlation between fertility and infant mortality in 2015. To drop `NA`s in either variable set the argument `use` to "pairwise.complete.obs" in your `cor()` function. Is this correlation consistent with the graph you produced in Task 3? 204 | 205 | ```{r} 206 | gapminder %>% 207 | filter(year == 2015) %>% 208 | summarise(cor(fertility, infant_mortality, use = "pairwise.complete.obs")) 209 | ``` 210 | 211 | **This correlation is positive and strong (relatively close to 1) which is consistent with the graph produced in Task 3. Indeed, that graph displayed a positive relationship between these two variables and the points were not that dispersed.** 212 | -------------------------------------------------------------------------------- /scripts/geotask.R: -------------------------------------------------------------------------------- 1 | 2 | library(sf) 3 | library(dplyr) 4 | library(here) 5 | 6 | destdir = file.path(here(),"data","shapefiles","departements-20140306-100m") 7 | 8 | if (!file.exists(file.path(destdir,"departements-20140306-100m.shp"))){ 9 | dir.create(file.path(here(),"data","shapefiles"), showWarnings = FALSE) 10 | download.file(url = "https://www.data.gouv.fr/fr/datasets/r/3096e551-c68d-40ce-8972-a228c94c0ad1", 11 | destfile = file.path(here(),"data","shapefiles","departements-20140306-100m.zip")) 12 | unzip(file.path(here(),"data","shapefiles","departements-20140306-100m.zip"), 13 | exdir = destdir) 14 | } 15 | 16 | # load departments shapefile 17 | deps = st_read(file.path(destdir,"departements-20140306-100m.shp")) 18 | 19 | plot(deps[,"code_insee"]) 20 | 21 | # subset to continental france 22 | deps = deps %>% 23 | dplyr::filter(!(code_insee %in% c("2A","2B","971","972","973","974","976"))) 24 | 25 | 26 | # Seine data 27 | data("seine", 28 | package = "spData") 29 | 30 | ## Make sure they have the same projection 31 | seine = st_transform(seine, 32 | crs = st_crs(deps)) 33 | 34 | # intersect deps and rivers 35 | deps_seine = st_join(deps, seine) %>% 36 | ## Get rid of regions with no overlap 37 | dplyr::filter(!is.na(name)) %>% 38 | dplyr::distinct(code_insee, .keep_all = T) 39 | 40 | # reproduce plot from class 41 | ggplot() + 42 | geom_sf(data = deps_seine,alpha = 0.5, 43 | aes(fill = nom), 44 | col = "#fcb4b3", # of borders 45 | linewidth = 0.5) + # of borders 46 | geom_sf(data = seine, col = "#05E9FF", lwd = 1) + 47 | labs(title = "Intersected regions only") + 48 | theme_bw() 49 | 50 | 51 | # # get communes shapefile from 52 | # # https://www.data.gouv.fr/fr/datasets/contours-des-communes-de-france-simplifie-avec-regions-et-departement-doutre-mer-rapproches/ 53 | # stable url: https://www.data.gouv.fr/fr/datasets/r/00c0c560-3ad1-4a62-9a29-c34c98c3701e 54 | 55 | commdir = file.path(here(),"data","shapefiles","communes") 56 | 57 | if (!file.exists(file.path(commdir,"a-com2022-topo-2154.json"))){ 58 | dir.create(commdir, showWarnings = FALSE) 59 | 60 | download.file(url = "https://www.data.gouv.fr/fr/datasets/r/00c0c560-3ad1-4a62-9a29-c34c98c3701e", 61 | destfile = file.path(commdir,"a-com2022-topo-2154.json"),quiet = FALSE) 62 | } 63 | 64 | 65 | comms = st_read(file.path(commdir,"a-com2022-topo-2154.json"),layer = "a_com2022") 66 | st_crs(comms) <- 2154 # set the initial CRS 67 | 68 | comms = comms %>% 69 | st_transform(4326) %>% 70 | dplyr::rename(code_insee = codgeo) 71 | 72 | # subset to relevant departments only: i.e. the ones of the join above 73 | co_d = comms %>% 74 | dplyr::filter(dep %in% unique(deps_seine$code_insee)) 75 | 76 | # reproduce plot above but now apply a color code that tells us 77 | # how many communes the rivers traverse *in each departement* 78 | 79 | # join co with seine 80 | co_seine = st_join(co_d, seine) %>% 81 | ## Get rid of regions with no overlap 82 | dplyr::filter(!is.na(name)) %>% 83 | dplyr::distinct(code_insee, .keep_all = T) 84 | 85 | # plot all the communes that are traversed and color by river name 86 | ggplot(co_seine) + geom_sf(aes(fill = name)) 87 | 88 | # add the rivers themselves 89 | ggplot(co_seine) + 90 | geom_sf(aes(fill = name)) + 91 | geom_sf(data = seine, col = "#05E9FF", lwd = 0.6) 92 | 93 | 94 | co_d_seine = co_seine %>% 95 | st_set_geometry(NULL) %>% # can get rid of geometry 96 | dplyr::group_by(dep) %>% 97 | dplyr::summarise(ncomms = dplyr::n()) 98 | 99 | # merge with deps_seine and plot again 100 | deps_seine %>% 101 | dplyr::inner_join(co_d_seine, by = c("code_insee" = "dep")) %>% 102 | ggplot() + 103 | geom_sf(alpha = 0.9, 104 | aes(fill = ncomms), 105 | col = "grey", # of borders 106 | linewidth = 0.2) + # of borders 107 | geom_sf(data = seine, col = "#05E9FF", lwd = 1) + 108 | # scale_fill_gradient2(low = "white",high = "red") + 109 | scale_fill_viridis_c() + 110 | labs(title = "Number of Communes by Department", 111 | subtitle = "Traversed by one of Seine, Marne or Yonne", 112 | fill = "Number of\nCommunes") + 113 | theme_bw() 114 | 115 | 116 | 117 | 118 | 119 | 120 | -------------------------------------------------------------------------------- /scripts/lon-lat-geocomp.r: -------------------------------------------------------------------------------- 1 | # this is copied from 2 | # https://raw.githubusercontent.com/geocompx/geocompr/main/code/02-vectorplots.R 3 | 4 | 5 | library(globe) 6 | library(dplyr) 7 | library(sf) 8 | 9 | london_lonlat = st_point(c(-0.1, 51.5)) %>% 10 | st_sfc() %>% 11 | st_sf(crs = 4326, geometry = .) 12 | london_osgb = st_transform(london_lonlat, 27700) 13 | origin_osgb = st_point(c(0, 0)) %>% 14 | st_sfc() %>% 15 | st_sf(crs = 27700, geometry = .) 16 | london_orign = rbind(london_osgb, origin_osgb) 17 | 18 | paris = c(2.34,48.85) 19 | 20 | paris_lonlat = st_point(paris) %>% 21 | st_sfc() %>% 22 | st_sf(crs = 4326, geometry = .) 23 | paris_lambert = st_transform(paris_lonlat, 27561) 24 | origin_lambert = st_point(c(0, 0)) %>% 25 | st_sfc() %>% 26 | st_sf(crs = 27561, geometry = .) 27 | paris_origin = rbind(paris_lambert, origin_lambert) 28 | 29 | png("images/vector_lonlat.png") 30 | globe::globeearth(eye = c(0, 0)) 31 | gratmat = st_coordinates(st_graticule())[, 1:2] 32 | globe::globelines(loc = gratmat, col = "grey", lty = 3) 33 | globe::globelines(loc = matrix(c(-90, 90, 0, 0), ncol = 2)) 34 | globe::globelines(loc = matrix(c(0, 0, -90, 90), ncol = 2)) 35 | globe::globepoints(loc = c(-0.1, 51.5), pch = 4, cex = 2, lwd = 3, col = "red") 36 | globe::globepoints(loc = c(0, 0), pch = 1, cex = 2, lwd = 3, col = "blue") 37 | dev.off() 38 | png("images/vector_projected.png") 39 | uk = rnaturalearth::ne_countries(scale = 50) %>% 40 | st_as_sf() %>% 41 | filter(grepl(pattern = "United Kingdom|Ire", x = name_long)) %>% 42 | st_transform(27700) 43 | plot(uk$geometry) 44 | plot(london_orign$geometry[1], add = TRUE, pch = 4, cex = 2, lwd = 3, col = "red") 45 | plot(london_orign$geometry[2], add = TRUE, pch = 1, cex = 2, lwd = 3, col = "blue") 46 | abline(h = seq(0, 9e5, length.out = 10), col = "grey", lty = 3) 47 | abline(v = seq(0, 9e5, length.out = 10), col = "grey", lty = 3) 48 | dev.off() 49 | 50 | 51 | # globe 52 | png("images/vector_lonlatglobe.png") 53 | globe::globeearth(eye = c(0,0)) 54 | gratmat = st_coordinates(st_graticule())[, 1:2] 55 | globe::globelines(loc = gratmat, col = "grey", lty = 3) 56 | globe::globelines(loc = matrix(c(-90, 90, 0, 0), ncol = 2)) 57 | globe::globelines(loc = matrix(c(0, 0, -90, 90), ncol = 2)) 58 | globe::globepoints(loc = c(0, 0), pch = 1, cex = 2, lwd = 3, col = "blue") 59 | dev.off() 60 | 61 | # paris 62 | png("images/vector_lonlatparis.png") 63 | globe::globeearth(eye = c(0,0)) 64 | gratmat = st_coordinates(st_graticule())[, 1:2] 65 | globe::globelines(loc = gratmat, col = "grey", lty = 3) 66 | globe::globelines(loc = matrix(c(-90, 90, 0, 0), ncol = 2)) 67 | globe::globelines(loc = matrix(c(0, 0, -90, 90), ncol = 2)) 68 | globe::globepoints(loc = paris, pch = 4, cex = 2, lwd = 3, col = "red") 69 | globe::globepoints(loc = c(0, 0), pch = 1, cex = 2, lwd = 3, col = "blue") 70 | dev.off() 71 | png("images/vector_projectedparis.png") 72 | france = rnaturalearth::ne_states(country = "France", returnclass = "sf") %>% 73 | filter(!name %in% c("Guyane française", "Martinique", "Guadeloupe", "La Réunion", "Mayotte")) %>% 74 | st_as_sf() %>% 75 | st_transform(27561) 76 | plot(france$geometry) 77 | plot(paris_origin$geometry[1], add = TRUE, pch = 4, cex = 2, lwd = 3, col = "red") 78 | plot(paris_origin$geometry[2], add = TRUE, pch = 1, cex = 2, lwd = 3, col = "blue") 79 | abline(h = seq(-9e5, 9e5, length.out = 15), col = "grey", lty = 3) 80 | abline(v = seq(0, 13e5, length.out = 10), col = "grey", lty = 3) 81 | dev.off() 82 | -------------------------------------------------------------------------------- /styles.css: -------------------------------------------------------------------------------- 1 | /* css styles */ 2 | --------------------------------------------------------------------------------