├── .gitignore
├── .nojekyll
├── 01-shell-intro.qmd
├── 02-filedir.qmd
├── 03-filework.qmd
├── 04-pipes.qmd
├── 05-git.qmd
├── 06-concepts.qmd
├── 09-R-packages.qmd
├── 10-spatial-R.qmd
├── 11-NLP-R.qmd
├── 11-NLP-R_cache
└── revealjs
│ ├── __packages
│ ├── unnamed-chunk-10_b67627f3927e12ad713028cf98e417bb.RData
│ ├── unnamed-chunk-10_b67627f3927e12ad713028cf98e417bb.rdb
│ ├── unnamed-chunk-10_b67627f3927e12ad713028cf98e417bb.rdx
│ ├── unnamed-chunk-11_c35984ea9b2f979ff7f85284fee4cda0.RData
│ ├── unnamed-chunk-11_c35984ea9b2f979ff7f85284fee4cda0.rdb
│ ├── unnamed-chunk-11_c35984ea9b2f979ff7f85284fee4cda0.rdx
│ ├── unnamed-chunk-12_18410f94565577aaf66decee3574d410.RData
│ ├── unnamed-chunk-12_18410f94565577aaf66decee3574d410.rdb
│ ├── unnamed-chunk-12_18410f94565577aaf66decee3574d410.rdx
│ ├── unnamed-chunk-13_17c61bf27164538fea6ca0a3e4dc5b20.RData
│ ├── unnamed-chunk-13_17c61bf27164538fea6ca0a3e4dc5b20.rdb
│ ├── unnamed-chunk-13_17c61bf27164538fea6ca0a3e4dc5b20.rdx
│ ├── unnamed-chunk-14_3cbee9f25d7102231341290c4fc06f0d.RData
│ ├── unnamed-chunk-14_3cbee9f25d7102231341290c4fc06f0d.rdb
│ ├── unnamed-chunk-14_3cbee9f25d7102231341290c4fc06f0d.rdx
│ ├── unnamed-chunk-15_50c5eb25cbf3bf122ec79bddb67172f2.RData
│ ├── unnamed-chunk-15_50c5eb25cbf3bf122ec79bddb67172f2.rdb
│ ├── unnamed-chunk-15_50c5eb25cbf3bf122ec79bddb67172f2.rdx
│ ├── unnamed-chunk-16_1c82565a0983d4f9ac69861c5f41d0d1.RData
│ ├── unnamed-chunk-16_1c82565a0983d4f9ac69861c5f41d0d1.rdb
│ ├── unnamed-chunk-16_1c82565a0983d4f9ac69861c5f41d0d1.rdx
│ ├── unnamed-chunk-17_a7915cd327518d28ed8f0a9e584a9247.RData
│ ├── unnamed-chunk-17_a7915cd327518d28ed8f0a9e584a9247.rdb
│ ├── unnamed-chunk-17_a7915cd327518d28ed8f0a9e584a9247.rdx
│ ├── unnamed-chunk-18_13c48c8c58c1b5cd21bb3ca46d39505a.RData
│ ├── unnamed-chunk-18_13c48c8c58c1b5cd21bb3ca46d39505a.rdb
│ ├── unnamed-chunk-18_13c48c8c58c1b5cd21bb3ca46d39505a.rdx
│ ├── unnamed-chunk-19_60f976a677f568d76e9935e9173d5545.RData
│ ├── unnamed-chunk-19_60f976a677f568d76e9935e9173d5545.rdb
│ ├── unnamed-chunk-19_60f976a677f568d76e9935e9173d5545.rdx
│ ├── unnamed-chunk-1_b443c34df83ffb4e47c67e5e9ac4cfce.RData
│ ├── unnamed-chunk-1_b443c34df83ffb4e47c67e5e9ac4cfce.rdb
│ ├── unnamed-chunk-1_b443c34df83ffb4e47c67e5e9ac4cfce.rdx
│ ├── unnamed-chunk-20_f90113178a75d661e8a9b319f8dd63b9.RData
│ ├── unnamed-chunk-20_f90113178a75d661e8a9b319f8dd63b9.rdb
│ ├── unnamed-chunk-20_f90113178a75d661e8a9b319f8dd63b9.rdx
│ ├── unnamed-chunk-21_0cc656c58d55c349f872ab6c59a1c9c6.RData
│ ├── unnamed-chunk-21_0cc656c58d55c349f872ab6c59a1c9c6.rdb
│ ├── unnamed-chunk-21_0cc656c58d55c349f872ab6c59a1c9c6.rdx
│ ├── unnamed-chunk-2_ea58cca509eaa82089e5339e5054c58a.RData
│ ├── unnamed-chunk-2_ea58cca509eaa82089e5339e5054c58a.rdb
│ ├── unnamed-chunk-2_ea58cca509eaa82089e5339e5054c58a.rdx
│ ├── unnamed-chunk-3_3ba45688564a3db789f5c5f910f2d7c8.RData
│ ├── unnamed-chunk-3_3ba45688564a3db789f5c5f910f2d7c8.rdb
│ ├── unnamed-chunk-3_3ba45688564a3db789f5c5f910f2d7c8.rdx
│ ├── unnamed-chunk-4_1ac64f41a7478af12db8d4afc3796ea6.RData
│ ├── unnamed-chunk-4_1ac64f41a7478af12db8d4afc3796ea6.rdb
│ ├── unnamed-chunk-4_1ac64f41a7478af12db8d4afc3796ea6.rdx
│ ├── unnamed-chunk-5_ca8c5c862543df61f31a683820040a75.RData
│ ├── unnamed-chunk-5_ca8c5c862543df61f31a683820040a75.rdb
│ ├── unnamed-chunk-5_ca8c5c862543df61f31a683820040a75.rdx
│ ├── unnamed-chunk-6_502f7c554aaaed64ff954e58c7fbaa66.RData
│ ├── unnamed-chunk-6_502f7c554aaaed64ff954e58c7fbaa66.rdb
│ ├── unnamed-chunk-6_502f7c554aaaed64ff954e58c7fbaa66.rdx
│ ├── unnamed-chunk-7_2750b1c3b2e57236a5af4f8faef1c5d1.RData
│ ├── unnamed-chunk-7_2750b1c3b2e57236a5af4f8faef1c5d1.rdb
│ ├── unnamed-chunk-7_2750b1c3b2e57236a5af4f8faef1c5d1.rdx
│ ├── unnamed-chunk-8_485a42cd3be166890574dd8e3464bfd8.RData
│ ├── unnamed-chunk-8_485a42cd3be166890574dd8e3464bfd8.rdb
│ ├── unnamed-chunk-8_485a42cd3be166890574dd8e3464bfd8.rdx
│ ├── unnamed-chunk-9_56df5bed1c943e1c4727cc65d4f7ab22.RData
│ ├── unnamed-chunk-9_56df5bed1c943e1c4727cc65d4f7ab22.rdb
│ └── unnamed-chunk-9_56df5bed1c943e1c4727cc65d4f7ab22.rdx
├── LICENSE
├── R
└── sayhello.R
├── README.md
├── ScPoProgramming.Rproj
├── _extensions
└── metropolis-theme
│ ├── _extension.yml
│ └── metropolis.scss
├── _quarto.yml
├── about.qmd
├── custom.scss
├── data
├── brexit.csv
└── shell-lesson-data.zip
├── images
├── 02_datum_fig.png
├── PHD.png
├── ScPo-logo.png
├── Tux.png
├── bad.gif
├── distance1.png
├── distance2.png
├── filesystem-challenge.odg
├── filesystem-challenge.svg
├── filesystem.svg
├── find-file-tree.odg
├── find-file-tree.svg
├── git-images
│ └── full.png
├── git-staging-CDG.jpeg
├── git-staging-area.svg
├── git_staging.svg
├── home-directories.svg
├── homedir.odg
├── nano-screenshot.png
├── phd101212s.png
├── redirects-and-pipes.svg
├── removed-that.png
├── seine.png
├── seine2.png
├── seine3.png
├── seine4.png
├── seine5.png
├── shell_command_syntax.svg
├── shell_script_for_loop_flow_chart.svg
├── standard-filesystem-hierarchy.svg
├── toypackage.png
├── vector_lonlat.png
├── vector_lonlatglobe.png
├── vector_lonlatparis.png
├── vector_projected.png
├── vector_projectedparis.png
└── which-version.png
├── index.qmd
├── scripts
├── 01-shell-intro.sh
├── _tidy_tasks.Rmd
├── geotask.R
└── lon-lat-geocomp.r
└── styles.css
/.gitignore:
--------------------------------------------------------------------------------
1 | /.quarto/
2 | /_site/
3 | .Rproj.user
4 | .Rhistory
5 | .Rdata
6 | .httr-oauth
7 | .DS_Store
8 | /data/shapefiles/
9 | .quarto
10 |
--------------------------------------------------------------------------------
/.nojekyll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/.nojekyll
--------------------------------------------------------------------------------
/01-shell-intro.qmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "The Unix Shell"
3 | format:
4 | revealjs:
5 | chalkboard: false
6 | logo: /images/ScPo-logo.png
7 | footer: "[SciencesPo Intro To Programming 2024](https://floswald.github.io/ScPoProgramming/)"
8 | incremental: false
9 | code-line-numbers: false
10 | highlight-style: github
11 | author: Florian Oswald and Software Carpentry
12 | subtitle: "[SciencesPo Intro To Programming 2024](https://floswald.github.io/ScPoProgramming/)"
13 | date: today
14 | date-format: "D MMMM, YYYY"
15 | ---
16 |
17 | ## Intro
18 |
19 | ::::: {.callout-note}
20 | # Question
21 |
22 | * What is a command shell and why would I use one?
23 | :::::
24 |
25 | ::::: {.callout-tip}
26 | # Objectives
27 | - Explain how the shell relates to the keyboard, the screen, the operating system, and users' programs.
28 | - Explain when and why command-line interfaces should be used instead of graphical interfaces.
29 | ::::::
30 |
31 | ---
32 |
33 |
34 | ## Do You GUI?
35 |
36 | ::: {.callout-tip}
37 |
38 | # What's a GUI
39 | A *Graphical User Interface* (GUI) lets the user interact by clicking with a mouse and using menus.
40 | :::
41 |
42 | . . .
43 |
44 | * I love 😍 a good Graphical User Interface (GUI)
45 |
46 | . . .
47 |
48 | * But. Bad things can happen.
49 |
50 | . . .
51 |
52 | * ☠️
53 |
54 | ---
55 |
56 | ## {background-image="/images/bad.gif" background-size="cover"}
57 |
58 | ---
59 |
60 | ## Bad.
61 |
62 | ::::: {.columns}
63 | ::::: {.column width="50%"}
64 | ::: {.callout-warning}
65 |
66 | # No More than 65,536 Rows
67 |
68 | * Public Health England [missed 14.000 covid cases](https://www.influentialsoftware.com/how-an-nhs-test-and-trace-excel-error-lost-16000-covid-19-cases/)
69 | * They used `.xls` document to collect data.
70 | :::
71 | :::::
72 |
73 |
74 | ::::: {.column width="50%"}
75 | ::: {.callout-warning}
76 |
77 | # No Growth with High Debt?
78 |
79 | * [Reinhardt and Rogoff controversy](https://theconversation.com/the-reinhart-rogoff-error-or-how-not-to-excel-at-economics-13646).^[Both are eminent researchers and we do *not* imply misconduct.]
80 | * Inadvertently did not select all relevant countries on spreadsheet.
81 | :::
82 | :::::
83 |
84 | :::::
85 |
86 |
87 | ---
88 |
89 | ## Or Do you CLI?
90 |
91 | ::: {.callout-tip}
92 |
93 | # What's a CLI
94 | A *Command Line Interface* (CLI) allows interaction via (text) commands.
95 | :::
96 |
97 |
98 | * CLIs can collect commands somewhere - *reproducible*
99 | * But one has to learn a *language*.
100 | * They are great for *long, repetitive tasks*.
101 | * It's often the **only** way to interact with high-performance computing. 👉 show [DANTE](https://www.ipgp.fr/en/details-du-cluster-dante/)
102 |
103 |
104 | ---
105 |
106 | ## The Shell {.smaller}
107 |
108 | * The *shell* is a program where we can type in commands and get output.
109 | * We often use very simple programs - good for just one thing.
110 | * There is tremendous power in *combining* those little programs.
111 | * It's a *platform* approach to an Operating System.
112 |
113 | ::: {.callout-tip}
114 | # Unix is a Platform
115 |
116 | A protocol and many small program who interact according to the rules with each other
117 | :::
118 |
119 | 
120 |
121 |
122 | ---
123 |
124 | ## Go! 🚀
125 |
126 | 1. Open your terminal! (`GitBash` on Windows)
127 | 2. You should see something like
128 | ```bash
129 | $
130 | ```
131 | which is called the **prompt**.
132 |
133 | 3. You **don't have to type** the `$`!
134 | 3. next to it, you see a cursor.
135 |
136 |
137 | ---
138 |
139 | ## First command: `ls`
140 |
141 | * type `ls` and hit enter
142 | * you see something like this as output:
143 |
144 | ```
145 | Desktop Downloads Movies Pictures
146 | Documents Library Music Public
147 | ```
148 |
149 | * By default, the terminal opens in your home directory.
150 | * `ls` *lists* the content of that directory.
151 |
152 | ---
153 |
154 | ## First Error!
155 |
156 | ::: {.callout-caution}
157 | * If you mistype a command, or look for a program that is not installed, you get an error. Like:
158 |
159 | ```
160 | bash-3.2$ ks
161 | ```
162 |
163 | ```
164 | bash: ks: command not found
165 | ```
166 | :::
167 |
168 | * Look for a spelling mistake (it's `ls` not `ks`)
169 | * Or install the required program.
170 |
171 |
172 | ## Nelle's Pipeline: A Typical Problem {.smaller}
173 |
174 | * Nelle Nemo is a marine biologist. 🌊 🐡
175 | * Just sampled gelatinous marine life in the
176 | [Great Pacific Garbage Patch](http://en.wikipedia.org/wiki/Great_Pacific_Garbage_Patch).
177 | * From 1520 samples she obtained measures of the relative abundance of 300 proteins.
178 | * Her supervisor, *Professor Jones*, handed over to her a program called `goostats.sh`.
179 | * `goostats.sh` needs to be run on each of the 1520 samples.
180 | * Paper needs to be ready by the end of the month.
181 |
182 | ---
183 |
184 | ## {background-image="/images/PHD.png" background-size="50%" background-position="center"}
185 |
186 |
187 | ---
188 |
189 | ## Battle Plan
190 |
191 | * Using a GUI to run `goostats.sh`, Nelle will have to use her mouse to select and open a file 1520 times.
192 | * If `goostats.sh` takes 30 secs to run, this will take more than 12 hours of Nelle's *active time*.
193 | * With the help of the shell, Nelle could make her computer go through that list of files instead.
194 | * **Bonus** : she will have a working pipeline, that can be re-run each time she wants to add data or reproduce previous output.
195 |
196 | ---
197 |
198 | ## What Does Nelle Need {.smaller}
199 |
200 | Nelle has needs to learn the following things:
201 |
202 | - navigate to a file/directory
203 | - create a file/directory
204 | - check the length of a file
205 | - chain commands together
206 | - retrieve a set of files
207 | - iterate over files
208 | - run a shell script containing her pipeline
209 |
210 | And we will be right next to her. 🙂
211 |
212 |
213 | ---
214 |
215 | ##
216 |
217 | ::: {.callout-tip}
218 |
219 | # Key Points
220 |
221 | - A shell is a program whose primary purpose is to read commands and run other programs.
222 | - This lesson uses Bash, the default shell in many implementations of Unix.
223 | - Programs can be run in Bash by entering commands at the command-line prompt.
224 | - The shell's main advantages are its high action-to-keystroke ratio, its support for automating repetitive tasks, and its capacity to access networked machines.
225 | - The shell's main disadvantages are its primarily textual nature and how cryptic its commands and operation can be.
226 | - ChatGPT can help you write Bash scripts, still it is fundamental to know the basics.
227 |
228 | :::
229 |
230 |
231 |
232 |
233 |
234 |
--------------------------------------------------------------------------------
/02-filedir.qmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Navigating Files and Directories"
3 | format:
4 | revealjs:
5 | theme: _extensions/metropolis-theme/metropolis.scss
6 | chalkboard: true
7 | logo: /images/ScPo-logo.png
8 | footer: "[SciencesPo Intro To Programming 2024](https://floswald.github.io/ScPoProgramming/)"
9 | incremental: false
10 | code-line-numbers: false
11 | highlight-style: github
12 | author: Florian Oswald and The Software Carpentry
13 | subtitle: "[SciencesPo Intro To Programming 2024](https://floswald.github.io/ScPoProgramming/)"
14 | date: today
15 | date-format: "D MMMM, YYYY"
16 | ---
17 |
18 |
19 | ## Intro
20 |
21 | ::::: {.callout-note}
22 | # Questions
23 |
24 | - How can I move around on my computer?
25 | - How can I see what files and directories I have?
26 | - How can I specify the location of a file or directory on my computer?
27 | :::::
28 |
29 | ::::: {.callout-tip}
30 | # Objectives
31 | - Explain the similarities and differences between a file and a directory.
32 | - Translate an absolute path into a relative path and vice versa.
33 | - Construct absolute and relative paths that identify specific files and directories.
34 | - Use options and arguments to change the behaviour of a shell command.
35 | - Demonstrate the use of tab completion and explain its advantages.
36 | ::::::
37 |
38 |
39 | ---
40 |
41 | ## The File System {.smaller}
42 |
43 | * The file system organizes data into files and directories on your computer.
44 | * let's start finding out where we are by running the `pwd` command - *present working directory*.
45 |
46 | ```bash
47 | $ pwd
48 | ```
49 |
50 | with output:
51 |
52 | ```
53 | /Users/nelle
54 | ```
55 |
56 | ::: {.callout-note}
57 |
58 | # Home Directory Variations
59 |
60 | * Linux: `/home/nelle`
61 | * Windows: `C:\Documents and Settings\nelle`
62 |
63 | if `pwd` does not return your home directory, may need to navigate there first with `cd`.
64 | :::
65 |
66 | ---
67 |
68 | ## Nelle's Home Directory
69 |
70 | Nelle's file system looks like this:
71 |
72 | 
74 |
75 | * `/` is the *root* of the system
76 | * all other locations can be reached from there via a *path*
77 | * path to homedir is from `/` to directory `Users`, which contains folder `Nelle`
78 | * We know exactly where the home is stored by looking at this path.
79 | * Notice that *inside* a path, `/` is a *separator*. (It's `\` on Windows!)
80 |
81 | ---
82 |
83 | ## Working with `ls`
84 |
85 | * type `ls -F`. This adds *option* `F` (for *full*) to the command.
86 | * now you get also
87 | - a trailing `/` indicates that this is a directory
88 | - `@` indicates a link
89 | - `*` indicates an executable
90 |
91 | ```bash
92 | $ ls -F
93 | ```
94 |
95 | ```
96 | Applications/ Documents/ Library/ Music/ Public/
97 | Desktop/ Downloads/ Movies/ Pictures/
98 | ```
99 |
100 | ::: {.callout-caution}
101 |
102 | # question
103 |
104 | What kind of objects does Nelle's home directory contain?
105 | :::
106 |
107 |
108 | ---
109 |
110 | ## Help
111 |
112 | ::: {.callout-note}
113 |
114 | # Clear Terminal
115 |
116 | * Use the `clear` command to clear terminal.
117 | * you can use your `↑` and `↓` keys to see previous commands, or just scroll up.
118 |
119 | :::
120 |
121 |
122 | ::: {.callout-tip}
123 |
124 | # Getting Help
125 |
126 | 1. pass the `--help` option to a command:
127 | ```bash
128 | $ ls --Help
129 | ```
130 |
131 | 2. Read the manual entry with `man` (MacOS and Linux only)
132 | ```bash
133 | man ls
134 | ```
135 |
136 | 3. Search internet for `unix man ls`
137 |
138 |
139 | :::
140 |
141 |
142 | ## More `ls` Flags
143 |
144 | ::: {.callout-warning}
145 |
146 | # Challenge
147 |
148 | You can also use two options at the same time. What does the command `ls` do when used
149 | with the `-l` option? What about if you use both the `-l` and the `-h` option?
150 | :::
151 |
152 |
153 | Show Solution
154 |
155 | ::: {.callout-note}
156 | # Solution
157 | The `-l` option makes `ls` use a **l**ong listing format, showing not only the file/directory names but also additional information, such as the file size and the time of its last modification. If you use both the `-h` option and the `-l` option,
158 | this makes the file size '**h**uman readable', i.e. displaying something like `5.3K` instead of `5369`.
159 | :::
160 |
161 |
162 |
163 | ---
164 |
165 | ## More `ls` Challenges
166 |
167 | ::: {.callout-warning}
168 |
169 | # Listing in Reverse Chronological Order
170 |
171 | By default, `ls` lists the contents of a directory in alphabetical
172 | order by name. The command `ls -t` lists items by time of last
173 | change instead of alphabetically. The command `ls -r` lists the
174 | contents of a directory in reverse order.
175 | Which file is displayed last when you combine the `-t` and `-r` options?
176 | Hint: You may need to use the `-l` option to see the
177 | last changed dates.
178 | :::
179 |
180 |
181 | Show Solution
182 |
183 | ::: {.callout-note}
184 | # Solution
185 | The most recently changed file is listed last when using `-rt`. This
186 | can be very useful for finding your most recent edits or checking to
187 | see if a new output file was written.
188 | :::
189 |
190 |
191 |
192 | ---
193 |
194 | ## Getting Data
195 |
196 | 1. Let's [download some data](/data/shell-lesson-data.zip)
197 | 2. unzip it and and move it to your `home` directory. (`~`, not `Desktop`!)
198 |
199 | ---
200 |
201 | ## Exploring More Directories
202 |
203 | * `ls` can search other than only the *current* directories.
204 | * Let's see what is on our `home`:
205 |
206 | ```bash
207 | $ cd # goes HOME
208 | $ ls -F .
209 | ```
210 |
211 | shows for Nelle only the data we just downloaded:
212 |
213 | ```
214 | shell-lesson-data/
215 | ```
216 |
217 | * We can also look *inside* that data from where we are:
218 |
219 | ```bash
220 | $ ls -F shell-less-data
221 | ```
222 |
223 | ```
224 | exercise-data/ north-pacific-gyre/
225 | ```
226 |
227 | * looks intriguing 🧐. Let's try and go there!
228 |
229 | ---
230 |
231 | ## Going into Subdirectories
232 |
233 | * `cd` is for *change directory*. Moves the shell to a different location in the file system.
234 | * Let's go to our data folder:
235 |
236 | ```bash
237 | $ cd shell-lesson-data
238 | $ cd exercise-data
239 | ```
240 |
241 | * Notice that the `cd` command does not print any output by default.
242 | * Run `ls -F` again to see what's in this directory!
243 | * Run `pwd` to see where we are!
244 |
245 | ---
246 |
247 | ## Coming Back from Subdirectories
248 |
249 | * Now we want to go back up one level.
250 | * It's tempting to say `cd shell-lesson-data`
251 | * But `cd` can only go into *its own subdirectories*.
252 | * It has a special one: `..` is its *parent* directory, so goes one up.
253 |
254 | ```bash
255 | $ cd ..
256 | $ pwd
257 | ```
258 |
259 | puts Nelle back into
260 |
261 | ```
262 | /Users/nelle/shell-lesson-data
263 | ```
264 |
265 | * Notice how `..` is listed if you flag `-a` on the `ls` command.
266 |
267 |
268 | ---
269 |
270 | ## Hidden Files
271 |
272 | ::: {.callout-tip}
273 |
274 | # Hidden Files
275 |
276 | * Typing `cd` without any arguments puts you back into your Home directory. Do it.
277 | * Let's use `ls -F -a` or `ls -Fa` to list *all* files. Also **hidden** ones!
278 |
279 | :::
280 |
281 | ::: {.callout-note}
282 |
283 | # Relative and Absolute Paths
284 |
285 | * Up until now, we used *relative paths*. `cd` and `ls` operated **from our current position** in the file sytem.
286 | * We can also specify the *absolute path*, i.e. starting at the root `/`. This allows to go anywhere from anywhere.
287 |
288 | :::
289 |
290 |
291 | ---
292 |
293 | ## More Shortcuts
294 |
295 | ::: {.callout-note}
296 |
297 | # Tilde (`~`) and dash (`-`)
298 |
299 | * The tilde `~` in first position means *current user's home*
300 | * The dash in `cd -` means *go into the directory I was previously in*.
301 | * So:
302 |
303 | 1. `cd ..` brings you *up* one level
304 | 2. `cd -` takes you *back* to wherever you've come from.
305 | :::
306 |
307 | ---
308 |
309 | ## Challenges
310 |
311 | ::: {.callout-caution}
312 |
313 | # Challenge
314 |
315 | Starting from `/Users/amanda/data`,
316 | which of the following commands could Amanda use to navigate to her home directory,
317 | which is `/Users/amanda`?
318 |
319 | 1. `cd .`
320 | 2. `cd /`
321 | 3. `cd /home/amanda`
322 | 4. `cd ../..`
323 | 5. `cd ~`
324 | 6. `cd home`
325 | 7. `cd ~/data/..`
326 | 8. `cd`
327 | 9. `cd ..`
328 |
329 | :::
330 |
331 | ---
332 |
333 | ## Solution
334 |
335 |
336 | Show Solution
337 |
338 | ::: {.callout-note}
339 | # Solution
340 |
341 | 1. No: `.` stands for the current directory.
342 | 2. No: `/` stands for the root directory.
343 | 3. No: Amanda's home directory is `/Users/amanda`.
344 | 4. No: this command goes up two levels, i.e. ends in `/Users`.
345 | 5. Yes: `~` stands for the user's home directory, in this case `/Users/amanda`.
346 | 6. No: this command would navigate into a directory `home` in the current directory
347 | if it exists.
348 | 7. Yes: unnecessarily complicated, but correct.
349 | 8. Yes: shortcut to go back to the user's home directory.
350 | 9. Yes: goes up one level.
351 |
352 |
353 | :::
354 |
355 |
356 | ---
357 |
358 |
359 | {.absolute top=200 right=30 width="550" height="500"}
360 |
361 | ::: {.callout-caution}
362 |
363 | # Challenge
364 | Using the filesystem diagram , if `pwd` displays `/Users/thing`,
365 | what will `ls -F ../backup` display?
366 |
367 | 1. `../backup: No such file or directory`
368 | 2. `2012-12-01 2013-01-08 2013-01-27`
369 | 3. `2012-12-01/ 2013-01-08/ 2013-01-27/`
370 | 4. `original/ pnas_final/ pnas_sub/`
371 |
372 |
373 |
374 | :::
375 |
376 |
377 | ---
378 |
379 | ## Solution
380 |
381 |
382 | Show Solution
383 |
384 | ::: {.callout-note}
385 | # Solution
386 |
387 | 1. No: there *is* a directory `backup` in `/Users`.
388 | 2. No: this is the content of `Users/thing/backup`, but with `..`, we asked for one level further up.
389 | 3. No: see previous explanation.
390 | 4. Yes: `../backup/` refers to `/Users/backup/`.
391 |
392 | :::
393 |
394 |
395 | ---
396 |
397 | {.absolute top=200 right=60 width="550" height="500"}
398 |
399 | ::: {.callout-caution}
400 |
401 | # Challenge
402 | Using the filesystem diagram below, if `pwd` displays `/Users/backup`,and `-r` tells `ls` to display things in reverse order,
403 | what command(s) will result in the following output:
404 |
405 | ```
406 | pnas_sub/ pnas_final/ original/
407 | ```
408 |
409 | is it:
410 |
411 | 1. `ls pwd`?
412 | 2. `ls -r -F`?
413 | 3. `ls -r -F /Users/backup`?
414 |
415 |
416 | :::
417 |
418 | ---
419 |
420 | ## Solution
421 |
422 |
423 | Show Solution
424 |
425 | ::: {.callout-note}
426 | # Solution
427 |
428 | 1. No: `pwd` is not the name of a directory.
429 | 2. Yes: `ls` without directory argument lists files and directories
430 | in the current directory.
431 | 3. Yes: uses the absolute path explicitly.
432 |
433 | :::
434 |
435 |
436 |
437 | ---
438 |
439 | ## General Syntax of Shell Commands {.smaller}
440 |
441 | Let's take as example this command:
442 |
443 | ```bash
444 | $ ls -F /
445 | ```
446 |
447 | 
448 |
449 | * The space between `ls` and whatever options you put is important.
450 | * Capitalization is important. `ls -s` is not the same as `ls -S`:
451 |
452 | ```bash
453 | $ cd ~/shell-lesson-data
454 | $ ls -s exercise-data # size
455 | $ ls -S exercise-data # sort by size
456 | ```
457 |
458 | ---
459 |
460 | ## Nelle's Pipeline and Tab Completion
461 |
462 | 1. Nelle organized the output of the assay machine into `north-pacific-gyre/`. let's go there.
463 | ```bash
464 | $ cd ~/shell-lesson-data/
465 | $ cd north-pacific-gyre
466 | ```
467 | 2. Now `north-pacific-gyre` is a mouthful to write. try instead to type `cd n` and hit the TAB key.
468 | 3. hitting TAB twice without any leading character, gives you a list of files in `pwd`.
469 |
470 |
471 | ---
472 |
473 | ::: {.callout-tip}
474 |
475 | # Key Points
476 |
477 | - The file system is responsible for managing information on the disk.
478 | - Information is stored in files, which are stored in directories (folders).
479 | - Directories can also store other directories, which then form a directory tree.
480 | - `pwd` prints the user's current working directory.
481 | - `ls [path]` prints a listing of a specific file or directory; `ls` on its own lists the current working directory.
482 | - `cd [path]` changes the current working directory.
483 | - Most commands take options that begin with a single `-`.
484 | - Directory names in a path are separated with `/` on Unix, but `\\` on Windows.
485 | - `/` on its own is the root directory of the whole file system.
486 | - An absolute path specifies a location from the root of the file system.
487 | - A relative path specifies a location starting from the current location.
488 | - `.` on its own means 'the current directory'; `..` means 'the directory above the current one'.
489 |
490 | :::
--------------------------------------------------------------------------------
/03-filework.qmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Working with Files and Directories"
3 | format:
4 | revealjs:
5 | theme: _extensions/metropolis-theme/metropolis.scss
6 | chalkboard: true
7 | logo: /images/ScPo-logo.png
8 | footer: "[SciencesPo Intro To Programming 2024](https://floswald.github.io/ScPoProgramming/)"
9 | incremental: false
10 | code-line-numbers: false
11 | highlight-style: github
12 | author: Florian Oswald and The Software Carpentry
13 | subtitle: "[SciencesPo Intro To Programming 2024](https://floswald.github.io/ScPoProgramming/)"
14 | date: today
15 | date-format: "D MMMM, YYYY"
16 | ---
17 |
18 |
19 | ## Intro
20 |
21 | ::: {.callout-note}
22 |
23 | # Questions
24 |
25 | - How can I create, copy, and delete files and directories?
26 | - How can I edit files?
27 |
28 | :::
29 |
30 | ::: {.callout-tip}
31 |
32 | # Objectives
33 |
34 | - Create a directory hierarchy that matches a given diagram.
35 | - Create files in that hierarchy using an editor or by copying and renaming existing files.
36 | - Delete, copy and move specified files and/or directories.
37 |
38 | :::
39 |
40 | ---
41 |
42 | ## Creating Directories
43 |
44 | * Let's create a directory `thesis` here:
45 | ```bash
46 | $ cd ~/shell-lesson-data/exercise-data/writing
47 | $ ls -F
48 | ```
49 | this outputs:
50 |
51 | ```
52 | haiku.txt LittleWomen.txt
53 | ```
54 |
55 | * Use `mkdir` to create:
56 | ```bash
57 | $ mkdir thesis
58 | ```
59 |
60 | used like this, `thesis` is created in the current directory. While with the `-p` flag we created nested subdirectories:
61 |
62 | ```bash
63 | $ mkdir -p ../project/data ../project/results
64 | ```
65 |
66 | ---
67 |
68 | ## Good File Names
69 |
70 |
71 | ::: {.callout-warning}
72 |
73 | # Bad File Names
74 |
75 | 1. Don't use spaces. Spaces don't work well on unix file names. `north pacific gyre` is not a good one. Use `north-pacific-gyre` instead.
76 | 2. Don't begin with `-`.
77 | 3. Stick with letters, numbers, `.`, `-`, and `_`
78 |
79 | :::
80 |
81 | ---
82 |
83 | ## Creating a Text File
84 |
85 | * Let's go into the `thesis` directory and create a text file called `draft.txt`.
86 |
87 | ```bash
88 | $ cd thesis
89 | $ nano draft.txt
90 | ```
91 |
92 | ::: {.callout-note}
93 |
94 | # TEXT Editor
95 |
96 | `nano` is a super simple editor, and you can use it *only* to edit text files (That's normal for *text editors* 😉). You will probably switch to a more powerful editor later on (I recommend `VSCode`), but `nano` is a good starting point.
97 | Notice that `^` key is the `Ctrl` key, so `^X` means `Ctrl + X`.
98 | :::
99 |
100 |
101 | ---
102 |
103 | ## Filename Extensions
104 |
105 |
106 | ### Task
107 |
108 | 1. Go to your home directory: `cd`
109 | 2. create an *empty* file with the `touch` command:
110 |
111 | ```bash
112 | $ # this is a comment, by the way
113 | $ cd # so, going home.
114 | $ touch new_doc.pdf # creating an empty file.
115 | ```
116 |
117 | 3. Open your file browser and double click on `new_doc.pdf`. What is going to happen?
118 |
119 |
120 | ---
121 |
122 | * Ok, let's get rid of that file now.
123 | * use the `rm` command (more later)
124 | ```bash
125 | $ rm new_doc.pdf
126 | ```
127 | * Caution: `rm` is forever gone.
128 | * You can add `-i` *interactive* to be safe(r).
129 |
130 |
131 | ---
132 |
133 | ## Moving Files and Directories
134 |
135 | * Let's go back to the `writing` directory
136 | ```bash
137 | $ cd ~/shell-lesson-data/exercise-data/writing
138 | ```
139 | * Let's *rename* `draft.txt` to `quotes.txt` with `mv`.
140 | ```bash
141 | $ mv thesis/draft.txt thesis/quotes.txt
142 | ```
143 | * Now let's actually *move* it into the current dir:
144 | ```bash
145 | $ mv thesis/quotes.txt .
146 | ```
147 | * Notice: `mv x y` means `x` is gone afterwards!
148 |
149 | ---
150 |
151 | ::: {.callout-caution}
152 |
153 | # Challenge
154 |
155 | Jamie placed `maltose.dat` and `sucrose.datfiles` in the `analyzed` folder by mistake. He wants to move those back to the `raw` folder now:
156 |
157 | ```bash
158 | $ ls -F
159 | analyzed/ raw/
160 | $ ls -F analyzed
161 | fructose.dat glucose.dat maltose.dat sucrose.dat
162 | $ cd analyzed
163 | ```
164 | What has to go in the blanks to achieve this?
165 |
166 | ```bash
167 | $ mv sucrose.dat maltose.dat ____/____
168 | ```
169 | :::
170 |
171 |
172 | Show Solution
173 |
174 | ::: {.callout-note}
175 | # Solution
176 |
177 | ```bash
178 | $ mv sucrose.dat maltose.dat ../raw
179 | ```
180 |
181 | :::
182 |
183 |
184 |
185 | ---
186 |
187 | ## Copying Files and Directories
188 |
189 | * `cp x y` is similar to `mv x y`, but you keep `x`.
190 | ```bash
191 | $ cp quotes.txt thesis/quotations.txt
192 | $ ls quotes.txt thesis/quotations.txt
193 | ```
194 |
195 | * the `-r` option means *recursively* and copies entire folders:
196 | ```bash
197 | $ cp -r thesis thesis_backup
198 | $ ls thesis thesis_backup
199 | ```
200 |
201 | * Notice that `rm -r mydir` will delete everything inside the `mydir` folder!
202 |
203 |
204 | ---
205 |
206 | ## Using *Wildcards*
207 |
208 | * the `*` character is a *wildcard*, i.e it matches all characters:
209 | ```bash
210 | $ cd shell-lesson-data/exercise-data/
211 | $ ls proteins/p*
212 | proteins/pentane.pdb proteins/propane.pdb
213 | ```
214 |
215 |
216 | ---
217 |
218 | ## Reproducing a Folder Structure
219 |
220 | Suppose we want to create the following structure on our computer:
221 |
222 | ```bash
223 | 2016-05-20/
224 | └── data
225 | ├── processed
226 | └── raw
227 | ```
228 |
229 | ::: {.callout-caution}
230 | # Challenge
231 |
232 | Which sequence will achieve this result?
233 |
234 | ```
235 | 1.
236 | $ mkdir 2016-05-20
237 | $ mkdir 2016-05-20/data
238 | $ mkdir 2016-05-20/data/processed
239 | $ mkdir 2016-05-20/data/raw
240 | ```
241 |
242 | ```
243 | 2.
244 | $ mkdir 2016-05-20/data/raw
245 | $ mkdir 2016-05-20/data/processed
246 | ```
247 |
248 | ```
249 | 3.
250 | $ mkdir -p 2016-05-20/data/raw
251 | $ mkdir -p 2016-05-20/data/processed
252 | ```
253 |
254 | :::
255 |
256 |
257 | ---
258 |
259 | ## Nice Trick
260 |
261 | * Oh by the way.
262 | * If you are on MacOS, try to this on the command line
263 |
264 | ```bash
265 | $ open .
266 | ```
267 |
268 | * Pretty handy!
269 |
270 | ---
271 |
272 | ::: {.callout-tip}
273 |
274 | - `cp [old] [new]` copies a file.
275 | - `mkdir [path]` creates a new directory.
276 | - `mv [old] [new]` moves (renames) a file or directory.
277 | - `rm [path]` removes (deletes) a file.
278 | - `*` matches zero or more characters in a filename, so `*.txt` matches all files ending in `.txt`.
279 | - `?` matches any single character in a filename, so `?.txt` matches `a.txt` but not `any.txt`.
280 | - Use of the Control key may be described in many ways, including `Ctrl-X`, `Control-X`, and `^X`.
281 | - The shell does not have a trash bin: once something is deleted, it's really gone.
282 | - Most files' names are `something.extension`. The extension isn't required, and doesn't guarantee anything, but is normally used to indicate the type of data in the file.
283 | - Depending on the type of work you do, you may need a more powerful text editor than Nano.
284 |
285 | :::
286 |
287 |
--------------------------------------------------------------------------------
/04-pipes.qmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Pipes and Filters"
3 | format:
4 | revealjs:
5 | theme: _extensions/metropolis-theme/metropolis.scss
6 | chalkboard: true
7 | logo: /images/ScPo-logo.png
8 | footer: "[SciencesPo Intro To Programming 2024](https://floswald.github.io/ScPoProgramming/)"
9 | incremental: false
10 | code-line-numbers: false
11 | highlight-style: github
12 | author: Florian Oswald and The Software Carpentry
13 | subtitle: "[SciencesPo Intro To Programming 2024](https://floswald.github.io/ScPoProgramming/)"
14 | date: today
15 | date-format: "D MMMM, YYYY"
16 | ---
17 |
18 |
19 | ## Combining Commands
20 |
21 | * We are now ready to combine some of the commands we learned.
22 |
23 | * You will see that here is where the real power lies.
24 |
25 | * Let's navigate into our exercise data folder first.
26 |
27 | ```bash
28 | $ cd ~/shell-lesson-data/exercise-data/proteins
29 | $ ls
30 | cubane.pdb ethane.pdb methane.pdb octane.pdb pentane.pdb propane.pdb
31 | ```
32 |
33 | * Those are *protein data bank* files.
34 |
35 | ---
36 |
37 | ## Capturing Output
38 |
39 | * Introducing the `wc` word count command.
40 |
41 | ```bash
42 | wc cubane.pdb
43 | 20 156 1158 cubane.pdb
44 | ```
45 | 29 lines, 156 words, 1158 characters.
46 |
47 | * Let's **redirect** the output of `wc` to a file instead with `>`:
48 |
49 | ```bash
50 | wc -l *.pdb > lengths.txt
51 | ```
52 |
53 | * no ouput on screen, you see? but now there is a new file: `lengths.txt`.
54 | * Let's *concatenate* its content (i.e. join together) and print to screen:
55 | ```bash
56 | $ cat lengths.txt
57 | 20 cubane.pdb
58 | 12 ethane.pdb
59 | 9 methane.pdb
60 | 30 octane.pdb
61 | 21 pentane.pdb
62 | 15 propane.pdb
63 | 107 total
64 | ```
65 |
66 | ---
67 |
68 | ## Reading Text Files
69 |
70 | * `cat` prints the entire thing to screen.
71 | * `tail` only the end
72 | * `head` only the beginning
73 | * `less` lets you scroll and read (arrows up/down or `j` (up) and `k` (down), `q` exits.)
74 |
75 | ```bash
76 | $ head -n 3 ../animal-counts/animals.csv
77 | $ tail -n 2 ../animal-counts/animals.csv
78 | $ less ../../north-pacific-gyre/NENE01729A.txt
79 | ```
80 |
81 |
82 | ---
83 |
84 | ## Printing Text with `echo`
85 |
86 | * The `echo` function prints text - by default to screen:
87 | ```bash
88 | $ echo hi
89 | hi
90 | ```
91 |
92 | * But you can redirect it to a file as well:
93 | ```bash
94 | $ echo I said hi! > echofile1.txt
95 | ```
96 |
97 | ::: {.callout-important}
98 |
99 | # Challenge
100 |
101 | Do 2 times in a row:
102 | ```bash
103 | $ echo I said hi! > echofile1.txt
104 | ```
105 |
106 | Now do twice (notice `>>`!)
107 | ```bash
108 | $ echo I said hi! >> echofile2.txt
109 | ```
110 | * What's happening?
111 | :::
112 |
113 |
114 | ---
115 |
116 | ## Appending to Files
117 |
118 | ::: {.callout-important}
119 |
120 | # Challenge
121 |
122 | Consider the file `shell-lesson-data/exercise-data/animal-counts/animals.csv`. What is result of this:
123 |
124 | ```bash
125 | $ head -n 3 animals.csv > animals-subset.csv
126 | $ tail -n 2 animals.csv >> animals-subset.csv
127 | ```
128 |
129 | 1. The first three lines of animals.csv?
130 | 2. The last two lines of animals.csv?
131 | 3. The first three lines and the last two lines of animals.csv?
132 | 4. The second and third lines of animals.csv?
133 | :::
134 |
135 |
136 |
137 | Show Solution
138 |
139 | ::: {.callout-note}
140 | # Solution
141 | Option 3 is correct.
142 | :::
143 |
144 |
145 |
146 | ---
147 |
148 | ## Filtering Files with `sort`
149 |
150 | * `sort` reads a file and *sorts* it's content to screen
151 | * it does not change the file.
152 |
153 | ```bash
154 | $ sort -n lengths.txt
155 | ```
156 |
157 | ```
158 | 9 methane.pdb
159 | 12 ethane.pdb
160 | 15 propane.pdb
161 | 20 cubane.pdb
162 | 21 pentane.pdb
163 | 30 octane.pdb
164 | 107 total
165 | ```
166 |
167 | ---
168 |
169 | ## Filtering Files and using the result
170 |
171 | * Cool 😎 but now we want to use this list.
172 | * Could save it to a new file?
173 |
174 | ```bash
175 | $ sort -n lengths.txt > sorted_lengths.txt
176 | $ head -n 2 sorted_lengths.txt
177 | ```
178 |
179 | ```
180 | 9 methane.pdb
181 | 12 ethane.pdb
182 | ```
183 |
184 | ## Filtering Files and **the pipe**
185 |
186 | * We call `|` the pipe. It takes output from a command and gives it to another command.
187 | * Modern languages use their own version of this (R has a package and now also a native pipe, julia has of course a pipe etc. Stata not sure 😜)
188 | * The **pipe** allows us to do this *without* storing intermediate results.
189 |
190 | ```bash
191 | $ sort -n lengths.txt | head -n 1
192 | ```
193 |
194 | ```
195 | 9 methane.pdb
196 | ```
197 |
198 | * But, wait 🤔. Then we don't even need `lengths.txt`:
199 |
200 | ```bash
201 | $ wc -l *.pdb | sort -n | head -n 1
202 | ```
203 |
204 | ```
205 | 9 methane.pdb
206 | ```
207 |
208 | * That's a *pipeline*. 🤯
209 |
210 |
211 | ---
212 |
213 |
214 | ## Piping Away
215 |
216 | * Make sure we are still in `~/shell-lesson-data/exercise-data/proteins`
217 |
218 |
219 | :::{.callout-important}
220 |
221 | # Pipe Dreams
222 |
223 | Which of the following commands shows us the 3 files with the least number of lines in the current directory? Build the pipeline up from left to right to check!
224 |
225 | 1. `wc -l * > sort -n > head -n 3`
226 | 2. `wc -l * | sort -n | head -n 1-3`
227 | 3. `wc -l * | sort -n | tail -n 4 | head -n 3`
228 | 4. `wc -l * | sort -n | head -n 3`
229 |
230 | :::
231 |
232 | ---
233 |
234 | ## Piping Away
235 |
236 | * Make sure we are still in `~/shell-lesson-data/exercise-data/proteins`
237 |
238 |
239 |
240 |
241 | Show Solution
242 |
243 | ::: {.callout-note}
244 | # Solution
245 | Option 4 is correct. Option 3 finds the ones with *most* lines.
246 | :::
247 |
248 |
249 |
250 | ---
251 |
252 | ## Cutting and Piping
253 |
254 | * We have a `.csv` file here: `shell-lesson-data/exercise-data/animal-counts`
255 | * Let's use the `cut` command to get parts of it.
256 |
257 | ```bash
258 | $ cd ~/shell-lesson-data/exercise-data/animal-counts
259 | $ cut -d , -f 2 animals.csv
260 | ```
261 |
262 | ::: {.callout-important}
263 |
264 | # Building a Pipe
265 |
266 | * `uniq` filters **adjacent** matching lines in a file.
267 | * Can you extend the above command with `uniq` (and another command?) such that we get the list of unique animal names?
268 | * Add the `-c` flag to `uniq` to get a contingency table.
269 |
270 | :::
271 |
272 | ---
273 |
274 | ## Building a Pipe
275 |
276 |
277 |
278 | Show Solution
279 |
280 | ::: {.callout-note}
281 | # Solution
282 | 1. `cut -d , -f 2 animals.csv | sort | uniq`
283 | 1. `cut -d , -f 2 animals.csv | sort | uniq -c`
284 | :::
285 |
286 |
287 |
288 | ---
289 |
290 | ## House Prices in France
291 |
292 | The below dataset contains information on house sales (price, location, type of house etc). We call one record a _housing transaction_.
293 |
294 | Using the shell:
295 |
296 | 1. Use `wget` to download data to [from here](https://static.data.gouv.fr/resources/demandes-de-valeurs-foncieres/20240408-125738/valeursfoncieres-2023.txt) to your downloads folder as `carburants.csv`: `wget https://static.data.gouv.fr/resources/demandes-de-valeurs-foncieres/20240408-125738/valeursfoncieres-2023.txt`
297 | 2. use `wc -l` to count how many rows (*lines*) there are
298 | 3. use `head -n 2` to see the first two rows (the *header*)
299 | 4. Use the above solution to build a contingency table that tells us the number of housing transactions per *commune*. Show the 10 cities with most housing transactions.
300 | 5. Compute the average of variable `Valeur fonciere`. You should use the `awk` command like this : `awk 'BEGIN{s=0;}{s+=$1;}END{print s/(NR);} your_file.txt'`
301 |
302 | ---
303 |
304 | ## House Prices in France
305 |
306 | The below dataset contains information on house sales (price, location, type of house etc). We call one record a _housing transaction_.
307 |
308 | Using the shell:
309 |
310 | 5. Compute the average of variable `Valeur fonciere`. You should use the `awk` command like this : `awk 'BEGIN{s=0;}{s+=$1;}END{print s/(NR);} your_file.txt'`
311 |
312 | ---
313 |
314 | ## Real Data
315 |
316 |
317 | Show Solution
318 |
319 | 1. `wget https://static.data.gouv.fr/resources/demandes-de-valeurs-foncieres/20240408-125738/valeursfoncieres-2023.txt`
320 | 2. `wc -l valeursfoncieres-2023.txt`
321 | 3. `head -n 2 valeursfoncieres-2023.txt`
322 | 4. `cut -d '|' -f 18 valeursfoncieres-2023.txt | sort | uniq -c | sort -r | head -n 10 `
323 | 5. `cut -d '|' -f 11 valeursfoncieres-2023.txt | cut -d , -f 1 | awk 'BEGIN{s=0;}{s+=$1;}END{print s/(NR);}' `
324 | 6. If you have R installed you can check whether this is the same as reading this column into it (it's not! odd.)
325 |
326 | ```bash
327 | Rscript -e 'x = data.table::fread(cmd = "cut -d \'|\' -f 11 valeursfoncieres-2023.txt | cut -d , -f 1"); x[, mean(`Valeur fonciere`,na.rm = TRUE)]'
328 | ```
329 |
330 |
331 |
332 |
333 |
334 |
335 |
--------------------------------------------------------------------------------
/05-git.qmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Version Control with `Git`"
3 | format:
4 | revealjs:
5 | theme: _extensions/metropolis-theme/metropolis.scss
6 | chalkboard: true
7 | logo: /images/ScPo-logo.png
8 | footer: "[SciencesPo Intro To Programming 2024](https://floswald.github.io/ScPoProgramming/)"
9 | incremental: false
10 | code-line-numbers: false
11 | highlight-style: github
12 | author: Florian Oswald and The Software Carpentry
13 | subtitle: "[SciencesPo Intro To Programming 2024](https://floswald.github.io/ScPoProgramming/)"
14 | date: today
15 | date-format: "D MMMM, YYYY"
16 | ---
17 |
18 | ## Version What?
19 |
20 |
21 | ::::: {.callout-note}
22 | # Question
23 |
24 | * What is Version Control and Why Should I Care?
25 | :::::
26 |
27 | ::::: {.callout-tip}
28 | # Objectives
29 | - Understand the benefits of an automated version control system.
30 | - Understand the basics of how automated version control systems work.
31 | ::::::
32 |
33 | ---
34 |
35 | ## Final.doc
36 |
37 | 
38 |
39 |
40 |
41 | ---
42 |
43 | ## Undo
44 |
45 | * The *latest version* is often best for text documents.
46 | * However, sometimes our view of *best* evolves. Then, we want to *undo*.
47 | * *Undo* means going back in history.
48 |
49 | . . .
50 |
51 | * MS Word etc have *track changes* features.
52 | * Once you accepted the proposed changes of a collaborator, can you go back?
53 | * What about Dropbox-like solutions? (What *is* dropbox actually?)
54 |
55 | ---
56 |
57 | ## Which Version: 20210611_draft.tex
58 |
59 | :::: {.columns}
60 |
61 | ::: {.column width="50%"}
62 |
63 | > Research team 👇 orders files by YYYYMMDD.
64 |
65 | * *Hey, fixed that thing last week.*
66 | * In `20220629-paper.tex`?
67 | * *Erm. Yes. No. I think `20211203-paper.tex` - messed up the file name.*
68 | * Ok, can you copy it into the latest version?
69 | * *Sure. Damn, can't find it anymore. I'll just write it again. All in my head.* 🤯
70 | :::
71 |
72 | ::: {.column width="40%"}
73 | 
74 | :::
75 |
76 | ::::
77 |
78 |
79 | ---
80 |
81 | ## Which Version 2: **Why is the sample size so small suddenly**?
82 |
83 |
84 | ::::: {.columns}
85 |
86 | :::: {.column width="40%"}
87 |
88 | * We had 800 observations, now 733. Why?
89 | * Erm...😱 No clue!
90 | * Well you must have changed the code.
91 | * Yes, I *improved* the code in several parts.
92 | * Well you have to find out what happened.
93 | * But that was weeks ago - I don't remember! 😢
94 | ::::
95 |
96 |
97 |
98 | :::: {.column width="50%"}
99 |
100 |
101 | ### Hard Bugs
102 |
103 | * The hard bugs 🐛 are the ones you see only after a while.
104 | * See result today, error was introduced long ago.
105 | * You can rewind dropbox 30 days. What if... ?
106 | * Also, throw away 30 days of work?
107 | * 😱 😱 😱 😱
108 |
109 | ::::
110 |
111 |
112 |
113 | :::::
114 |
115 |
116 | ---
117 |
118 | ## {background-image="./images/removed-that.png" background-size=100%}
119 |
120 |
121 |
122 | ---
123 |
124 |
125 | ## Setting Up Git
126 |
127 | * We all installed `git`.
128 | * Let's setup our name
129 |
130 | ```bash
131 | $ git config --global user.name "Your Name"
132 | $ git config --global user.email "your@mail.com"
133 | ```
134 |
135 | * Line Endings on Windows:
136 |
137 | ```bash
138 | git config --global core.autocrlf false
139 | ```
140 |
141 | ---
142 |
143 |
144 | ## Creating a Git **Repository**
145 |
146 |
147 | ::::: {.callout-note}
148 | # Question
149 |
150 | * Where does Git store information?
151 |
152 | :::::
153 |
154 | ::::: {.callout-tip}
155 | # Objectives
156 | - Create a local repository
157 | - Describe purpose of `.git` directory
158 | ::::::
159 |
160 |
161 | ---
162 |
163 | ## House Prices Project
164 |
165 | * Let's create a project folder in our home to look at the house prices from last week.
166 |
167 | ```bash
168 | $ cd # going to home dir
169 | $ mkdir houseprices # create directory
170 | $ cd houseprices
171 | $ git init
172 | ```
173 |
174 | * Now the directory `~/houseprices` is endowed with `git` version control.
175 | * What does that look like?
176 |
177 |
178 | ---
179 |
180 | ## Where is Git?
181 |
182 | * Remember *hidden files* and folders?
183 |
184 | ```bash
185 | $ ls -a
186 | ./ ../ .git/
187 | ```
188 |
189 | * Git for this repository resides in `.git`
190 |
191 | ::: {.callout-warning}
192 |
193 | # Danger Zone
194 |
195 | * If you _delete_ that folder, the entire version control is GONE.
196 | * Be very careful that you really want to do that.
197 |
198 | :::
199 |
200 |
201 | ---
202 |
203 |
204 | ## Tracking Changes with Git
205 |
206 |
207 |
208 | ::::: {.callout-note}
209 | # Question
210 |
211 | * How do I record changes in Git?
212 | * How do I check the status of my version control repository?
213 | * How do I record notes about what changes I made and why?
214 | :::::
215 |
216 | ::::: {.callout-tip}
217 | # Objectives
218 | - Understand the benefits of an automated version control system.
219 | - Understand the basics of how automated version control systems work.
220 | ::::::
221 |
222 |
223 |
224 |
225 | ---
226 |
227 | ## Adding Code and Text
228 |
229 | ::: {.callout-note}
230 |
231 | * Notice: The code we produce **is** text.
232 | * Remember what we learned about **file endings**.
233 |
234 | :::
235 |
236 | * Let's add a shell script where we add our pipeline from last week.
237 |
238 | 1. run to get the raw data again:
239 | ```bash
240 | wget https://static.data.gouv.fr/resources/demandes-de-valeurs-foncieres/20240408-125738/valeursfoncieres-2023.txt
241 | ```
242 |
243 | ---
244 |
245 | ## Adding Code and Text
246 |
247 |
248 | 2. create a script
249 |
250 | ```bash
251 | nano maketable.sh # open nano
252 | # type this:
253 | cd ~/houseprices # make sure we are in the right place
254 | cut -d '|' -f 18 valeursfoncieres-2023.txt | sort | uniq -c | sort -r | head -n 10
255 | # save and exit
256 | ```
257 |
258 | 3. (Does it work?)
259 | ```bash
260 | ls . # check the new file is there
261 | ./maketable.sh # run it!
262 | ```
263 |
264 | . . .
265 |
266 | 4. No, it doesn't. 😖
267 | ```bash
268 | chmod +x ./maketable.sh # add executable mode
269 | ls -a
270 | ./maketable.sh
271 | ```
272 |
273 |
274 |
275 | ---
276 |
277 | ## Viewing Changes
278 |
279 | * Ok, now let's see what `git` makes of our additions to this directory.
280 |
281 | ```tcl
282 | floswald@PTL11077 ~/houseprices (main)> git status
283 | On branch main
284 |
285 | No commits yet
286 |
287 | Untracked files:
288 | (use "git add ..." to include in what will be committed)
289 | valeursfoncieres-2023.txt
290 | maketable.sh
291 | ```
292 |
293 | * It is actually helpful **not** to use `bash` as a shell...
294 | * Customizing your shell is an extremely effective procrastination device.
295 | * You must know what [*shaving a Yak*](https://projects.csail.mit.edu/gsb/old-archive/gsb-archive/gsb2000-02-11.html) means before you walk out of my class.
296 |
297 |
298 | ---
299 |
300 | ## Seeing the Difference
301 |
302 | * the command `git diff` shows you what changed between versions.
303 | * lets see what it shows now:
304 |
305 | ```bash
306 | $ git diff
307 | ```
308 |
309 | * It shows nothing, i.e. an _empty_ diff, because _there are no commits yet_ to compare with.
310 | * Ok, let's change that.
311 |
312 |
313 | ---
314 |
315 | ## Modify-Add-Commit 1
316 |
317 | * git reports about _untracked files_. We need to decide *what to track*.
318 |
319 | 1. Move files to *staging area*:
320 | ```bash
321 | git add maketable.sh
322 | git status
323 | ```
324 |
325 | * Notice that I did *not* want to track the `csv` file.
326 |
327 | ```bash
328 | On branch main
329 |
330 | No commits yet
331 |
332 | Changes to be committed:
333 | (use "git rm --cached ..." to unstage)
334 | new file: maketable.sh
335 |
336 | Untracked files:
337 | (use "git add ..." to include in what will be committed)
338 | valeursfoncieres-2023.txt
339 | ```
340 |
341 |
342 | ---
343 |
344 | ## Modify-Add-Commit 2
345 |
346 | * Now, let's *record* what is in the staging area.
347 |
348 | ```bash
349 | $ git commit -m 'added the maketable script'
350 |
351 | [main (root-commit) 9956506] added the maketable script
352 | 1 file changed, 2 insertions(+)
353 | create mode 100644 maketable.sh
354 | ```
355 |
356 | * check status:
357 |
358 | ```bash
359 | $ git status
360 |
361 | On branch main
362 | Untracked files:
363 | (use "git add ..." to include in what will be committed)
364 | valeursfoncieres-2023.txt
365 |
366 | nothing added to commit but untracked files present (use "git add" to track)
367 | ➜ gasprices git:(main) ✗
368 | ```
369 |
370 | ---
371 |
372 | ## Modify-Add-Commit 3
373 |
374 | * Let's check what's in the log.
375 |
376 | ```bash
377 | $ git log
378 |
379 | commit 9956506dc3159403b87aea3b04654c293e82c680 (HEAD -> main)
380 | Author: Florian Oswald
381 | Date: Tue Feb 7 10:50:51 2023 +0100
382 |
383 | added the maketable script
384 | ```
385 |
386 | ---
387 |
388 | ## Modify-Add-Commit 4
389 |
390 | * Now let's _modify_ the script finally.
391 |
392 | ```bash
393 | $ nano maketable.sh
394 |
395 | # add this line on top
396 | echo hello user, will make a contigency table now.
397 | # save and exit
398 | ```
399 |
400 | * now - what's the difference in the repo?
401 |
402 | ---
403 |
404 | ## Diffing
405 |
406 | * there are still the same files here:
407 |
408 | ```bash
409 | $ ls
410 | valeursfoncieres-2023.txt maketable.sh
411 | ```
412 |
413 | * But we can now _compare_ versions:
414 |
415 | ```bash
416 | $ git diff
417 |
418 | diff --git a/maketable.sh b/maketable.sh
419 | index 7e01058..3b7007e 100644
420 | --- a/maketable.sh
421 | +++ b/maketable.sh
422 | @@ -1,2 +1,3 @@
423 | +echo hello user, will make a contigency table now.
424 | cd ~/valeursfoncieres-2023.txt # make sure we are in the right place
425 | cut -d ';' -f 5 valeursfoncieres-2023.txt | tr [:lower:] [:upper:] | sort | uniq -c | sort
426 | ```
427 |
428 |
429 | ---
430 |
431 | ## Commiting Changes Again
432 |
433 | * let's first check everything runs
434 |
435 | ```bash
436 | $ ./maketable.sh
437 | ```
438 |
439 | * good. commit!
440 | ```bash
441 | $ git add maketable.sh
442 | $ git commit -m 'added message to user'
443 | ```
444 |
445 |
446 | ---
447 |
448 | ## Adding a README
449 |
450 | * Good. Now let's add a `README` file.
451 | * It's customary to write this in [markdown](https://carpentries-incubator.github.io/markdown-intro/)
452 |
453 | ```bash
454 | $ nano README.md
455 | ```
456 | write this in nano and save when done.
457 |
458 | ```md
459 | # Gas Prices
460 |
461 | This repo contains code to analyse gas prices at French gas stations.
462 | ```
463 |
464 | * add to staging area, so we can take a snapshot
465 |
466 | ```bash
467 | $ git add README.md
468 | $ git commit -m 'added readme'
469 | ```
470 |
471 | ---
472 |
473 | ## What is this Staging Area?
474 |
475 | * `git` is like a fotographic camera.
476 | * before you take a picture of your friends, you need to arrange them somehow, so that all fit, and so that all 😁.
477 | * You put them _on stage_. Same for files in your repo.
478 |
479 | ![figure from [software carpentry]()](/images/git-staging-area.svg)
480 |
481 |
482 | ---
483 |
484 | ## What is this Staging Area?
485 |
486 |
487 | 
488 |
489 |
490 | ---
491 |
492 | ## Looking at History
493 |
494 |
495 |
496 |
497 | ::::: {.callout-note}
498 | # Question
499 | * How can I identify old versions of files?
500 | * How do I review my changes?
501 | * How can I recover old versions of files?
502 |
503 | :::::
504 |
505 | ::::: {.callout-tip}
506 | # Objectives
507 |
508 |
509 | * Explain what the HEAD of a repository is and how to use it.
510 | * Identify and use Git commit numbers.
511 | * Compare various versions of tracked files.
512 | * Restore old versions of files.
513 | ::::::
514 |
515 |
516 | ---
517 |
518 | ## The most recent version: HEAD
519 |
520 | * Let's change the `maketable.sh` script again:
521 | ```bash
522 | $ nano maketable.sh
523 | echo program run successfully
524 | # save exit
525 |
526 | $ git add maketable.sh
527 | ```
528 |
529 | * The most recent version of our repo is called `HEAD`.
530 | ```bash
531 | $ git diff # compares entire repo to HEAD
532 | $ git diff HEAD maketable.sh
533 | ```
534 |
535 | ---
536 |
537 | ## Whoops, typo
538 |
539 | * Oh no, we wrote _program run successfully_. That should be _ran_ not _run_.
540 | * What now?
541 |
542 | . . .
543 |
544 | * we have not committed this yet!
545 | * we can just get back the version in HEAD, and edit again:
546 |
547 | ```bash
548 | $ git restore maketable.sh
549 | $ git checkout maketable.sh # also works
550 | ```
551 |
552 | * edit the script, add and commit.
553 |
554 | ---
555 |
556 | ## How to get a *specific* version
557 |
558 | * What if you want something else than `HEAD`?
559 | * like, the first version of `maketable.sh`?
560 | * look at history:
561 |
562 | ```bash
563 | $ git log --oneline --graph
564 |
565 | * a6f023b (HEAD -> main) added readme
566 | * 9956506 added the maketable script
567 | ```
568 |
569 | * The `9956506` is the unique identifier of that version.
570 | * We can go back to that version:
571 |
572 | ```bash
573 | $ git checkout 9956506 maketable.sh
574 | ```
575 |
576 |
577 | ---
578 |
579 |
580 | ::: {.callout-tip}
581 |
582 | # Key Points
583 |
584 | * `git diff` displays differences between commits.
585 | * `git checkout` recovers old versions of files.
586 | :::
587 |
588 | ---
589 |
590 | ## So, how does this thing work?
591 |
592 | 
593 |
594 |
595 | ---
596 |
597 | ## Version Control with VScode
598 |
599 | * Download [Visual Studio Code](https://code.visualstudio.com/)
600 | * Start
601 | * Open folder `~/gasprices`
602 | * check version control tab on the left.
603 |
604 |
605 |
606 | ---
607 |
608 | ## Version Control with RStudio
609 |
610 | * top right click on *new project*
611 | * Select *existing directory*
612 | * Select `~/gasprices`
613 | * checkout out the `git` tab in Rstudio!
614 |
615 |
616 | ---
617 |
618 | ## Collaborating with Git on GitHub
619 |
620 | * Create repo
621 | * copy ssh remote URL
622 | * connect local to remote repo
623 |
624 | ---
625 |
626 | ## SSH connections
627 |
628 | * Secure Shell Protocol
629 | * Private-Public key pair. It's like a lock, and you have the only key.
630 | * Let's check if you have one already!
631 |
632 | ```bash
633 | ls -la ~/.ssh
634 | ```
635 |
636 | if error, create one:
637 |
638 | ```bash
639 | ssh-keygen -t ed25519 -C "your@email.com"
640 | ```
641 | press enter (no passphrase)
642 |
643 | check
644 | ```bash
645 | ls -la ~/.ssh
646 | ```
647 |
648 | ---
649 |
650 | ## Communicate with GitHub Remote
651 |
652 | * Let's ping the remote server at GitHub now.
653 |
654 | ```bash
655 | ssh -T git@github.com
656 | ```
657 |
658 | * right, of course Github doesn't have our public key yet (the _lock_ for our key!)
659 |
660 | * copy from your terminal
661 | ```bash
662 | cat ~/.ssh/id_ed25519.pub # or your *.pub
663 | ```
664 |
665 | * Go to github.com, click top right corner, settings, SSH keys.
666 |
667 |
668 | ---
669 |
670 | ## Adding a Remote to your local Repo
671 |
672 | * Now that we can talk to Github.com, let's add the remote to our local repo.
673 | * We `add` a remote by getting the `SSH` url from the repository (green button) online.
674 |
675 | ```bash
676 | $ git remote add origin git@github.com:YOUR_USER/YOUR_REPO.git
677 | ```
678 | * `origin` is the _name_ of the remote server. your choice, but _origin_ is common.
679 | * this should set that remote both for sending and retrieving stuff from the repo. _pull_ and _push_, in git language:
680 |
681 | ```bash
682 | $ git remote --v
683 | ```
684 |
685 |
686 |
687 | ---
688 |
689 | ## Pushing It
690 |
691 | * Now we can _push_ our local repository to the remote repo.
692 | * There will be a full copy of what is in `.git` (i.e., the entire history of the repo) on that remote machine.
693 | * You will be able to use it like a central backup location for your work.
694 |
695 |
696 | ```bash
697 | $ git push -u origin main
698 | ```
699 |
700 | * the `-u` flag sets the _main_ branch as default _upstream_ branch to track.
701 |
702 |
703 | ---
704 |
705 | ## Branching It
706 |
707 | * Next to different _versions_ of a file/directory _over time_, we can have versions evolving in parallel.
708 | * Imagine development history _branching_ off into 2 separate directions at one point.
709 | * They may converge at some point again, but maybe one of them will turn out a failure and we drop it.
710 | * Branches are hugely useful to organize team work.
711 |
712 | ```bash
713 | $ git checkout -b testing # checkout repo on new branch `testing`
714 | Switched to a new branch 'testing'
715 | ```
716 |
717 | * Now can develop stuff on the `testing` branch.
718 | * Later on, we can `merge` it back into `main` if we like it.
719 |
720 |
721 | ---
722 |
723 | ## The Full Picture(s)
724 |
725 |  - click for more!](/images/git-images/full.png)
726 |
727 |
728 | ---
729 |
730 | ## Pushing Branches to GitHub
731 |
732 | * Once you created a local branch you can of course copy (_push_) it to your remote to share with others.
733 | * you would amend the push command:
734 |
735 | ```bash
736 | # make sure you are on the desired branch
737 | $ git branch
738 | main
739 | * testing
740 |
741 | $ git push origin testing
742 | ```
743 |
744 |
--------------------------------------------------------------------------------
/06-concepts.qmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Intro to Generic Programming"
3 | ---
4 |
5 |
6 | ```{r reticulate_config}
7 | #| cache: false
8 | #| include: false
9 | library(reticulate)
10 | use_condaenv("introprog")
11 | ```
12 |
13 | In this short lecture we introduce a few core concepts used in programming. We will be using both `R` and `python` as examples, however, the concepts are transversal across all/most languages. The implementation details - i.e. how do you _invoke_ a certain concept - will differ across languages.
14 |
15 | [How to install python](https://realpython.com/installing-python/).
16 | [Here](https://swcarpentry.github.io/python-novice-inflammation/) is a nice introduction for Python novices.
17 |
18 |
19 | ## Setup
20 |
21 | Ideally you would try to run all commands in the below for both languages. I recommend that you open two terminal windows, one running `R` and one running `python`. For python we need the `numpy` package to demonstrate array support. Depending on how you installed python, there are different options.
22 |
23 | * Anaconda installation: `conda install numpy`
24 | * Homebrew or download from python.org : `pip install numpy`
25 |
26 |
27 | Check whether the installation worked by doing
28 |
29 | ```{python}
30 | import numpy as np
31 | # loads the numpy library and gives it
32 | # short name `np`
33 | ```
34 |
35 |
36 | ## Variables
37 |
38 | Variables are labels for objects. This can be simple numbers, or strings, but often also any other sort of object you could think of: a plot, a table, a matrix, a vector, a list, ...
39 |
40 | What is curious to know about variables is their _scoping_ behaviour: where in our programs we can we see which variable? This differs quite importantly across languages and is something that requires some thought.
41 |
42 | First, let's create a variable `x` which holds the value `12.3`:
43 |
44 | ::: {.panel-tabset group="language"}
45 |
46 | ### Python
47 |
48 | ```{python pvars}
49 | #| cache: false
50 | x = 12.3
51 | x + 5
52 | ```
53 |
54 | ### R
55 |
56 | ```{r rvars}
57 | #| cache: false
58 | x <- 12.3 # = works also
59 | x + 5
60 | ```
61 |
62 | :::
63 |
64 | Next, a *function* which will use the variables - here we do not provide `x` as an argument to the function, so which value will it use in each case?
65 |
66 | ::: {.panel-tabset group="language"}
67 |
68 | ### Python
69 |
70 | ```{python pf}
71 | #| cache: false
72 | def myfun(y):
73 | return x + y # must use `return`
74 | # note the indentation!
75 | # function definition finishes after last line of indented block.
76 |
77 | myfun(8)
78 | ```
79 |
80 | ### R
81 |
82 | ```{r rf}
83 | #| cache: false
84 | myfun <- function(y){
85 | x + y # can use `return()`
86 | }
87 | myfun(8)
88 | ```
89 |
90 | :::
91 |
92 | we see that in both cases, the function looked for the variable `x` in it's _calling scope_, i.e. the environment where it was called from. This only worked because we had defined `x` before. This may or may not work in other languages. In general this is called [*lexical scoping*](https://www.gnu.org/software/guile/manual/html_node/Lexical-Scope.html).
93 |
94 | ## Loops
95 |
96 | If we have a repetitive task, it's useful to be able to _iterate_, i.e. do the same thing to a potentially changing input. Consider that we had 4 numbers `2,3,4,5` and we wanted to print them to screen. We could do of course write 4 identical `print` statements, each with a different input:
97 |
98 | ::: {.panel-tabset group="language"}
99 |
100 | ### Python
101 |
102 | ```{python ploop0}
103 | #| cache: false
104 | #| eval: false
105 | print("this is number",2)
106 | print("this is number",3)
107 | print("this is number",4)
108 | print("this is number",5)
109 | ```
110 |
111 | ### R
112 |
113 | ```{r rloop0}
114 | #| cache: false
115 | #| eval: false
116 | print(paste("this is number",2))
117 | print(paste("this is number",3))
118 | print(paste("this is number",4))
119 | print(paste("this is number",5))
120 | ```
121 |
122 | :::
123 |
124 | but you can see that this a lot of repetitive code, which we want to avoid. Also, adding an additional number would mean a lot of extra work. So, loops are better here:
125 |
126 |
127 |
128 | ::: {.panel-tabset group="language"}
129 |
130 | ### Python
131 |
132 | ```{python ploop}
133 | #| cache: false
134 | for i in range(2,5) :
135 | print(f"this is number",i) # note the indentation!
136 | ```
137 |
138 | ### R
139 |
140 | ```{r rloop}
141 | #| cache: false
142 | for (i in 2:4){
143 | print(paste("this is number",i))
144 | }
145 | ```
146 |
147 | :::
148 |
149 | ## Useful Datastructures
150 |
151 | * python docs on [data structures](https://docs.python.org/3/tutorial/datastructures.html)
152 | * Article about [R datastructures](http://adv-r.had.co.nz/Data-structures.html)
153 |
154 | concept | Python | R
155 | -----| -------|-----
156 | 1d list | `[1,2]` | `c(1,2)`
157 | 1d vector | `np.array([1,2])` | `c(1,2)`
158 | matrix | `np.array([row, col])` | `matrix(data,rows,cols)`
159 | n-d array | `np.array` | `array`
160 | Dictionary | `dict` | `list`
161 | DataFrame | `pandas.df` | `data.frame`
162 |
163 | ### 1-D list/vector
164 |
165 | ::: {.panel-tabset group="language"}
166 |
167 | ### Python
168 |
169 | ```{python plist}
170 | #| cache: false
171 | li = [1,3]
172 | li + li # not well defined vector space with `+` and `*`
173 | ```
174 |
175 | ### R
176 |
177 | ```{r rlist}
178 | #| cache: false
179 | li = c(1,3)
180 | li * li # element-by-element
181 | li + li
182 | ```
183 |
184 | :::
185 |
186 | in python we use the `numpy` package for linear algebra:
187 |
188 |
189 | ::: {.panel-tabset group="language"}
190 |
191 | ### Python
192 |
193 | ```{python pnp}
194 | #| cache: false
195 | import numpy as np
196 | li = np.array([1,3])
197 | li * li
198 | li + li
199 | ```
200 |
201 | ### R
202 |
203 | ```{r}
204 | #| cache: false
205 | li = c(1,3)
206 | li * li # element-by-element
207 | li + li
208 | ```
209 |
210 | :::
211 |
212 | ### Matrices
213 |
214 |
215 | ::: {.panel-tabset group="language"}
216 |
217 | ### Python
218 |
219 | ```{python}
220 | #| cache: false
221 | import numpy as np
222 | ma = np.array([[1,3], [2,4]])
223 | ma * ma
224 | ma + ma
225 | ```
226 |
227 | ### R
228 |
229 | ```{r}
230 | #| cache: false
231 | ma = matrix(c(1,2,3,4),nrow = 2, ncol = 2)
232 | ma * ma # element-by-element
233 | ma + ma
234 | ```
235 |
236 | :::
237 |
238 | ### N-D arrays
239 |
240 | ::: {.panel-tabset group="language"}
241 |
242 | ### Python
243 |
244 | ```{python}
245 | #| cache: false
246 | a = np.arange(1,9)
247 | np.reshape(a, (2,2,2))
248 | ```
249 |
250 | ### R
251 |
252 | ```{r}
253 | #| cache: false
254 | array(1:8,dim = c(2,2,2))
255 | ```
256 |
257 | :::
258 |
259 |
260 | ### Dictionaries
261 |
262 | `Dict`s are lists with a *key -> value* structure. Like a telephone book:
263 |
264 | ::: {.panel-tabset group="language"}
265 |
266 | ### Python
267 |
268 | ```{python}
269 | #| cache: false
270 | di = {'peter' : 1225, 'alice' : 4333}
271 | di
272 | ```
273 |
274 | ### R
275 |
276 | ```{r}
277 | #| cache: false
278 | di = list(peter = 1225, alice = 4333)
279 | di
280 | ```
281 |
282 | :::
283 |
284 | ### DataFrames
285 |
286 | In python, we use the `pandas` package for dataframe support. In R they are built-in as we know. There are many ways to create a `pandas` dataframe.
287 |
288 | * Here is the official pandas [documentation](https://pandas.pydata.org/pandas-docs/stable/index.html).
289 | * in `R`, type `?data.frame` for the help entry.
290 |
291 |
292 | ::: {.panel-tabset group="language"}
293 |
294 | ### Python
295 |
296 | ```{python}
297 | #| cache: false
298 | import pandas as pd
299 | d = {"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]}
300 | pd.DataFrame(d)
301 | ```
302 |
303 | ### R
304 |
305 | ```{r}
306 | #| cache: false
307 | data.frame(one = c(1,2,3,4.0), two = c(4,3,2,1.0))
308 | ```
309 |
310 | :::
--------------------------------------------------------------------------------
/09-R-packages.qmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Building `R` Packages"
3 | format:
4 | revealjs:
5 | theme: _extensions/metropolis-theme/metropolis.scss
6 | chalkboard: true
7 | logo: /images/ScPo-logo.png
8 | footer: "[SciencesPo Intro To Programming 2024](https://floswald.github.io/ScPoProgramming/)"
9 | incremental: false
10 | code-line-numbers: false
11 | highlight-style: github
12 | slide-number: true
13 | author: Florian Oswald
14 | subtitle: "[SciencesPo Intro To Programming 2024](https://floswald.github.io/ScPoProgramming/)"
15 | date: today
16 | date-format: "D MMMM, YYYY"
17 | ---
18 |
19 | ## R and Packages
20 |
21 |
22 | ::: {.columns}
23 |
24 | ::: {.column width=45%}
25 |
26 | ::: {.callout-tip}
27 |
28 | # Questions
29 |
30 | 1. Why write our _own_ `R` package?
31 | 2. How to create an `R` package?
32 | 3. What are _unit tests_?
33 |
34 | :::
35 |
36 | :::
37 |
38 | ::: {.column width=45%}
39 |
40 |
41 | ::: {.callout-note}
42 |
43 | # Objectives
44 |
45 | * Learn the `RStudio`-powered package development workflow.
46 | * Create a package, and test it.
47 | * Publish the package to github.
48 | * Publish the package docs as a self-contained website.
49 |
50 | :::
51 |
52 |
53 | :::
54 |
55 | :::
56 |
57 |
58 |
59 |
60 |
61 |
62 | ---
63 |
64 | ## R and Packages
65 |
66 | * We have been using `R` packages all the time.
67 |
68 | * Each time we say `library(xyz)` we are using *external* code provided in the `xyz` package.
69 |
70 | * You can write **your own** packages.
71 |
72 | . . .
73 |
74 | ::: {.callout-tip}
75 |
76 | # What's the point of Packages?
77 |
78 | 1. Extend `R` functionality.
79 | 2. **for researchers**: key tool to ensure _reproducibilty_ of findings
80 | 3. **for researchers**: key tool to _organize_ code in team work
81 |
82 | :::
83 |
84 | * Let's go through some material from the [`r-pkgs`](https://r-pkgs.org) book!
85 |
86 |
87 |
88 | ---
89 |
90 |
91 | ## Building a Toy Package
92 |
93 | ::: {.columns}
94 |
95 | ::: {.column width=45%}
96 | ### `RStudio` for the win
97 |
98 | * We do this in `RStudio`
99 |
100 | * we use the `devtools` package
101 |
102 | * check you have a recent version:
103 |
104 | ```{r}
105 | #| echo: true
106 | packageVersion("devtools")
107 | ```
108 |
109 | * if not - reinstall.
110 |
111 | :::
112 |
113 | ::: {.column width=45%}
114 |
115 | ### Let's Do it!
116 |
117 | ```{r}
118 | #| echo: true
119 | #| eval: true
120 |
121 | library(devtools)
122 |
123 | # create a package `here`
124 | create_package("~/toypackage")
125 | ```
126 |
127 |
128 | * You see Rstudio jumps to that location
129 | :::
130 |
131 | :::
132 |
133 |
134 | ---
135 |
136 | ## Adding Git
137 |
138 | * Of course we want to track our package with `git`.
139 | * We use functions from the `usethis` package. This is loaded by default when attaching the `devtools` package (`use_git` is part of `usethis`...)
140 |
141 | ```{r}
142 | #| eval: true
143 | #| echo: true
144 | #|
145 | library(devtools)
146 | use_git()
147 | ```
148 |
149 | * Say `Yes` to everything ✌️
150 |
151 | ---
152 |
153 | ## Adding Code
154 |
155 | * We add `R` _source_ code in the `R/` folder.
156 | * Create as many `.R` files as you want.
157 | * It's good practice to organize tests accompanying source files.
158 |
159 | ```{r}
160 | #| eval: true
161 | #| echo: true
162 | #| message: true
163 | use_r("sayhello")
164 | ```
165 |
166 | * What's with that `use_test()` thing? 🤔 Let's worry about this later.
167 |
168 |
169 | ---
170 |
171 | ## Ok, but...Adding Code??
172 |
173 | * Let's add a function to the file `R/sayhello.R`:
174 |
175 | ```r
176 | # Notice I'm using = instead of < - because
177 | # the font of those slides prints it weirdly
178 | hello = function(who){
179 | paste("hello,",who)
180 | }
181 | ```
182 |
183 | * Now, if this were a simple `R` script, we could `source` the `R/sayhello.R` file into global space and try this out.
184 | * We _don't_ want to do that here though. 🤨
185 | * Instead, we want to _load_ the **package**, which _contains_ our function.
186 | * do `load_all()`:
187 | ```r
188 | load_all()
189 | ℹ Loading toypackage
190 | ```
191 |
192 | ---
193 |
194 | ## Trying out Code
195 |
196 | ::: {.callout-note}
197 |
198 | # `load_all()`
199 |
200 | * The `load_all()` function simulates the process of building, installing, and attaching the `toypackage` package.
201 | * This means that **all** the functions you included in the package will become _visible_ in the global scope (in your console)
202 | * This is _not_ in general the case: Later on we will fine-tune which functions are visible to the user, and which ones are not!
203 | :::
204 |
205 | * Call the function with your name!
206 |
207 | ```r
208 | hello("Peter")
209 | [1] "hello, Peter"
210 | ```
211 |
212 | * Great!
213 |
214 | ---
215 |
216 | ## Checking the Package
217 |
218 | * `R` has a rigid set of rules for what a package needs to look like.
219 | * What files should be where, their names and permissions, such that the structure is nicely uniform across all R packages.
220 | * Particularly relevant for _official_ packages on [CRAN](https://cran.r-project.org/)
221 | * Do this here often:
222 |
223 | ```r
224 | check()
225 | ```
226 |
227 | * This outputs a bunch of things:
228 | 1. It actually _builds_ our package in a separate process - immune from our current workspace
229 | 2. It runs a battery of checks and returns a report:
230 |
231 | ```r
232 | 0 errors ✔ | 1 warning ✖ | 1 note ✖
233 | ```
234 |
235 | ---
236 |
237 | ## Editing DESCRIPTION
238 |
239 | * Open the `DESCRIPTION` file (or type `Ctrl + .` and start typing `desc`)
240 | * Fill in the obviously missing contents.
241 |
242 | ### Adding a LICENSE
243 |
244 | > [Use a license, any license (Jeff Atwood)](https://blog.codinghorror.com/pick-a-license-any-license/)
245 |
246 | Let's
247 | ```r
248 | use_mit_license()
249 | ```
250 |
251 | ---
252 |
253 | ## Documenting with Roxygen
254 |
255 | * Go back to the `hello` function, place the cursor inside the function body, and do `Code > Insert Roxygen Skeleton`.
256 | * You'll see something like this:
257 |
258 | ```r
259 | #' Title
260 | #'
261 | #' @param who
262 | #'
263 | #' @return
264 | #' @export
265 | #'
266 | #' @examples
267 | hello <- function(who){
268 | paste("hello,",who)
269 | }
270 | ```
271 |
272 | * Each line starting with `#'` is part of the **docstring**.
273 | * The `roxygen` package can _separate_ those blocks from our code, and produce valid `R` documentation for us! 🤯
274 |
275 | ---
276 |
277 | ## Building Documentation
278 |
279 | * Let's modify the docstring accordingly.
280 | * execute the `document()` function.
281 | * After that, the documentation is visible to us:
282 |
283 | ```r
284 | ?hello
285 | ℹ Rendering development documentation for "hello"
286 | ```
287 | * Look in the _Help_ pane in RStudio!
288 |
289 |
290 | ---
291 |
292 | ## NAMESPACE
293 |
294 | * Did you notice the `@export` tag in the docstring?
295 | * when we ran `document()`, roxygen changed the `NAMESPACE` file based upon that tag.
296 | * Go and look at that file!
297 | * The contents of `NAMESPACE` specify what is _visible_ to a user who does `library(toypackage)`.
298 | * Try removing the `@export` tag, and `document()` again. Look back at `NAMESPACE`!
299 |
300 | ### `check()` again!
301 |
302 | ```r
303 | check()
304 | 0 errors ✔ | 0 warnings ✔ | 0 notes ✔
305 | ```
306 |
307 | ---
308 |
309 | ## Time to INSTALL the package
310 |
311 | * Ok, great. Now we have a minimal package that _works to a certain extent_ 🙂.
312 | * We must _install_ it into our package library, in order to be able to use it like any other package (same as when we did `install.packages("ggplot2")`)
313 | * Notice that `R` installs your packages here:
314 | ```{r}
315 | #| echo: true
316 |
317 | .libPaths()
318 | ```
319 |
320 | * We _install_ our package into that location with `install()`
321 | * Look out for the final message:
322 | ```r
323 | * DONE (toypackage)
324 | ```
325 | 👏
326 |
327 | ---
328 |
329 | ## New Session - Try it Out!
330 |
331 | * Restart Rstudio
332 | * type into the console
333 | ```r
334 | library(toypackage)
335 | ```
336 |
337 | * and then let's see our cool 😎 function:
338 |
339 | ```r
340 | hello("John Spencer Blues Explosion")
341 | [1] "hello, John Spencer Blues Explosion"
342 | ```
343 |
344 | * Works! Bingo! 🎉
345 |
346 | ---
347 |
348 | ## Automatically Testing Our Code
349 |
350 | * We verified ourselves that this _works_.
351 | * We had our own, informal, way to convince ourselves that it works.
352 | * We knew which steps we had to follow until we would conclude that "yes, this works".
353 |
354 | . . .
355 |
356 | ::: {.columns}
357 |
358 | ::: {.column width=45%}
359 | ::: {.callout-caution}
360 |
361 | # The Time Factor
362 |
363 | If you come back to this in 2 months time you probably
364 |
365 | a. won't remember all the steps you have taken (above)
366 | b. won't be able to reproduce what you _tested_ today!
367 |
368 | :::
369 | :::
370 |
371 | ::: {.column width=45%}
372 |
373 | ::: {.callout-warning}
374 |
375 | # The Scale Factor
376 |
377 | As your package grows, you will find it hard to come back to all components repeatedly, making sure they all _still_ work as intended (now that they may depend on other parts of your code)
378 |
379 | :::
380 | :::
381 |
382 |
383 |
384 | :::
385 |
386 |
387 | # Unit Testing and Continuous Integration (CI)
388 |
389 |
390 | ---
391 |
392 | ## Enter **Unit Testing**
393 |
394 | * Automatic Unit Testing or [_Continuous Integration_ (CI)](https://en.wikipedia.org/wiki/Continuous_integration) is our best response to this.
395 | * We still have to *design* and *write* the tests, but we can offload the work to **run** the tasks repeatedly, and automatically, to a helpful infrastructure.
396 |
397 | ```r
398 | library(devtools)
399 | use_testthat()
400 | ```
401 |
402 | * then
403 | ```r
404 | use_test("sayhello")
405 | • Modify 'tests/testthat/test-sayhello.R'
406 | ```
407 |
408 | ---
409 |
410 | ## Writing Unit Tests
411 |
412 | * Ideally, each function in our `R/` folder is _covered_ by a corresponding test.
413 |
414 | ::: {.callout-important}
415 |
416 | # What Is a Test?
417 |
418 | The purpose of a **test** is to verify that some part of your code, a function in most cases, works **as intended**.
419 | :::
420 |
421 | * Modify `'tests/testthat/test-sayhello.R'` like so
422 | ```r
423 | test_that("hello function works", {
424 | who = "James T. Kirk"
425 | expect_equal(hello(who), paste("hello,",who))
426 | })
427 | ```
428 |
429 | * Ready for 🚀 takeoff?
430 |
431 | ---
432 |
433 | ## Running all unit tests
434 |
435 | * You can run each test file separately to try it out (you must do `library(testthat)` first)
436 | * It's better practice to test the entire package though:
437 |
438 | ```r
439 | > test()
440 | ℹ Testing toypackage
441 |
442 | Attaching package: ‘testthat’
443 |
444 | The following object is masked from ‘package:devtools’:
445 |
446 | test_file
447 |
448 | ✔ | F W S OK | Context
449 | ✔ | 1 | sayhello
450 |
451 | ══ Results ══════════════════════════════════════════════════════
452 | [ FAIL 0 | WARN 0 | SKIP 0 | PASS 1 ]
453 | ```
454 |
455 | * Celebrate! 🎉 🥳 🎊
456 |
457 | ## Using _other_ packages
458 |
459 | * Most likely our package would depend some _other_ package as well.
460 | * Like we could choose the `export` some of our functions, we now may want to `import` some functions from elsewhere.
461 | * Suppose we want to use the `dplyr` package:
462 | ```r
463 | > use_package("dplyr")
464 | ✔ Adding 'dplyr' to Imports field in DESCRIPTION
465 | • Refer to functions with `dplyr::fun()`
466 | ```
467 |
468 | * Let's check the `DESCRIPTION` file to see what happened.
469 |
470 | ---
471 |
472 | ## Hook it up to GitHub!
473 |
474 | * It's fairly easy to publish our new package to a github repo.
475 | * Let's `use_github()`
476 | ```r
477 | use_github()
478 | ```
479 |
480 | * answer all the prompts and end up here!
481 |
482 | 
483 |
484 |
485 | ---
486 |
487 | ## Adding a Readme file
488 |
489 | * We know by now that readme files are very important on any git repo.
490 | * Let's add one here as well!
491 | * the `usethis::use_readme_rmd()` function is perfect for this:
492 |
493 | ```r
494 | usethis::use_readme_rmd()
495 | ```
496 |
497 | * If we want to automatically run our tests on a remote server called _github actions_, we can call this function as well to set this up:
498 |
499 | ```r
500 | use_github_actions()
501 | ```
502 |
503 | * let's re-build the package now. (look for rstudio button `install` in `build` tab)
504 |
505 |
506 | ---
507 |
508 | ## Adding a **Vignette**
509 |
510 | * Vignette's are a great feature of R packages. They are full text introductions of the package to a first time user.
511 | * A _tutorial_ for your package.
512 | * This is going to be much more verbose and spiked with example input and ouput than the standard documentation.
513 | * Often it features the main use case of your package.
514 | * There is an [entire chapter](https://r-pkgs.org/vignettes.html) on `r-pkgs` dedicated to this!
515 |
516 | ### Adding the Vignette(s)
517 |
518 | ```r
519 | usethis::use_vignette("vignette-toypackage-1")
520 | ```
521 |
522 | ---
523 |
524 | ## Deploy package documention on a website
525 |
526 |
527 |
528 |
529 | * 🚨 Now we are entering the seriously cool zone of R package development 😎
530 |
531 | * Wouldn't it be 🤩 amazing if all of our package documentation, the content of our readme, and any explanatory articles we might have written as vignettes, were **available on a (free to host!) website which is always up to date**?
532 |
533 |
534 |
535 |
536 | ---
537 |
538 | ## You Bet It's Cool 😎 {transition="zoom" transition-speed="slow"}
539 |
540 |
541 |
542 |
543 |
544 |
545 | 
546 |
547 |
548 | ---
549 |
550 | ## Deploy package documention on a website 3
551 |
552 | * Ready?
553 |
554 | . . .
555 |
556 | ```r
557 | usethis::use_pkgdown()
558 | ```
559 |
560 | * `pkgdown` is a package for website and docs building.
561 |
562 | . . .
563 |
564 | * Let's build that site!
565 |
566 | ```r
567 | pkdown::build_site()
568 | ```
569 |
570 | . . .
571 |
572 | * Let's get `gh-actions` going
573 |
574 | ```r
575 | usethis::use_pkgdown_github_pages()
576 | ```
577 |
578 | * commit everything and push to github!
579 |
580 |
581 | ---
582 |
583 | ## Summary
584 |
585 | ::: {.callout-tip}
586 |
587 | # Key Points
588 |
589 | 1. `RStudio` greatly facilitates `R` package development.
590 | 2. `R` packages contain code, data and documentation in highly structured fashion.
591 | 3. We are encouraged to run automated unit tests.
592 | 3. It is relatively straightforward to publish the package to github for collaboration.
593 | 4. It is equally straightforward to build and publish a full website with package documentation and vignettes, hosted _for free_ on github.com.
594 |
595 | :::
596 |
597 |
--------------------------------------------------------------------------------
/10-spatial-R.qmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: Spatial Data With `R`
3 | format:
4 | revealjs:
5 | theme: _extensions/metropolis-theme/metropolis.scss
6 | chalkboard: true
7 | logo: /images/ScPo-logo.png
8 | footer: "[SciencesPo Intro To Programming 2023](https://floswald.github.io/ScPoProgramming/)"
9 | incremental: false
10 | code-line-numbers: false
11 | highlight-style: github
12 | slide-number: true
13 | author: Florian Oswald
14 | subtitle: "[SciencesPo Intro To Programming 2023](https://floswald.github.io/ScPoProgramming/)"
15 | date: today
16 | date-format: "D MMMM, YYYY"
17 | ---
18 |
19 | ## Intro
20 |
21 |
22 | In this lecture we will cover some basics about geospatial data and how to handle it with `R`. Spatial data is getting always more important, so we need a powerful tool to work with it.
23 |
24 | ::: {.callout-note}
25 |
26 | # tl;dr
27 |
28 | Yes, `R` is a fully fledged GIS. No, you don't need an ArchGIS (or other) license to do real work with spatial data (I don't have one, and I use it for *real work* 😉).
29 |
30 | :::
31 |
32 | . . .
33 |
34 | ### Resources
35 |
36 | 1. [Geocomputation with `R`](https://r.geocompx.org/index.html) is our main reference.
37 | 2. The [`sf` package vignettes](https://cran.r-project.org/web/packages/sf/vignettes/sf1.html) are *outstanding*.
38 |
39 | ---
40 |
41 | ## Spatial Data Basics
42 |
43 | * One prime example of spatial data are of course *maps*, providing an answer to the age-old question *where is what*.
44 | * Fundamentally, spatial data still provide an answer to the same question, it is just that the *what* part has gotten much richer over the years.
45 | * The attribute *location* may be only one of many other features of information on a certain observation.
46 | * Multiple measurements imply that observations can be observed *moving* in space.
47 | * There are two fundamentally different ways in which to consider spatial data:
48 |
49 | ---
50 |
51 | ## Spatial Data Types
52 |
53 |
54 |
55 |
56 | ::: {.fragment}
57 | ::: {.callout-warning }
58 |
59 | # 1. Vector Data
60 |
61 | We represent things with *points, lines and polygons*. We can scale and stretch and transform those easily with mathematical operations. Can increase precision to arbitrary levels (can always zoom in futher).
62 | :::
63 | :::
64 |
65 |
66 |
67 | ::: {.fragment}
68 | ::: {.callout-warning .fragment}
69 | # 2. Raster Data
70 |
71 | We have fixed-size *tiles* or *cells* (like a mosaic, or like *pixels*), which form a **grid**. Fixed resolution.
72 | :::
73 | :::
74 |
75 | ::: {.fragment}
76 | > 👉 This lecture deals only with *Vector* Data.
77 | :::
78 |
79 | # Vector Data and Coordinate Reference Systems
80 |
81 | ---
82 |
83 | ## Representation of Vector Data
84 |
85 | ::: {.fragment}
86 | * Basically, we concentrate on a 2-dimensional space, even though three-dimensional spaces can be useful as well (any ideas for
87 | examples?)
88 | :::
89 | ::: {.fragment}
90 | * In other words, we denote a location with a tuple of coordinates $(x,y)$, or $(x,y,z)$ as the case may be, where each coordinate gives the *distance from the origin* in each direction. For example, we could represent Paris by the tuple `c(2.34,48.85)`
91 | :::
92 | ::: {.fragment}
93 | * One key question in the context of spatial data concerning planet earth you should ask is: *Where is the Origin*?
94 | :::
95 | ::: {.fragment}
96 | * Another question is, related to the well known fact that the earth is quasi-elipsoid (i.e. a bit like a squashed football and - just to be sure: **not flat**), *how to represent locations in three dimensions on a 2-dimensional map*?
97 | :::
98 |
99 | ---
100 |
101 | ## Coordinate Reference Systems (*CRS*)
102 |
103 | ::: {.fragment}
104 | * CRSs use *longitude* and *latitude* to identify locations.
105 | :::
106 | ::: {.fragment}
107 | * One widely used CRS is the *World Geodetic System 1984*, or WGS84 (used on google maps). It measures *angular distance* in degrees in a *geocentric datum* (made for the entire planet).
108 | :::
109 | ::: {.fragment}
110 | * *longitude* measures East-West distance from the **Prime Meridian Plane**. (left-to-right distance from a starting point)
111 | :::
112 | ::: {.fragment}
113 | * *latitude* measures North-South distance of **Equatorial Plane**. (up-down distance from a starting point)
114 | :::
115 |
116 | ---
117 |
118 | ## One Standard CRS: WGS84
119 |
120 | ::: {columns}
121 |
122 | ::: {.column width=50%}
123 | 
124 | :::
125 |
126 | ::: {.column width=40%}
127 | * The dashed lines are the WGS84 elipsoid coordinate frame
128 | * The blue circle is *the origin* at $(0,0)$ :
129 | 1. 0 degrees longitude (x-direction): Prime Meridian through Greenwhich, London.
130 | 1. 0 degrees latitude (y-direction): Equator.
131 | :::
132 |
133 | :::
134 |
135 | ---
136 |
137 | ## Paris in Different CRS
138 |
139 | ::: {columns}
140 |
141 | ::: {.column width=45%}
142 | 
143 | :::
144 |
145 | ::: {.column width=45%}
146 | 
147 |
148 | :::
149 |
150 | :::
151 |
152 | ---
153 |
154 |
155 | ## Paris Where? {.inverse}
156 |
157 | ::: {columns}
158 | ::: {.column width=45%}
159 | 
160 |
161 | :::
162 | ::: {.column width=45%}
163 |
164 |
165 |
166 |
167 | ### Task
168 |
169 | 1. Search for *NTF Lambert North France*
170 | 2. What does `c(600256.4, 127726.4)` actually mean?
171 |
172 | :::
173 | :::
174 |
175 |
176 |
177 |
178 |
179 | ---
180 |
181 | ## Geocentric vs Local Datum
182 |
183 | . Geocentric and local geodetic datums shown on top of a geoid (in false color and the vertical exaggeration by 10,000 scale factor). Image of the geoid is adapted from the work of Ince et al. (2019)](/images/02_datum_fig.png)
184 |
185 |
186 | # Vector Spatial Data in `R`
187 |
188 |
189 | ## Working with (Vector) Spatial Data in **R**
190 |
191 | * We rely on a few core libraries.
192 | * `sf` being the main one. That itself relies on several other lower level libraries.
193 |
194 | ```r
195 | install.packages("sf")
196 | ```
197 |
198 | * Don't try to build from `source` unless you know why.
199 | * For problems, please consult the [package readme](https://github.com/r-spatial/sf#installing).
200 | * Let's try to load the library:
201 |
202 | ```{r}
203 | #| echo: true
204 | #| warning: true
205 | library(sf)
206 | ```
207 |
208 | * I highly recommend the package vignettes!
209 |
210 | ```r
211 | vignette(package = "sf") # see which vignettes are available
212 | vignette("sf1") # an introduction to the package
213 | ```
214 |
215 | ---
216 |
217 | ## Working with **sf** 1
218 |
219 | Let's read a *shapefile* from the `sf` package:
220 |
221 | ```{r}
222 | #| echo: true
223 | nc = st_read(system.file("shape/nc.shp", package="sf"))
224 | head(nc[,c("AREA","NAME","FIPS","BIR79")])
225 | ```
226 |
227 | ---
228 |
229 |
230 | ## Working with **sf** 2
231 |
232 | * Notice the `geometry` column.
233 | * This is basically a geo-referenced `data.frame`.
234 |
235 | ```{r}
236 | #| echo: true
237 | plot(nc[,"AREA"]) # plot feature "AREA" (i.e. column 1)
238 | ```
239 |
240 | ---
241 |
242 |
243 | ## Working with **sf** 3
244 |
245 | * Works also with `ggplot2`
246 |
247 | ```{r}
248 | #| echo: true
249 | library(ggplot2)
250 | ggplot(nc) + geom_sf(aes(fill = AREA)) +
251 | scale_fill_viridis_c(name = "Area")
252 | ```
253 |
254 | ---
255 |
256 |
257 | ## Working with **sf** 4: CRS Transform
258 |
259 | ::: {columns}
260 |
261 | ::: {.column width=45%}
262 |
263 |
264 |
265 | ```{r}
266 | #| echo: true
267 | ggplot(nc) + geom_sf(aes(fill = AREA)) +
268 | scale_fill_viridis_c(name = "Area")
269 | ```
270 |
271 | :::
272 |
273 | ::: {.column width=45%}
274 | ```{r}
275 | #| echo: true
276 | nc %>%
277 | st_transform("+proj=moll") %>%
278 | ggplot() + geom_sf(aes(fill = AREA)) +
279 | scale_fill_viridis_c(name = "Area") +
280 | ggtitle("Mollweide Projection")
281 | ```
282 |
283 | :::
284 |
285 | :::
286 |
287 |
288 | ## Geometric Operations with **sf** 1
289 |
290 | * the [simple features standard](https://en.wikipedia.org/wiki/Simple_Features) specifies a series of operations.
291 | * the relevant functions start with `st_` (for *spatio-temporal*)
292 | * For 2 geometries `x,y` we can compute things like `st_distance(x,y)`, `st_intersect(x,y)`, etc
293 | * For single geometries we can do things like `st_area(x)`, `st_union(x)`, `st_buffer(x,dist)` etc
294 |
295 | ```{r}
296 | #| echo: true
297 |
298 | st_area(st_union(nc))
299 | ```
300 |
301 | * Ooof, how many square km is that now? 🤔
302 |
303 | . . .
304 |
305 | ```{r}
306 | #| echo: true
307 |
308 | st_area(st_union(nc)) %>% units::set_units(km2)
309 | ```
310 |
311 | ---
312 |
313 | ## Geometric Operations with **sf** 2
314 |
315 | ```{r}
316 | #| echo: true
317 | # copied from https://github.com/uo-ec607/lectures
318 | nc_centroid = st_centroid(nc)
319 |
320 | ggplot(nc) +
321 | geom_sf(fill = "black", alpha = 0.8, col = "white") +
322 | geom_sf(data = nc_centroid, col = "red") + ## Notice how easy it is to combine different sf objects
323 | labs(
324 | title = "Counties of North Carolina",
325 | subtitle = "Centroids in red"
326 | )
327 | ```
328 |
329 | ---
330 |
331 | ## Mapping the Seine 1 {background-image="/images/seine.png" background-position="75% 70%" background-size="40%"}
332 |
333 | ::: {columns}
334 | ::: {.column width=35%}
335 | ```{r}
336 | #| echo: true
337 | # copied from https://github.com/uo-ec607/lectures
338 | # install.packages(c("maps","spData"))
339 | ## Get the data
340 | france = st_as_sf(
341 | maps::map('france',
342 | plot = FALSE,
343 | fill = TRUE)
344 | )
345 | data("seine",
346 | package = "spData")
347 |
348 | ## Make sure they have the same projection
349 | seine = st_transform(seine,
350 | crs = st_crs(france))
351 | ```
352 | ```{r}
353 | #| echo: true
354 | # now, make a base plot:
355 | pseine = ggplot() +
356 | geom_sf(data = france,
357 | alpha = 0.8,
358 | fill = "black",
359 | col = "gray50") +
360 | labs(
361 | title = "Administrative regions of France"
362 | )
363 | ggsave(plot = pseine,
364 | "images/seine.png",
365 | width=6, height=6)
366 | ```
367 | :::
368 |
369 | :::
370 |
371 | ---
372 |
373 | ## Mapping the Seine 2 {background-image="/images/seine2.png" background-position="75% 70%" background-size="40%"}
374 |
375 | ::: {columns}
376 | ::: {.column width=35%}
377 |
378 | ```{r}
379 | #| echo: true
380 | #| eval: false
381 | # let's add the seine!
382 | pseine2 = pseine +
383 | geom_sf(data = seine, col = "#05E9FF", lwd = 1) +
384 | labs(
385 | title = "Administrative regions of France",
386 | subtitle = "Also showing the Seine, Marne and Yonne rivers"
387 | )
388 | ggsave(plot = pseine2,
389 | "images/seine2.png",
390 | width=6, height=6)
391 | ```
392 | :::
393 | :::
394 |
395 |
396 | ---
397 |
398 |
399 | ## Intersect two **sf** objects {background-image="/images/seine3.png" background-position="95% 70%" background-size="45%"}
400 |
401 |
402 | ::: {columns}
403 | ::: {.column width=50%}
404 | ```{r}
405 | #| echo: true
406 | seine = st_transform(seine, crs = st_crs(france))
407 | sf_use_s2(FALSE) # need to turn off because of invalid geometry
408 | france_intersected = st_intersection(france, seine)
409 | head(france_intersected,2)
410 | ```
411 |
412 | ```{r}
413 | #| eval: true
414 | #| echo: true
415 | pl3 = france_intersected %>%
416 | ggplot() +
417 | geom_sf(alpha = 0.8, aes(fill = ID, col = ID)) +
418 | labs(
419 | title = "Seine, Marne and Yonne rivers",
420 | caption = "Colours depict French administrative regions"
421 | ) +
422 | theme(legend.title = element_blank())
423 | ggsave(plot = pl3,"images/seine3.png",
424 | width=7, height=5)
425 | ```
426 | :::
427 | :::
428 |
429 | ---
430 |
431 | ## Join two **sf** objects {background-image="/images/seine4.png" background-position="85% 45%" background-size="45%"}
432 |
433 |
434 | ::: {columns}
435 | ::: {.column width=50%}
436 | ```{r}
437 | #| eval: false
438 | #| echo: true
439 | pl4 = st_join(france, seine) %>%
440 | ## Get rid of regions with no overlap
441 | dplyr::filter(!is.na(name)) %>%
442 | ## Some regions are duplicated b/c two
443 | ## branches of the river network flow through them
444 | dplyr::distinct(ID, .keep_all = T) %>%
445 | ## pipe into ggplot
446 | ggplot() +
447 | geom_sf(alpha = 0.5,
448 | fill = "#01731f",
449 | col = "#fcb4b3", # of borders
450 | linewidth = 0.5) + # of borders
451 | geom_sf(data = seine, col = "#05E9FF", lwd = 1) +
452 | labs(title = "Intersected regions only") +
453 | theme_bw()
454 | ggsave(plot = pl4,"images/seine4.png",
455 | width=7, height=5)
456 | ```
457 | :::
458 | :::
459 |
460 | ---
461 |
462 | ## Joining Task { .inverse}
463 |
464 |
465 | ::: {columns}
466 | ::: {.column width=35%}
467 | * Modify the code chunk on the previous slide.
468 | * We want to have different colors for the shown departements, instead of all "#01731f".
469 | * I.e. make this for me 👉
470 | ```{r}
471 | #| eval: false
472 | #| echo: false
473 | d5 = st_join(france, seine) %>%
474 | ## Get rid of regions with no overlap
475 | dplyr::filter(!is.na(name)) %>%
476 | ## Some regions are duplicated b/c two
477 | ## branches of the river network flow through them
478 | dplyr::distinct(ID, .keep_all = T)
479 |
480 | my_colors = palette.colors(nrow(d5), palette = "Alphabet")
481 | names(my_colors) <- NULL
482 |
483 | ## pipe into ggplot
484 | pl5 = ggplot(data = d5) +
485 | geom_sf(aes(fill = ID),
486 | col = "#fcb4b3", # of borders
487 | linewidth = 0.5) + # of borders
488 | geom_sf(data = seine, col = "#05E9FF", lwd = 1.5) +
489 | labs(title = "Intersected regions only", fill = "Departement") +
490 | theme_bw() +
491 | scale_fill_manual(values = my_colors)
492 | ggsave(plot = pl5,"images/seine5.png",
493 | width=7, height=5)
494 | ```
495 | :::
496 |
497 | ::: {.column width=55%}
498 | 
499 | :::
500 | :::
501 |
502 | ---
503 |
504 | ## Joining Task Solution
505 |
506 | ```{r}
507 | #| eval: false
508 | #| echo: true
509 | d5 = st_join(france, seine) %>%
510 | ## Get rid of regions with no overlap
511 | dplyr::filter(!is.na(name)) %>%
512 | ## Some regions are duplicated b/c two
513 | ## branches of the river network flow through them
514 | dplyr::distinct(ID, .keep_all = T)
515 |
516 | my_colors = palette.colors(nrow(d5), palette = "Alphabet")
517 | names(my_colors) <- NULL
518 |
519 | ## pipe into ggplot
520 | pl5 = ggplot(data = d5) +
521 | geom_sf(aes(fill = ID),
522 | col = "#fcb4b3", # of borders
523 | linewidth = 0.5) + # of borders
524 | geom_sf(data = seine, col = "#05E9FF", lwd = 1.5) +
525 | labs(title = "Intersected regions only", fill = "Departement") +
526 | theme_bw() +
527 | scale_fill_manual(values = my_colors)
528 | ggsave(plot = pl5,"images/seine5.png",
529 | width=7, height=5)
530 | ```
531 |
532 |
533 | ---
534 |
535 | ## Distances
536 |
537 | * Another typical question could be:
538 |
539 | > What's the (straight-line) distance between 2 points?
540 |
541 | As in
542 |
543 | >What's the distance between the centroids of the Seine-Maritime and Nievre Departements?
544 |
545 | ---
546 |
547 | ## Task: Distances {.inverse}
548 |
549 | Modifying the plot from the previous task, produce 2 new plots
550 |
551 | 1. One that colors only the concerned departments, and marks their respective centroids with a point.
552 | 2. Another one with the same coloring, but where a straight solid line connects both centroids, and we print the distance in km into the table title.
553 |
554 | ---
555 |
556 | ## Task Desired Result: Distances {.inverse}
557 |
558 | **Hint:**
559 |
560 | ```r
561 | # start from here
562 | p6 = ggplot(d5) + geom_sf()
563 | ```
564 |
565 | ::: {layout-ncol=2}
566 |
567 | 
568 |
569 | 
570 |
571 | Desired Outputs
572 |
573 | :::
574 |
575 |
576 | ---
577 |
578 | ## Task Solution
579 |
580 | ```{r}
581 | #| echo: true
582 | #| eval: false
583 | cvec = rep(NA, length(unique(d5$ID)))
584 | names(cvec) <- unique(d5$ID)
585 | cvec["Seine-Maritime"] <- "purple"
586 | cvec["Nievre"] <- "brown"
587 |
588 | p6 = ggplot(d5) + geom_sf()
589 | p6 = ggplot(d5) + geom_sf(aes(fill = ID))
590 | p6 = p6 + scale_fill_manual(values = cvec, limits= c("Seine-Maritime","Nievre"))
591 | subdeps = d5 %>% dplyr::filter(ID %in% c("Seine-Maritime","Nievre"))
592 | p6 = p6 + geom_sf(data = st_centroid(subdeps))
593 | ggsave(plot = p6, "images/distance1.png", width = 5,height=4)
594 |
595 | dists = st_distance(subdeps) %>% units::set_units("km")
596 | coords = st_centroid(subdeps) %>% st_coordinates()
597 | coords = data.frame(lon = coords[1,"X"],
598 | lat = coords[1,"Y"],
599 | lon_end = coords[2,"X"],
600 | lat_end = coords[2,"Y"])
601 | p7 = p6 + geom_segment(data = coords, aes(lon, lat, xend = lon_end, yend = lat_end))
602 | p7 = p7 + ggtitle(paste("Distance between Centroids:",round(dists[1,2],0), "km"))
603 | ggsave(plot = p7, "images/distance2.png",width = 5,height=4)
604 |
605 |
606 | ```
607 |
608 |
609 | # Raster Data
610 |
611 | ---
612 |
613 | ## What's Different?
614 |
615 | * We have a grid (*pixels*) where each cell contains one single data value - usually our measure of interest.
616 | * We can have multiple *layers* of measurements (e.g. temperature, humidity and elevation for a grid cell)
617 | * CRS considerations equally apply.
618 | * Remote Sensing Data (e.g. Satelitte images) are often in raster format.
619 |
620 | ---
621 |
622 | ## Raster Resources
623 |
624 | * [Chapter 6 of gecompr](https://r.geocompx.org/raster-vector.html) is a great starting point.
625 | * [R as GIS for Economists](https://tmieno2.github.io/R-as-GIS-for-Economists/index.html) is in general a great resource, [chapter 4](https://tmieno2.github.io/R-as-GIS-for-Economists/raster-basics.html) in particular so.
626 | * [`R` package `{raster}`](https://cran.r-project.org/web/packages/raster/index.html) is the very mature and traditional solution.
627 | * [`R` package `{star}`](https://cran.r-project.org/web/packages/stars/index.html) is a great recent development in this space.
628 |
629 | ---
630 |
631 | ## Other Spatial Resources
632 |
633 | * [Spatial Data Science](https://r-spatial.org/book/): Still WIP but looks like the ultimate authority amongst books.
634 | * [mapview/](https://r-spatial.github.io/mapview/) : great for interactive and quick mapping
635 | * [tmaps](https://cran.r-project.org/web/packages/tmap/vignettes/tmap-getstarted.html): same
636 | * [Analyzing US Census Data](https://walker-data.com/census-r/) by Kyle Walker is a brilliant intro do his package [`{tidycensus}`](https://walker-data.com/tidycensus/).
637 | * Nice mapping [examples](https://ryanpeek.github.io/2017-11-05-mapping-with-sf-Part-2/)
638 | * [`sf` and `raster`](https://nceas.github.io/oss-lessons/spatial-data-gis-law/3-mon-intro-gis-in-r.html) intro by NCEAS
639 |
640 |
641 | # End
--------------------------------------------------------------------------------
/11-NLP-R.qmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: Quick Intro to NLP with `R`
3 | format:
4 | revealjs:
5 | theme: _extensions/metropolis-theme/metropolis.scss
6 | chalkboard: true
7 | logo: /images/ScPo-logo.png
8 | footer: "[SciencesPo Intro To Programming 2024](https://floswald.github.io/ScPoProgramming/)"
9 | incremental: false
10 | code-line-numbers: false
11 | highlight-style: github
12 | slide-number: true
13 | author: Florian Oswald
14 | subtitle: "[SciencesPo Intro To Programming 2024](https://floswald.github.io/ScPoProgramming/)"
15 | date: today
16 | date-format: "D MMMM, YYYY"
17 | execute:
18 | echo: true
19 | cache: true
20 | ---
21 |
22 | ## Intro
23 |
24 |
25 | In this lecture we will introduce the most basic language models with R.
26 |
27 | This is based on a nice [introduction](https://datascienceplus.com/an-introduction-to-k-gram-language-models-in-r/) by Valerio Gherardi, author of the `kgrams` package for R.
28 |
29 | ---
30 |
31 | ## Natuarl Language Processing (NLP) basics
32 |
33 | * We are all familiar with *large* language models (LLMs for short) by now.
34 | * ChatGPT (short for _Chat Generative Pretrained Transformer_) is a proprietary solution, there are by now many open source alternatives.
35 | * We will not be able to go into the details of those, but see some simpler cousins.
36 |
37 |
38 | ## $k$-gram language models
39 |
40 | * Let $w_i$ be the $i$-th word in a sentence, i.e. $s = w_1 w_2 \dots w_n$
41 | * An NLP model gives the probability of observing this sentence, i.e. $\Pr(s)$.
42 | * As usual, we can *sample* from $\Pr(s)$ to obtain *random* sentences.
43 | * In general all the $s$ at our disposal come from a certain _corpus_, i.e. a collection of sentences/words.
44 |
45 | ## Continuation Probabilities
46 |
47 | * Define a sequence of words as _context_: $w_1 w_2 \dots w_m$
48 | * We can _predict_ the next word in the sequence by computing $\Pr(w|c)$, i.e. $\Pr(w|c)$ is the probability that the next word is $w$, given context $c$.
49 | * That is in a nutshell what ChatGPT computes for you.
50 |
51 | ## Dictionaries
52 |
53 | * The list of known words in an NLP model is called the _dictionary_.
54 | * This also tells us how to deal with _unknown_ words - those are mapped to the `UNK` (unknown word token).
55 | * It also tells us how to deal with the end of sentences, by introducing an `EOS` (end of sentence) token.
56 | * `kgram` models (below) also include a `BOS` (beginning of sentence) token. Each sentence is left-padded with $N-1$ `BOS` tokens ($N$ the order of the model). This helps predicting _the first workd of the next sentence_ from the preceding $N-1$ tokens.
57 |
58 | ## $k$-gram Models
59 |
60 | * A $k$-gram model makes a _markovian_ assumption on continuation probabilites.
61 | * We assume that the next word depends only on the last $N-1$ words, where $N$ is the _order_ of the model.
62 | * We have
63 |
64 | $$\begin{align}
65 | \Pr(w|c) &= \Pr(w|w_1 w_2 \cdots w_{N-1})\\
66 | c &= \cdots w_{-1} w_0 w_1 w_2 \cdots w_{N-1}
67 | \end{align}$$
68 |
69 | * We call the $k$ tuples of words $(w_1, w_2,\dots, w_k)$ _k-grams_.
70 | * You can see that we can only capture relatively short range dependencies.
71 | * As $N$ becomes too large, memory requirements explode.
72 |
73 | ## Estimating Continuation Probabilities
74 |
75 | * We can make a table from our corpus, counting how many times each $k$ gram occurs.
76 | * While this is simple, we need a _smoothing_ technique to account for the fact that many potentially sensible sentences are never observed in our Corpus.
77 | * The smoothing will take some probability from the very frequently observed sequences and give some to the rarer ones, simply speaking.
78 |
79 | $$\hat{\Pr}_{MLE}(w|c) = \frac{C(w_1 w_2 \cdots w_{k} w)}{C(w_1 w_2 \cdots w_{k})}$$
80 |
81 | * Our data is sparse: many sequences are not in our corpus, hence the above estimator incorrectly assigns zero probability to them.
82 | * If context $w_1 w_2 \cdots w_{k}$ not in data, estimator is not defined.
83 |
84 | ## Training and Testing NLP Models
85 |
86 | * We need an evaluation metric: how good is this model.
87 | * Widely used is [perplexity](https://en.wikipedia.org/wiki/Perplexity): The larger _perplexity_ of a discrete probability distribution, the less likely it will be that an observer could guess the next value to be drawn from it.
88 | * We will evaluate $H=-\frac{1}{W} \sum_s \ln \Pr(s)$ where $W$ is the total number of words in our corpus.
89 |
90 | ## Training a k-gram model in R
91 |
92 | ```{r}
93 | #| echo: true
94 | library(kgrams)
95 | ```
96 | We can get the spoken text from the following Shakespear plays:
97 |
98 | ```{r}
99 | #| echo: true
100 |
101 | playcodes <- c(
102 | "All's Well That Ends Well" = "AWW",
103 | "Antony and Cleopatra" = "Ant",
104 | "As You Like It" = "AYL",
105 | "The Comedy of Errors" = "Err",
106 | "Coriolanus" = "Cor",
107 | "Cymbeline" = "Cym",
108 | "Hamlet" = "Ham",
109 | "Henry IV, Part 1" = "1H4",
110 | "Henry IV, Part 2" = "2H4",
111 | "Henry V" = "H5",
112 | "Henry VI, Part 1" = "1H6",
113 | "Henry VI, Part 2" = "2H6",
114 | "Henry VI, Part 3" = "3H6",
115 | "Henry VIII" = "H8",
116 | "Julius Caesar" = "JC",
117 | "King John" = "Jn",
118 | "King Lear" = "Lr",
119 | "Love's Labor's Lost" = "LLL",
120 | "Macbeth" = "Mac",
121 | "Measure for Measure" = "MM",
122 | "The Merchant of Venice" = "MV",
123 | "The Merry Wives of Windsor" = "Wiv",
124 | "A Midsummer Night's Dream" = "MND",
125 | "Much Ado About Nothing" = "Ado",
126 | "Othello" = "Oth",
127 | "Pericles" = "Per",
128 | "Richard II" = "R2",
129 | "Richard III" = "R3",
130 | "Romeo and Juliet" = "Rom",
131 | "The Taming of the Shrew" = "Shr",
132 | "The Tempest" = "Tmp",
133 | "Timon of Athens" = "Tim",
134 | "Titus Andronicus" = "Tit",
135 | "Troilus and Cressida" = "Tro",
136 | "Twelfth Night" = "TN",
137 | "Two Gentlemen of Verona" = "TGV",
138 | "Two Noble Kinsmen" = "TNK",
139 | "The Winter's Tale" = "WT"
140 | )
141 | ```
142 |
143 | ## Estimating 2
144 |
145 | We could get the text from "Much Ado about Nothing" as follows:
146 |
147 | ```{r}
148 | #| echo: true
149 | get_url_con <- function(playcode) {
150 | stopifnot(playcode %in% playcodes)
151 | url <- paste0("https://www.folgerdigitaltexts.org/", playcode, "/text")
152 | con <- url(url)
153 | }
154 |
155 | con <- get_url_con("Ado")
156 | open(con)
157 | readLines(con, 10)
158 | ```
159 |
160 | ```{r}
161 | #| echo: true
162 | close(con)
163 | ```
164 |
165 | ## Defining Training and Testing Data
166 |
167 | We will use all plays but "Hamlet" as training data, and reserve this last one for testing our model.
168 |
169 | ```{r}
170 | train_playcodes <- playcodes[names(playcodes) != c("Hamlet")]
171 | test_playcodes <- playcodes[names(playcodes) == c("Hamlet")]
172 | ```
173 |
174 | We want to pre-process the text data. Here we want to remove some html tags and make everything lower-case.
175 |
176 | ```{r}
177 | .preprocess <- function(x) {
178 | # Remove html tags
179 | x <- gsub("<[^>]+>", "", x)
180 | # Lower-case and remove characters not alphanumeric or punctuation
181 | x <- kgrams::preprocess(x)
182 | return(x)
183 | }
184 | ```
185 |
186 | ## Preprocessing Text
187 |
188 | * We need to split sentences at sensible punctuation marks `.!?:;` and insert `EOS` and `BOS` tokens into the data.
189 | * This will treat `.!?:;` as regular _words_, hence the model will be able to *predict* those.
190 |
191 | ```{r}
192 | .tknz_sent <- function(x) {
193 | # Collapse everything to a single string
194 | x <- paste(x, collapse = " ")
195 | # Tokenize sentences
196 | x <- kgrams::tknz_sent(x, keep_first = TRUE)
197 | # Remove empty sentences
198 | x <- x[x != ""]
199 | return(x)
200 | }
201 | ```
202 |
203 | ## Making $k$-gram frequency counts
204 |
205 | * Let us now make a table of occurences of all $k$-grams in our corpus.
206 | * We set an _order_:
207 |
208 | ```{r}
209 | N = 5
210 | freqs = kgram_freqs(N, .preprocess = .preprocess, .tknz_sent = .tknz_sent)
211 | summary(freqs)
212 | ```
213 |
214 | * So, for now this is an empty model as you can see. Let's train it on our corpus!
215 |
216 | ## Training the NLP model
217 |
218 | ```{r}
219 | lapply(train_playcodes,
220 | function(playcode) {
221 | con <- get_url_con(playcode)
222 | process_sentences(text = con, freqs = freqs, verbose = FALSE)
223 | })
224 | ```
225 |
226 | ## Checking the Frequency tables
227 |
228 | * the `freqs` object was modified during the previous call.
229 | * Let's check it quickly:
230 |
231 | ```{r}
232 | query(freqs, c("leonato", "pound of flesh", "smartphones"))
233 | ```
234 |
235 | * Last thing to do: choose a smoother.
236 | ```{r}
237 |
238 | smoothers()
239 | ```
240 |
241 | Let's choose the _modified Kneser-Ney_ smoother and set some default parameters:
242 |
243 | ```{r}
244 | info("mkn")
245 | ```
246 |
247 | ## Building the model
248 |
249 | ```{r}
250 | model <- language_model(freqs, smoother = "mkn", D1 = 0.5, D2 = 0.5, D3 = 0.5)
251 | summary(model)
252 | ```
253 |
254 | ## Making Predictions with the model
255 |
256 | * Now we can compute probabilities for given sentences:
257 |
258 | ```{r}
259 | sentences <- c(
260 | "I have a letter from monsieur Berowne to one lady Rosaline.",
261 | "I have an email from monsieur Valerio to one lady Judit."
262 | )
263 | probability(sentences, model)
264 | ```
265 |
266 | or we can get the _continuation probability_ for a context:
267 |
268 | ```{r}
269 | context <- "pound of"
270 | words <- c("flesh", "bananas")
271 | probability(words %|% context, model)
272 | ```
273 |
274 | ## Tuning our models
275 |
276 | * Remember we held out "Hamlet" from our training data. Let's use it to test performance now!
277 |
278 | ```{r}
279 | con <- get_url_con(test_playcodes)
280 | perplexity(text = con, model = model)
281 | ```
282 |
283 | This applies the same transformations and tokenization to test data than it does to training data (which is important).
284 |
285 | ## Tuning More
286 |
287 | * We could now create a grid over the parameters of the model (`D1`, `D2` etc) as well as the order of the models
288 | * We would then choose those parameters for whcih the perplexity is smallest.
289 | * Suppose we find that the $k=4$ models works best.
290 | * Let's use it to create some random sentences!
291 |
292 | ```{r}
293 | param(model, "N") <- 4
294 | ```
295 |
296 | ## Random Text generation
297 |
298 | ```{r}
299 | set.seed(840)
300 | sample_sentences(model, 10, max_length = 20)
301 | ```
302 |
303 | ## Temperature
304 |
305 | * The temperature parameter makes the pdf smoother and rougher. Smaller values mean the model will not deviate much from it's implied distribution, higher values means there will be much more randomness in output.
306 |
307 | ```{r}
308 | set.seed(841)
309 | sample_sentences(model, 10, max_length = 20) # Normal temperature
310 | ```
311 |
312 | ## High temperature
313 |
314 | ```{r}
315 | set.seed(841)
316 | sample_sentences(model, 10, max_length = 20, t = 10)
317 | ```
318 |
319 |
320 | ## Low temperature
321 |
322 | ```{r}
323 | set.seed(841)
324 | sample_sentences(model, 10, max_length = 20, t = 0.1)
325 | ```
326 |
327 |
328 |
329 | # End
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/__packages:
--------------------------------------------------------------------------------
1 | kgrams
2 |
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-10_b67627f3927e12ad713028cf98e417bb.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-10_b67627f3927e12ad713028cf98e417bb.RData
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-10_b67627f3927e12ad713028cf98e417bb.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-10_b67627f3927e12ad713028cf98e417bb.rdb
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-10_b67627f3927e12ad713028cf98e417bb.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-10_b67627f3927e12ad713028cf98e417bb.rdx
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-11_c35984ea9b2f979ff7f85284fee4cda0.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-11_c35984ea9b2f979ff7f85284fee4cda0.RData
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-11_c35984ea9b2f979ff7f85284fee4cda0.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-11_c35984ea9b2f979ff7f85284fee4cda0.rdb
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-11_c35984ea9b2f979ff7f85284fee4cda0.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-11_c35984ea9b2f979ff7f85284fee4cda0.rdx
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-12_18410f94565577aaf66decee3574d410.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-12_18410f94565577aaf66decee3574d410.RData
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-12_18410f94565577aaf66decee3574d410.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-12_18410f94565577aaf66decee3574d410.rdb
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-12_18410f94565577aaf66decee3574d410.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-12_18410f94565577aaf66decee3574d410.rdx
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-13_17c61bf27164538fea6ca0a3e4dc5b20.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-13_17c61bf27164538fea6ca0a3e4dc5b20.RData
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-13_17c61bf27164538fea6ca0a3e4dc5b20.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-13_17c61bf27164538fea6ca0a3e4dc5b20.rdb
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-13_17c61bf27164538fea6ca0a3e4dc5b20.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-13_17c61bf27164538fea6ca0a3e4dc5b20.rdx
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-14_3cbee9f25d7102231341290c4fc06f0d.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-14_3cbee9f25d7102231341290c4fc06f0d.RData
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-14_3cbee9f25d7102231341290c4fc06f0d.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-14_3cbee9f25d7102231341290c4fc06f0d.rdb
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-14_3cbee9f25d7102231341290c4fc06f0d.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-14_3cbee9f25d7102231341290c4fc06f0d.rdx
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-15_50c5eb25cbf3bf122ec79bddb67172f2.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-15_50c5eb25cbf3bf122ec79bddb67172f2.RData
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-15_50c5eb25cbf3bf122ec79bddb67172f2.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-15_50c5eb25cbf3bf122ec79bddb67172f2.rdb
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-15_50c5eb25cbf3bf122ec79bddb67172f2.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-15_50c5eb25cbf3bf122ec79bddb67172f2.rdx
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-16_1c82565a0983d4f9ac69861c5f41d0d1.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-16_1c82565a0983d4f9ac69861c5f41d0d1.RData
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-16_1c82565a0983d4f9ac69861c5f41d0d1.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-16_1c82565a0983d4f9ac69861c5f41d0d1.rdb
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-16_1c82565a0983d4f9ac69861c5f41d0d1.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-16_1c82565a0983d4f9ac69861c5f41d0d1.rdx
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-17_a7915cd327518d28ed8f0a9e584a9247.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-17_a7915cd327518d28ed8f0a9e584a9247.RData
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-17_a7915cd327518d28ed8f0a9e584a9247.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-17_a7915cd327518d28ed8f0a9e584a9247.rdb
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-17_a7915cd327518d28ed8f0a9e584a9247.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-17_a7915cd327518d28ed8f0a9e584a9247.rdx
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-18_13c48c8c58c1b5cd21bb3ca46d39505a.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-18_13c48c8c58c1b5cd21bb3ca46d39505a.RData
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-18_13c48c8c58c1b5cd21bb3ca46d39505a.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-18_13c48c8c58c1b5cd21bb3ca46d39505a.rdb
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-18_13c48c8c58c1b5cd21bb3ca46d39505a.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-18_13c48c8c58c1b5cd21bb3ca46d39505a.rdx
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-19_60f976a677f568d76e9935e9173d5545.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-19_60f976a677f568d76e9935e9173d5545.RData
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-19_60f976a677f568d76e9935e9173d5545.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-19_60f976a677f568d76e9935e9173d5545.rdb
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-19_60f976a677f568d76e9935e9173d5545.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-19_60f976a677f568d76e9935e9173d5545.rdx
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-1_b443c34df83ffb4e47c67e5e9ac4cfce.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-1_b443c34df83ffb4e47c67e5e9ac4cfce.RData
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-1_b443c34df83ffb4e47c67e5e9ac4cfce.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-1_b443c34df83ffb4e47c67e5e9ac4cfce.rdb
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-1_b443c34df83ffb4e47c67e5e9ac4cfce.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-1_b443c34df83ffb4e47c67e5e9ac4cfce.rdx
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-20_f90113178a75d661e8a9b319f8dd63b9.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-20_f90113178a75d661e8a9b319f8dd63b9.RData
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-20_f90113178a75d661e8a9b319f8dd63b9.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-20_f90113178a75d661e8a9b319f8dd63b9.rdb
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-20_f90113178a75d661e8a9b319f8dd63b9.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-20_f90113178a75d661e8a9b319f8dd63b9.rdx
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-21_0cc656c58d55c349f872ab6c59a1c9c6.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-21_0cc656c58d55c349f872ab6c59a1c9c6.RData
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-21_0cc656c58d55c349f872ab6c59a1c9c6.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-21_0cc656c58d55c349f872ab6c59a1c9c6.rdb
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-21_0cc656c58d55c349f872ab6c59a1c9c6.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-21_0cc656c58d55c349f872ab6c59a1c9c6.rdx
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-2_ea58cca509eaa82089e5339e5054c58a.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-2_ea58cca509eaa82089e5339e5054c58a.RData
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-2_ea58cca509eaa82089e5339e5054c58a.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-2_ea58cca509eaa82089e5339e5054c58a.rdb
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-2_ea58cca509eaa82089e5339e5054c58a.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-2_ea58cca509eaa82089e5339e5054c58a.rdx
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-3_3ba45688564a3db789f5c5f910f2d7c8.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-3_3ba45688564a3db789f5c5f910f2d7c8.RData
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-3_3ba45688564a3db789f5c5f910f2d7c8.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-3_3ba45688564a3db789f5c5f910f2d7c8.rdb
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-3_3ba45688564a3db789f5c5f910f2d7c8.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-3_3ba45688564a3db789f5c5f910f2d7c8.rdx
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-4_1ac64f41a7478af12db8d4afc3796ea6.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-4_1ac64f41a7478af12db8d4afc3796ea6.RData
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-4_1ac64f41a7478af12db8d4afc3796ea6.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-4_1ac64f41a7478af12db8d4afc3796ea6.rdb
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-4_1ac64f41a7478af12db8d4afc3796ea6.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-4_1ac64f41a7478af12db8d4afc3796ea6.rdx
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-5_ca8c5c862543df61f31a683820040a75.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-5_ca8c5c862543df61f31a683820040a75.RData
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-5_ca8c5c862543df61f31a683820040a75.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-5_ca8c5c862543df61f31a683820040a75.rdb
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-5_ca8c5c862543df61f31a683820040a75.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-5_ca8c5c862543df61f31a683820040a75.rdx
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-6_502f7c554aaaed64ff954e58c7fbaa66.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-6_502f7c554aaaed64ff954e58c7fbaa66.RData
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-6_502f7c554aaaed64ff954e58c7fbaa66.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-6_502f7c554aaaed64ff954e58c7fbaa66.rdb
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-6_502f7c554aaaed64ff954e58c7fbaa66.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-6_502f7c554aaaed64ff954e58c7fbaa66.rdx
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-7_2750b1c3b2e57236a5af4f8faef1c5d1.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-7_2750b1c3b2e57236a5af4f8faef1c5d1.RData
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-7_2750b1c3b2e57236a5af4f8faef1c5d1.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-7_2750b1c3b2e57236a5af4f8faef1c5d1.rdb
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-7_2750b1c3b2e57236a5af4f8faef1c5d1.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-7_2750b1c3b2e57236a5af4f8faef1c5d1.rdx
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-8_485a42cd3be166890574dd8e3464bfd8.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-8_485a42cd3be166890574dd8e3464bfd8.RData
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-8_485a42cd3be166890574dd8e3464bfd8.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-8_485a42cd3be166890574dd8e3464bfd8.rdb
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-8_485a42cd3be166890574dd8e3464bfd8.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-8_485a42cd3be166890574dd8e3464bfd8.rdx
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-9_56df5bed1c943e1c4727cc65d4f7ab22.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-9_56df5bed1c943e1c4727cc65d4f7ab22.RData
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-9_56df5bed1c943e1c4727cc65d4f7ab22.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-9_56df5bed1c943e1c4727cc65d4f7ab22.rdb
--------------------------------------------------------------------------------
/11-NLP-R_cache/revealjs/unnamed-chunk-9_56df5bed1c943e1c4727cc65d4f7ab22.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/11-NLP-R_cache/revealjs/unnamed-chunk-9_56df5bed1c943e1c4727cc65d4f7ab22.rdx
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 Florian Oswald
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/R/sayhello.R:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/R/sayhello.R
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ScPo Intro To Programming
2 |
3 | This repository contains the course material for the introductory programming course at Sciences Po.
4 |
5 | You can find the course website [here](https://floswald.github.io/ScPoProgramming/).
6 |
7 | ## License
8 |
9 | You are free to copy and remix this content as long as you stick the terms laid out in the LICENSE file. Thanks.
--------------------------------------------------------------------------------
/ScPoProgramming.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 |
3 | RestoreWorkspace: Default
4 | SaveWorkspace: Default
5 | AlwaysSaveHistory: Default
6 |
7 | EnableCodeIndexing: Yes
8 | UseSpacesForTab: Yes
9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 |
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 |
--------------------------------------------------------------------------------
/_extensions/metropolis-theme/_extension.yml:
--------------------------------------------------------------------------------
1 | title: Quarto Metropolis Theme
2 | author: Patrick Schratz
3 | version: 1.0.0
4 | contributes:
5 | formats:
6 | revealjs:
7 | theme: metropolis.scss
8 |
--------------------------------------------------------------------------------
/_extensions/metropolis-theme/metropolis.scss:
--------------------------------------------------------------------------------
1 | /*-- scss:defaults --*/
2 |
3 | // fonts
4 | @import url(https://fonts.googleapis.com/css?family=Fira+Sans:300,300i,400,400i,500,500i,700,700i);
5 | @import url(https://cdn.rawgit.com/tonsky/FiraCode/1.204/distr/fira_code.css);
6 | @import url("https://fonts.googleapis.com/css?family=Roboto+Mono|JetBrains+Mono&display=swap");
7 | @import url("https://fonts.googleapis.com/css?family=Roboto:300,400,500,700&display=swap");
8 |
9 | $font-family-sans-serif: "Roboto", "Fira Sans", "Droid Serif", serif !default;
10 | $font-family-monospace: "JetBrains Mono", "Fira Code", monospace;
11 | $presentation-font-size-root: 30px;
12 | $presentation-line-height: 1.5em;
13 | $presentation-heading-font-weight: 400;
14 |
15 | // colors
16 | $body-bg: #fafafa !default;
17 | $body-color: #000 !default;
18 | // $link-color: #EB811B !default;
19 | $selection-bg: #26351c;
20 |
21 | // headings
22 | // $presentation-heading-font: $font-family-sans-serif, serif !default;
23 | // $presentation-heading-color: #383d3d !default;
24 |
25 | /*-- scss:rules --*/
26 |
27 | .reveal a {
28 | line-height: 1.5em;
29 | color: #eb811b;
30 | font-weight: 300;
31 | }
32 |
33 | .reveal .footer a {
34 | color: #eb811b !important;
35 | }
36 |
37 | .reveal p {
38 | font-weight: 300;
39 | }
40 |
41 | .reveal .slide ul li,
42 | .reveal .slide ol li {
43 | font-weight: 300;
44 | }
45 |
46 | // maximum height of code blocks before scrolling is used
47 | .reveal pre.sourceCode code {
48 | max-height: 700px; // default 500
49 | }
50 |
51 | // title slide
52 | .title-slide {
53 | background-color: #fafafa;
54 | border-top: 80px solid #fafafa;
55 | }
56 |
57 | h1.title {
58 | color: #1a292c;
59 | font-size: 45px;
60 | text-shadow: none;
61 | font-weight: 400;
62 | text-align: left;
63 | margin-left: 15px;
64 | padding-top: 80px;
65 | }
66 | p.subtitle {
67 | // margin-top: -10px;
68 | // padding-bottom: -20px;
69 | color: #1a292c;
70 | text-shadow: none;
71 | font-weight: 300;
72 | font-size: 40px;
73 | text-align: left;
74 | margin-left: 15px;
75 | }
76 | p.author {
77 | color: #1a292c;
78 | text-shadow: none;
79 | font-weight: 300;
80 | font-size: 30px;
81 | text-align: left;
82 | margin-left: 15px;
83 | margin-bottom: -10px;
84 | margin-top: 0px;
85 | }
86 |
87 | p.date {
88 | color: #1a292c;
89 | text-shadow: none;
90 | font-weight: 300;
91 | font-size: 30px;
92 | text-align: left;
93 | margin-left: 15px;
94 | // margin-bottom: -30px;
95 | }
96 |
97 | p.subtitle:after {
98 | content: "";
99 | display: block;
100 | border: none;
101 | background-color: #eb811b;
102 | color: #eb811b;
103 | height: 1px;
104 | margin: 25px 0 25px;
105 | }
106 |
107 | // Section break slide
108 | hr,
109 | h1::after {
110 | content: "";
111 | display: block;
112 | border: none;
113 | background-color: #eb811b;
114 | color: #eb811b;
115 | height: 1px;
116 | margin: 1em 10px 0 10px;
117 | }
118 |
119 | // Override h1 style for title slide (remove section break slide style)
120 | hr,
121 | h1.title::after {
122 | content: "";
123 | display: block;
124 | border: none;
125 | background-color: transparent !important;
126 | color: transparent !important;
127 | height: 0px;
128 | margin: 0px !important;
129 | }
130 |
131 | h2::after.title {
132 | margin: 10px 15px 35px 0;
133 | }
134 |
135 | .reveal .slide-number a {
136 | font-size: 120%;
137 | background-color: #fafafa;
138 | border-radius: 12px;
139 | padding: 5px;
140 | }
141 |
142 | // inline
143 | .reveal code {
144 | font-size: 70%;
145 | background-color: #afb8c133;
146 | color: #000;
147 | padding: 4px;
148 | border-radius: 6px;
149 | }
150 |
151 | // code blocks
152 | .reveal div.sourceCode pre code {
153 | font-size: 100%;
154 | }
155 |
156 | // code output
157 | .reveal pre code {
158 | font-size: 100%;
159 | padding-top: 15px;
160 | }
161 |
162 |
163 |
164 | .column {
165 | // #column;
166 | // border: 2px solid red;
167 | border-radius: 10px !important;
168 | padding: 10px;
169 | margin: 5px;
170 | // background-color: #ededed;
171 | // background-color: #ffffff;
172 | }
173 |
174 |
175 | .reveal h2 {
176 | background-color: #23373b;
177 | padding: 5px 0px 5px 10px;
178 | color: #fafafa;
179 | border-radius: 12px;
180 | }
181 |
182 | .inverse {
183 | background-color: #fff3f2;
184 | padding: 5px 0px 5px 10px;
185 | color: #870000;
186 | border-radius: 12px;
187 | }
188 |
189 | .small-font {
190 | font-size: 70%;
191 | }
192 |
193 | iframe {
194 | display: block;
195 | margin-right: auto;
196 | margin-left: auto;
197 | }
198 |
199 | .center {
200 | text-align: center;
201 | }
202 |
203 | //
204 | .reveal .slide-menu-button .fa-bars::before {
205 | background-image: url('data:image/svg+xml,');
206 | }
207 |
208 | .reveal .slide-chalkboard-buttons .fa-easel2::before {
209 | padding-bottom: 6px;
210 | background-image: url('data:image/svg+xml,');
211 | }
212 |
213 | .reveal .slide-chalkboard-buttons .fa-brush::before {
214 | padding-bottom: 6px;
215 | background-image: url('data:image/svg+xml,');
216 | }
217 |
218 | .reveal .progress {
219 | color: #23373b;
220 | }
221 |
--------------------------------------------------------------------------------
/_quarto.yml:
--------------------------------------------------------------------------------
1 | project:
2 | type: website
3 |
4 | website:
5 | title: "ScPoProgramming"
6 | favicon: /images/ScPo-logo.png
7 | twitter-card: true
8 | google-analytics: "G-TQGG8QBSRH"
9 | body-footer: © Florian Oswald, 2024
10 | sidebar:
11 | style: "docked"
12 | contents:
13 | - section: "Lessons:"
14 | contents:
15 | - href: 01-shell-intro.qmd
16 | text: 1. Shell Intro
17 | - href: 02-filedir.qmd
18 | text: 2. Files and Directories
19 | - href: 03-filework.qmd
20 | text: 3. Working with Files
21 | - href: 04-pipes.qmd
22 | text: 4. Filters and Pipes
23 | - href: 05-git.qmd
24 | text: 5. `Git` Version Control
25 | - href: https://raw.githack.com/ScPoEcon/ScPoEconometrics-Slides/master/chapter_intro_programming/chapter_intro.html
26 | text: 6. `R` intro
27 | - href: https://raw.githack.com/ScPoEcon/ScPoEconometrics-Slides/master/chapter_tidy_programming/chapter_tidy.html
28 | text: 7. `R Tidyverse`
29 | - href: 06-concepts.qmd
30 | text: 8. `R` and `python` generics
31 | - href: https://raw.githack.com/floswald/lectures/master/05-datatable/05-datatable.html
32 | text: 9. `R data.table`
33 | - href: 09-R-packages.qmd
34 | text: 10. Building `R` packages
35 | - href: 10-spatial-R.qmd
36 | text: 11. Spatial Data with `R`
37 | - href: https://floswald.github.io/julia-bootcamp/01-variables.html
38 | text: 11. `julia` intro 1
39 | - href: https://floswald.github.io/julia-bootcamp/02-functions.html
40 | text: 13. `julia` intro 2
41 |
42 | navbar:
43 | left:
44 | - href: index.qmd
45 | text: Home
46 | logo: /images/ScPo-logo.png
47 | background: "#ba0202"
48 | foreground: "#faf7f7"
49 | format:
50 | html:
51 | theme: journal
52 | linkcolor: "#ba0202"
53 | css: styles.css
54 | toc: true
55 | highlight-style: github
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
--------------------------------------------------------------------------------
/about.qmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "About"
3 | ---
4 |
5 | About this site
6 |
--------------------------------------------------------------------------------
/custom.scss:
--------------------------------------------------------------------------------
1 | $code-block-bg: #ded9ca;
2 | $code-block-border-color: #000000;
--------------------------------------------------------------------------------
/data/brexit.csv:
--------------------------------------------------------------------------------
1 | "","startdate","enddate","pollster","poll_type","samplesize","remain","leave","undecided","spread"
2 | "1",2016-06-23,2016-06-23,"YouGov","Online",4772,0.52,0.48,0,0.04
3 | "2",2016-06-22,2016-06-22,"Populus","Online",4700,0.55,0.45,0,0.1
4 | "3",2016-06-20,2016-06-22,"YouGov","Online",3766,0.51,0.49,0,0.02
5 | "4",2016-06-20,2016-06-22,"Ipsos MORI","Telephone",1592,0.49,0.46,0.01,0.03
6 | "5",2016-06-20,2016-06-22,"Opinium","Online",3011,0.44,0.45,0.09,-0.01
7 | "6",2016-06-17,2016-06-22,"ComRes","Telephone",1032,0.54,0.46,0,0.08
8 | "7",2016-06-17,2016-06-22,"ComRes","Telephone",1032,0.48,0.42,0.11,0.06
9 | "8",2016-06-16,2016-06-22,"TNS","Online",2320,0.41,0.43,0.16,-0.02
10 | "9",2016-06-20,2016-06-20,"Survation/IG Group","Telephone",1003,0.45,0.44,0.11,0.01
11 | "10",2016-06-18,2016-06-19,"YouGov","Online",1652,0.42,0.44,0.13,-0.02
12 | "11",2016-06-16,2016-06-19,"ORB/Telegraph","Telephone",800,0.53,0.46,0.02,0.07
13 | "12",2016-06-17,2016-06-18,"Survation","Telephone",1004,0.45,0.42,0.13,0.03
14 | "13",2016-06-16,2016-06-17,"YouGov","Online",1694,0.44,0.43,0.09,0.01
15 | "14",2016-06-14,2016-06-17,"Opinium","Online",2006,0.44,0.44,0.12,0
16 | "15",2016-06-15,2016-06-16,"YouGov","Online",1734,0.42,0.44,0.09,-0.02
17 | "16",2016-06-15,2016-06-15,"Survation","Telephone",1104,0.42,0.45,0.13,-0.03
18 | "17",2016-06-10,2016-06-15,"BMG Research","Online",1468,0.37,0.47,0.16,-0.1
19 | "18",2016-06-10,2016-06-15,"BMG Research","Telephone",1064,0.46,0.43,0.11,0.03
20 | "19",2016-06-11,2016-06-14,"Ipsos MORI","Telephone",1257,0.43,0.49,0.03,-0.06
21 | "20",2016-06-12,2016-06-13,"YouGov","Online",1905,0.39,0.46,0.15,-0.07
22 | "21",2016-06-10,2016-06-13,"ICM","Telephone",1000,0.45,0.5,0.05,-0.05
23 | "22",2016-06-10,2016-06-13,"ICM","Online",2001,0.44,0.49,0.07,-0.05
24 | "23",2016-06-09,2016-06-13,"ComRes","Telephone",1002,0.46,0.45,0.09,0.01
25 | "24",2016-06-07,2016-06-13,"TNS","Online",2497,0.4,0.47,0.13,-0.07
26 | "25",2016-06-09,2016-06-12,"ORB","Telephone",800,0.48,0.49,0.03,-0.01
27 | "26",2016-06-09,2016-06-10,"YouGov","Online",1671,0.42,0.43,0.11,-0.01
28 | "27",2016-06-07,2016-06-10,"Opinium","Online",2009,0.44,0.42,0.13,0.02
29 | "28",2016-06-08,2016-06-09,"ORB","Online",2052,0.45,0.55,0,-0.1
30 | "29",2016-06-05,2016-06-06,"YouGov","Online",2001,0.43,0.42,0.11,0.01
31 | "30",2016-06-03,2016-06-05,"ICM","Online",2047,0.43,0.48,0.09,-0.05
32 | "31",2016-06-02,2016-06-05,"ORB","Telephone",800,0.48,0.47,0.05,0.01
33 | "32",2016-06-01,2016-06-03,"YouGov","Online",3405,0.41,0.45,0.11,-0.04
34 | "33",2016-05-31,2016-05-31,"Opinium","Online",2007,0.43,0.41,0.16,0.02
35 | "34",2016-05-31,2016-05-31,"Opinium","Online",2007,0.4,0.43,0.16,-0.03
36 | "35",2016-05-30,2016-05-31,"YouGov","Online",1735,0.41,0.41,0.13,0
37 | "36",2016-05-27,2016-05-29,"ICM","Telephone",1004,0.42,0.45,0.15,-0.03
38 | "37",2016-05-27,2016-05-29,"ICM","Online",2052,0.44,0.47,0.09,-0.03
39 | "38",2016-05-25,2016-05-29,"ORB","Telephone",800,0.51,0.46,0.03,0.05
40 | "39",2016-05-20,2016-05-25,"BMG Research","Online",1638,0.44,0.45,0.12,-0.01
41 | "40",2016-05-24,2016-05-24,"Survation","Telephone",1013,0.44,0.38,0.18,0.06
42 | "41",2016-05-23,2016-05-24,"YouGov","Online",1756,0.41,0.41,0.13,0
43 | "42",2016-05-19,2016-05-23,"TNS","Online",1213,0.41,0.43,0.16,-0.02
44 | "43",2016-05-20,2016-05-22,"ICM","Online",2003,0.45,0.45,0.1,0
45 | "44",2016-05-18,2016-05-22,"ORB","Telephone",800,0.55,0.42,0.03,0.13
46 | "45",2016-05-17,2016-05-19,"Opinium","Online",2008,0.44,0.4,0.14,0.04
47 | "46",2016-05-16,2016-05-17,"YouGov","Online",1648,0.44,0.4,0.12,0.04
48 | "47",2016-05-14,2016-05-17,"ComRes","Telephone",1000,0.52,0.41,0.07,0.11
49 | "48",2016-05-14,2016-05-16,"Ipsos MORI","Telephone",1002,0.55,0.37,0.05,0.18
50 | "49",2016-05-13,2016-05-15,"ICM","Telephone",1002,0.47,0.39,0.14,0.08
51 | "50",2016-05-13,2016-05-15,"ICM","Online",2048,0.43,0.47,0.1,-0.04
52 | "51",2016-05-11,2016-05-15,"ORB","Telephone",800,0.55,0.4,0.05,0.15
53 | "52",2016-05-10,2016-05-12,"TNS","Online",1222,0.38,0.41,0.21,-0.03
54 | "53",2016-04-29,2016-04-29,"YouGov","Telephone",996,0.36,0.39,0.22,-0.03
55 | "54",2016-04-29,2016-04-29,"YouGov","Online",1973,0.38,0.4,0.16,-0.02
56 | "55",2016-05-06,2016-05-08,"ICM","Online",2005,0.44,0.46,0.11,-0.02
57 | "56",2016-05-04,2016-05-06,"YouGov","Online",3378,0.42,0.4,0.13,0.02
58 | "57",2016-04-29,2016-04-29,"ICM","Online",2040,0.44,0.45,0.11,-0.01
59 | "58",2016-04-27,2016-04-29,"ICM","Online",2029,0.43,0.46,0.11,-0.03
60 | "59",2016-04-26,2016-04-29,"Opinium","Online",2005,0.42,0.41,0.14,0.01
61 | "60",2016-04-27,2016-04-29,"ORB","Online",2000,0.49,0.51,0,-0.02
62 | "61",2016-04-26,2016-04-28,"TNS","Online",1221,0.39,0.36,0.26,0.03
63 | "62",2016-04-25,2016-04-26,"YouGov","Online",1650,0.41,0.42,0.13,-0.01
64 | "63",2016-04-25,2016-04-26,"Survation","Telephone",1003,0.45,0.38,0.17,0.07
65 | "64",2016-04-22,2016-04-26,"BMG Research","Online",2001,0.43,0.45,0.13,-0.02
66 | "65",2016-04-22,2016-04-24,"ICM","Online",2001,0.44,0.46,0.1,-0.02
67 | "66",2016-04-20,2016-04-24,"ORB","Telephone",800,0.51,0.43,0.06,0.08
68 | "67",2016-04-16,2016-04-19,"ComRes","Telephone",1002,0.51,0.4,0.09,0.11
69 | "68",2016-04-16,2016-04-18,"Ipsos MORI","Telephone",1026,0.49,0.39,0.08,0.1
70 | "69",2016-04-15,2016-04-17,"ICM","Telephone",1003,0.48,0.41,0.11,0.07
71 | "70",2016-04-15,2016-04-17,"ICM","Online",2008,0.43,0.44,0.13,-0.01
72 | "71",2016-04-13,2016-04-17,"ORB","Telephone",800,0.53,0.41,0.06,0.12
73 | "72",2016-04-12,2016-04-14,"TNS","Online",1198,0.38,0.34,0.28,0.04
74 | "73",2016-04-12,2016-04-14,"YouGov","Online",3371,0.4,0.39,0.16,0.01
75 | "74",2016-04-11,2016-04-12,"YouGov","Online",1693,0.39,0.39,0.17,0
76 | "75",2016-04-07,2016-04-11,"TNS","Online",1198,0.35,0.35,0.3,0
77 | "76",2016-04-08,2016-04-10,"ComRes","Telephone",1002,0.45,0.38,0.17,0.07
78 | "77",2016-04-08,2016-04-10,"ICM","Online",2030,0.42,0.45,0.12,-0.03
79 | "78",2016-04-06,2016-04-07,"YouGov","Online",1612,0.4,0.38,0.16,0.02
80 | "79",2016-03-29,2016-03-29,"YouGov","Online",3754,0.39,0.38,0.18,0.01
81 | "80",2016-04-01,2016-04-03,"ICM","Online",2007,0.44,0.43,0.13,0.01
82 | "81",2016-03-29,2016-03-29,"ORB","Telephone",800,0.51,0.44,0.05,0.07
83 | "82",2016-03-29,2016-03-29,"Opinium","Online",1966,0.39,0.43,0.18,-0.04
84 | "83",2016-03-24,2016-03-29,"TNS","Online",1193,0.35,0.35,0.3,0
85 | "84",2016-03-24,2016-03-29,"BMG Research","Online",1518,0.41,0.45,0.14,-0.04
86 | "85",2016-03-24,2016-03-28,"ORB","Online",2002,0.51,0.49,0,0.02
87 | "86",2016-03-22,2016-03-24,"ICM","Online",1970,0.45,0.43,0.12,0.02
88 | "87",2016-03-19,2016-03-22,"Ipsos MORI","Telephone",1023,0.49,0.41,0.1,0.08
89 | "88",2016-03-17,2016-03-22,"YouGov","Online",1688,0.4,0.37,0.19,0.03
90 | "89",2016-03-18,2016-03-20,"ComRes","Telephone",1002,0.48,0.41,0.11,0.07
91 | "90",2016-03-18,2016-03-20,"ICM","Online",2000,0.41,0.43,0.17,-0.02
92 | "91",2016-03-17,2016-03-19,"Survation","Telephone",1006,0.46,0.35,0.19,0.11
93 | "92",2016-03-11,2016-03-14,"ORB","Telephone",823,0.47,0.49,0.04,-0.02
94 | "93",2016-03-11,2016-03-13,"ICM","Online",2031,0.43,0.41,0.16,0.02
95 | "94",2016-03-04,2016-03-11,"Greenberg Quinlan Rosner Research","Online",2282,0.45,0.4,0.16,0.05
96 | "95",2016-03-02,2016-03-10,"Populus/Number Cruncher Politics","Online",4047,0.48,0.45,0.07,0.03
97 | "96",2016-03-04,2016-03-06,"Populus/Number Cruncher Politics","Telephone",966,0.49,0.35,0.15,0.14
98 | "97",2016-03-04,2016-03-06,"ICM","Online",2051,0.4,0.41,0.19,-0.00999999999999995
99 | "98",2016-03-02,2016-03-03,"YouGov","Online",1695,0.4,0.37,0.18,0.03
100 | "99",2016-03-01,2016-03-02,"YouGov","Online",1705,0.4,0.35,0.19,0.05
101 | "100",2016-02-29,2016-02-29,"YouGov","Online",2233,0.39,0.37,0.19,0.02
102 | "101",2016-02-26,2016-02-29,"ICM","Online",2003,0.41,0.41,0.18,0
103 | "102",2016-02-26,2016-02-28,"Populus/Number Cruncher Politics","Online",2071,0.39,0.45,0.18,-0.06
104 | "103",2016-02-26,2016-02-28,"Populus/Number Cruncher Politics","Telephone",1002,0.48,0.37,0.15,0.11
105 | "104",2016-02-24,2016-02-25,"ORB","Online",2014,0.48,0.52,0,-0.04
106 | "105",2016-02-21,2016-02-23,"YouGov","Online",3482,0.37,0.38,0.25,-0.01
107 | "106",2016-02-17,2016-02-23,"BMG Research","Online",1517,0.38,0.36,0.25,0.02
108 | "107",2016-02-19,2016-02-22,"ICM","Online",2021,0.42,0.4,0.17,0.02
109 | "108",2016-02-19,2016-02-22,"ComRes","Telephone",1000,0.51,0.39,0.1,0.12
110 | "109",2016-02-13,2016-02-20,"Survation","Telephone",938,0.45,0.32,0.23,0.13
111 | "110",2016-02-18,2016-02-19,"Opinium","Online",1033,0.4,0.41,0.19,-0.00999999999999995
112 | "111",2016-02-13,2016-02-16,"Ipsos MORI","Telephone",497,0.54,0.36,0.1,0.18
113 | "112",2016-02-11,2016-02-15,"TNS","Online",1079,0.36,0.39,0.25,-0.03
114 | "113",2016-02-12,2016-02-14,"ICM","Online",2001,0.43,0.39,0.18,0.04
115 | "114",2016-02-11,2016-02-14,"ComRes","Telephone",1105,0.49,0.41,0.1,0.08
116 | "115",2016-02-05,2016-02-07,"ICM","Online",2018,0.41,0.42,0.17,-0.01
117 | "116",2016-02-03,2016-02-04,"YouGov/The Times","Online",1675,0.36,0.45,0.19,-0.09
118 | "117",2016-01-29,2016-01-31,"ICM","Online",2002,0.42,0.39,0.19,0.03
119 | "118",2016-01-27,2016-01-28,"YouGov","Online",1735,0.38,0.42,0.2,-0.04
120 | "119",2016-01-23,2016-01-25,"Ipsos MORI","Telephone",513,0.55,0.36,0.09,0.19
121 | "120",2016-01-21,2016-01-25,"BMG Research","Online",1511,0.44,0.42,0.14,0.02
122 | "121",2016-01-22,2016-01-24,"ComRes","Telephone",1006,0.54,0.36,0.1,0.18
123 | "122",2016-01-22,2016-01-24,"ICM","Online",2010,0.41,0.41,0.18,0
124 | "123",2016-01-20,2016-01-21,"ORB","Online",2015,0.52,0.48,0,0.04
125 | "124",2016-01-15,2016-01-17,"ICM","Online",2023,0.42,0.4,0.17,0.02
126 | "125",2016-01-15,2016-01-16,"Survation","Online",1017,0.38,0.4,0.22,-0.02
127 | "126",2016-01-08,2016-01-14,"Panelbase","Online",2087,0.42,0.45,0.12,-0.03
128 | "127",2016-01-08,2016-01-10,"ICM","Online",2055,0.44,0.38,0.18,0.06
129 |
--------------------------------------------------------------------------------
/data/shell-lesson-data.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/data/shell-lesson-data.zip
--------------------------------------------------------------------------------
/images/02_datum_fig.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/02_datum_fig.png
--------------------------------------------------------------------------------
/images/PHD.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/PHD.png
--------------------------------------------------------------------------------
/images/ScPo-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/ScPo-logo.png
--------------------------------------------------------------------------------
/images/Tux.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/Tux.png
--------------------------------------------------------------------------------
/images/bad.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/bad.gif
--------------------------------------------------------------------------------
/images/distance1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/distance1.png
--------------------------------------------------------------------------------
/images/distance2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/distance2.png
--------------------------------------------------------------------------------
/images/filesystem-challenge.odg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/filesystem-challenge.odg
--------------------------------------------------------------------------------
/images/filesystem.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
287 |
--------------------------------------------------------------------------------
/images/find-file-tree.odg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/find-file-tree.odg
--------------------------------------------------------------------------------
/images/git-images/full.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/git-images/full.png
--------------------------------------------------------------------------------
/images/git-staging-CDG.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/git-staging-CDG.jpeg
--------------------------------------------------------------------------------
/images/git-staging-area.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
94 |
--------------------------------------------------------------------------------
/images/homedir.odg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/homedir.odg
--------------------------------------------------------------------------------
/images/nano-screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/nano-screenshot.png
--------------------------------------------------------------------------------
/images/phd101212s.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/phd101212s.png
--------------------------------------------------------------------------------
/images/redirects-and-pipes.svg:
--------------------------------------------------------------------------------
1 |
2 |
72 |
--------------------------------------------------------------------------------
/images/removed-that.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/removed-that.png
--------------------------------------------------------------------------------
/images/seine.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/seine.png
--------------------------------------------------------------------------------
/images/seine2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/seine2.png
--------------------------------------------------------------------------------
/images/seine3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/seine3.png
--------------------------------------------------------------------------------
/images/seine4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/seine4.png
--------------------------------------------------------------------------------
/images/seine5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/seine5.png
--------------------------------------------------------------------------------
/images/toypackage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/toypackage.png
--------------------------------------------------------------------------------
/images/vector_lonlat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/vector_lonlat.png
--------------------------------------------------------------------------------
/images/vector_lonlatglobe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/vector_lonlatglobe.png
--------------------------------------------------------------------------------
/images/vector_lonlatparis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/vector_lonlatparis.png
--------------------------------------------------------------------------------
/images/vector_projected.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/vector_projected.png
--------------------------------------------------------------------------------
/images/vector_projectedparis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/vector_projectedparis.png
--------------------------------------------------------------------------------
/images/which-version.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/floswald/ScPoProgramming/79282fd68db97b0da9482b17adaf27019c21fa39/images/which-version.png
--------------------------------------------------------------------------------
/index.qmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: ScPoProgramming
3 | author: "[Florian Oswald](https://floswald.github.io)"
4 | subtitle: "SciencesPo Paris, Ecole Doctorale 2024"
5 | ---
6 |
7 | # Welcome to *Introduction to Programming*
8 |
9 | * This website is the home of the course _Introduction to Programming_ taught to first year PhD students (M1) in the doctoral program of the [department of economics](https://www.sciencespo.fr/department-economics/) at Sciences Po Paris.
10 | * This course assumes **no prior programming** experience.
11 | * Below you will find setup instructions and a syllabus.
12 | * You can obtain all material for this course from the associated github repository at [link](https://github.com/floswald/ScPoProgramming/)
13 |
14 | ## Objectives of this Course
15 |
16 | * After this course, we want you to be able to participate or conduct your own research project in an efficient way. *Research* nowadays means data sciencey stuff in most cases, certainly in Economics.
17 | * We want you to have a basic understanding of how an operating system (in particular, *your* OS) works.
18 | * We want to be able to achieve a basic level of automation in repetitive tasks.
19 | * We want you to know what Version Control is and how to use it in a research project.
20 | * We will introduce some (hopefully) useful `R` programming.
21 |
22 | # Syllabus
23 |
24 | | Session Number | Topic | Author |
25 | |:------------:|:-----------|:-------------:|
26 | | 1 | [The Unix Shell](01-shell-intro.qmd) | [The Software Carpentry Project](https://software-carpentry.org/) + Florian Oswald |
27 | | 2 | [Shell: Files and Directories](02-filedir.qmd) | [The Software Carpentry Project](https://software-carpentry.org/) + Florian Oswald |
28 | | 3 | [Shell: Working with Files and Directories](03-filework.qmd) | [The Software Carpentry Project](https://software-carpentry.org/) + Florian Oswald |
29 | | 4 | [Shell: Pipes and Filters](04-pipes.qmd) | [The Software Carpentry Project](https://software-carpentry.org/) + Florian Oswald |
30 | | 5 | [`Git` Version Control](05-git.qmd) | [The Software Carpentry Project](https://software-carpentry.org/) + Florian Oswald |
31 | | | [Homework 1](https://github.com/floswald/scpoproghw1): complete and run a bash script on gh-actions | Florian Oswald |
32 | | 6 | [`R` Intro](https://raw.githack.com/ScPoEcon/ScPoEconometrics-Slides/master/chapter_intro_programming/chapter_intro.html) | Florian Oswald|
33 | | 7 | [`R {tidyverse}`](https://raw.githack.com/ScPoEcon/ScPoEconometrics-Slides/master/chapter_tidy_programming/chapter_tidy.html) | [Grant McDermott](https://grantmcdermott.com/) + Florian Oswald|
34 | | 8 | [`R` and `python` generics](06-concepts.qmd) | Florian Oswald|
35 | | 9 | [`R {data.table}`](https://raw.githack.com/floswald/lectures/master/05-datatable/05-datatable.html) | [Grant McDermott](https://grantmcdermott.com/) + Florian Oswald|
36 | | 10 | [Building `R` packages](09-R-packages.qmd) | Florian Oswald|
37 | | 11 | [Spatial Data with `R`](10-spatial-R.qmd) | Florian Oswald|
38 | | 12 | [`julia` intro 1](https://floswald.github.io/julia-bootcamp/01-variables.html) | Florian Oswald|
39 | | 13 | [`julia` intro 2](https://floswald.github.io/julia-bootcamp/02-functions.html) | Florian Oswald|
40 | | 14 | [Quick Intro to NLP with R](11-NLP-R.qmd) | Florian Oswald|
41 |
42 |
43 | # Setup Instructions
44 |
45 | * You must bring your own laptop to each class.
46 | * Please make sure you have an up to date operating system, i.e. run a software update before we start.
47 | * Everybody should [install R](https://cran.r-project.org/) or make sure they have a recent version installed.
48 | * Everybody should [install RStudio](https://posit.co/download/rstudio-desktop/) or run an update on the installed program.
49 |
50 |
51 | ## Windows and Mac Specific Instructions
52 |
53 | There are different instructions depending on whether you have a Mac or a Windows computer. Unix-based computers are similar to Macs in most respects.
54 |
55 | ### Windows
56 |
57 | We need to install some things that make your windows computer a resemble a bit a Unix box. In particular, we want to be able to use the unix shell. Therefore, I want you to download and install
58 |
59 | * [GitForWindows](https://gitforwindows.org/)
60 |
61 | *Specifics:*
62 |
63 | 1. During the installation process, choose all default settings.
64 | 2. At one point, you are offered a choice for a default editor being used for `git`. If you know `vim` already, why not (I use `vim`), otherwise I recommend `nano`, which is a simple to use editor that runs inside your command line - we want to avoid having to open an external window of a separate editor for our tasks (i.e. don't choose notepad and other standalone editors).
65 |
66 |
67 | ### Mac
68 |
69 | You should be all set. To make sure we have really everything we need, open `Terminal.app` (in *Applications > Utilities* or do `Cmd + Space` to get spotlight search and type `terminal`). Then paste this code and hit enter:
70 |
71 | ```bash
72 | xcode-select --install
73 | ```
74 |
75 | click on *install* (don't click on `get Xcode`)
76 |
77 |
78 | ### Unix
79 |
80 | Same, all set. Maybe open a terminal and type
81 |
82 | ```bash
83 | git --version
84 | ```
85 |
86 | if that throws an error, install it with your package manager, e.g.
87 |
88 | ```bash
89 | sudo apt install git-all
90 | ```
91 |
92 |
93 | # Code of Conduct
94 |
95 | If you decide to participate in this course, I expect you to abide by the following minimal code of conduct.
96 |
97 | 1. Be polite to the other class participants.
98 | 2. While in class, do not spend time on messaging apps, chat rooms, computer games, or similar content.
99 |
100 | You can expect your instructor to abide by the same code of conduct, so this is a matter of mutual respect. If you are found in breach of the above you will be given a single warning, and I will ask you to no longer join the course after a second time. Your grade will be "fail".
101 |
102 |
103 |
104 | # License
105 |
106 | All lectures of this course are derived from the work of the [Software Carpentry](https://software-carpentry.org/license/). Their material is licensed under [creative commons license 4.0](https://creativecommons.org/licenses/by/4.0/), whereby I am allowd to share and remix the content, if appropriate attribution is given.
107 |
108 | Those terms apply to anyone wanting use material on this website as well. Thank you.
--------------------------------------------------------------------------------
/scripts/01-shell-intro.sh:
--------------------------------------------------------------------------------
1 | # this line is a comment (it starts with the # character)
2 |
3 | # (bash) shell commands for 01-shell-intro
4 |
5 |
--------------------------------------------------------------------------------
/scripts/_tidy_tasks.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Tidying, Visualising and Summarising Data - Tasks"
3 | author: "Mylène Feuillade, Gustave Kenedi, Florian Oswald and Pierre Villedieu"
4 | date: "`r Sys.Date()`"
5 | output: html_document
6 | ---
7 |
8 | ```{r setup, include=FALSE}
9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 |
12 | ## Task 1: Data wrangling
13 |
14 | Load the data by running the following code:
15 |
16 | ```{r}
17 | library(dslabs)
18 | data(polls_us_election_2016)
19 |
20 | library(tidyverse)
21 | ```
22 |
23 | 1\. Which polls had a missing `grade`?
24 |
25 | Only showing the first 6 rows using `head`, otherwise the document would be very long.
26 |
27 | ```{r}
28 | polls_us_election_2016 %>%
29 | filter(is.na(grade)) %>%
30 | head
31 | ```
32 |
33 | 2\. Which polls were (i) polled by American Strategies, GfK Group or Merrill Poll, (ii) had a sample size greater than 1,000, _and_ (iii) started on October 20th, 2016? (*Hint: for (i) `%in%` might come in handy. Recall that vectors are created using the `c()` function. For (iii) make sure to check the format of the variable containing the poll's start date.*)
34 |
35 | ```{r}
36 | polls_us_election_2016 %>%
37 | filter(pollster %in% c("American Strategies","GfK Group","Merrill Poll") &
38 | samplesize > 1000 &
39 | startdate == "2016-10-20")
40 | ```
41 |
42 | 3\. Which polls (i) did not have missing poll data for Johnson, (ii) had a combined raw poll vote share for Trump and Clinton greater than 95% _and_ (iii) were done in the state of Ohio? (*Hint: it might be practical to first create a variable containing the combined raw poll vote share for Trump and Clinton and then filter.*)
43 |
44 | ```{r}
45 | polls_us_election_2016 %>%
46 | mutate(rawpoll_clintontrump = rawpoll_clinton + rawpoll_trump) %>%
47 | filter(!is.na(rawpoll_johnson) & rawpoll_clintontrump > 95 & state == "Ohio")
48 | ```
49 |
50 |
51 | 4\. Which state had the highest average Trump vote share for polls which had at least a sample size of 2,000? (*Hint: you'll have to use `filter`, `group_by`, `summarise` and `arrange`. To obtain ranking in descending order check `arrange`'s help page.*)
52 |
53 | ```{r}
54 | polls_us_election_2016 %>%
55 | filter(samplesize >= 2000) %>%
56 | group_by(state) %>%
57 | summarise(mean_trump = mean(rawpoll_trump)) %>%
58 | arrange(desc(mean_trump))
59 | ```
60 |
61 |
62 | ## Task 2: Understanding the data
63 |
64 | Load the data by running the following code:
65 |
66 | ```{r, echo = T, eval = F}
67 | library(dslabs)
68 | data(gapminder, package = "dslabs")
69 | ```
70 |
71 | 1\. Compute the average population per continent per year, `mean_pop`, and assign the output to a new object `gapminder_mean`. (*Hint: you should have one observation (row) per continent for each year. You'll have to use `group_by` and `summarise`.*)
72 |
73 | ```{r}
74 | gapminder_mean <- gapminder %>%
75 | group_by(continent, year) %>%
76 | summarise(mean_pop = mean(population))
77 | ```
78 |
79 |
80 | ## Task 3: Visualising data
81 |
82 | Using the `gapminder` data, create the following plots using `ggplot2`. Don't forget to label the axes.
83 |
84 | 1\. A histogram of life expectancy in 2015. (*Hint: do you need to specify a `y` in `aes()` for a histogram?*) Once you've created the histogram, within the appropriate `geom_*` set: `binwidth` to 5, `boundary` to 45, `colour` to "white" and `fill` to "#d90502". What does each of these options do?
*Optional:* Using the previous graph, facet it by continent such that each continent's plot is a new row. (*Hint: check the help for `facet_grid`.*)
85 |
86 | The basic histogram:
87 |
88 | ```{r}
89 | gapminder %>%
90 | filter(year == 2015) %>%
91 | ggplot() +
92 | aes(x = life_expectancy) +
93 | geom_histogram()
94 | ```
95 |
96 | The fancy histogram (with axis labels):
97 |
98 | ```{r}
99 | life_exp_hist <- gapminder %>%
100 | filter(year == 2015) %>%
101 | ggplot() +
102 | aes(x = life_expectancy) +
103 | geom_histogram(binwidth = 5,
104 | boundary = 45,
105 | colour = "white",
106 | fill = "#d90502") +
107 | labs(x = "Life expectancy",
108 | y = "Frequency")
109 | life_exp_hist
110 | ```
111 |
112 | The faceted fancy histogram:
113 |
114 | ```{r}
115 | life_exp_hist +
116 | facet_grid(rows = vars(continent))
117 | ```
118 |
119 | 2\. A boxplot of average life expectancy per year by continent. Within the appropriate `geom_*` set: `colour` to "black" and `fill` to "#d90502". (*Hint: you need to group by both `continent` and `year`.*)
120 |
121 | ```{r}
122 | gapminder %>%
123 | group_by(continent, year) %>%
124 | summarise(mean_life_exp = mean(life_expectancy)) %>%
125 | ggplot() +
126 | aes(x = continent, y = mean_life_exp) +
127 | geom_boxplot(colour = "black",
128 | fill = "#d90502") +
129 | labs(x = "Continent",
130 | y = "Life expectancy")
131 | ```
132 |
133 | 3\. A scatter plot of fertility rate (y-axis) with respect to infant mortality (x-axis) in 2015. Once you've created the scatter plot, within the appropriate `geom_*` set: `size` to 3, `alpha` to 0.5, `colour` to "#d90502".
134 |
135 | The basic scatter plot:
136 |
137 | ```{r}
138 | gapminder %>%
139 | filter(year == 2015) %>%
140 | ggplot() +
141 | aes(x = infant_mortality, y = fertility) +
142 | geom_point()
143 | ```
144 |
145 | The fancy scatter plot with axis labels:
146 |
147 | ```{r}
148 | gapminder %>%
149 | filter(year == 2015) %>%
150 | ggplot() +
151 | aes(x = infant_mortality, y = fertility) +
152 | geom_point(size = 3,
153 | alpha = 0.5,
154 | colour = "#d90502") +
155 | labs(x = "Infant mortality", y = "Fertility")
156 | ```
157 |
158 |
159 | ## Task 4: Summarising data
160 |
161 | 1\. Compute the mean of GDP in 2011 and assign to object `mean`. You should exclude missing values. (*Hint: read the help for `mean` to remove `NA`s*).
162 |
163 | ```{r}
164 | mean_GDP <- gapminder %>%
165 | filter(year == 2011) %>%
166 | summarise(mean(gdp, na.rm = T))
167 | mean_GDP
168 | ```
169 |
170 | 2\. Compute the median of GDP in 2011 and assign to object `median`. Again, you should exclude missing values. Is it greater or smaller than the average?
171 |
172 | ```{r}
173 | median_GDP <- gapminder %>%
174 | filter(year == 2011) %>%
175 | summarise(median(gdp, na.rm = T))
176 | median_GDP
177 | ```
178 |
179 | **The median is much smaller than the average.**
180 |
181 | 3\. Create a density plot of GDP in 2011 using `geom_density`. A density plot is a way of representing the distribution of a numeric variable. Add the following code to your plot to show the median and mean as vertical lines. What do you observe?
182 | `geom_vline(xintercept = as.numeric(mean_GDP), colour = "red") +
183 | geom_vline(xintercept = as.numeric(median_GDP), colour = "orange")`
184 |
185 | ```{r}
186 | gdp_density <- gapminder %>%
187 | filter(year == 2011) %>%
188 | ggplot() +
189 | aes(x = gdp) +
190 | geom_density() +
191 | geom_vline(xintercept = as.numeric(mean_GDP), colour = "red") +
192 | geom_vline(xintercept = as.numeric(median_GDP), colour = "orange")
193 | gdp_density
194 | ```
195 |
196 | **The distribution of GDP is highly ***skewed***: there are many countries with small GDPs and very few with huge GDPs (U.S., Japan, China). In such cases, the average will be (significantly) greater than the median. To see this more clearly, here's a graph where I've transformed the x-axis such that each tick is 10 times larger than the previous one (the scale is therefore not linear, i.e. the first tick is 100,000, the second is 1 million, the third is 10 million, etc.).**
197 |
198 | ```{r}
199 | gdp_density +
200 | scale_x_log10()
201 | ```
202 |
203 | 4\. Compute the correlation between fertility and infant mortality in 2015. To drop `NA`s in either variable set the argument `use` to "pairwise.complete.obs" in your `cor()` function. Is this correlation consistent with the graph you produced in Task 3?
204 |
205 | ```{r}
206 | gapminder %>%
207 | filter(year == 2015) %>%
208 | summarise(cor(fertility, infant_mortality, use = "pairwise.complete.obs"))
209 | ```
210 |
211 | **This correlation is positive and strong (relatively close to 1) which is consistent with the graph produced in Task 3. Indeed, that graph displayed a positive relationship between these two variables and the points were not that dispersed.**
212 |
--------------------------------------------------------------------------------
/scripts/geotask.R:
--------------------------------------------------------------------------------
1 |
2 | library(sf)
3 | library(dplyr)
4 | library(here)
5 |
6 | destdir = file.path(here(),"data","shapefiles","departements-20140306-100m")
7 |
8 | if (!file.exists(file.path(destdir,"departements-20140306-100m.shp"))){
9 | dir.create(file.path(here(),"data","shapefiles"), showWarnings = FALSE)
10 | download.file(url = "https://www.data.gouv.fr/fr/datasets/r/3096e551-c68d-40ce-8972-a228c94c0ad1",
11 | destfile = file.path(here(),"data","shapefiles","departements-20140306-100m.zip"))
12 | unzip(file.path(here(),"data","shapefiles","departements-20140306-100m.zip"),
13 | exdir = destdir)
14 | }
15 |
16 | # load departments shapefile
17 | deps = st_read(file.path(destdir,"departements-20140306-100m.shp"))
18 |
19 | plot(deps[,"code_insee"])
20 |
21 | # subset to continental france
22 | deps = deps %>%
23 | dplyr::filter(!(code_insee %in% c("2A","2B","971","972","973","974","976")))
24 |
25 |
26 | # Seine data
27 | data("seine",
28 | package = "spData")
29 |
30 | ## Make sure they have the same projection
31 | seine = st_transform(seine,
32 | crs = st_crs(deps))
33 |
34 | # intersect deps and rivers
35 | deps_seine = st_join(deps, seine) %>%
36 | ## Get rid of regions with no overlap
37 | dplyr::filter(!is.na(name)) %>%
38 | dplyr::distinct(code_insee, .keep_all = T)
39 |
40 | # reproduce plot from class
41 | ggplot() +
42 | geom_sf(data = deps_seine,alpha = 0.5,
43 | aes(fill = nom),
44 | col = "#fcb4b3", # of borders
45 | linewidth = 0.5) + # of borders
46 | geom_sf(data = seine, col = "#05E9FF", lwd = 1) +
47 | labs(title = "Intersected regions only") +
48 | theme_bw()
49 |
50 |
51 | # # get communes shapefile from
52 | # # https://www.data.gouv.fr/fr/datasets/contours-des-communes-de-france-simplifie-avec-regions-et-departement-doutre-mer-rapproches/
53 | # stable url: https://www.data.gouv.fr/fr/datasets/r/00c0c560-3ad1-4a62-9a29-c34c98c3701e
54 |
55 | commdir = file.path(here(),"data","shapefiles","communes")
56 |
57 | if (!file.exists(file.path(commdir,"a-com2022-topo-2154.json"))){
58 | dir.create(commdir, showWarnings = FALSE)
59 |
60 | download.file(url = "https://www.data.gouv.fr/fr/datasets/r/00c0c560-3ad1-4a62-9a29-c34c98c3701e",
61 | destfile = file.path(commdir,"a-com2022-topo-2154.json"),quiet = FALSE)
62 | }
63 |
64 |
65 | comms = st_read(file.path(commdir,"a-com2022-topo-2154.json"),layer = "a_com2022")
66 | st_crs(comms) <- 2154 # set the initial CRS
67 |
68 | comms = comms %>%
69 | st_transform(4326) %>%
70 | dplyr::rename(code_insee = codgeo)
71 |
72 | # subset to relevant departments only: i.e. the ones of the join above
73 | co_d = comms %>%
74 | dplyr::filter(dep %in% unique(deps_seine$code_insee))
75 |
76 | # reproduce plot above but now apply a color code that tells us
77 | # how many communes the rivers traverse *in each departement*
78 |
79 | # join co with seine
80 | co_seine = st_join(co_d, seine) %>%
81 | ## Get rid of regions with no overlap
82 | dplyr::filter(!is.na(name)) %>%
83 | dplyr::distinct(code_insee, .keep_all = T)
84 |
85 | # plot all the communes that are traversed and color by river name
86 | ggplot(co_seine) + geom_sf(aes(fill = name))
87 |
88 | # add the rivers themselves
89 | ggplot(co_seine) +
90 | geom_sf(aes(fill = name)) +
91 | geom_sf(data = seine, col = "#05E9FF", lwd = 0.6)
92 |
93 |
94 | co_d_seine = co_seine %>%
95 | st_set_geometry(NULL) %>% # can get rid of geometry
96 | dplyr::group_by(dep) %>%
97 | dplyr::summarise(ncomms = dplyr::n())
98 |
99 | # merge with deps_seine and plot again
100 | deps_seine %>%
101 | dplyr::inner_join(co_d_seine, by = c("code_insee" = "dep")) %>%
102 | ggplot() +
103 | geom_sf(alpha = 0.9,
104 | aes(fill = ncomms),
105 | col = "grey", # of borders
106 | linewidth = 0.2) + # of borders
107 | geom_sf(data = seine, col = "#05E9FF", lwd = 1) +
108 | # scale_fill_gradient2(low = "white",high = "red") +
109 | scale_fill_viridis_c() +
110 | labs(title = "Number of Communes by Department",
111 | subtitle = "Traversed by one of Seine, Marne or Yonne",
112 | fill = "Number of\nCommunes") +
113 | theme_bw()
114 |
115 |
116 |
117 |
118 |
119 |
120 |
--------------------------------------------------------------------------------
/scripts/lon-lat-geocomp.r:
--------------------------------------------------------------------------------
1 | # this is copied from
2 | # https://raw.githubusercontent.com/geocompx/geocompr/main/code/02-vectorplots.R
3 |
4 |
5 | library(globe)
6 | library(dplyr)
7 | library(sf)
8 |
9 | london_lonlat = st_point(c(-0.1, 51.5)) %>%
10 | st_sfc() %>%
11 | st_sf(crs = 4326, geometry = .)
12 | london_osgb = st_transform(london_lonlat, 27700)
13 | origin_osgb = st_point(c(0, 0)) %>%
14 | st_sfc() %>%
15 | st_sf(crs = 27700, geometry = .)
16 | london_orign = rbind(london_osgb, origin_osgb)
17 |
18 | paris = c(2.34,48.85)
19 |
20 | paris_lonlat = st_point(paris) %>%
21 | st_sfc() %>%
22 | st_sf(crs = 4326, geometry = .)
23 | paris_lambert = st_transform(paris_lonlat, 27561)
24 | origin_lambert = st_point(c(0, 0)) %>%
25 | st_sfc() %>%
26 | st_sf(crs = 27561, geometry = .)
27 | paris_origin = rbind(paris_lambert, origin_lambert)
28 |
29 | png("images/vector_lonlat.png")
30 | globe::globeearth(eye = c(0, 0))
31 | gratmat = st_coordinates(st_graticule())[, 1:2]
32 | globe::globelines(loc = gratmat, col = "grey", lty = 3)
33 | globe::globelines(loc = matrix(c(-90, 90, 0, 0), ncol = 2))
34 | globe::globelines(loc = matrix(c(0, 0, -90, 90), ncol = 2))
35 | globe::globepoints(loc = c(-0.1, 51.5), pch = 4, cex = 2, lwd = 3, col = "red")
36 | globe::globepoints(loc = c(0, 0), pch = 1, cex = 2, lwd = 3, col = "blue")
37 | dev.off()
38 | png("images/vector_projected.png")
39 | uk = rnaturalearth::ne_countries(scale = 50) %>%
40 | st_as_sf() %>%
41 | filter(grepl(pattern = "United Kingdom|Ire", x = name_long)) %>%
42 | st_transform(27700)
43 | plot(uk$geometry)
44 | plot(london_orign$geometry[1], add = TRUE, pch = 4, cex = 2, lwd = 3, col = "red")
45 | plot(london_orign$geometry[2], add = TRUE, pch = 1, cex = 2, lwd = 3, col = "blue")
46 | abline(h = seq(0, 9e5, length.out = 10), col = "grey", lty = 3)
47 | abline(v = seq(0, 9e5, length.out = 10), col = "grey", lty = 3)
48 | dev.off()
49 |
50 |
51 | # globe
52 | png("images/vector_lonlatglobe.png")
53 | globe::globeearth(eye = c(0,0))
54 | gratmat = st_coordinates(st_graticule())[, 1:2]
55 | globe::globelines(loc = gratmat, col = "grey", lty = 3)
56 | globe::globelines(loc = matrix(c(-90, 90, 0, 0), ncol = 2))
57 | globe::globelines(loc = matrix(c(0, 0, -90, 90), ncol = 2))
58 | globe::globepoints(loc = c(0, 0), pch = 1, cex = 2, lwd = 3, col = "blue")
59 | dev.off()
60 |
61 | # paris
62 | png("images/vector_lonlatparis.png")
63 | globe::globeearth(eye = c(0,0))
64 | gratmat = st_coordinates(st_graticule())[, 1:2]
65 | globe::globelines(loc = gratmat, col = "grey", lty = 3)
66 | globe::globelines(loc = matrix(c(-90, 90, 0, 0), ncol = 2))
67 | globe::globelines(loc = matrix(c(0, 0, -90, 90), ncol = 2))
68 | globe::globepoints(loc = paris, pch = 4, cex = 2, lwd = 3, col = "red")
69 | globe::globepoints(loc = c(0, 0), pch = 1, cex = 2, lwd = 3, col = "blue")
70 | dev.off()
71 | png("images/vector_projectedparis.png")
72 | france = rnaturalearth::ne_states(country = "France", returnclass = "sf") %>%
73 | filter(!name %in% c("Guyane française", "Martinique", "Guadeloupe", "La Réunion", "Mayotte")) %>%
74 | st_as_sf() %>%
75 | st_transform(27561)
76 | plot(france$geometry)
77 | plot(paris_origin$geometry[1], add = TRUE, pch = 4, cex = 2, lwd = 3, col = "red")
78 | plot(paris_origin$geometry[2], add = TRUE, pch = 1, cex = 2, lwd = 3, col = "blue")
79 | abline(h = seq(-9e5, 9e5, length.out = 15), col = "grey", lty = 3)
80 | abline(v = seq(0, 13e5, length.out = 10), col = "grey", lty = 3)
81 | dev.off()
82 |
--------------------------------------------------------------------------------
/styles.css:
--------------------------------------------------------------------------------
1 | /* css styles */
2 |
--------------------------------------------------------------------------------