├── .gitmodules ├── LICENSE ├── README.md ├── data ├── bes_data.csv ├── bes_exercise_data.csv ├── contacts.csv ├── scores.csv └── scores_extra.csv ├── exercises ├── part1_problems.ipynb ├── part1_solutions.ipynb ├── part2_problems.ipynb └── part2_solutions.ipynb ├── figures ├── pandas_base.html ├── pandas_columns.html ├── pandas_index.html ├── pandas_loc_cell.html ├── pandas_loc_col.html ├── pandas_loc_multi1.html ├── pandas_loc_multi2.html ├── pandas_loc_multi3.html ├── pandas_loc_multi_col.html ├── pandas_loc_multi_row.html ├── pandas_loc_new_index.html └── pandas_loc_row.html ├── notebooks ├── 01_tech_check.ipynb ├── 02_base_python.ipynb └── 03_pandas.ipynb └── slides ├── 01_intro.html ├── 01_intro.md ├── 02_base_python.html ├── 02_base_python.qmd ├── 03_pandas.html ├── 03_pandas.qmd ├── _quarto.yml ├── minimal-theme.css └── presentation.yaml /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "reveal.js"] 2 | path = reveal.js 3 | url = git@github.com:hakimel/reveal.js 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 mmjh-dev 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ds3-intro-python 2 | 3 | Introduction to Python mini-course given at Hertie DS3 (https://ds3.ai). 4 | 5 | Author: Dr Musashi Jacobs-Harukawa, DDSS Princeton 6 | 7 | ## Repository 8 | 9 | ``` 10 | ├── data 11 | ├── exercises 12 | ├── figures 13 | ├── notebooks 14 | └── slides 15 | ``` 16 | 17 | - `data` contains data files for use in examples and exercises. 18 | - `exercises` contains problem sets and solutions to accompany lecture. 19 | - `figures` contains figures used in lecture. 20 | - `notebooks` contains the notebooks accompanying the lecture 21 | - `slides` contains the markdown and Quarto files used to create the slides, and the slides themselves. 22 | 23 | 24 | -------------------------------------------------------------------------------- /data/contacts.csv: -------------------------------------------------------------------------------- 1 | StudentID,FirstName,LastName,Age,Gender,Scholarship,Email 2 | 5a01,Alice,Smith,20,Female,True,alice@gmail.com 3 | 5a02,Bob,Higgins,21,Male,True,bob@hotmail.com 4 | 5b05,Charlie,Wylie,22,Male,False,charlie@yahoo.com 5 | 5b10,David,Card,20,Male,False,david@gmail.com 6 | 5e04,Eva,Longman,23,Female,False,eva@outlook.com 7 | 5b11,Frankie,Krueger,20,Female,True,frankie@outlook.com 8 | 6a01,Gerald,Nivea,19,Male,False,gerald@gmail.com 9 | -------------------------------------------------------------------------------- /data/scores.csv: -------------------------------------------------------------------------------- 1 | student_id,math,english,history,biology,art 2 | 5a01,95,97,80,81,86 3 | 5a12,78,91,89,86,81 4 | 5b05,85,86,94,88,82 5 | 5b10,90,89,87,99,91 6 | 5e04,88,90,85,88,84 7 | -------------------------------------------------------------------------------- /data/scores_extra.csv: -------------------------------------------------------------------------------- 1 | student_id,math,english,history,biology,art 2 | 5b11,93,84,82,95,91 3 | 5c01,88,73,72,80,84 4 | -------------------------------------------------------------------------------- /exercises/part1_problems.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Part 1 Exercises\n", 8 | "\n", 9 | "_Refer to the lecture notes, examples, and the Internet to help you complete these tasks._" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "## Task 1: Animal Sounds\n", 17 | "\n", 18 | "1. Create a dictionary of animals and their sounds and call it `animal_sounds`.\n", 19 | " - Each value should be the corresponding sound for the animal.\n", 20 | " - If your native language is not English, use the sounds from your language!\n", 21 | "2. Use a for loop to print the statement \"In my language, the **ANIMAL** makes the sound **SOUND**\" for each key-value pair in your dictionary.\n", 22 | "\n", 23 | "**_Extra Challenge_**:\n", 24 | "\n", 25 | "1. Create two separate lists, `animals` and `sounds`.\n", 26 | " - `animals` should be the list of the animals used in the previous task.\n", 27 | " - `sounds` should be the list of corresponding sounds.\n", 28 | " - Also: Make sure the `type` of `animals` and `sounds` is `list`!\n", 29 | "2. Create an empty dictionary called `animal_sounds`.\n", 30 | " - Hint: This can be done with `{}`\n", 31 | "3. Use a for loop to populate the dictionary with the information from animals and sounds.\n", 32 | " - _In the same for-loop, print the same statements as in the previous section_." 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "## Task 2: Writing a Menu\n", 40 | "\n", 41 | "A menu typically consists of the following information:\n", 42 | "\n", 43 | "- Course\n", 44 | "- Dish\n", 45 | "- Description\n", 46 | "- Price\n", 47 | "\n", 48 | "In this exercise, you will experiment with different ways of representing this information.\n", 49 | "\n", 50 | "1. Dictionary of Dictionaries (Nested Hierarchy)\n", 51 | " - Create a dictionary called `menu1`.\n", 52 | " - For each dish, create a second dictionary with the keys `'course'`, `'price'`, and `'description'`. Fill these in accordingly.\n", 53 | " \n", 54 | "2. Dictionary of Lists\n", 55 | " - Create a dictionary called `menu2`.\n", 56 | " - For each of the keys `'dish'`, `'course'`, `'description'` and `'price'`, write a list of all of the values.\n", 57 | " - Hint: `'course'` will contain many repeated values.\n", 58 | "\n", 59 | "**_Extra Challenge_**:\n", 60 | "\n", 61 | "- For both methods, find a way to iterate over the dictionary to print out a menu.\n", 62 | "- The fancier the better!\n", 63 | " - Note that you can get the length of a string using the `len` function. You can use this to create aligned columns!" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "# Task 3:\n", 71 | "\n", 72 | "Similar to the exercise of making a sentence from the fewest letters possible. \n", 73 | "\n", 74 | "- Create a list of five letters and a space, call it `letters`.\n", 75 | "- Figure out the longest sentence you can make from those letters.\n", 76 | "- Use the indices of the list to write a sentence.\n", 77 | "- Create a new sentence using a for loop and the `join` function.\n", 78 | "\n", 79 | "**_Extra Challenge_**:\n", 80 | "\n", 81 | "There are other, smarter ways of doing this with dictionaries and lists. See if you can find a better method than the one below!" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "# Task 4 (Bonus):\n", 89 | "\n", 90 | "A prime number is a natural number ($\\mathbb{N}$) that is greater than 1 and is not the product of two smaller natural numbers.\n", 91 | "\n", 92 | "Write code that prints all prime numbers less than 10000\n", 93 | "\n", 94 | "For an additional challenge, write `%%timeit` at the top of the codeblock to see how long your code takes to execute. See how fast you can make your code." 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [] 103 | } 104 | ], 105 | "metadata": { 106 | "kernelspec": { 107 | "display_name": "ds3", 108 | "language": "python", 109 | "name": "ds3" 110 | }, 111 | "language_info": { 112 | "codemirror_mode": { 113 | "name": "ipython", 114 | "version": 3 115 | }, 116 | "file_extension": ".py", 117 | "mimetype": "text/x-python", 118 | "name": "python", 119 | "nbconvert_exporter": "python", 120 | "pygments_lexer": "ipython3", 121 | "version": "3.11.4" 122 | } 123 | }, 124 | "nbformat": 4, 125 | "nbformat_minor": 4 126 | } 127 | -------------------------------------------------------------------------------- /exercises/part1_solutions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "slideshow": { 7 | "slide_type": "slide" 8 | } 9 | }, 10 | "source": [ 11 | "# Part 1 Solutions\n", 12 | "\n", 13 | "_Model solutions_.\n", 14 | "\n", 15 | "Note that there are many possible ways to solve these problems, but the best solutions will maximise readability and efficiency. Therefore you should aim to iterate where possible, but make it clear what you are doing at each step." 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "slideshow": { 22 | "slide_type": "slide" 23 | } 24 | }, 25 | "source": [ 26 | "## Task 1: Animal Sounds\n", 27 | "\n", 28 | "1. Create a dictionary of animals and their sounds and call it `animal_sounds`.\n", 29 | " - Each value should be the corresponding sound for the animal.\n", 30 | " - If your native language is not English, use the sounds from your language!\n", 31 | "2. Use a for loop to print the statement \"In my language, the **ANIMAL** makes the sound **SOUND**\" for each key-value pair in your dictionary.\n", 32 | "\n", 33 | "**_Extra Challenge_**:\n", 34 | "\n", 35 | "1. Create two separate lists, `animals` and `sounds`.\n", 36 | " - `animals` should be the list of the animals used in the previous task.\n", 37 | " - `sounds` should be the list of corresponding sounds.\n", 38 | " - Also: Make sure the `type` of `animals` and `sounds` is `list`!\n", 39 | "2. Create an empty dictionary called `animal_sounds`.\n", 40 | " - Hint: This can be done with `{}`\n", 41 | "3. Use a for loop to populate the dictionary with the information from animals and sounds.\n", 42 | " - _In the same for-loop, print the same statements as in the previous section_." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": { 49 | "slideshow": { 50 | "slide_type": "slide" 51 | } 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "# Task 1 - Part 1\n", 56 | "# Dictionary of animal sounds\n", 57 | "animal_sounds = {\n", 58 | " 'dog': 'wanwan',\n", 59 | " 'cat': 'nyaa',\n", 60 | " 'mouse': 'chuu',\n", 61 | " 'frog': 'kerokero',\n", 62 | " 'elephant': 'paoon'\n", 63 | "}\n", 64 | "\n", 65 | "# Task 1 - Part 2\n", 66 | "# Iterate over keys to create statements.\n", 67 | "for animal in animal_sounds.keys():\n", 68 | " print(\"In Japanese, the \"+animal+\" makes the sound \"+animal_sounds[animal]+\"!\")" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": { 75 | "slideshow": { 76 | "slide_type": "slide" 77 | } 78 | }, 79 | "outputs": [], 80 | "source": [ 81 | "# Task 1 - Part 2: Bonus Solution\n", 82 | "# You can use f-strings to make this a bit easier to read:\n", 83 | "for animal in animal_sounds.keys():\n", 84 | " print(f\"In Japanese, the {animal} makes the sound {animal_sounds[animal]}!\")\n", 85 | " # Note the `f` before the string. This creates a f-string, which an take values in `{}`" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": { 92 | "slideshow": { 93 | "slide_type": "slide" 94 | } 95 | }, 96 | "outputs": [], 97 | "source": [ 98 | "# Extra Challenge:\n", 99 | "\n", 100 | "# Part 1 - Create lists. Note that they are aligned on index.\n", 101 | "animals = ['dog', 'cat', 'mouse', 'frog', 'elephant']\n", 102 | "sounds = ['wanwan', 'nyaa', 'chuu', 'kerokero', 'paoon']" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": { 109 | "slideshow": { 110 | "slide_type": "slide" 111 | } 112 | }, 113 | "outputs": [], 114 | "source": [ 115 | "# Part 2 - Create empty dict\n", 116 | "animal_sounds = {}\n", 117 | "\n", 118 | "# Part 3 - Solution 1: Using `enumerate`\n", 119 | "# `enumerate` creates an iterator that returns a counter and the objects in the iterable.\n", 120 | "for i, animal in enumerate(animals):\n", 121 | " animal_sounds[animal] = sounds[i]\n", 122 | " print(f\"In Japanese, the {animal} makes the sound {animal_sounds[animal]}!\")" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": { 129 | "slideshow": { 130 | "slide_type": "slide" 131 | } 132 | }, 133 | "outputs": [], 134 | "source": [ 135 | "# Part 3 - Solution 2: Using `range`\n", 136 | "\n", 137 | "# We need to 'reset' the dict by making it empty again\n", 138 | "animal_sounds = {}\n", 139 | "\n", 140 | "# `range` returns a sequence of the length you have given it.\n", 141 | "# Here I use `len(animals)` because the iterator needs to be the length of the sequences we are\n", 142 | "# iterating over.\n", 143 | "# `range` is a very useful function, and worth reading the documentation on.\n", 144 | "for i in range(len(animals)):\n", 145 | " animal = animals[i]\n", 146 | " sound = sounds[i]\n", 147 | " animal_sounds[animal] = sound\n", 148 | " print(f\"In Japanese, the {animal} makes the sound {sound}!\")" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": { 155 | "slideshow": { 156 | "slide_type": "slide" 157 | } 158 | }, 159 | "outputs": [], 160 | "source": [ 161 | "# Part 3 - Solution 3: Using zip and dict\n", 162 | "\n", 163 | "# `zip` creates an iterator.\n", 164 | "# Given two sequences, [1, 2, 3] and [a, b, c], zip will return the sequence [(1, a), (2, b), (3, c)]\n", 165 | "# Pass this to the dict function, which can take a list of tuples to construct a dictionary!\n", 166 | "\n", 167 | "animal_sounds = dict(zip(animals, sounds))\n", 168 | "\n", 169 | "# I'm not going to repeat the print portion.\n", 170 | "# This is probably the most \"pythonic\" solution, but that's always debatable." 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": { 176 | "slideshow": { 177 | "slide_type": "slide" 178 | } 179 | }, 180 | "source": [ 181 | "## Task 2: Writing a Menu\n", 182 | "\n", 183 | "A menu typically consists of the following information:\n", 184 | "\n", 185 | "- Course\n", 186 | "- Dish\n", 187 | "- Description\n", 188 | "- Price\n", 189 | "\n", 190 | "In this exercise, you will experiment with different ways of representing this information.\n", 191 | "\n", 192 | "1. Dictionary of Dictionaries (Nested Hierarchy)\n", 193 | " - Create a dictionary called `menu1`.\n", 194 | " - For each dish, create a second dictionary with the keys `'course'`, `'price'`, and `'description'`. Fill these in accordingly.\n", 195 | " \n", 196 | "2. Dictionary of Lists\n", 197 | " - Create a dictionary called `menu2`.\n", 198 | " - For each of the keys `'dish'`, `'course'`, `'description'` and `'price'`, write a list of all of the values.\n", 199 | " - Hint: `'course'` will contain many repeated values.\n", 200 | "\n", 201 | "**_Extra Challenge_**:\n", 202 | "\n", 203 | "- For both methods, find a way to iterate over the dictionary to print out a menu.\n", 204 | "- The fancier the better!\n", 205 | " - Note that you can get the length of a string using the `len` function. You can use this to create aligned columns!" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": { 212 | "slideshow": { 213 | "slide_type": "slide" 214 | } 215 | }, 216 | "outputs": [], 217 | "source": [ 218 | "# Approach 1: Nested Dictionaries\n", 219 | "menu1 = {} # Creating an empty dictionary\n", 220 | "\n", 221 | "# Adding three items.\n", 222 | "menu1['Karaage'] = {'price': 5.0,\n", 223 | " 'course': 'Starter',\n", 224 | " 'description': 'Japanese fried chicken'}\n", 225 | "menu1['Salmon Teriyaki'] = {'price': 9.0,\n", 226 | " 'course': 'Main',\n", 227 | " 'description': 'Pan-fried salmon over rice with teriyaki sauce'}\n", 228 | "menu1['Mochi'] = {'price': 3.5,\n", 229 | " 'course': 'Dessert',\n", 230 | " 'description': 'Sweet rice paste'}" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": { 237 | "slideshow": { 238 | "slide_type": "slide" 239 | } 240 | }, 241 | "outputs": [], 242 | "source": [ 243 | "# Approach 2 - Dictionary of Lists\n", 244 | "# This is my less-preferred approach, but important for Week 2.\n", 245 | "menu2 = {\n", 246 | " 'course': ['Starter', 'Main', 'Dessert'],\n", 247 | " 'name': ['Karaage', 'Salmon Teriyaki', 'Mochi'],\n", 248 | " 'description': ['Japanese fried chicken', 'Pan-fried salmon over rice with teriyaki sauce', 'Sweet rice paste'],\n", 249 | " 'price': [5.0, 9.0, 3.5]\n", 250 | "}" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "metadata": { 257 | "slideshow": { 258 | "slide_type": "slide" 259 | } 260 | }, 261 | "outputs": [], 262 | "source": [ 263 | "# Extra Challenge - Printing out a Menu\n", 264 | "# Creating list for courses. Could be done with existing information, but order is not guaranteed.\n", 265 | "courses = ['Starter', 'Main', 'Dessert']\n", 266 | "\n", 267 | "# Use a nested for loop to create sections in menu\n", 268 | "for course in courses:\n", 269 | " # Each course should be headed by the name of the course, and be followed by a blank space.\n", 270 | " print(course)\n", 271 | " for item in menu1.keys():\n", 272 | " # Within each course, we list out the items in that course. This can be done with an `if` statement.\n", 273 | " if menu1[item]['course']==course: # Checking if item is in course.\n", 274 | " # Aligned menu, using dashes.\n", 275 | " print(\n", 276 | " item + \\\n", 277 | " (\"-\"*(20-len(item))) + \\\n", 278 | " menu1[item]['description'] + \\\n", 279 | " (\"-\"*(60-len(menu1[item]['description']))) + \\\n", 280 | " str(menu1[item]['price']) # float needs to be coerced to string\n", 281 | " )\n", 282 | " print(\"\\n\") # Trailing blank space." 283 | ] 284 | }, 285 | { 286 | "cell_type": "markdown", 287 | "metadata": { 288 | "slideshow": { 289 | "slide_type": "slide" 290 | } 291 | }, 292 | "source": [ 293 | "# Task 3:\n", 294 | "\n", 295 | "Similar to the exercise of making a sentence from the fewest letters possible. \n", 296 | "\n", 297 | "- Create a list of five letters and a space, call it `letters`.\n", 298 | "- Figure out the longest sentence you can make from those letters.\n", 299 | "- Use the indices of the list to write a sentence.\n", 300 | "- Create a new sentence using a for loop and the `join` function.\n", 301 | "\n", 302 | " **_Extra Challenge_**:\n", 303 | "\n", 304 | "There are other, smarter ways of doing this with dictionaries and lists. See if you can find a better method than the one below!" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": null, 310 | "metadata": { 311 | "slideshow": { 312 | "slide_type": "slide" 313 | } 314 | }, 315 | "outputs": [], 316 | "source": [ 317 | "# Example:\n", 318 | "letters = ['a', 't', 'r', 's', 'e', ' ']\n", 319 | "sentence_indices = [3, 4, 4, 5, 3, 0, 2, 0, 5, 4, 0, 1, 5, 3, 4, 0, 5, 1, 2, 4, 4, 5, 1, 4, 0, 5, 1, 2, 4, 0, 1, 3]\n", 320 | "\n", 321 | "sentence = []\n", 322 | "for index in sentence_indices:\n", 323 | " sentence.append(letters[index])\n", 324 | "\n", 325 | "\"\".join(sentence) # `\" \".join` would join a sequence of values with a space between each value.\n", 326 | " # `\"\".join` just pastes them all together." 327 | ] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "metadata": {}, 332 | "source": [ 333 | "# Task 4 (Bonus):\n", 334 | "\n", 335 | "A prime number is a natural number ($\\mathbb{N}$) that is greater than 1 and is not the product of two smaller natural numbers.\n", 336 | "\n", 337 | "Write code that prints all prime numbers less than 10000\n", 338 | "\n", 339 | "For an additional challenge, write `%%timeit` at the top of the codeblock to see how long your code takes to execute. See how fast you can make your code." 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": {}, 346 | "outputs": [], 347 | "source": [ 348 | "# There are a variety of ways you could approach this.\n", 349 | "# My approach will use the modulo operator, '%'\n", 350 | "# x % y gives the remainder of the division of x by y\n", 351 | "# In other words, it gives the remainder after you find the maximum number of\n", 352 | "# times y goes into x.\n", 353 | "# e.g.\n", 354 | "print(3 % 2)\n", 355 | "print(5 % 3)\n", 356 | "print(12 % 2)" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": null, 362 | "metadata": {}, 363 | "outputs": [], 364 | "source": [ 365 | "# We want to generate a growing list of primes.\n", 366 | "# We can do this with a list and the '.append()' method.\n", 367 | "# list.append() appends the argument to the list.\n", 368 | "isPrime = True\n", 369 | "primes = [2] # An empty list\n", 370 | "\n", 371 | "for i in range(3, 10001): # Start from 1, finish at 10000\n", 372 | " # Check if divisible with no remainder from existing arguments\n", 373 | " for p in primes:\n", 374 | " # Here we need some kind of conditional logic. I will use 'if' again.\n", 375 | " if i % p == 0: # If zero remainder, then multiple and not prime\n", 376 | " isPrime = False\n", 377 | " # After all primes checked, if still isPrime=True, then we can append\n", 378 | " if isPrime == True:\n", 379 | " primes.append(i)\n", 380 | " isPrime = True # Resetting the 'trigger'" 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": null, 386 | "metadata": {}, 387 | "outputs": [], 388 | "source": [ 389 | "%%timeit\n", 390 | "# I can compare a few methods for speed\n", 391 | "# Using '%%timeit'\n", 392 | "\n", 393 | "isPrime = True\n", 394 | "primes = [2] # An empty list\n", 395 | "\n", 396 | "for i in range(3, 10001): # Third argument is step\n", 397 | " for p in primes:\n", 398 | " if i % p == 0:\n", 399 | " isPrime = False\n", 400 | " if isPrime == True:\n", 401 | " primes.append(i)\n", 402 | " isPrime = True" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": null, 408 | "metadata": {}, 409 | "outputs": [], 410 | "source": [ 411 | "%%timeit\n", 412 | "# Now we skip even numbers\n", 413 | "\n", 414 | "isPrime = True\n", 415 | "primes = [2]\n", 416 | "\n", 417 | "for i in range(3, 10001, 2): # Third argument is step\n", 418 | " for p in primes:\n", 419 | " if i % p == 0:\n", 420 | " isPrime = False\n", 421 | " if isPrime == True:\n", 422 | " primes.append(i)\n", 423 | " isPrime = True" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": null, 429 | "metadata": {}, 430 | "outputs": [], 431 | "source": [ 432 | "%%timeit\n", 433 | "# Can also cut out numbers larger than the square root of n\n", 434 | "\n", 435 | "isPrime = True\n", 436 | "primes = [2]\n", 437 | "\n", 438 | "for i in range(3, 10001, 2): # Third argument is step\n", 439 | " for p in primes:\n", 440 | " if i % p == 0:\n", 441 | " isPrime = False\n", 442 | " if p > (i**0.5): # Check if p is larger than sqrt of i\n", 443 | " break # 'break' breaks the current loop\n", 444 | " if isPrime == True:\n", 445 | " primes.append(i)\n", 446 | " isPrime = True" 447 | ] 448 | } 449 | ], 450 | "metadata": { 451 | "celltoolbar": "Slideshow", 452 | "kernelspec": { 453 | "display_name": "Python 3 (ipykernel)", 454 | "language": "python", 455 | "name": "python3" 456 | }, 457 | "language_info": { 458 | "codemirror_mode": { 459 | "name": "ipython", 460 | "version": 3 461 | }, 462 | "file_extension": ".py", 463 | "mimetype": "text/x-python", 464 | "name": "python", 465 | "nbconvert_exporter": "python", 466 | "pygments_lexer": "ipython3", 467 | "version": "3.11.4" 468 | } 469 | }, 470 | "nbformat": 4, 471 | "nbformat_minor": 4 472 | } 473 | -------------------------------------------------------------------------------- /exercises/part2_problems.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Part 2 Exercises\n" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Task 1: Functions\n", 15 | "\n", 16 | "### Task 1a: Numeric Functions\n", 17 | "\n", 18 | "In this exercise you write functions whose domain are either scalar numbers or numeric vectors.\n", 19 | "\n", 20 | "#### Scalar Functions\n", 21 | "\n", 22 | "- One Input: Absolute value\n", 23 | "- Two Inputs: Calculate the difference between the first input and the largest multiple of the second input that is less than the first input. Therefore, if the inputs are (41, 10), the function should calculate 41 - 4\\*10 = 1.\n", 24 | "- Challenge: Write a function that returns the factors of the input. For example, 132 = 2\\*2\\*3\\*11, so $f(132) = \\{2, 2, 3, 11\\}$\n", 25 | "\n", 26 | "#### Vector Functions\n", 27 | "\n", 28 | "- One Input: Write a summary statistics function. Given a vector, this function should return the following statistics in a `pd.Series` object with corresponding index labels: number of elements, sum, mean, median, variance, standard deviation, and any other statistics that you think are helpful.\n", 29 | "- Two Inputs: Write a function that given two equal-length inputs, determines whether each element in the first is divisible by the second. The output should be a vector of equal length to the inputs, indicating with True/False values whether the arguments of the first vector were divisible by the corresponding element in the second. CHALLENGE: Allow the function to take either a scalar or vector input as its second argument.\n", 30 | "\n", 31 | "### Task 1b: String Functions\n", 32 | "\n", 33 | "#### Scalar Functions\n", 34 | "\n", 35 | "- One Input: Write a function that divides a string into a list of words. Note: the `str.split()` function is useful here.\n", 36 | "- Two Inputs: Write a function that calculates the number of times the second argument occurs in the first. e.g. \"How many times does the letter e occur in this sentence?\"\n", 37 | "\n", 38 | "#### Vector Function\n", 39 | "\n", 40 | "- One Input: Write a function that, given a vector/list/series of strings, returns a series where the index is are the unique words in the input, and the values are the number of times that unique word occurs in the entire input. Therefore, if I took a list containing all of the State of the Union Address, I want a function that tells me a) what the unique words in the collection of all Addresses is, and b) how many times those words occur in the total collection.\n" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "## Task 2: Apply\n", 48 | "\n", 49 | "### Task 2a: Element-Wise Operations\n", 50 | "\n", 51 | "1. Using the `Age` variable from BES data subset (link below), calculate the age of each respondent rounded down to the nearest multiple of 5. Try writing this both using a defined function and with a `lambda` function.\n", 52 | "2. Recode the column `y09` as 0 and 1.\n", 53 | "3. Write a function that gets the lower bound from the income bounds reported in column `y01`, and returns it as an integer.\n", 54 | "\n", 55 | "\n", 56 | "### Task 2b: Grouped Functions\n", 57 | "\n", 58 | "These were not covered in the lecture, but you are encouraged to try them out. The relevant command [documentation](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.groupby.html)\n", 59 | "\n", 60 | "1. Calculate the summary statistics on `Age` for each region, and each region/constituency.\n", 61 | "2. Calculate the median income bracket (`y01`) per region and region/constituency.\n", 62 | "3. Calculate the most commonly given answer to `a02` per region and region/income bracket.\n", 63 | "4. Calculate the most commonly given answer to `a02` and `y06` per region." 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "data_url = \"https://raw.githubusercontent.com/mmjh-dev/ds3-intro-python/main/data/bes_exercise_data.csv\"" 73 | ] 74 | } 75 | ], 76 | "metadata": { 77 | "kernelspec": { 78 | "display_name": "ds3", 79 | "language": "python", 80 | "name": "ds3" 81 | }, 82 | "language_info": { 83 | "codemirror_mode": { 84 | "name": "ipython", 85 | "version": 3 86 | }, 87 | "file_extension": ".py", 88 | "mimetype": "text/x-python", 89 | "name": "python", 90 | "nbconvert_exporter": "python", 91 | "pygments_lexer": "ipython3", 92 | "version": "3.11.4" 93 | } 94 | }, 95 | "nbformat": 4, 96 | "nbformat_minor": 4 97 | } 98 | -------------------------------------------------------------------------------- /exercises/part2_solutions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Part 2 Exercises\n" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Task 1: Functions\n", 15 | "\n", 16 | "### Task 1a: Numeric Functions\n", 17 | "\n", 18 | "In this exercise you write functions whose domain are either scalar numbers or numeric vectors.\n", 19 | "\n", 20 | "#### Scalar Functions\n", 21 | "\n", 22 | "- One Input: Absolute value\n", 23 | "- Two Inputs: Calculate the difference between the first input and the largest multiple of the second input that is less than the first input. Therefore, if the inputs are (41, 10), the function should calculate 41 - 4\\*10 = 1.\n", 24 | "- Challenge: Write a function that returns the factors of the input. For example, 132 = 2\\*2\\*3\\*11, so $f(132) = \\{2, 2, 3, 11\\}$\n", 25 | "\n", 26 | "#### Vector Functions\n", 27 | "\n", 28 | "- One Input: Write a summary statistics function. Given a vector, this function should return the following statistics in a `pd.Series` object with corresponding index labels: number of elements, sum, mean, median, variance, standard deviation, and any other statistics that you think are helpful.\n", 29 | "- Two Inputs: Write a function that given two equal-length inputs, determines whether each element in the first is divisible by the second. The output should be a vector of equal length to the inputs, indicating with True/False values whether the arguments of the first vector were divisible by the corresponding element in the second. CHALLENGE: Allow the function to take either a scalar or vector input as its second argument.\n", 30 | "\n", 31 | "### Task 1b: String Functions\n", 32 | "\n", 33 | "#### Scalar Functions\n", 34 | "\n", 35 | "- One Input: Write a function that divides a string into a list of words. Note: the `str.split()` function is useful here.\n", 36 | "- Two Inputs: Write a function that calculates the number of times the second argument occurs in the first. e.g. \"How many times does the letter e occur in this sentence?\"\n", 37 | "\n", 38 | "#### Vector Function\n", 39 | "\n", 40 | "- One Input: Write a function that, given a vector/list/series of strings, returns a series where the index is are the unique words in the input, and the values are the number of times that unique word occurs in the entire input. Therefore, if I took a list containing all of the State of the Union Address, I want a function that tells me a) what the unique words in the collection of all Addresses is, and b) how many times those words occur in the total collection.\n" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "def absolute_value(x):\n", 50 | " \"\"\"\n", 51 | " There is in fact also a abs() function.\n", 52 | " This is just another way to implement it.\n", 53 | " \"\"\"\n", 54 | " if x < 0:\n", 55 | " x = x*-1\n", 56 | " return x\n", 57 | "\n", 58 | "def largest_mult_diff(x, y):\n", 59 | " \"\"\"\n", 60 | " There are a variety of ways to do this.\n", 61 | " Modulo operations are probably the easiest: x % y\n", 62 | " \"\"\"\n", 63 | " return x % y\n", 64 | "\n", 65 | "def factorize(x):\n", 66 | " \"\"\"\n", 67 | " Factorizes x.\n", 68 | " \"\"\"\n", 69 | " # Initial values\n", 70 | " remainder = x\n", 71 | " divide = 2\n", 72 | " # Store factors as we find them\n", 73 | " factors = []\n", 74 | " while remainder > 1: # When factor=1, then we've finished factorizing\n", 75 | " # While because a single prime can be a factor multiple times \n", 76 | " while (remainder % divide) == 0: # Check if it cleanly divides.\n", 77 | " factors.append(divide) # If it cleanly divides, then add it to the list of factors.\n", 78 | " remainder = remainder//divide # Update the remainder, try again.\n", 79 | " divide += 1 # Increment up through all integers. Faster to try only primes.\n", 80 | " return factors" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 2, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "import pandas as pd\n", 90 | "\n", 91 | "def series_summary(x):\n", 92 | " \"\"\"\n", 93 | " Returns key statistics of a series.\n", 94 | " \"\"\"\n", 95 | " if not isinstance(x, pd.Series): # Checks if input is pd.Series object\n", 96 | " x = pd.Series(x) # If not, then make it be so\n", 97 | " index = ['n', 'mean', 'median', 'variance', 'std'] # 5 statistics\n", 98 | " data = [len(x),\n", 99 | " x.mean(),\n", 100 | " x.median(),\n", 101 | " x.var(),\n", 102 | " x.std()\n", 103 | " ]\n", 104 | " return pd.Series(data, index=index)\n", 105 | "\n", 106 | "def check_divisible(x, y):\n", 107 | " return x%y==0" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 3, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "import re\n", 117 | "\n", 118 | "# There is a way to do this without re, but the regex solution is the most efficient and generalisable (can deal with weird characters)\n", 119 | "def split_into_words(x, delim=\" \"):\n", 120 | " x = re.sub(r'[^\\w ]+', '', x) # This pattern deletes everything but letters and spaces\n", 121 | " x = x.split(delim)\n", 122 | " return x\n", 123 | "\n", 124 | "def count_occurrences(e, x):\n", 125 | " \"\"\"\n", 126 | " Returns the number of times 'e' occurs in x.\n", 127 | " \"\"\"\n", 128 | " count = len(x.split(e))-1\n", 129 | " # We don't need to count the occurrences, we can just break up the string on 'e' and\n", 130 | " # count how many parts it gets split into.\n", 131 | " return count" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 4, 137 | "metadata": {}, 138 | "outputs": [ 139 | { 140 | "name": "stdout", 141 | "output_type": "stream", 142 | "text": [ 143 | "['Hello', 'World', 'My', 'name', 'is', 'Myles', 'Morales', 'How', 'are', 'you']\n", 144 | "5\n" 145 | ] 146 | } 147 | ], 148 | "source": [ 149 | "print(split_into_words('Hello World! My name is Myles Morales. How are you?'))\n", 150 | "print(count_occurrences('e', 'Hello World! My name is Myles Morales. How are you?'))" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 5, 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "from collections import Counter\n", 160 | "\n", 161 | "def text_col_to_dfm(text_series):\n", 162 | " text_series = text_series.str.lower().str.replace(r\"[^\\w ]\", '')\n", 163 | " index = text_series.index.values\n", 164 | " tokens = list(set(text_series.str.split(\" \").sum()))\n", 165 | " data = []\n", 166 | " for i in index:\n", 167 | " row = []\n", 168 | " l = Counter(text_series.values[i].split(\" \"))\n", 169 | " for token in tokens:\n", 170 | " row.append(l.get(token, 0))\n", 171 | " data.append(row)\n", 172 | " df = pd.DataFrame(index=index, columns=tokens, data=data)\n", 173 | " return df" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 6, 179 | "metadata": {}, 180 | "outputs": [ 181 | { 182 | "data": { 183 | "text/html": [ 184 | "
\n", 185 | "\n", 198 | "\n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | "
newsreporthellojelloworld
000101
100110
211001
\n", 236 | "
" 237 | ], 238 | "text/plain": [ 239 | " news report hello jello world\n", 240 | "0 0 0 1 0 1\n", 241 | "1 0 0 1 1 0\n", 242 | "2 1 1 0 0 1" 243 | ] 244 | }, 245 | "execution_count": 6, 246 | "metadata": {}, 247 | "output_type": "execute_result" 248 | } 249 | ], 250 | "source": [ 251 | "text_input_series = pd.Series(\n", 252 | " ['Hello World!', 'Hello Jello!', 'World News Report']\n", 253 | ")\n", 254 | "text_col_to_dfm(text_input_series)" 255 | ] 256 | }, 257 | { 258 | "cell_type": "markdown", 259 | "metadata": {}, 260 | "source": [ 261 | "## Task 2: Apply\n", 262 | "\n", 263 | "### Task 2a: Element-Wise Operations\n", 264 | "\n", 265 | "1. Using the `Age` variable from BES data, calculate the age of each respondent rounded down to the nearest multiple of 5. Try writing this both using a defined function and with a `lambda` function.\n", 266 | "2. Recode the column `y09` as 0 and 1.\n", 267 | "3. Write a function that gets the lower bound from the income bounds reported in column `y01`, and returns it as an integer.\n", 268 | "\n", 269 | "\n", 270 | "### Task 2b: Grouped Functions\n", 271 | "\n", 272 | "1. Calculate the summary statistics on `Age` for each region, and each region/constituency.\n", 273 | "2. Calculate the median income bracket (`y01`) per region and region/constituency.\n", 274 | "3. Calculate the most commonly given answer to `a02` per region and region/income bracket.\n", 275 | "4. Calculate the most commonly given answer to `a02` and `y06` per region." 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 7, 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [ 284 | "df = pd.read_csv(\"../data/bes_exercise_data.csv\")" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": 8, 290 | "metadata": {}, 291 | "outputs": [ 292 | { 293 | "data": { 294 | "text/plain": [ 295 | "0 20.0\n", 296 | "1 50.0\n", 297 | "2 55.0\n", 298 | "3 65.0\n", 299 | "4 65.0\n", 300 | " ... \n", 301 | "2189 55.0\n", 302 | "2190 45.0\n", 303 | "2191 50.0\n", 304 | "2192 80.0\n", 305 | "2193 85.0\n", 306 | "Name: Age, Length: 2194, dtype: float64" 307 | ] 308 | }, 309 | "execution_count": 8, 310 | "metadata": {}, 311 | "output_type": "execute_result" 312 | } 313 | ], 314 | "source": [ 315 | "df['Age'].apply(lambda x: x//5*5) # // integer division\n", 316 | "\n", 317 | "def myround(x, base=5):\n", 318 | " return x//base*base\n", 319 | "\n", 320 | "df['Age'].apply(myround)" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": 9, 326 | "metadata": {}, 327 | "outputs": [ 328 | { 329 | "data": { 330 | "text/plain": [ 331 | "0 1\n", 332 | "1 0\n", 333 | "2 0\n", 334 | "3 1\n", 335 | "4 1\n", 336 | " ..\n", 337 | "2189 1\n", 338 | "2190 1\n", 339 | "2191 1\n", 340 | "2192 1\n", 341 | "2193 0\n", 342 | "Name: y09, Length: 2194, dtype: category\n", 343 | "Categories (2, int64): [0 < 1]" 344 | ] 345 | }, 346 | "execution_count": 9, 347 | "metadata": {}, 348 | "output_type": "execute_result" 349 | } 350 | ], 351 | "source": [ 352 | "df['y09'].apply(lambda x: int(x=='Female'))" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": 10, 358 | "metadata": {}, 359 | "outputs": [ 360 | { 361 | "data": { 362 | "text/plain": [ 363 | "['GBP 5,200 - GBP 10,399',\n", 364 | " 'GBP 2,600 - GBP 5,199',\n", 365 | " 'GBP 36,400 - GBP 39,999',\n", 366 | " 'GBP 40,000 - GBP 44,999',\n", 367 | " 'Don`t know',\n", 368 | " 'GBP 10,400 - GBP 15,599',\n", 369 | " 'GBP 50,000 - GBP 59,999',\n", 370 | " 'GBP 31,200 - GBP 36,399',\n", 371 | " 'GBP 26,000 - GBP 31,199',\n", 372 | " 'GBP 60,000 - GBP 74,999',\n", 373 | " 'GBP 15,600 - GBP 20,799',\n", 374 | " 'Refused',\n", 375 | " 'GBP 75,000 - GBP 99,999',\n", 376 | " 'GBP 45,000 - GBP 49,999',\n", 377 | " 'GBP 100,000 or more',\n", 378 | " 'GBP 20,800 - GBP 25,999',\n", 379 | " 'Under GBP 2,600']" 380 | ] 381 | }, 382 | "execution_count": 10, 383 | "metadata": {}, 384 | "output_type": "execute_result" 385 | } 386 | ], 387 | "source": [ 388 | "df['y01'].unique().tolist()" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": 11, 394 | "metadata": {}, 395 | "outputs": [ 396 | { 397 | "data": { 398 | "text/plain": [ 399 | "0 5200.0\n", 400 | "1 2600.0\n", 401 | "2 5200.0\n", 402 | "3 36400.0\n", 403 | "4 40000.0\n", 404 | " ... \n", 405 | "2189 60000.0\n", 406 | "2190 75000.0\n", 407 | "2191 5200.0\n", 408 | "2192 15600.0\n", 409 | "2193 45000.0\n", 410 | "Name: y01, Length: 2194, dtype: float64" 411 | ] 412 | }, 413 | "execution_count": 11, 414 | "metadata": {}, 415 | "output_type": "execute_result" 416 | } 417 | ], 418 | "source": [ 419 | "def get_lower_income_bound(x):\n", 420 | " if x == 'Under GBP 2,600':\n", 421 | " return 0\n", 422 | " elif x == 'GBP 100,000 or more':\n", 423 | " return 100000\n", 424 | " elif x in ['Don`t know', 'Refused']:\n", 425 | " return pd.np.nan\n", 426 | " else:\n", 427 | " return int(x.split(\" - \")[0].split(\"GBP \")[1].replace(\",\", \"\"))\n", 428 | "\n", 429 | "df['y01'].apply(get_lower_income_bound)" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": 12, 435 | "metadata": {}, 436 | "outputs": [ 437 | { 438 | "data": { 439 | "text/html": [ 440 | "
\n", 441 | "\n", 454 | "\n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | "
countmeanstdmin25%50%75%max
region
East Midlands155.054.90322617.22229519.042.5057.066.0094.0
Eastern226.054.07079618.42955018.041.0055.068.0096.0
London203.046.89655218.67582118.032.0041.061.0089.0
North East112.054.27678620.31340520.036.0055.569.2591.0
North West304.051.38815817.94621618.037.0050.067.0095.0
Scotland191.053.10994816.99670118.040.5054.065.0097.0
South East282.051.97163118.33591018.036.0052.067.0091.0
South West166.054.56024119.45389219.039.0056.570.0099.0
Wales126.051.26984120.51006118.033.2551.567.7589.0
West Midlands226.054.45132717.96712618.041.0056.068.7595.0
Yorkshire & Humber184.053.15217417.72123818.041.7552.566.0090.0
\n", 603 | "
" 604 | ], 605 | "text/plain": [ 606 | " count mean std min 25% 50% 75% \\\n", 607 | "region \n", 608 | "East Midlands 155.0 54.903226 17.222295 19.0 42.50 57.0 66.00 \n", 609 | "Eastern 226.0 54.070796 18.429550 18.0 41.00 55.0 68.00 \n", 610 | "London 203.0 46.896552 18.675821 18.0 32.00 41.0 61.00 \n", 611 | "North East 112.0 54.276786 20.313405 20.0 36.00 55.5 69.25 \n", 612 | "North West 304.0 51.388158 17.946216 18.0 37.00 50.0 67.00 \n", 613 | "Scotland 191.0 53.109948 16.996701 18.0 40.50 54.0 65.00 \n", 614 | "South East 282.0 51.971631 18.335910 18.0 36.00 52.0 67.00 \n", 615 | "South West 166.0 54.560241 19.453892 19.0 39.00 56.5 70.00 \n", 616 | "Wales 126.0 51.269841 20.510061 18.0 33.25 51.5 67.75 \n", 617 | "West Midlands 226.0 54.451327 17.967126 18.0 41.00 56.0 68.75 \n", 618 | "Yorkshire & Humber 184.0 53.152174 17.721238 18.0 41.75 52.5 66.00 \n", 619 | "\n", 620 | " max \n", 621 | "region \n", 622 | "East Midlands 94.0 \n", 623 | "Eastern 96.0 \n", 624 | "London 89.0 \n", 625 | "North East 91.0 \n", 626 | "North West 95.0 \n", 627 | "Scotland 97.0 \n", 628 | "South East 91.0 \n", 629 | "South West 99.0 \n", 630 | "Wales 89.0 \n", 631 | "West Midlands 95.0 \n", 632 | "Yorkshire & Humber 90.0 " 633 | ] 634 | }, 635 | "execution_count": 12, 636 | "metadata": {}, 637 | "output_type": "execute_result" 638 | } 639 | ], 640 | "source": [ 641 | "df.groupby(['region'])['Age'].describe()" 642 | ] 643 | }, 644 | { 645 | "cell_type": "code", 646 | "execution_count": 13, 647 | "metadata": {}, 648 | "outputs": [ 649 | { 650 | "data": { 651 | "text/html": [ 652 | "
\n", 653 | "\n", 666 | "\n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | "
countmeanstdmin25%50%75%max
regionConstit_Code
East MidlandsAshfield9.056.88888918.92382421.053.0065.066.0083.0
Bassetlaw10.046.00000023.59849323.028.7539.054.7593.0
Bolsover8.050.37500012.87231227.043.0053.058.7565.0
Broxtowe6.055.83333312.22156635.050.7559.065.0067.0
Charnwood11.060.81818214.64799136.051.5062.070.0080.0
..............................
Yorkshire & HumberSheffield12.044.00000016.81449619.029.0044.556.0071.0
Sheffield,26.055.03846218.28765922.042.5051.070.5090.0
Skipton an9.054.44444423.55372122.034.0054.073.0084.0
York Centr9.052.77777819.93600919.044.0054.067.0077.0
York Outer10.060.80000014.60441346.050.7554.071.0086.0
\n", 820 | "

218 rows × 8 columns

\n", 821 | "
" 822 | ], 823 | "text/plain": [ 824 | " count mean std min 25% \\\n", 825 | "region Constit_Code \n", 826 | "East Midlands Ashfield 9.0 56.888889 18.923824 21.0 53.00 \n", 827 | " Bassetlaw 10.0 46.000000 23.598493 23.0 28.75 \n", 828 | " Bolsover 8.0 50.375000 12.872312 27.0 43.00 \n", 829 | " Broxtowe 6.0 55.833333 12.221566 35.0 50.75 \n", 830 | " Charnwood 11.0 60.818182 14.647991 36.0 51.50 \n", 831 | "... ... ... ... ... ... \n", 832 | "Yorkshire & Humber Sheffield 12.0 44.000000 16.814496 19.0 29.00 \n", 833 | " Sheffield, 26.0 55.038462 18.287659 22.0 42.50 \n", 834 | " Skipton an 9.0 54.444444 23.553721 22.0 34.00 \n", 835 | " York Centr 9.0 52.777778 19.936009 19.0 44.00 \n", 836 | " York Outer 10.0 60.800000 14.604413 46.0 50.75 \n", 837 | "\n", 838 | " 50% 75% max \n", 839 | "region Constit_Code \n", 840 | "East Midlands Ashfield 65.0 66.00 83.0 \n", 841 | " Bassetlaw 39.0 54.75 93.0 \n", 842 | " Bolsover 53.0 58.75 65.0 \n", 843 | " Broxtowe 59.0 65.00 67.0 \n", 844 | " Charnwood 62.0 70.00 80.0 \n", 845 | "... ... ... ... \n", 846 | "Yorkshire & Humber Sheffield 44.5 56.00 71.0 \n", 847 | " Sheffield, 51.0 70.50 90.0 \n", 848 | " Skipton an 54.0 73.00 84.0 \n", 849 | " York Centr 54.0 67.00 77.0 \n", 850 | " York Outer 54.0 71.00 86.0 \n", 851 | "\n", 852 | "[218 rows x 8 columns]" 853 | ] 854 | }, 855 | "execution_count": 13, 856 | "metadata": {}, 857 | "output_type": "execute_result" 858 | } 859 | ], 860 | "source": [ 861 | "df.groupby(['region', 'Constit_Code'])['Age'].describe()" 862 | ] 863 | }, 864 | { 865 | "cell_type": "code", 866 | "execution_count": 14, 867 | "metadata": {}, 868 | "outputs": [ 869 | { 870 | "data": { 871 | "text/plain": [ 872 | "region Constit_Code\n", 873 | "East Midlands Ashfield 7800.0\n", 874 | " Bassetlaw 33800.0\n", 875 | " Bolsover 31200.0\n", 876 | " Broxtowe 31200.0\n", 877 | " Charnwood 55000.0\n", 878 | " ... \n", 879 | "Yorkshire & Humber Sheffield 28600.0\n", 880 | " Sheffield, 26000.0\n", 881 | " Skipton an 15600.0\n", 882 | " York Centr 20800.0\n", 883 | " York Outer 40700.0\n", 884 | "Name: lower_income_bound, Length: 218, dtype: float64" 885 | ] 886 | }, 887 | "execution_count": 14, 888 | "metadata": {}, 889 | "output_type": "execute_result" 890 | } 891 | ], 892 | "source": [ 893 | "df['lower_income_bound'] = df['y01'].apply(get_lower_income_bound)\n", 894 | "df.groupby(['region'])['lower_income_bound'].median()\n", 895 | "df.groupby(['region', 'Constit_Code'])['lower_income_bound'].median()" 896 | ] 897 | }, 898 | { 899 | "cell_type": "code", 900 | "execution_count": 15, 901 | "metadata": {}, 902 | "outputs": [ 903 | { 904 | "data": { 905 | "text/plain": [ 906 | "region Constit_Code\n", 907 | "East Midlands Ashfield Don`t know\n", 908 | " Bassetlaw None/No party\n", 909 | " Bolsover None/No party\n", 910 | " Broxtowe None/No party\n", 911 | " Charnwood Conservatives\n", 912 | " ... \n", 913 | "Yorkshire & Humber Sheffield Labour\n", 914 | " Sheffield, Conservatives\n", 915 | " Skipton an Don`t know\n", 916 | " York Centr Don`t know\n", 917 | " York Outer Conservatives\n", 918 | "Name: a02, Length: 218, dtype: object" 919 | ] 920 | }, 921 | "execution_count": 15, 922 | "metadata": {}, 923 | "output_type": "execute_result" 924 | } 925 | ], 926 | "source": [ 927 | "df.groupby(['region'])['a02'].apply(lambda x: pd.Series.mode(x)[0])\n", 928 | "df.groupby(['region', 'Constit_Code'])['a02'].apply(lambda x: pd.Series.mode(x)[0])" 929 | ] 930 | } 931 | ], 932 | "metadata": { 933 | "kernelspec": { 934 | "display_name": "ds3", 935 | "language": "python", 936 | "name": "ds3" 937 | }, 938 | "language_info": { 939 | "codemirror_mode": { 940 | "name": "ipython", 941 | "version": 3 942 | }, 943 | "file_extension": ".py", 944 | "mimetype": "text/x-python", 945 | "name": "python", 946 | "nbconvert_exporter": "python", 947 | "pygments_lexer": "ipython3", 948 | "version": "3.11.4" 949 | } 950 | }, 951 | "nbformat": 4, 952 | "nbformat_minor": 4 953 | } 954 | -------------------------------------------------------------------------------- /figures/pandas_base.html: -------------------------------------------------------------------------------- 1 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 |
student_idmathenglishhistorybiologyart
5a019597808186
5a127891898681
5b058586948882
5b109089879991
5e048890858884
81 | -------------------------------------------------------------------------------- /figures/pandas_columns.html: -------------------------------------------------------------------------------- 1 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 |
 student_idmathenglishhistorybiologyart
01019597808186
11027891898681
21038586948882
31049089879991
41058890858884
93 | -------------------------------------------------------------------------------- /figures/pandas_index.html: -------------------------------------------------------------------------------- 1 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 |
 student_idmathenglishhistorybiologyart
01019597808186
11027891898681
21038586948882
31049089879991
41058890858884
81 | -------------------------------------------------------------------------------- /figures/pandas_loc_cell.html: -------------------------------------------------------------------------------- 1 | 703 | 704 | 705 | 706 | 707 | 708 | 709 | 710 | 711 | 712 | 713 | 714 | 715 | 716 | 717 | 718 | 719 | 720 | 721 | 722 | 723 | 724 | 725 | 726 | 727 | 728 | 729 | 730 | 731 | 732 | 733 | 734 | 735 | 736 | 737 | 738 | 739 | 740 | 741 | 742 | 743 | 744 | 745 | 746 | 747 | 748 | 749 | 750 | 751 | 752 | 753 | 754 | 755 | 756 |
student_idmathenglishhistorybiologyart
5a019597808186
5a127891898681
5b058586948882
5b109089879991
5e048890858884
757 | -------------------------------------------------------------------------------- /figures/pandas_loc_col.html: -------------------------------------------------------------------------------- 1 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 |
student_idmathenglishhistorybiologyart
5a019597808186
5a127891898681
5b058586948882
5b109089879991
5e048890858884
250 | -------------------------------------------------------------------------------- /figures/pandas_loc_multi_col.html: -------------------------------------------------------------------------------- 1 | 597 | 598 | 599 | 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | 611 | 612 | 613 | 614 | 615 | 616 | 617 | 618 | 619 | 620 | 621 | 622 | 623 | 624 | 625 | 626 | 627 | 628 | 629 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 646 | 647 | 648 | 649 | 650 |
student_idmathenglishhistorybiologyart
5a019597808186
5a127891898681
5b058586948882
5b109089879991
5e048890858884
651 | -------------------------------------------------------------------------------- /figures/pandas_loc_multi_row.html: -------------------------------------------------------------------------------- 1 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 |
student_idmathenglishhistorybiologyart
5a019597808186
5a127891898681
5b058586948882
5b109089879991
5e048890858884
415 | -------------------------------------------------------------------------------- /figures/pandas_loc_new_index.html: -------------------------------------------------------------------------------- 1 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 |
 mathenglishhistorybiologyart
1019597808186
1027891898681
1038586948882
1049089879991
1058890858884
121 | -------------------------------------------------------------------------------- /figures/pandas_loc_row.html: -------------------------------------------------------------------------------- 1 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 |
student_idmathenglishhistorybiologyart
5a019597808186
5a127891898681
5b058586948882
5b109089879991
5e048890858884
142 | -------------------------------------------------------------------------------- /notebooks/01_tech_check.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "e110c140-b066-405d-9423-6ade0fa20967", 6 | "metadata": {}, 7 | "source": [ 8 | "# Colab Notebook\n", 9 | "\n", 10 | "Hello! This is an online coding notebook hosted by Google.\n", 11 | "\n", 12 | "If you don't know how to use it, watch me demonstrate.\n" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "id": "58afce63-1725-4642-a999-a6c378dfe799", 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "print(\"Hello Colab!\")" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "id": "1e385560-e9c9-4a91-a4a4-ee74374a9336", 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [] 32 | } 33 | ], 34 | "metadata": { 35 | "kernelspec": { 36 | "display_name": "Python 3 (ipykernel)", 37 | "language": "python", 38 | "name": "python3" 39 | }, 40 | "language_info": { 41 | "codemirror_mode": { 42 | "name": "ipython", 43 | "version": 3 44 | }, 45 | "file_extension": ".py", 46 | "mimetype": "text/x-python", 47 | "name": "python", 48 | "nbconvert_exporter": "python", 49 | "pygments_lexer": "ipython3", 50 | "version": "3.11.4" 51 | } 52 | }, 53 | "nbformat": 4, 54 | "nbformat_minor": 5 55 | } 56 | -------------------------------------------------------------------------------- /slides/01_intro.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | Introduction to Python 9 | 10 | 11 | 12 | 13 | 14 | 31 | 32 | 33 | 34 | 35 |
36 |
37 | 38 |
39 |

Introduction to Python

40 |

Dr Musashi Jacobs-Harukawa

41 |

DDSS @ Princeton University

42 |

15 Aug 2023

43 |
44 | 45 |
46 |
47 |

First Things

48 | 49 |
50 |
51 |

Ground Rules

52 |
    53 |
  • Asking questions
  • 54 |
  • Contacting me
  • 55 |
56 |
57 |
58 |

Tech Check

59 |
    60 |
  • Open the following link and follow the 61 | instructions.
  • 62 |
  • Once you have run the command in the notebook, come 63 | back to me.
  • 64 |
65 |
66 |
67 |

Format

68 |
    69 |
  • Lecture
  • 70 |
  • Notebooks with same material as slides
  • 71 |
  • Additional exercises in repo
  • 72 |
73 |
74 |
75 |

About Me

76 |
    77 |
  • Postdoc Data-Driven Social Science Initiative, 78 | Princeton University
  • 79 |
  • Research: Applied ML/Social Science Methodology 80 | (primarily NLP)
  • 81 |
  • Previously: DPhil University of Oxford
  • 82 |
  • Previously previously: Data Scientist in 83 | Finance
  • 84 |
85 |
86 |
87 |

Learning Objectives

88 |
    89 |
  • How is Python used in (social science) 90 | research?
  • 91 |
  • Basic base Python programming
  • 92 |
  • Basic data analysis in Python
  • 93 |
94 |
95 |
96 |
97 |

Python for Research

98 | 99 |
100 |
101 |

For Beginners: A Description

102 |

Python is an open-source, general-purpose scripting 103 | language.

104 |
105 |
106 |

Open-Source

107 |
    108 |
  • Built by a community
  • 109 |
  • Maintained by a community
  • 110 |
  • Free to use for all
  • 111 |
112 |
113 |
114 |

General-Purpose

115 |
    116 |
  • If you’re doing it on a computer and there’s some 117 | repetitive element, then you can automate it in Python.
  • 118 |
  • Python isn’t limited to Data Science, but it’s very 119 | popular with data scientists!
  • 120 |
121 |
122 |
123 |

Scripting

124 |
    125 |
  • No strict definition for what a “script” is.
  • 126 |
  • Series of commands to automate some task.
  • 127 |
  • Like a pipeline: takes some inputs, does some 128 | things to these inputs, and gives back some outputs.
  • 129 |
130 |
131 |
132 |

Language

133 |
    134 |
  • Python is a language, and not an application.
  • 135 |
  • Practical difference for you: 136 |
      137 |
    • most applications provide you options to select 138 | from.
    • 139 |
    • languages require to generate commands from 140 | accepted rules.
    • 141 |
  • 142 |
  • Upshot is that you can do nearly anything with 143 | Python!
  • 144 |
145 |
146 |
148 |

For Researchers: what can we use Python for?

149 |
    150 |
  • Generally: any repetitive task done on a 151 | computer can be automated with Python.
  • 152 |
  • Ways I use it: 153 |
      154 |
    • Data collection (web scraping)
    • 155 |
    • Data cleaning/analysis
    • 156 |
    • Data visualization
    • 157 |
    • Machine learning
    • 158 |
    • Deep learning
    • 159 |
  • 160 |
161 |
162 |
164 |

For Engineers: how is research different from development?

165 |
    166 |
  • Research: 167 |
      168 |
    • Usage: scripting, interactive usage
    • 169 |
    • Concerns: ease of use, time-efficiency
    • 170 |
  • 171 |
  • Development: 172 |
      173 |
    • Use: application development
    • 174 |
    • Concerns: portability, deployment, resource 175 | efficiency
    • 176 |
  • 177 |
178 |
179 |
180 |

Python vs Alternatives

181 |
    182 |
  • Python and R popular 183 | languages for data analysis
  • 184 |
  • My observations: 185 |
      186 |
    • Functionality
    • 187 |
    • Contribution
    • 188 |
    • Standards
    • 189 |
    • Robustness
    • 190 |
  • 191 |
  • Open source or nothing
  • 192 |
193 |
194 |
195 |

Python Research Tools

196 |

Suggestions:

197 |
    198 |
  • IDE: VSCode, Jupyter
  • 199 |
  • Package Management: Conda/Mamba
  • 200 |
201 |
202 |
203 |
204 |

Before we start coding

205 | 206 |
207 |
208 |

Why Automate?

209 |
    210 |
  • Advantage of automation cost, scale, and 211 | scope.
  • 212 |
  • To harness computational methods, need to represent 213 | our observations in a way that algorithms and programs can utilise.
  • 214 |
  • Process of quantifying and structuring our 215 | observations usually entails loss of some information.
  • 216 |
217 |
218 |
    219 |
  • As social scientists, our goal when automating
  • 220 |
  • This automation is not limited to collecting data. 221 | Running a regression or sorting your responses by data is the automation 222 | of doing this by hand.
  • 223 |
  • Some qualitative scholars I speak to 224 | contend that the validity of the quantitative endeavour 225 | ends there. 226 |
      227 |
    • Are there unquantifiable things?
    • 228 |
  • 229 |
  • I’m more optimistic about what is possible, and 230 | think that the key to having valid quantitative inferences is to be 231 | extremely clear on the connection between the data in your analysis and 232 | the actual events you are measuring.
  • 233 |
234 |
235 |
236 |
237 |

Representing Information

238 |
    239 |
  • Choosing a representation of your information that 240 | retains relevant properties is key.
  • 241 |
  • To read more about this particular debate, a good 242 | starting point is Stevens 244 | (1946).
  • 245 |
246 |
247 |
248 |

Data Types

249 |

Data types are concerned with the representation of individual data 250 | points, or observations.

251 |
    252 |
  • Logical
  • 253 |
  • Numerical
  • 254 |
  • Categorical
  • 255 |
  • Text
  • 256 |
  • Date and time
  • 257 |
258 |
259 |
260 |

Data Structures

261 |

Data structures are concerned with the relations between 262 | observations.

263 |
    264 |
  • Are the data points members of the same set?
  • 265 |
  • Are the data points members of the same 266 | sequence?
  • 267 |
  • Are the data points different features of single 268 | empirical unit?
  • 269 |
270 |
271 |
272 |

Representing Data on a Computer

273 |
    274 |
  • Good news: Python, like most modern 275 | programming languages, has ways to represent each of the data types 276 | listed above.
  • 277 |
  • Bad news: At a fundamental level, this is 278 | being stored as 0’s and 1’s.
  • 279 |
  • Take away: Take the time to understand the 280 | relationship between: 281 |
      282 |
    • your empirical observations,
    • 283 |
    • the abstracted representation of them in your 284 | mathematical model,
    • 285 |
    • the approximation of this in your computational 286 | model.
    • 287 |
  • 288 |
289 |
290 |
291 |
292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 453 | 454 | 455 | -------------------------------------------------------------------------------- /slides/01_intro.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Introduction to Python" 3 | author: "Dr Musashi Jacobs-Harukawa" 4 | institute: "DDSS @ Princeton University" 5 | date: "15 Aug 2023" 6 | revealjs-url: "../reveal.js" 7 | section-titles: false 8 | aspectratio: 1610 9 | mainfont: "IBM Plex Sans-Light" 10 | theme: "white" 11 | css: "minimal-theme.css" 12 | --- 13 | 14 | # First Things 15 | 16 | ## Tech Check 17 | 18 | - Open the following link and follow the instructions: 19 | 20 | https://colab.research.google.com/github/mmjh-dev/ds3-intro-python/blob/main/notebooks/01_tech_check.ipynb 21 | 22 | 23 | ## Ground Rules 24 | 25 | - Asking questions 26 | - Contacting me 27 | 28 | 29 | ## Format 30 | 31 | - Lecture: slides, explanation, video recording 32 | - Notebooks: 33 | - Execute code along with me if you like 34 | - Ask me questions about what's happening 35 | - Additional exercises in repo: 36 | - If there's time, we can work on them together 37 | 38 | 39 | ## About Me 40 | 41 | - Postdoc Data-Driven Social Science Initiative, Princeton University 42 | - Research: Applied ML/Social Science Methodology (primarily NLP) 43 | - Previously: DPhil University of Oxford 44 | - Previously previously: Data Scientist in Finance 45 | 46 | 47 | ## Learning Objectives 48 | 49 | - How is Python used in (social science) research? 50 | - Basic base Python programming 51 | - Basic data analysis in Python 52 | 53 | 54 | # Python for Research 55 | 56 | ## For Beginners: A Description 57 | 58 | Python is an _open-source, general-purpose scripting language_. 59 | 60 | ## Open-Source 61 | 62 | - Built by a community 63 | - Maintained by a community 64 | - Free to use for all 65 | 66 | ## General-Purpose 67 | 68 | - If you're doing it on a computer and there's some repetitive element, then you can automate it in Python. 69 | - Python isn't limited to Data Science, but it's very popular with data scientists! 70 | 71 | 72 | ## Scripting 73 | 74 | - No strict definition for what a "script" is. 75 | - Series of commands to automate some task. 76 | - Like a pipeline: takes some inputs, does some things to these inputs, and gives back some outputs. 77 | 78 | 79 | ## Language 80 | 81 | - Python is a language, and not an application. 82 | - Practical difference for you: 83 | - most applications provide you options to select from. 84 | - languages require to generate commands from accepted rules. 85 | - Upshot is that you can do nearly anything with Python! 86 | 87 | 88 | ## For Researchers: what can we use Python for? 89 | 90 | - Generally: _any repetitive task done on a computer can be automated with Python_. 91 | - Ways I use it: 92 | - Data collection (web scraping) 93 | - Data cleaning/analysis 94 | - Data visualization 95 | - Machine learning 96 | - Deep learning 97 | 98 | ## For Engineers: how is research different from development? 99 | 100 | - Research: 101 | - Usage: scripting, interactive usage 102 | - Concerns: ease of use, time-efficiency 103 | - Development: 104 | - Use: application development 105 | - Concerns: portability, deployment, resource efficiency 106 | 107 | 108 | ## Python vs Alternatives 109 | 110 | - `Python` and `R` popular languages for data analysis 111 | - My observations on differences in: 112 | - Functionality 113 | - Contribution 114 | - Standards 115 | - Robustness 116 | - Open source or nothing 117 | 118 | ## Python Research Tools 119 | 120 | Suggestions: 121 | 122 | - IDE: VSCode, Jupyter 123 | - Package Management: Conda/Mamba 124 | 125 | 126 | # Before we start coding 127 | 128 | ## Why Automate? 129 | 130 | - Advantage of automation cost, scale, and scope. 131 | - To harness computational methods, need to represent our observations in a way that algorithms and programs can utilise. 132 | - Process of quantifying and structuring our observations usually entails loss of some information. 133 | 134 | ::: .notes 135 | - As social scientists, our goal when automating 136 | - This automation is not limited to collecting data. Running a regression or sorting your responses by data is the automation of doing this by hand. 137 | - Some qualitative scholars I speak to **contend** that the validity of the quantitative endeavour ends there. 138 | - Are there unquantifiable things? 139 | - I'm more optimistic about what is possible, and think that the key to having valid quantitative inferences is to be extremely clear on the connection between the data in your analysis and the actual events you are measuring. 140 | ::: 141 | 142 | ## Representing Information 143 | 144 | - Choosing a representation of your information that retains relevant properties is key. 145 | - To read more about this particular debate, a good starting point is [Stevens (1946)](https://pdfs.semanticscholar.org/2680/6102a45a6104489872dd3241b6e8030bbc40.pdf). 146 | 147 | ## Data Types 148 | 149 | Data types are concerned with the representation of individual data points, or observations. 150 | 151 | - Logical 152 | - Numerical 153 | - Categorical 154 | - Text 155 | - Date and time 156 | 157 | ## Data Structures 158 | 159 | Data structures are concerned with the relations between observations. 160 | 161 | - Are the data points members of the same set? 162 | - Are the data points members of the same sequence? 163 | - Are the data points different features of single empirical unit? 164 | 165 | ## Representing Data on a Computer 166 | 167 | - _Good news:_ Python, like most modern programming languages, has ways to represent each of the data types listed above. 168 | - _Bad news:_ At a fundamental level, this is being stored as 0's and 1's. 169 | - _Take away:_ Take the time to understand the relationship between: 170 | - your empirical observations, 171 | - the abstracted representation of them in your mathematical model, 172 | - the approximation of this in your computational model. 173 | 174 | # Coding Tutorial 175 | 176 | ## Part 1: Base Python 177 | 178 | https://colab.research.google.com/github/mmjh-dev/ds3-intro-python/blob/main/notebooks/02_base_python.ipynb 179 | 180 | ## Exercises for Part 1 181 | 182 | https://colab.research.google.com/github/mmjh-dev/ds3-intro-python/blob/main/exercises/part1_problems.ipynb 183 | 184 | ## Part 2: Intro Pandas 185 | 186 | https://colab.research.google.com/github/mmjh-dev/ds3-intro-python/blob/main/notebooks/03_pandas.ipynb 187 | 188 | ## Exercises for Part 2 189 | 190 | https://colab.research.google.com/github/mmjh-dev/ds3-intro-python/blob/main/exercises/part1_problems.ipynb 191 | -------------------------------------------------------------------------------- /slides/02_base_python.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Part 1: Base Python" 3 | --- 4 | 5 | # Crash Course 6 | 7 | ## "Base" 8 | 9 | - Like many languages, Python is enhanced by a great number of "libraries". 10 | - Libraries extend the functionality of a language, e.g. for data analysis. 11 | 12 | ## How much base Python do I need to know? 13 | 14 | - Varies with application 15 | - This course covers minimum needed for data analysis workflow 16 | - Deep learning or app development requires more 17 | - Improving knowledge of core functionality helps you write elegant code. 18 | 19 | ## Course Outline 20 | 21 | 1. Variables 22 | 2. Data Types 23 | 3. Data Structures 24 | 4. Control Flow 25 | 5. Functions 26 | 27 | # Values, Variables and Types 28 | 29 | ## Workspace 30 | 31 | _A mental image if this is your first time programming_ 32 | 33 | - When we start the Python "session", imagine we create an empty box. 34 | - We use commands, written into the cells of the notebook, to interact with this box. 35 | - We can create objects that persist inside the box. 36 | - We can modify objects inside the box. 37 | - We use the `print()` function to display these objects in the output. 38 | 39 | 40 | ## First Command: "Hello World!" 41 | 42 | Write the following command into the first cell: 43 | 44 | . . . 45 | 46 | ```{python} 47 | #| output-location: column-fragment 48 | print("Hello World!") 49 | ``` 50 | 51 | 52 | ## Variable Assignment 53 | 54 | - Variables are names that point to particular objects in the box. 55 | - These objects can be values, functions, etc. 56 | - We assign variables with the `=` operator. 57 | 58 | . . . 59 | 60 | ```{python} 61 | #| output-location: column-fragment 62 | x = 1 63 | y = 2 64 | print(x) 65 | print(y) 66 | ``` 67 | 68 | ## Aside: Variable Assignment Rules 69 | 70 | There are some rules for variable assignment: 71 | 72 | - Variable names cannot contain spaces 73 | 74 | . . . 75 | ```{python} 76 | #| output-location: default 77 | a variable = "this will give an error" 78 | ``` 79 | 80 | - The first letter of the variable cannot be a number or symbol 81 | 82 | . . . 83 | 84 | ```{python} 85 | #| output-location: default 86 | 1st_variable = "this will also give an error" 87 | ``` 88 | . . . 89 | 90 | Both of these return `SyntaxError` because the interpreter cannot understand these commands. 91 | 92 | 93 | # Data Types 94 | 95 | Here are four data types in base Python: 96 | 97 | - Integer `int`: whole numbers 98 | - Boolean `bool`: true/false values 99 | - Float `float`: rational numbers 100 | - String `str`: character sequences 101 | 102 | ## Integers 103 | 104 | Integers (`int`) are whole numbers, positive or negative. 105 | 106 | - Created by writing a number without a `.` 107 | 108 | ```{python} 109 | #| eval: false 110 | 42 111 | 0 112 | -15343 113 | ``` 114 | 115 | 116 | ## Integer Operations 117 | 118 | - As seen, integers are assigned with the `=` operator. 119 | 120 | . . . 121 | 122 | ```{python} 123 | #| output-location: default 124 | a = 5 125 | b = 13 126 | ``` 127 | 128 | - Arithmetic operations are as expected: `+`, `-`, `*`, `/` 129 | 130 | . . . 131 | 132 | ```{python} 133 | #| output-location: column-fragment 134 | print(a + b) 135 | print(a - b) 136 | print(a * b) 137 | ``` 138 | 139 | - `==` tests equivalence. 140 | 141 | . . . 142 | 143 | ```{python} 144 | #| output-location: column-fragment 145 | print(a == b) 146 | ``` 147 | 148 | ## Booleans 149 | 150 | - Booleans are True/False values. 151 | - You saw them on the previous slide! 152 | - There are two Booleans: `True` and `False`. 153 | 154 | 155 | ## Boolean Operators 156 | 157 | - `and`: logical _conjunction_ 158 | 159 | . . . 160 | 161 | ```{python} 162 | #| output-location: column-fragment 163 | print(True and True) 164 | print(True and False) 165 | print(False and True) 166 | print(False and False) 167 | ``` 168 | 169 | - `or`: logical _disjunction_ 170 | 171 | . . . 172 | 173 | ```{python} 174 | #| output-location: column-fragment 175 | print(True or True) 176 | print(True or False) 177 | print(False or True) 178 | print(False or False) 179 | ``` 180 | 181 | ## Aside: Bools and 0, 1 182 | 183 | Booleans behave like the integers 0 and 1. 184 | 185 | . . . 186 | 187 | ```{python} 188 | #| output-location: column 189 | True + True 190 | ``` 191 | 192 | . . . 193 | 194 | ```{python} 195 | #| output-location: column 196 | True * False 197 | ``` 198 | 199 | . . . 200 | 201 | ```{python} 202 | #| output-location: column-fragment 203 | #| error: true 204 | # What do you think will happen? 205 | False / False 206 | ``` 207 | 208 | ## Floats 209 | 210 | - Floats are, technically, the representation of real numbers ($\mathbb{R}$) in base 2. 211 | - For our purposes, all non-whole numbers are represented by floats. 212 | - Floats can be constructed using the `.` in a number: 213 | 214 | . . . 215 | 216 | ```{python} 217 | x = 1.0 218 | y = 3.4 219 | ``` 220 | 221 | ## Integer Division 222 | 223 | Python also automatically converts the output of integer division to a float: 224 | 225 | . . . 226 | 227 | ```{python} 228 | #| output-location: column-fragment 229 | print(1 / 3) 230 | print(3) 231 | print(3 / 1) 232 | ``` 233 | 234 | ## Warning!! 235 | 236 | Floats can behave unpredictably. 237 | 238 | . . . 239 | 240 | Note the following examples: 241 | 242 | . . . 243 | 244 | ```{python} 245 | #| output-location: column-fragment 246 | 10 * 0.1 * 3 == 3 247 | ``` 248 | 249 | . . . 250 | 251 | ```{python} 252 | #| output-location: column-fragment 253 | 0.1 * 3 * 10 == 3 254 | ``` 255 | 256 | . . . 257 | 258 | ```{python} 259 | #| output-location: column-fragment 260 | 0.1 * 3 == 0.3 261 | ``` 262 | 263 | . . . 264 | 265 | ```{python} 266 | #| output-location: column-fragment 267 | 0.1 * 3 268 | ``` 269 | 270 | . . . 271 | 272 | [0.30000000000000004.com](https://0.30000000000000004.com/) 273 | 274 | ## Strings 275 | 276 | Strings (`str`) are a _sequence of characters_. 277 | 278 | ## Creation 279 | 280 | - Strings are created by writing a sequence of letters between _single **or** double_ quotes. 281 | 282 | . . . 283 | 284 | ```{python} 285 | first_word = "Hello" 286 | second_word = 'World' 287 | first_punct = "!" 288 | ``` 289 | 290 | ## Equivalence 291 | 292 | - We can test equivalence using the `==` operator. 293 | 294 | . . . 295 | 296 | ```{python} 297 | #| output-location: column-fragment 298 | print(first_word == first_word) 299 | print(first_word == second_word) 300 | ``` 301 | 302 | ## Concatenation 303 | 304 | Combine _strings_ with the `+` operator: 305 | 306 | . . . 307 | 308 | ```{python} 309 | print(first_word + second_word + first_punct) 310 | ``` 311 | 312 | . . . 313 | 314 | You can use variables and values together: 315 | 316 | . . . 317 | 318 | ```{python} 319 | print(first_word + " " + second_word + first_punct) 320 | ``` 321 | 322 | ## Indexing 323 | 324 | - Strings are a _sequence_. 325 | - Strings can be indexed by using the `[]` operator. 326 | - _Note that python counts from zero!_ 327 | 328 | . . . 329 | 330 | ```{python} 331 | the_word = "bird" 332 | 333 | print(the_word + " is the word") 334 | print(the_word[0] + " is the first letter of the word") 335 | print(the_word[1] + " is the second letter of the word") 336 | ``` 337 | 338 | ## Slices 339 | 340 | - Use the `:` to return a range of values. 341 | - The range `[m:n]` returns from the (m+1)th letter to the nth letter. 342 | 343 | . . . 344 | 345 | ```{python} 346 | print(the_word[0:2] + " is the first two letters of the word") 347 | print(the_word[1:3] + " is the second and third letter of the word") 348 | ``` 349 | 350 | ## Slices 351 | 352 | . . . 353 | 354 | ```{python} 355 | #| output-location: column-fragment 356 | # What does this return? 357 | the_word[0:3] + the_word[0] 358 | ``` 359 | 360 | 361 | ## Negative Indexing 362 | 363 | - `[-1]` returns the last element of a sequence; in this case, the last character in a word. 364 | 365 | . . . 366 | ```{python} 367 | #| output-location: column-fragment 368 | print(the_word[-1]) 369 | ``` 370 | 371 | - We can use this to get the $n$ last values of a sequence: 372 | 373 | . . . 374 | ```{python} 375 | #| output-location: column-fragment 376 | print(the_word[-3:]) 377 | ``` 378 | 379 | ## Checking and Coercing Types 380 | 381 | - `type()` returns the type of an object. 382 | 383 | . . . 384 | 385 | ```{python} 386 | #| output-location: column-fragment 387 | type(15) 388 | ``` 389 | 390 | - The type of an object can be coerced by calling the a different type on the object. 391 | 392 | . . . 393 | 394 | ```{python} 395 | #| output-location: column-fragment 396 | type(float(15)) 397 | ``` 398 | 399 | ## Checking and Coercing Types 400 | 401 | - Sometimes coercion is not possible: 402 | 403 | . . . 404 | 405 | ```{python} 406 | #| output-location: column-fragment 407 | int('meow') 408 | ``` 409 | 410 | - And sometimes it is unpredictable 411 | 412 | . . . 413 | 414 | 415 | ```{python} 416 | #| output-location: column-fragment 417 | print(bool('meow')) 418 | print(int(-0.99)) 419 | print(str(False)==False) 420 | ``` 421 | 422 | 423 | # Data Structures 424 | 425 | - **Data types** concern the representation of individual data points, or observations. 426 | - **Data structures** concern the relations between observations. 427 | - Are the data points members of the same set? 428 | - Are the data points members of the same sequence? 429 | - Are the data points different features of single empirical unit? 430 | 431 | ## Two Python Data Structures 432 | 433 | - Lists (`list`): ordered array 434 | - Dictionaries (`dict`): key-value mapping 435 | 436 | ## Lists 437 | 438 | Lists are a type of data container in base Python. Lists are: 439 | 440 | - 1-dimensional 441 | - Ordered and indexed 442 | - Mutable 443 | - Able to contain any (type of) object 444 | 445 | . . . 446 | 447 | To create a list, write a sequence of values separated by commas between square brackets: 448 | 449 | . . . 450 | 451 | ```{python} 452 | my_list = [1, 2, 3] # only integers 453 | my_list = [1, 2.0, '3'] # integer, float and string 454 | ``` 455 | 456 | ## Indexing and Slicing Lists 457 | 458 | - Lists can be indexed in the same way as strings, using the `list[m:n]` notation. 459 | - `list[0]` returns the first value of the list 460 | 461 | . . . 462 | 463 | ```{python} 464 | #| output-location: column-fragment 465 | #| code-line-numbers: "2" 466 | my_list = [1, 2.0, '3'] 467 | print(my_list[0]) 468 | 469 | ``` 470 | 471 | - `list[1:3]` returns a list containing the second and third value of the list 472 | 473 | . . . 474 | 475 | ```{python} 476 | #| output-location: column-fragment 477 | #| code-line-numbers: "2" 478 | print(my_list[1:3]) 479 | ``` 480 | 481 | 482 | ## Growing Lists 483 | 484 | - Use `len()` to get the length of a `list` 485 | 486 | . . . 487 | 488 | ```{python} 489 | #| output-location: column-fragment 490 | len(my_list) 491 | ``` 492 | 493 | - Use `.append()` to add an element at the end 494 | 495 | . . . 496 | 497 | ```{python} 498 | #| output-location: column-fragment 499 | my_list.append(4) 500 | print(my_list) 501 | ``` 502 | 503 | - Adding lists concatenates them: 504 | 505 | . . . 506 | 507 | ```{python} 508 | #| output-location: column-fragment 509 | my_list + ['fivesixseven8'] 510 | ``` 511 | 512 | ## Removing Items 513 | 514 | - Use `.pop()` to remove the final element 515 | - NB: `.pop()` also returns the final element 516 | 517 | . . . 518 | 519 | ```{python} 520 | #| output-location: column-fragment 521 | last_element = my_list.pop() 522 | print(last_element) 523 | print(my_list) 524 | ``` 525 | 526 | ## Changing Values in Lists 527 | 528 | . . . 529 | 530 | We can assign or reassign values using `=` and indexers: 531 | 532 | . . . 533 | 534 | ```{python} 535 | #| output-location: column-fragment 536 | x = ['a', 't', 'e'] 537 | print(x) 538 | ``` 539 | 540 | . . . 541 | 542 | ```{python} 543 | #| output-location: column-fragment 544 | x[0] = 'A' 545 | x[1] = 'T' 546 | print(x) 547 | ``` 548 | 549 | . . . 550 | 551 | We can also replace slices: 552 | 553 | ```{python} 554 | #| output-location: column-fragment 555 | x[0:2] = ['a', 't'] 556 | print(x) 557 | ``` 558 | 559 | ## Dictionaries 560 | 561 | - Dictionaries are an unordered mapping of _keys_ to _values_. 562 | - Created by writing a list of `key:value` pairs separated by commas between `{}`. 563 | 564 | . . . 565 | 566 | ```{python} 567 | favourite_icecream = { 568 | "Musashi": "chocolate", 569 | "Maria": "dulce de leche" 570 | } 571 | ``` 572 | 573 | ## Retrieving Values 574 | 575 | - Values are accessed with `dict_name[key]` 576 | 577 | . . . 578 | 579 | ```{python} 580 | print(favourite_icecream['Musashi']) 581 | print(favourite_icecream['Maria']) 582 | ``` 583 | 584 | ## Modifying Dictionaries 585 | 586 | - New key-value pairs can be added or modified with `=` 587 | 588 | . . . 589 | 590 | ```{python} 591 | favourite_icecream['Chris'] = 'pistachio' 592 | favourite_icecream['Musashi'] = 'green tea' 593 | 594 | print(favourite_icecream['Musashi']) 595 | print(favourite_icecream['Chris']) 596 | ``` 597 | 598 | ## Accessing All Keys or Values 599 | 600 | To view all of the keys or values in a dictionary, you can use the following _methods_: 601 | 602 | . . . 603 | 604 | ```{python} 605 | #| output-location: column-fragment 606 | print(favourite_icecream.keys()) 607 | ``` 608 | 609 | . . . 610 | 611 | ```{python} 612 | #| output-location: column-fragment 613 | print(favourite_icecream.values()) 614 | ``` 615 | 616 | . . . 617 | 618 | ```{python} 619 | #| output-location: column-fragment 620 | print(favourite_icecream.items()) 621 | ``` 622 | 623 | # Control Flow 624 | 625 | - Control flow structures specify whether and in what order to run blocks of our code. 626 | - We focus on two: 627 | - Conditional execution 628 | - Iteration (For loops) 629 | 630 | ## Conditional Execution 631 | 632 | - In Python, to execute code conditionally, we use the following syntax: 633 | 634 | . . . 635 | 636 | ```{python} 637 | #| eval: false 638 | #| output-location: default 639 | if CONDITION: 640 | RUN_COMMAND 641 | ``` 642 | 643 | - If `CONDITION` is `True`, then `RUN_COMMAND` is executed. 644 | 645 | ## Absolute Value 646 | 647 | - Suppose we want the absolute value of a number. 648 | - "if number is negative, then multiply by -1" 649 | - We can write it as follows: 650 | 651 | . . . 652 | 653 | ```{python} 654 | #| code-line-numbers: "3,4" 655 | number = -2 656 | 657 | if number < 0: 658 | number = number * -1 659 | 660 | print("Absolute value is", number) 661 | ``` 662 | 663 | 664 | ## If/Else 665 | 666 | - We specify what happens if `CONDITION` evaluates `False` using the `else` command. 667 | 668 | . . . 669 | 670 | ```{python} 671 | #| eval: true 672 | #| code-line-numbers: "|1|3|4" 673 | language_to_learn = "Python" 674 | 675 | if language_to_learn == "Python": 676 | print("You are in the right place!") 677 | else: 678 | print("You might be lost!") 679 | ``` 680 | 681 | ## If/Elif/Else 682 | 683 | - `elif` evaluates if the previous `if` or `elif` is `False` 684 | - and executes if the statement evaluates `True`. 685 | 686 | . . . 687 | 688 | ```{python} 689 | #| eval: true 690 | #| code-line-numbers: "|1|3|5|6" 691 | language_to_learn = "Julia" 692 | 693 | if language_to_learn == "Python": 694 | print("You are in the right place!") 695 | elif language_to_learn == "Julia": 696 | print("Hertie should offer Julia!") 697 | elif language_to_learn == "R": 698 | print("You should learn a better coding language!") 699 | else: 700 | print("You might be lost!") 701 | ``` 702 | 703 | 704 | ## For Loops 705 | 706 | - A for-loop is a logical structure composed of two parts: an iterable and an action. 707 | - In python, they are written as follows: 708 | 709 | . . . 710 | 711 | ```{python} 712 | #| eval: false 713 | #| code-line-numbers: "|1|2" 714 | for i in ITERABLE: 715 | RUN_COMMAND 716 | ``` 717 | 718 | 719 | ## Simple Example 720 | 721 | ```{python} 722 | for number in [1, 2, 3, 4, 5, 6, 7]: 723 | print(number, "+", number, "=", number+number) 724 | ``` 725 | 726 | ## Incrementing a Counter 727 | 728 | ```{python} 729 | counter = 1 730 | for number in [1, 2, 3, 4, 5, 6, 7]: 731 | print(number, "*", counter, "=", number*counter) 732 | counter = counter*number 733 | ``` 734 | 735 | # Functions 736 | 737 | ## Defining Functions 738 | 739 | Here's a simple function that adds 1 to the input: 740 | 741 | . . . 742 | 743 | ```{python} 744 | def add_one(x): 745 | y = x+1 746 | return y 747 | ``` 748 | 749 | - The command `def` followed by a space tells Python that you are defining a function. 750 | - This function is given the name followed by `def`; in this case `add_one`. 751 | - The _arguments_ of the function are given after the function name, inside `()`. 752 | - The `:` says that the definition line is done. The following line must be indented by four spaces. 753 | 754 | ## Namespaces 755 | 756 | - Python has _namespaces_ for variables 757 | - Multiple levels of namespace: _local_ and _global_ 758 | 759 | ## Namespaces cont. 760 | 761 | - _Local_: Variables defined _within_ a function are created within the _local_ namespace of that function. 762 | - This means that they are only accessible from within the function. 763 | - _Global_: Variables defined _outside_ a function are created within the _global_ namespace. 764 | - _Priority_: If a function contains a reference to a variable, it will first check to see whether the variable exists in the _local_ namespace, and then the _global_ one. 765 | 766 | ## Namespace Examples 767 | 768 | _Clearing the environment first_ 769 | 770 | ```{python} 771 | %reset -f 772 | ``` 773 | 774 | ## Local Variables not Accessibly Globally 775 | 776 | The following code will result in an error: 777 | 778 | . . . 779 | 780 | ```{python} 781 | #| error: true 782 | def f(x): 783 | y = 5 784 | return x + y 785 | 786 | print(y) 787 | ``` 788 | 789 | ## Local Accessed Before Global 790 | 791 | The following code will return the _local_ value of `y`, thus returning 10. 792 | 793 | . . . 794 | 795 | ```{python} 796 | y = 0 797 | 798 | def f(x): 799 | y = 5 800 | return x + y 801 | 802 | print(f(5)) 803 | ``` 804 | 805 | ## Functions Reading from Global Variables 806 | 807 | The following code uses `y`, which is defined globally. Therefore it returns 7. 808 | 809 | . . . 810 | 811 | ```{python} 812 | y = 2 813 | 814 | def f(x): 815 | return x + y 816 | 817 | print(f(5)) 818 | ``` 819 | 820 | ## `Lambda` Functions 821 | 822 | - Python has _lambda functions_. These are essentially a way to define a function in-line. 823 | - Below, the function `f` is equivalent to the line `lambda x: x+1`. 824 | 825 | . . . 826 | 827 | ```{python} 828 | #| eval: false 829 | def f(x): 830 | return x+1 831 | 832 | lambda x: x+1 833 | ``` 834 | 835 | 836 | # Recap 837 | 838 | ## We covered a lot! 839 | 840 | - 4 Data Types 841 | - `int`: whole numbers 842 | - `float`: rational numbers. Unexpected behavior. 843 | - `bool`: True/False values 844 | - `str`: Character sequence. Behaves like sequence. 845 | - 2 data structures 846 | - Lists: ordered sequence of values 847 | - Dictionaries: mapping of key to value 848 | 849 | ## cont. 850 | 851 | - 2 control flow structures 852 | - Conditional execution: `if`/`elif`/`else` 853 | - Iteration: for-loops 854 | - How to write functions 855 | 856 | ## Question Time 857 | 858 | # Break -------------------------------------------------------------------------------- /slides/03_pandas.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Part 2: Intro Pandas" 3 | --- 4 | 5 | # Preface 6 | 7 | ## 🐼 ? 8 | 9 | - Popular library for analyzing _tabular data_. 10 | - Tabular: rectangular, has rows and columns. 11 | - _Expressive_ and _full-featured_. 12 | - Readable syntax 13 | - Lots of convenience functions 14 | 15 | . . . 16 | 17 | 🐼: _panel data, apparently._ ^[https://www.dlr.de/sc/Portaldata/15/Resources/dokumente/pyhpc2011/submissions/pyhpc2011_submission_9.pdf] 18 | 19 | 20 | ## Pros and Cons 21 | 22 | :::: {.columns} 23 | 24 | ::: {.column} 25 | **Reasons to use** 26 | 27 | - Support for **many** file types 28 | - Integrated into data analysis ecosystem 29 | - Balance of verbosity and function 30 | - Complex time-series and hierarchically indexed data functionality 31 | ::: 32 | 33 | ::: {.column} 34 | **Reasons not to use** 35 | 36 | - Struggles with larger datasets (>1M rows) 37 | - Significant overhead 38 | - Memory-intensive 39 | - CPU-only 40 | 41 | ::: 42 | 43 | :::: 44 | 45 | ## Running Scenario 46 | 47 | - Analyzing student scores and sending summary reports 48 | - Two datasets: 49 | - `data/scores.csv`: students' scores in 5 subjects 50 | - `data/contacts.csv`: students' contact details 51 | 52 | ## Functionality Covered 53 | 54 | - How do I read in data from files? 55 | - How do I select and filter values? 56 | - How do I perform calculations on data? 57 | - How do I merge/combine data from multiple sources? 58 | 59 | # Importing Libraries 60 | 61 | _A brief aside on a first step that tends to get overlooked._ 62 | 63 | ## Libraries 64 | 65 | - Functionality of base Python is limited for data analysis workflow 66 | - Python language is extensible, benefits from enormous ecosystem 67 | - `pandas` is a _library_ for data analysis 68 | 69 | ## `import` 70 | 71 | - To use a library, we need to `import` it: 72 | 73 | . . . 74 | 75 | ```{python} 76 | #| output-location: column 77 | import pandas 78 | ``` 79 | 80 | - In Jupyter, we can use `%who` to list objects in the global namespace 81 | 82 | . . . 83 | 84 | ```{python} 85 | #| output-location: column 86 | %who 87 | ``` 88 | 89 | - The methods within pandas are now accessible as _methods_ of the imported _module_. 90 | 91 | . . . 92 | 93 | ```{python} 94 | #| output-location: column 95 | pandas.__version__ 96 | ``` 97 | 98 | ## `from` 99 | 100 | - We can use the `from` command to import objects and methods directly from modules into the global namespace. 101 | 102 | . . . 103 | 104 | ```{python} 105 | #| output-location: column 106 | from pandas import __version__ 107 | print(__version__) 108 | ``` 109 | 110 | ## `as` 111 | 112 | - The default approach in Python is to keep fewer objects in the global namespace. 113 | - This means always prefacing the function or class we want to use with the prefix name. 114 | - This can get verbose, so we often abbreviate library names with `as`: 115 | 116 | . . . 117 | 118 | ```{python} 119 | #| output-location: column 120 | import pandas as pd 121 | print(pd) 122 | print(pd.DataFrame) 123 | ``` 124 | 125 | ## Contrast with `R` 126 | 127 | - In `R`, the default approach is to import all methods from a library into the global namespace. 128 | - If we did this with `pandas` it would look like this: 129 | 130 | . . . 131 | 132 | ```{python} 133 | from pandas import * 134 | %who 135 | ``` 136 | 137 | ## Cleanup 138 | 139 | NB: `%`-commands only work in Jupyter (IPython). 140 | 141 | ```{python} 142 | %reset -f 143 | import pandas as pd # normal way of importing 144 | ``` 145 | 146 | 147 | # Getting Your Data 148 | 149 | ## Manual `DataFrame` 150 | 151 | - Can be constructed manually from a `dict` of equal-length `list`s. 152 | 153 | . . . 154 | 155 | ```{python} 156 | #| output-location: default 157 | scores_dict = { 158 | 'student_id': ['5a01', '5a12', '5b05', '5b10', '5e04'], 159 | 'math': [95, 78, 85, 90, 88], 160 | 'english': [97, 91, 86, 89, 90], 161 | 'history': [80, 89, 94, 87, 85], 162 | 'biology': [81, 86, 88, 99, 88], 163 | 'art': [86, 81, 82, 91, 84] 164 | } 165 | ``` 166 | 167 | ## Manual `DataFrame` 168 | 169 | ```{python} 170 | df = pd.DataFrame(data=scores_dict) 171 | df 172 | ``` 173 | 174 | 175 | ## Reading from a file 176 | 177 | `pandas` comes with functions for reading and writing to all kinds of data formats. A quick list can be viewed using tab completion: 178 | 179 | ```{python} 180 | #| eval: false 181 | #| output-location: default 182 | pd.read_ 183 | ``` 184 | . . . 185 | 186 | read_clipboard() read_gbq() read_parquet() read_sql_query() 187 | read_csv() read_hdf() read_pickle() read_sql_table() 188 | read_excel() read_html() read_sas() read_stata() 189 | read_feather() read_json() read_spss() read_table() 190 | read_fwf() read_orc() read_sql() read_xml() 191 | 192 | ## Data IO: `read_csv` 193 | 194 | . . . 195 | 196 | ```{python} 197 | #| eval: false 198 | #| output-location: default 199 | df = pd.read_csv("../data/scores.csv") 200 | df 201 | ``` 202 | . . . 203 | 204 | ```{python} 205 | #| echo: false 206 | #| output-location: default 207 | df = pd.read_csv("../data/scores.csv") 208 | from IPython.display import HTML 209 | HTML("../figures/pandas_base.html") 210 | ``` 211 | 212 | 213 | # Selecting Data 214 | 215 | - "What were everyone's `math` scores?" 216 | - "What did student `5a12` get on all subjects?" 217 | - "What did `5a12` and `5e04` get on `history` and `art`?" 218 | 219 | 220 | ## Columns and Index 221 | 222 | - Entries indexed by columns and rows 223 | 224 | . . . 225 | 226 | ```{python} 227 | #| echo: false 228 | df 229 | ``` 230 | 231 | ## `df.columns` and `df.index` 232 | 233 | - These can be accessed through following: 234 | 235 | . . . 236 | 237 | ```{python } 238 | #| output-location: column-fragment 239 | df.columns 240 | ``` 241 | . . . 242 | 243 | ```{python} 244 | #| output-location: column-fragment 245 | df.index 246 | ``` 247 | 248 | - By default dataframes have a numerical index. 249 | 250 | ## Setting the Index 251 | 252 | - `df.set_index()` returns dataframe with new index 253 | 254 | . . . 255 | 256 | ```{python} 257 | #| output-location: column-fragment 258 | #| code-line-numbers: "1" 259 | df = df.set_index('student_id') 260 | df.index 261 | ``` 262 | 263 | . . . 264 | 265 | ```{python} 266 | #| echo: false 267 | from IPython.display import HTML 268 | HTML("../figures/pandas_base.html") 269 | ``` 270 | 271 | 272 | ## `.loc` 273 | 274 | - Use `.loc` for name-based indexing. 275 | - General syntax: `.loc[, ]` 276 | - `` and `` correspond to the index and column names. 277 | - They can be a single value, a list, or `:` to indicate "all". 278 | - Let's learn by example: 279 | 280 | ## Row 281 | 282 | - "What did `5a01` get on all (`:`) exams?" 283 | 284 | . . . 285 | 286 | ```{python} 287 | #| eval: false 288 | #| output-location: default 289 | df.loc['5a01', :] 290 | ``` 291 | 292 | . . . 293 | 294 | ```{python} 295 | #| echo: false 296 | #| output-location: default 297 | HTML("../figures/pandas_loc_row.html") 298 | ``` 299 | 300 | ## Column 301 | 302 | - "What did everyone (`:`) get on `history`?" 303 | 304 | . . . 305 | 306 | ```{python} 307 | #| eval: false 308 | #| output-location: default 309 | df.loc[:, 'history'] 310 | ``` 311 | 312 | . . . 313 | 314 | ```{python} 315 | #| echo: false 316 | #| output-location: default 317 | HTML("../figures/pandas_loc_col.html") 318 | ``` 319 | 320 | ## Multiple Rows 321 | 322 | - "What did `5a01` and `5a12` (`['5a01', '5a12']`) get on all (`:`) exams?" 323 | 324 | . . . 325 | 326 | ```{python} 327 | #| eval: false 328 | #| output-location: default 329 | df.loc[['5a01', '5a12'], :] 330 | ``` 331 | 332 | . . . 333 | 334 | ```{python} 335 | #| echo: false 336 | #| output-location: default 337 | HTML("../figures/pandas_loc_multi_row.html") 338 | ``` 339 | 340 | ## Multiple Columns 341 | 342 | - What did everyone (`:`) get on `art` and `history` (`['art', 'history']`)? 343 | 344 | . . . 345 | 346 | ```{python} 347 | #| eval: false 348 | #| output-location: default 349 | df.loc[:, ['art', 'history']] 350 | ``` 351 | 352 | . . . 353 | 354 | ```{python} 355 | #| echo: false 356 | #| output-location: default 357 | HTML("../figures/pandas_loc_multi_col.html") 358 | ``` 359 | 360 | ## Label Order 361 | 362 | Note the order of labels changes the order of columns: 363 | 364 | . . . 365 | 366 | ```{python} 367 | df.loc[:, ['art', 'history']] 368 | ``` 369 | 370 | ## Cell 371 | 372 | - What did `5a01` get in `history`? 373 | 374 | . . . 375 | 376 | ```{python} 377 | #| eval: false 378 | #| output-location: default 379 | df.loc['5a01', 'history'] 380 | ``` 381 | 382 | . . . 383 | 384 | ```{python} 385 | #| echo: false 386 | #| output-location: default 387 | HTML("../figures/pandas_loc_cell.html") 388 | ``` 389 | 390 | ## Multiple Values 391 | 392 | - What did `5a01` get in `history` and `art`? 393 | 394 | . . . 395 | 396 | ```{python} 397 | #| eval: false 398 | #| output-location: default 399 | df.loc['5a12', ['history', 'art']] 400 | ``` 401 | 402 | . . . 403 | 404 | ```{python} 405 | #| echo: false 406 | #| output-location: default 407 | HTML("../figures/pandas_loc_multi1.html") 408 | ``` 409 | 410 | 411 | ## cont. 412 | 413 | - What did `5a01`, `5a12`, and `5b05` get in `biology` and `art`? 414 | 415 | . . . 416 | 417 | ```{python} 418 | #| eval: false 419 | #| output-location: default 420 | df.loc[['5a01', '5a12', '5b05'], ['biology', 'art']] 421 | ``` 422 | 423 | . . . 424 | 425 | ```{python} 426 | #| echo: false 427 | #| output-location: default 428 | HTML("../figures/pandas_loc_multi2.html") 429 | ``` 430 | 431 | ## Filtering 432 | 433 | "The overall scores of students who got _90 or higher in math_"" 434 | 435 | - Create a _boolean array_ 436 | 437 | . . . 438 | 439 | ```{python} 440 | #| output-location: column-fragment 441 | df.loc[:, 'math'] >= 90 442 | ``` 443 | 444 | ## `.loc` Filtering 445 | 446 | - `.loc` can take boolean arrays: 447 | 448 | . . . 449 | 450 | ```{python} 451 | #| code-line-numbers: "2" 452 | good_math = df.loc[:, 'math'] >= 90 453 | df.loc[good_math, :] 454 | ``` 455 | 456 | - Shorter syntax: 457 | 458 | . . . 459 | 460 | ```{python} 461 | #| eval: false 462 | df.loc[df['math'].ge(90), :] 463 | ``` 464 | 465 | ## Combining Indexers and Filters 466 | 467 | - `history` scores where `math` and `art` $>=$ 85 468 | 469 | . . . 470 | 471 | ```{python} 472 | df.loc[(df.loc[:, 'math']>=85) & 473 | (df.loc[:, 'art']>=85), 474 | 'history'] 475 | ``` 476 | 477 | ## `.iloc`: locational indexing 478 | 479 | - First 2 rows (`:2` in Python) 480 | - Last 3 columns (`-3:`) 481 | 482 | . . . 483 | 484 | ```{python} 485 | df.iloc[:2, -3:] 486 | ``` 487 | 488 | # Operations 489 | 490 | - "Is this score over 90?" 491 | - "What was the average `math` score?" 492 | - "What was `5b10`'s maximum score?"" 493 | - "What was the lowest score in each exam?" 494 | 495 | ## Some Terminology 496 | 497 | Clarifying what I mean in this lecture when I say: 498 | 499 | - _Scalar_: a single value 500 | - _Non-scalar_: a data structure capable of containing multiple data points 501 | - _Argument(s)_: the input(s) to a function 502 | 503 | ## `Series` and `DataFrame` 504 | 505 | - `pd.Series` are 1-dimensional 506 | - `pd.DataFrame` are 2-dimensional 507 | 508 | ## `Series` 509 | 510 | - Passing a scalar to an indexer (`.loc`) on a `DataFrame` returns a `Series` 511 | 512 | . . . 513 | 514 | ```{python} 515 | df.loc[:, 'biology'] 516 | ``` 517 | 518 | ## `DataFrame` 519 | 520 | - Passing a `list` returns a `DataFrame` 521 | 522 | . . . 523 | 524 | ```{python} 525 | df.loc[:, ['biology']] 526 | ``` 527 | 528 | ## Arguments at Three Levels 529 | 530 | - Scalar: "Is this score over 90?" 531 | 532 | . . . 533 | 534 | ```{python} 535 | #| output-location: default 536 | def greater_than_90(x): 537 | return x > 90 538 | ``` 539 | 540 | - Series: "What is the maximum score on one exam?" 541 | 542 | . . . 543 | 544 | ```{python} 545 | #| output-location: default 546 | def maximum(scores): 547 | return max(scores) 548 | ``` 549 | 550 | - Dataframe: "How many individual scores?" 551 | 552 | . . . 553 | 554 | ```{python} 555 | #| output-location: default 556 | def how_many_elements(df): 557 | rows, cols = df.shape 558 | return rows*cols 559 | ``` 560 | 561 | ## Convenience Functions 562 | 563 | - `Series` and `DataFrame` objects have many common operations built-in: 564 | - $\ne$ (`.ne()`), $\gt$ (`.gt()`), ... 565 | - mean (`.mean()`), median (`.median()`), standard deviation (`.std()`), ... 566 | - These tend to be optimized. 567 | - See [documentation](https://pandas.pydata.org/docs/user_guide/basics.html#descriptive-statistics) for list. 568 | 569 | ## "What was the average `math` score?" 570 | 571 | . . . 572 | 573 | ```{python} 574 | df.loc[:, 'math'].mean() 575 | ``` 576 | 577 | ## `apply` and `applymap` 578 | 579 | - Custom functions can be applied to `Series` and `DataFrame`s using their `.apply` method. 580 | - `pd.Series.apply`: functions with _scalar arguments_ 581 | - `pd.DataFrame.apply`: functions with `pd.Series` as an argument 582 | - Specify `axis`: 0 is row-wise, 1 is column-wise 583 | - `pd.DataFrame.applymap`: functions with _scalar arguments_ 584 | 585 | ## "What was the lowest score for each exam and student??" 586 | 587 | - Lowest per exam (`axis=0`) 588 | 589 | . . . 590 | 591 | ```{python} 592 | #| output-location: column-fragment 593 | df.min(axis=0) 594 | ``` 595 | 596 | - Lowest per student (`axis=1`, using built-in `min`): 597 | 598 | . . . 599 | 600 | ```{python} 601 | #| output-location: column-fragment 602 | df.apply(min, axis=1) 603 | ``` 604 | 605 | ## Convert Scores to A-F Ranking 606 | 607 | ```{python} 608 | #| output-location: default 609 | def convert_score(x): 610 | score = '' 611 | if x >= 90: 612 | score = 'A' 613 | elif x >= 80: 614 | score = 'B' 615 | elif x >= 70: 616 | score = 'C' 617 | elif x >= 60: 618 | score = 'D' 619 | else: 620 | score = 'F' 621 | return score 622 | ``` 623 | 624 | ## Convert Scores to A-F Ranking 625 | 626 | ```{python} 627 | df.applymap(convert_score) 628 | ``` 629 | 630 | # Combining Data 631 | 632 | - How do we combine data from multiple sources? 633 | 634 | ## Two Ways 635 | 636 | - Concatenating: sticking it together 637 | - Joining: merging on common "key" 638 | 639 | ## We forgot two students! 640 | 641 | ```{python} 642 | df_extra = pd.read_csv('../data/scores_extra.csv', 643 | index_col='student_id') 644 | df_extra 645 | ``` 646 | 647 | ## Concatenation 648 | 649 | ```{python} 650 | pd.concat([df, df_extra], axis=0) 651 | ``` 652 | 653 | ## Different Columns 654 | 655 | - `.reset_index()` moves the index into a column: 656 | 657 | . . . 658 | 659 | ```{python} 660 | df_extra.reset_index() 661 | ``` 662 | 663 | ## Different Columns 664 | 665 | - Concatenating with different columns creates NA values: 666 | 667 | . . . 668 | 669 | ```{python} 670 | pd.concat([df_extra, df_extra.reset_index()]) 671 | ``` 672 | 673 | 674 | ## Contact Details 675 | 676 | - We want to match scores to students' first names 677 | 678 | . . . 679 | 680 | ```{python} 681 | pd.read_csv('../data/contacts.csv') 682 | 683 | ``` 684 | 685 | ## Preparing data for joining 686 | 687 | - The two dataframes have one column in common: the student ID. 688 | - In one it's `StudentID`, in the other `student_id`. 689 | - For visual simplicity, I move `student_id` back to the columns 690 | 691 | . . . 692 | 693 | ```{python} 694 | df_contact = pd.read_csv('../data/contacts.csv') 695 | df_scores = df.reset_index() 696 | ``` 697 | 698 | ## Key 699 | 700 | - Note that the two dataframes do not have the exact same values of student ID! 701 | 702 | . . . 703 | 704 | ```{python} 705 | #| output-location: column 706 | df_contact['StudentID'] 707 | ``` 708 | 709 | ```{python} 710 | #| output-location: column 711 | df_scores['student_id'] 712 | ``` 713 | 714 | ## Merge Syntax 715 | 716 | ```{python} 717 | #| eval: false 718 | pd.merge( 719 | how= 720 | left=df_contact[['FirstName', 'StudentID']], 721 | right=df_scores[['student_id', 'history']], 722 | left_on='StudentID', 723 | right_on='student_id') 724 | ``` 725 | 726 | - We specify the two dataframes as `left` and `right` 727 | - We specify the common key for each dataframe 728 | - Note if they were the same, we could use the argument `on` 729 | - `how` is easiest to explain visually 730 | 731 | 732 | ## `how='inner'` 733 | 734 | ```{python} 735 | #| echo: false 736 | pd.merge( 737 | how='inner', 738 | left=df_contact[['FirstName', 'StudentID']], 739 | right=df_scores[['student_id', 'history']], 740 | left_on='StudentID', 741 | right_on='student_id') 742 | ``` 743 | 744 | ## `how='left'` 745 | 746 | ```{python} 747 | #| echo: false 748 | pd.merge( 749 | how='left', 750 | left=df_contact[['FirstName', 'StudentID']], 751 | right=df_scores[['student_id', 'history']], 752 | left_on='StudentID', 753 | right_on='student_id') 754 | ``` 755 | 756 | ## `how='right'` 757 | 758 | ```{python} 759 | #| echo: false 760 | pd.merge( 761 | how='right', 762 | left=df_contact[['FirstName', 'StudentID']], 763 | right=df_scores[['student_id', 'history']], 764 | left_on='StudentID', 765 | right_on='student_id') 766 | ``` 767 | 768 | ## `how='outer'` 769 | 770 | ```{python} 771 | #| echo: false 772 | pd.merge( 773 | how='outer', 774 | left=df_contact[['FirstName', 'StudentID']], 775 | right=df_scores[['student_id', 'history']], 776 | left_on='StudentID', 777 | right_on='student_id') 778 | ``` 779 | 780 | # Exploratory Analysis 781 | 782 | ## British Election Study Data 783 | 784 | ```{python} 785 | link = 'http://github.com/muhark/dpir-intro-python/raw/master/Week2/data/bes_data.csv' 786 | bes_df = pd.read_csv(link) 787 | ``` 788 | ## First-Look Functions 789 | 790 | - When working with data, your first step should always be _getting to know the data_ 791 | - Manually inspect samples of the data. 792 | - Check dimensions: are they expected? 793 | - Check data types: are they expected? 794 | - Tabulate variables: what are the levels? 795 | 796 | 797 | ## Inspect first 5 rows 798 | 799 | . . . 800 | 801 | ```{python} 802 | bes_df.head() 803 | ``` 804 | 805 | 806 | ## What are the dimensions of the dataset? 807 | 808 | . . . 809 | 810 | ```{python} 811 | bes_df.shape 812 | ``` 813 | 814 | 815 | ## What data types are each of the columns? 816 | 817 | . . . 818 | 819 | ```{python} 820 | bes_df.info() 821 | ``` 822 | 823 | ## What unique values do each column contain? 824 | 825 | - `pd.Series.value_counts()` for tabulation 826 | 827 | . . . 828 | 829 | ```{python} 830 | #| output-location: column-fragment 831 | bes_df['female'].value_counts() 832 | ``` 833 | 834 | . . . 835 | 836 | ```{python} 837 | #| output-location: column-fragment 838 | bes_df['region'].value_counts() 839 | ``` 840 | 841 | # Recap 842 | 843 | ## `pandas` 844 | 845 | - File I/O (reading/writing data formats) 846 | - Indexing, slicing, filtering 847 | - Operations on data 848 | - Combining/merging data 849 | - Exploratory look 850 | 851 | 852 | # Additional Resources 853 | 854 | ## Textbook 855 | 856 | The following sections of _Python for Data Analysis: Data Wrangling with Pandas, NumPy and IPython, 2nd edition_ are relevant to this lecture: 857 | 858 | - 5.*: Getting Started with pandas 859 | - 6.*: Data Loading, Storage and File Formats 860 | - 7.1-2: Data Cleaning and Preparation 861 | - 7.3: String Manipulation 862 | - 12.1: Categorical Data 863 | 864 | ## Blogs, Docs 865 | 866 | - [`pandas` guide to combining dataframes](https://pandas.pydata.org/docs/user_guide/merging.html) 867 | - Pandas Data: https://pbpython.com/pandas_dtypes.html 868 | -------------------------------------------------------------------------------- /slides/_quarto.yml: -------------------------------------------------------------------------------- 1 | format: 2 | revealjs: 3 | incremental: true 4 | jupyter: ds3 5 | execute: 6 | cache: true 7 | echo: true 8 | eval: true 9 | error: true 10 | output-location: fragment 11 | highlight-style: ayu 12 | -------------------------------------------------------------------------------- /slides/minimal-theme.css: -------------------------------------------------------------------------------- 1 | /** 2 | * A simple theme for reveal.js presentations, similar 3 | * to the default theme. The accent color is darkblue. 4 | * 5 | * This theme is Copyright (C) 2012 Owen Versteeg, https://github.com/StereotypicalApps. It is MIT licensed. 6 | * reveal.js is Copyright (C) 2011-2012 Hakim El Hattab, http://hakim.se 7 | */ 8 | @import url(https://fonts.googleapis.com/css?family=News+Cycle:400,700); 9 | @import url(https://fonts.googleapis.com/css?family=Lato:400,700,400italic,700italic); 10 | section.has-dark-background, section.has-dark-background h1, section.has-dark-background h2, section.has-dark-background h3, section.has-dark-background h4, section.has-dark-background h5, section.has-dark-background h6 { 11 | color: #fff; } 12 | 13 | /********************************************* 14 | * GLOBAL STYLES 15 | *********************************************/ 16 | body { 17 | background: #fff; 18 | background-color: #fff; } 19 | 20 | .reveal { 21 | font-family: "Lato", sans-serif; 22 | font-size: 35px; 23 | font-weight: normal; 24 | color: #000; } 25 | 26 | ::selection { 27 | color: #000; 28 | background: rgba(0, 0, 0, 0.99); 29 | text-shadow: none; } 30 | 31 | ::-moz-selection { 32 | color: #fff; 33 | background: rgba(0, 0, 0, 0.99); 34 | text-shadow: none; } 35 | 36 | .reveal .slides section, 37 | .reveal .slides section > section { 38 | line-height: 1.3; 39 | font-weight: inherit; } 40 | 41 | /********************************************* 42 | * HEADERS 43 | *********************************************/ 44 | .reveal h1, 45 | .reveal h2, 46 | .reveal h3, 47 | .reveal h4, 48 | .reveal h5, 49 | .reveal h6 { 50 | margin: 0 0 20px 0; 51 | color: #000; 52 | font-family: "Lato", sans-serif; 53 | font-weight: bold; 54 | line-height: 1.2; 55 | letter-spacing: normal; 56 | text-transform: none; 57 | text-shadow: none; 58 | word-wrap: break-word; } 59 | 60 | .reveal h1 { 61 | font-size: 1.65em; 62 | margin: 5px 30px 5px 0px; 63 | text-align: left; 64 | text-shadow: none; } 65 | 66 | .reveal h2 { 67 | font-size: 1.05em; 68 | text-align: left; } 69 | 70 | .reveal h3 { 71 | font-size: 1.025em; } 72 | 73 | .reveal h4 { 74 | font-size: 1em; } 75 | 76 | /********************************************* 77 | * OTHER 78 | *********************************************/ 79 | .reveal p { 80 | margin: 5px 0; 81 | line-height: 1.15; 82 | text-align: left; } 83 | 84 | .reveal .title { 85 | margin: 5px 30px 5px 30px; 86 | text-align: left; } 87 | 88 | .reveal .subtitle { 89 | font-size: 1.25em; 90 | margin: 5px 30px 5px 30px; 91 | line-height: 1.15; 92 | text-align: left; } 93 | 94 | .reveal .author { 95 | margin: 30px 30px 5px 30px; 96 | line-height: 1.15; 97 | text-align: left; } 98 | 99 | .reveal .institute { 100 | margin: 5px 30px 5px 30px; 101 | line-height: 1.15; 102 | text-align: left; } 103 | 104 | .reveal .date { 105 | margin: 5px 30px 5px 30px; 106 | line-height: 1.15; 107 | text-align: left; } 108 | 109 | /* Ensure certain elements are never larger than the slide itself */ 110 | .reveal img, 111 | .reveal video, 112 | .reveal iframe { 113 | max-width: 95%; 114 | max-height: 95%; } 115 | 116 | .reveal strong, 117 | .reveal b { 118 | font-weight: bold; } 119 | 120 | .reveal em { 121 | font-style: italic; } 122 | 123 | .reveal ol, 124 | .reveal dl, 125 | .reveal ul { 126 | text-align: left; 127 | display: block; 128 | margin: 0 0 0 1.5em; } 129 | 130 | .reveal ol { 131 | list-style-type: decimal; } 132 | 133 | .reveal ul { 134 | display: block; 135 | list-style-type: disc; } 136 | 137 | .reveal ul ul { 138 | list-style-type: square; } 139 | 140 | .reveal ul ul ul { 141 | list-style-type: circle; } 142 | 143 | .reveal ul ul, 144 | .reveal ul ol, 145 | .reveal ol ol, 146 | .reveal ol ul { 147 | display: block; 148 | margin-left: 1.5em; } 149 | 150 | .reveal dt { 151 | font-weight: bold; } 152 | 153 | .reveal dd { 154 | margin-left: 30px; } 155 | 156 | .reveal blockquote { 157 | display: block; 158 | position: relative; 159 | width: 70%; 160 | margin: 20px auto; 161 | padding: 5px; 162 | font-style: italic; 163 | background: rgba(255, 255, 255, 0.05); 164 | box-shadow: 0px 0px 2px rgba(0, 0, 0, 0.2); } 165 | 166 | .reveal blockquote p:first-child, 167 | .reveal blockquote p:last-child { 168 | display: inline-block; } 169 | 170 | .reveal q { 171 | font-style: italic; } 172 | 173 | .reveal pre { 174 | display: block; 175 | position: relative; 176 | width: 90%; 177 | margin: 20px auto; 178 | text-align: left; 179 | font-size: 0.55em; 180 | font-family: monospace; 181 | line-height: 1.2em; 182 | word-wrap: break-word; 183 | box-shadow: 0px 5px 15px rgba(0, 0, 0, 0.15); } 184 | 185 | /*** 186 | .reveal code { 187 | font-family: monospace; 188 | text-transform: none; 189 | padding: 2px 4px; 190 | font-size: 90%; 191 | color: #c7254e; 192 | background-color: #f9f2f4; 193 | vertical-align: baseline; 194 | white-space: pre-wrap; 195 | border-radius: 2px; 196 | 197 | } 198 | 199 | .reveal pre code { 200 | display: block; 201 | padding: 5px; 202 | overflow: auto; 203 | max-height: 400px; 204 | word-wrap: normal; } 205 | ***/ 206 | 207 | .reveal table { 208 | margin: auto; 209 | border-collapse: collapse; 210 | border-spacing: 0; } 211 | 212 | .reveal table th { 213 | font-weight: bold; } 214 | 215 | .reveal table th, 216 | .reveal table td { 217 | text-align: left; 218 | padding: 0.2em 0.5em 0.2em 0.5em; 219 | border-bottom: 1px solid; } 220 | 221 | .reveal table th[align="center"], 222 | .reveal table td[align="center"] { 223 | text-align: center; } 224 | 225 | .reveal table th[align="right"], 226 | .reveal table td[align="right"] { 227 | text-align: right; } 228 | 229 | .reveal table tbody tr:last-child th, 230 | .reveal table tbody tr:last-child td { 231 | border-bottom: none; } 232 | 233 | .reveal sup { 234 | vertical-align: super; 235 | font-size: smaller; } 236 | 237 | .reveal sub { 238 | vertical-align: sub; 239 | font-size: smaller; } 240 | 241 | .reveal small { 242 | display: inline-block; 243 | font-size: 0.6em; 244 | line-height: 1.2em; 245 | vertical-align: top; } 246 | 247 | .reveal small * { 248 | vertical-align: top; } 249 | 250 | /********************************************* 251 | * LINKS 252 | *********************************************/ 253 | .reveal a { 254 | color: #00008B; 255 | text-decoration: none; 256 | -webkit-transition: color .15s ease; 257 | -moz-transition: color .15s ease; 258 | transition: color .15s ease; } 259 | 260 | .reveal a:hover { 261 | color: #0000f1; 262 | text-shadow: none; 263 | border: none; } 264 | 265 | .reveal .roll span:after { 266 | color: #fff; 267 | background: #00003f; } 268 | 269 | /********************************************* 270 | * IMAGES 271 | *********************************************/ 272 | .reveal section img { 273 | margin: 15px 0px; 274 | background: rgba(255, 255, 255, 0.12); 275 | border: 4px solid #000; 276 | box-shadow: 0 0 10px rgba(0, 0, 0, 0.15); } 277 | 278 | .reveal section img.plain { 279 | border: 0; 280 | box-shadow: none; } 281 | 282 | .reveal a img { 283 | -webkit-transition: all .15s linear; 284 | -moz-transition: all .15s linear; 285 | transition: all .15s linear; } 286 | 287 | .reveal a:hover img { 288 | background: rgba(255, 255, 255, 0.2); 289 | border-color: #00008B; 290 | box-shadow: 0 0 20px rgba(0, 0, 0, 0.55); } 291 | 292 | /********************************************* 293 | * NAVIGATION CONTROLS 294 | *********************************************/ 295 | .reveal .controls { 296 | color: #00008B; } 297 | 298 | /********************************************* 299 | * PROGRESS BAR 300 | *********************************************/ 301 | .reveal .progress { 302 | background: rgba(0, 0, 0, 0.2); 303 | color: #00008B; } 304 | 305 | .reveal .progress span { 306 | -webkit-transition: width 800ms cubic-bezier(0.26, 0.86, 0.44, 0.985); 307 | -moz-transition: width 800ms cubic-bezier(0.26, 0.86, 0.44, 0.985); 308 | transition: width 800ms cubic-bezier(0.26, 0.86, 0.44, 0.985); } 309 | 310 | /********************************************* 311 | * PRINT BACKGROUND 312 | *********************************************/ 313 | @media print { 314 | .backgrounds { 315 | background-color: #fff; } } 316 | -------------------------------------------------------------------------------- /slides/presentation.yaml: -------------------------------------------------------------------------------- 1 | slide-level: 2 2 | revealjs-url: "../reveal.js" 3 | progress: true 4 | section-titles: false 5 | aspectratio: 1610 6 | mainfont: "IBM Plex Sans-Light" 7 | theme: "white" 8 | css: "minimal-theme.css" 9 | verbosity: INFO 10 | --------------------------------------------------------------------------------