├── .gitignore
├── .gitmodules
├── InstallationGuide.md
├── InstallationGuide.pdf
├── README.md
├── Week1
    ├── examples_instructor.ipynb
    ├── examples_student.ipynb
    ├── exercises.ipynb
    ├── lecture.html
    ├── lecture.md
    ├── lecture.pdf
    ├── planning.md
    └── solutions.ipynb
├── Week2
    ├── data
    │   ├── BES-2017-F2F-codebook.pdf
    │   ├── bes_data.csv
    │   ├── bes_data.feather
    │   ├── bes_data.pickle
    │   ├── bes_data_full_week2.csv
    │   ├── bes_data_full_week2.feather
    │   ├── bes_data_full_week2.json
    │   ├── bes_data_subset_week2.csv
    │   ├── bes_data_subset_week2.feather
    │   ├── bes_data_subset_week2.json
    │   ├── bes_f2f_2017_v1.3.dta
    │   ├── bes_relabelling.R
    │   ├── data_prep.py
    │   └── data_week2.zip
    ├── examples_instructor.ipynb
    ├── examples_student.ipynb
    ├── exercises.ipynb
    ├── lecture.html
    ├── lecture.md
    ├── lecture.pdf
    ├── planning.md
    └── solutions.ipynb
├── Week3
    ├── examples.ipynb
    ├── examples_instructor.ipynb
    ├── examples_student.ipynb
    ├── exercises.ipynb
    ├── groupby_example.png
    ├── lecture.html
    ├── lecture.md
    ├── lecture.pdf
    ├── planning.md
    ├── solutions.ipynb
    └── test.ipynb
├── Week4
    ├── crosstab_heatmap.py
    ├── examples_instructor.ipynb
    ├── examples_student.ipynb
    ├── exercises.ipynb
    ├── extra_challenge.png
    ├── figures.py
    ├── figures
    │   ├── lecture_box1.png
    │   ├── lecture_emptysubplot1.png
    │   ├── lecture_emptysubplot2.png
    │   ├── lecture_emptysubplot3.png
    │   ├── lecture_fig1.png
    │   ├── lecture_heatmap1.png
    │   ├── lecture_hist1.png
    │   ├── lecture_hist2.png
    │   ├── lecture_line1.png
    │   ├── lecture_linescatter1.png
    │   ├── lecture_linescatter2.png
    │   ├── lecture_linescatter3.png
    │   ├── lecture_linescatter4.png
    │   ├── lecture_rotated_labels1.png
    │   ├── lecture_scatter1.png
    │   ├── lecture_swarm1.png
    │   ├── lecture_swarm2.png
    │   └── lecture_violin1.png
    ├── latex_table.tex
    ├── lecture.html
    ├── lecture.md
    ├── lecture.pdf
    ├── planning.md
    ├── solutions.ipynb
    └── test.ipynb
├── Week5
    ├── examples_instructor.ipynb
    ├── examples_student.ipynb
    ├── exercises.ipynb
    ├── lecture.html
    ├── lecture.md
    ├── lecture.pdf
    ├── local_plot_utils.py
    ├── planning.md
    └── solutions.ipynb
├── Week6
    ├── examples.ipynb
    ├── examples_instructor.ipynb
    ├── examples_student.ipynb
    ├── exercises.ipynb
    ├── iris-TD-2.svg
    ├── lecture.html
    ├── lecture.md
    └── lecture.pdf
├── Week7
    ├── examples_instructor.ipynb
    ├── examples_student.ipynb
    ├── exercises.ipynb
    ├── lecture.html
    ├── lecture.md
    ├── lecture.pdf
    └── planning.md
├── Week8
    ├── examples_selenium.ipynb
    ├── examples_twitter.ipynb
    ├── lecture.html
    ├── lecture.md
    ├── lecture.pdf
    └── lecture_planning.md
├── Week8Old
    ├── examples.ipynb
    ├── figure1.dot
    ├── figure1.png
    ├── figure1.svg
    ├── lecture.html
    ├── lecture.md
    ├── lecture.pdf
    ├── planning.md
    └── wmd_fig1.png
├── _config.yml
├── dpir-intro-theme.css
├── images
    ├── anaconda_navigator_environments.png
    ├── anaconda_navigator_screenshot.png
    ├── atom_editor.png
    ├── jupyter_lab_editor.png
    └── jupyter_lab_launcher.png
├── index.md
├── ipynb_slideify.py
├── minimal-theme.css
├── misc_presentations
    ├── ConfoundedMeasurement.png
    ├── Measurement Bias-Confounding.png
    ├── Measurement Bias-Relationship as Indicator.png
    ├── Measurement Bias-Simple Diagram.png
    ├── Measurement Bias.drawio
    ├── Measurement Bias.xml
    ├── MeasurementFig1.png
    ├── a7.png
    ├── cess-mt21-pres.html
    ├── cess-mt21-pres.md
    ├── compgov_revision.html
    ├── comptext-pres.html
    ├── draft4.pdf
    ├── epsa.html
    ├── fig_bernoulli.png
    ├── figures.py
    ├── figures
    │   ├── allocations.pdf
    │   ├── effect_of_targeting_presentation.png
    │   ├── effect_of_targeting_total.pdf
    │   ├── feature_importance.pdf
    │   ├── heterogeneity_presentation.png
    │   └── predicted_favorability_f_pid.pdf
    ├── knn.png
    ├── minimal-theme.css
    ├── pip-colloquium.html
    ├── planning.md
    ├── presentation.html
    ├── presentation.md
    └── presentation_updated.md
├── syllabus.md
├── syllabus.pdf
└── teaching.yaml


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | #   For a library or package, you might want to ignore these files since the code is
 86 | #   intended to run in multiple environments; otherwise, check them in:
 87 | # .python-version
 88 | 
 89 | # pipenv
 90 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 91 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 92 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 93 | #   install all needed dependencies.
 94 | #Pipfile.lock
 95 | 
 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 97 | __pypackages__/
 98 | 
 99 | # Celery stuff
100 | celerybeat-schedule
101 | celerybeat.pid
102 | 
103 | # SageMath parsed files
104 | *.sage.py
105 | 
106 | # Environments
107 | .env
108 | .venv
109 | env/
110 | venv/
111 | ENV/
112 | env.bak/
113 | venv.bak/
114 | 
115 | # Spyder project settings
116 | .spyderproject
117 | .spyproject
118 | 
119 | # Rope project settings
120 | .ropeproject
121 | 
122 | # mkdocs documentation
123 | /site
124 | 
125 | # mypy
126 | .mypy_cache/
127 | .dmypy.json
128 | dmypy.json
129 | 
130 | # Pyre type checker
131 | .pyre/
132 | 
133 | # pytype static type analyzer
134 | .pytype/
135 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "reveal.js"]
2 | 	path = reveal.js
3 | 	url = https://github.com/hakimel/reveal.js
4 | 


--------------------------------------------------------------------------------
/InstallationGuide.md:
--------------------------------------------------------------------------------
 1 | # Python Installation Guide
 2 | _Musashi Harukawa, DPIR_
 3 | 
 4 | # Installing Anaconda
 5 | 
 6 | Anaconda is a freely-available data science software and environment manager. It can be used to manage versions and editors for Python, `R`, and other popular data science languages (such as `Julia`).
 7 | 
 8 | In order to simplify installing all the relevant packages and software, we will be using Anaconda for this class. The instructions on how to install Anaconda can be found at:
 9 | 
10 | - **Windows**: https://docs.anaconda.com/anaconda/install/windows/
11 | - **MacOS**: https://docs.anaconda.com/anaconda/install/mac-os/
12 | - **Linux**: https://docs.anaconda.com/anaconda/install/linux/
13 | 
14 | Follow the instructions contained within the guide. Note the following:
15 | 
16 | - Install Python version 3.7 or later. Do NOT install Python 2.7.
17 | - Unless you know what you are doing, I recommend that you install anaconda to the default location.
18 | - You do not need to install as admin.
19 | - You do not need PyCharm, but if you have a experience with Sublime or other industry-standard IDEs, you may prefer to use it (please note that you will still need to use Jupyter).
20 | - (Windows only): You do not need to add Anaconda to your PATH environment variable.
21 | - You do not need Anaconda Cloud.
22 | 
23 | If you run into trouble, _first try Googling the answer_ (or whatever your preferred non-invasive search engine is), and then ask me. Chances are that somebody has run into the same problem as you, and the answer exists on the Internet. If the problem persists, then feel free to get in touch.
24 | 
25 | # Verifying your Installation
26 | 
27 | Once you have installed Anaconda, check that everything works.
28 | 
29 | Open up Anaconda Navigator (like you would any other application). You should see a menu with a number of items including JupyterLab and Jupyter Notebooks. Try opening up either, and navigating to the directory (folder) where you will be keeping all of your notes for this course.
30 | 
31 | In this directory, create a Python Notebook and rename it to `my_first_notebook`. The default file ending should be `.ipynb`. Then in the first cell of this notebook, type the following command:
32 | 
33 | ```{python}
34 | print("Hello World!")
35 | ```
36 | 
37 | Click on "Run Selected Cell" in the notebook menu. You should get the following output:
38 | 
39 | ```
40 | > Hello World!
41 | ```
42 | 
43 | If all of this works without issue, then you are ready to come to class for week 1! If this does not work, and you are unable to troubleshoot the issue, please contact me prior to coming to class (so we can minimize the amount of time we spend in class installing software and troubleshooting indivdiual issues).
44 | 
45 | See you on Wednesday!
46 | 


--------------------------------------------------------------------------------
/InstallationGuide.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/InstallationGuide.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Introduction to Python for Social Science
 2 | 
 3 | _Musashi Jacobs-Harukawa, Department of Politics and International Relations_
 4 | 
 5 | ## Course Description
 6 | 
 7 | _Introduction to Python for Social Science_ is an 8-week optional methods module aimed at social science researchers seeking to learn programming skills for their research. There will be weekly lectures, lasting 60 to 90 minutes, followed by a workshop, and supplemented by weekly office hours. All of the above will be conducted on Teams.
 8 | 
 9 | The aim of this course is two-fold. The first goal is to teach students essential _data analysis_ and _scripting_ skills so that they are able to put together short programs and run their own analyses. The second aim is to give a introduction to the numerous techniques and technologies that researchers can integrate into their own research, and to provide incentives to invest in computational methods and skills. Some of the techniques that will be taught include:
10 | 
11 | - Using Python as a Research and Development Tool
12 | - Data Cleaning and Merging with `pandas`
13 | - Static Data Visualisation with `matplotlib` and `seaborn`
14 | - Introduction to Machine Learning with `scikit-learn`
15 | - Introduction to Web Scraping with `beautifulsoup` and `selenium`
16 | 
17 | Note that this course is not a course in _programming_. Students will learn how to use Python for data analysis and research, but the primary focus is on teaching them about the available methods and the bare minimum level of programming to implement these methods. Also note that this course is optional, and there will be no marked assignments, but there will be weekly tasks designed to aid learning. Students are encouraged to complete these tasks, and to ask questions about them during the workshop and clinic.
18 | 
19 | This course is aimed at complete beginners, although experience with other programming languages (such as `R`) may provide some useful reference points. As spaces are limited, priority will be given to students without prior experience using Python, and those who have a use case for computational tools in their research.
20 | 
21 | ## Using this Repository
22 | 
23 | This repository contains all of the code, lecture slides, and jupyter notebooks for the course. You are welcome to clone this repository/browse the material here, but I've also made the effort to let you browse the slides in the browser at [`muhark.github.io/dpir-intro-python`](muhark.github.io/dpir-intro-python). I am also working on Google colab integration to allow students to work with the notebooks interactively from the website.
24 | 


--------------------------------------------------------------------------------
/Week1/exercises.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Week 1 Lecture Exercises\n",
  8 |     "\n",
  9 |     "_Refer to the lecture notes, examples, and the Internet to help you complete these tasks.<br>\n",
 10 |     "Model solutions will be posted next week._"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "metadata": {},
 16 |    "source": [
 17 |     "## Task 1: Animal Sounds\n",
 18 |     "\n",
 19 |     "1. Create a dictionary of animals and their sounds and call it `animal_sounds`.\n",
 20 |     "    - Each value should be the corresponding sound for the animal.\n",
 21 |     "    - If your native language is not English, use the sounds from your language!\n",
 22 |     "2. Use a for loop to print the statement \"In my language, the **ANIMAL** makes the sound **SOUND**\" for each key-value pair in your dictionary.\n",
 23 |     "\n",
 24 |     "**_Extra Challenge_**:\n",
 25 |     "\n",
 26 |     "1. Create two separate lists, `animals` and `sounds`.\n",
 27 |     "    - `animals` should be the list of the animals used in the previous task.\n",
 28 |     "    - `sounds` should be the list of corresponding sounds.\n",
 29 |     "    - Also: Make sure the `type` of `animals` and `sounds` is `list`!\n",
 30 |     "2. Create an empty dictionary called `animal_sounds`.\n",
 31 |     "    - Hint: This can be done with `{}`\n",
 32 |     "3. Use a for loop to populate the dictionary with the information from animals and sounds.\n",
 33 |     "    - _In the same for-loop, print the same statements as in the previous section_."
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {},
 39 |    "source": [
 40 |     "## Task 2: Writing a Menu\n",
 41 |     "\n",
 42 |     "A menu typically consists of the following information:\n",
 43 |     "\n",
 44 |     "- Course\n",
 45 |     "- Dish\n",
 46 |     "- Description\n",
 47 |     "- Price\n",
 48 |     "\n",
 49 |     "In this exercise, you will experiment with different ways of representing this information.\n",
 50 |     "\n",
 51 |     "1. Dictionary of Dictionaries (Nested Hierarchy)\n",
 52 |     "    - Create a dictionary called `menu1`.\n",
 53 |     "    - For each dish, create a second dictionary with the keys `'course'`, `'price'`,  and `'description'`. Fill these in accordingly.\n",
 54 |     "    \n",
 55 |     "2. Dictionary of Lists\n",
 56 |     "    - Create a dictionary called `menu2`.\n",
 57 |     "        - For each of the keys `'dish'`, `'course'`, `'description'` and `'price'`, write a list of all of the values.\n",
 58 |     "        - Hint: `'course'` will contain many repeated values.\n",
 59 |     "\n",
 60 |     "**_Extra Challenge_**:\n",
 61 |     "\n",
 62 |     "- For both methods, find a way to iterate over the dictionary to print out a menu.\n",
 63 |     "- The fancier the better!\n",
 64 |     "    - Note that you can get the length of a string using the `len` function. You can use this to create aligned columns!"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "# Task 3:\n",
 72 |     "\n",
 73 |     "Similar to the exercise of making a sentence from the fewest letters possible. \n",
 74 |     "\n",
 75 |     "- Create a list of five letters and a space, call it `letters`.\n",
 76 |     "- Figure out the longest sentence you can make from those letters.\n",
 77 |     "- Use the indices of the list to write a sentence.\n",
 78 |     "- Create a new sentence using a for loop and the `join` function.\n",
 79 |     "\n",
 80 |     "**_Extra Challenge_**:\n",
 81 |     "\n",
 82 |     "There are other, smarter ways of doing this with dictionaries and lists. See if you can find a better method than the one below!"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "# Task 4 (Bonus):\n",
 90 |     "\n",
 91 |     "A prime number is a natural number ($\\mathbb{N}$) that is greater than 1 and is not the product of two smaller natural numbers.\n",
 92 |     "\n",
 93 |     "Write code that prints all prime numbers less than 10000\n",
 94 |     "\n",
 95 |     "For an additional challenge, write `%%timeit` at the top of the codeblock to see how long your code takes to execute. See how fast you can make your code."
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": []
104 |   }
105 |  ],
106 |  "metadata": {
107 |   "kernelspec": {
108 |    "display_name": "teaching",
109 |    "language": "python",
110 |    "name": "teaching"
111 |   },
112 |   "language_info": {
113 |    "codemirror_mode": {
114 |     "name": "ipython",
115 |     "version": 3
116 |    },
117 |    "file_extension": ".py",
118 |    "mimetype": "text/x-python",
119 |    "name": "python",
120 |    "nbconvert_exporter": "python",
121 |    "pygments_lexer": "ipython3",
122 |    "version": "3.7.6"
123 |   }
124 |  },
125 |  "nbformat": 4,
126 |  "nbformat_minor": 4
127 | }
128 | 


--------------------------------------------------------------------------------
/Week1/lecture.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/Week1/lecture.pdf


--------------------------------------------------------------------------------
/Week1/planning.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Intro to Python Course Planning
 3 | ---
 4 | 
 5 | _From the syllabus_:
 6 | 
 7 | ### _Week 1_: Introduction to Python and the Development Environment
 8 | 
 9 | **Learning Aims**:
10 | 
11 | 1. What is Python and what can I use it for?
12 | 2. What are the tools I can use to write Python code?
13 | 3. Writing your first Python script
14 | 
15 | There are three learning goals in the first week. The first relates to what Python is, and how it can be useful for social science researchers. Students will learn about the various use cases for Python, and come up with ways that it may help them achieve their research aims.
16 | 
17 | The second learning goal is to gain familiarity with the tools used to code in Python and present their research. These include Jupyter notebooks, IDEs and the terminal. Students will primarily use Jupyter notebooks in this course, but are welcome to use alternative development tools.
18 | 
19 | The final goal is to write their first program in Python. Commands and operators such as `print`, `+`, `&` etc. will be introduced.
20 | 
21 | ### Coding Goals
22 | 
23 | - print()
24 | - import, from, as
25 | - int, float, str (and bool)
26 | - basic arithmetic
27 | - basic string operations
28 | - lists and dicts
29 | 
30 | ### Next Week
31 | 
32 | - Data I/O (requires knowledge of strings, paths)
33 | - Constructing pandas dataframes from dicts and lists
34 | - Selecting
35 | 
36 | 
37 | ### Lecture
38 | 
39 | The lecture begins with a few administrative points, and then goes into the following parts:
40 | 
41 | - _What is Python and what can I use it for_?
42 |     - What is Python?
43 |         - _General purpose scripting language with large data science community_
44 |             - What is a script? For our purposes, it automates some task.
45 |                 - Usually some inputs and an output, but sometimes it can just generate things (from some external data source, e.g. the web).
46 |                 - Good to keep this input-output idea in your head. Each script should take some inputs, and give some outputs.
47 |     - What can I use Python for?
48 |         - List possible applications of Python for social science research. My primary goal is to motivate students, but also to try and illustrate the broad possibilities.
49 |             - Aside: what do I use Python for?
50 |                 - General scripting
51 |                 - Quick data visualisation
52 |                 - Data Cleaning
53 |                 - Natural Language Processing
54 |                 - Web Scraping/Data Collection
55 |         - If possible, try and find a number of papers that have used Python. This isn't always obvious, and `R` tends to be more popular in the computational political science community, whereas python is more popular amongst the engineering/hard sciences.
56 |     - Quick Aside: Python vs `R`
57 |         - This question comes up frequently. My two cents on the debate is that whereas `R` is a language specifically for statistical computing, python is a general-purpose programming language popular with the data science community. There is a lot of overlap in the functionality between the two languages, and learning either one will enable you to do many things anyway.
58 | - _Basic Coding Tools_
59 |     - _Aim of this is to understand a few things about the coding toolkit and interface_.
60 |     - Anaconda: An environment manager. Python has many _libraries_ and _versions_; this software helps you keep them tidy.
61 |     - Jupyter: Code editor (and executor). Takes the form of terminals, notebooks and lab.
62 |         - How to start Jupyter (and why is this running in a browser??)
63 |         - Navigating the Jupyter Lab interface
64 |     - Some other IDEs (for nerds):
65 |         - Atom: my preferred tool, for anyone who is interested.
66 |         - PyCharm: most commonly used tool for development, good for people in the class with prior experience coding in other languages.
67 |         - vim: if you're hardcore
68 |         - Takeaway: Python is a language, which is separate from the tools you use to write and execute it.
69 | - _First Steps in Python_: (to be done in RISE)
70 |     1. `print()`
71 |         - Note: notebook returns output of last command, if this is just a variable then it returns that. In general if you want print statements, then
72 |     2. variable assignment
73 |     3. binary operators: +, -, ==
74 |     4. 4 basic data types
75 |     5. lists and dicts
76 | 


--------------------------------------------------------------------------------
/Week2/data/BES-2017-F2F-codebook.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/Week2/data/BES-2017-F2F-codebook.pdf


--------------------------------------------------------------------------------
/Week2/data/bes_data.feather:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/Week2/data/bes_data.feather


--------------------------------------------------------------------------------
/Week2/data/bes_data.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/Week2/data/bes_data.pickle


--------------------------------------------------------------------------------
/Week2/data/bes_data_full_week2.feather:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/Week2/data/bes_data_full_week2.feather


--------------------------------------------------------------------------------
/Week2/data/bes_data_subset_week2.feather:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/Week2/data/bes_data_subset_week2.feather


--------------------------------------------------------------------------------
/Week2/data/bes_f2f_2017_v1.3.dta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/Week2/data/bes_f2f_2017_v1.3.dta


--------------------------------------------------------------------------------
/Week2/data/bes_relabelling.R:
--------------------------------------------------------------------------------
  1 | library("FactoRMine")
  2 | library("haven")
  3 | library("plyr")
  4 | library("dplyr")
  5 | 
  6 | options(
  7 |   summary.stats.lm = c(
  8 |     "R-squared",
  9 |     "p",
 10 |     "Deviance",
 11 |     "AIC",
 12 |     "BIC"
 13 |   )
 14 | )
 15 | 
 16 | data <- read_stata("/home/lunayneko/Documents/Teaching/dpir-intro-python/Week2/data/bes_f2f_2017_v1.3.dta")
 17 | 
 18 | #' B: RESPONDENT'S ELECTORAL BEHAVIOUR
 19 | #'
 20 | #' B1: Did you vote in 2017?
 21 | #' B2: Who did you vote for?
 22 | #' B4: If you had voted, who would you have voted for?
 23 | #' ^^ The above two can likely be combined to a single preferred party variable.
 24 | #' B6: Why did you vote the way you did?
 25 | #' ^^ Answers (3, 4) indicate strategic voting, B6a checks actual preferred
 26 | #'    party.
 27 | #' B12-B13: How closely do Tories(12)/Labour(13) look after the interests of:
 28 | #'          - BAME
 29 | #'          - Trade unions
 30 | #'          - Middle class people
 31 | #'          - Big business
 32 | #'          - Working class people
 33 | #'          - People who are unemployed or on benefits
 34 | #'    1: Very closely<>Not at all closely :4
 35 | #'
 36 | #' C: ATTITUDES TOWARD VOTING
 37 | #'
 38 | #' C1: How interested were you in the general election? (1 very)
 39 | #' C2_2: _It is every citizen's duty to vote in an election_ (1 strong disagree)
 40 | #'
 41 | #' D: PARTY ID
 42 | #'
 43 | #' D1: Which party do you think of yourself as?
 44 | #' -> If no, D2: do you think of yourelf as a little closer to one?
 45 | #'    ->  If yes, D3: which party is that?
 46 | #' D4: Strength of party identification. Can add option (4) based on non-
 47 | #'     identifaction based on answer to D1/D2
 48 | #'
 49 | #' E: LEFT-RIGHT
 50 | #'
 51 | #' E1: Left-Right self-placement
 52 | #' ^^ Would be interesting to compare correlation of PCA with LR
 53 | #'
 54 | #' F1: Range of political statements:
 55 | #'  01 Ordinary working people get their fair share of the nation's wealth
 56 | #'  02 There is one law for the rich and one for the poor
 57 | #'  03 Young people today don't have enough respect for traditional British
 58 | #'     values
 59 | #'  04 Censorship of films and magazines is necessary to uphold moral standards
 60 | #'  05 There is no need for strong trade unions to protect
 61 | #'     employees' working conditions and wages
 62 | #'  06 Private enterprise is the best way to solve Britain's economic problems
 63 | #'  07 Major public services and industries ought to be in state ownership
 64 | #'  08 It is the government's responsibility to provide a job for everyone who
 65 | #'     wants one
 66 | #'  09 People should be allowed to organise public meetings to protest against
 67 | #'     the government
 68 | #'  10 People in Britain should be more tolerant of those who lead
 69 | #'    unconventional lives
 70 | #'  11 For some crimes, the death penalty is the most appropriate sentence
 71 | #'  12 People who break the law should be given stiffer sentences
 72 | #'
 73 | #' P: Europe 0-10, self and parties
 74 | #'
 75 | #' P1: How did you vote in the Brexit referendum?
 76 | #' P2: How would you have voted in the Brexit referendum?
 77 | #' P3_1: Own view, European integration
 78 | #'
 79 | #'
 80 | #' W: CLASS
 81 | #'
 82 | #' W1: Class self-identification (1 middle, 2 working, 3 other)
 83 | #' -> If not W1(1, 2), W2: if you had to choose, middle or working (upper is
 84 | #'                         option but not mentioned).
 85 | #'
 86 | #' Y: DEMOGRAPHICS
 87 | #'
 88 | #' Y09: Gender
 89 | #' Y13A: Highest education achieved
 90 | #'
 91 | # Let's predict Brexit vote and whether they voted Tory.
 92 | 
 93 | vars <- c(
 94 |   "b01", "b02",
 95 |   "c01", "c02_2",
 96 |   "d01", "d03", "d02", "d04",
 97 |   "e01",
 98 |   "f01_1", "f01_2", "f01_3", "f01_4", "f01_5",
 99 |   "f01_6", "f01_7", "f01_8", "f01_9", "f01_10",
100 |   "f01_11", "f01_12",
101 |   "p01", "p02", "p03_1",
102 |   "w01", "w02",
103 |   "y09", "edlevel", "region")
104 | 
105 | df <- data %>% select(vars)
106 | 
107 | # Construct variables that are split over columns
108 | 
109 | f_batt <- c(
110 |   "f01_1", "f01_2", "f01_3", "f01_4", "f01_5",
111 |   "f01_6", "f01_7", "f01_8", "f01_9", "f01_10",
112 |   "f01_11", "f01_12")
113 | f_pca <- PCA(select(df, all_of(f_batt)), ncp=2)
114 | 
115 | 
116 | df <- df %>% transmute(
117 |   voted = factor(as.integer(b01==1), levels=c(0,1)),
118 |   tory_vote = factor(tidyr::replace_na(b02==2, 0), levels=c(0,1)),
119 | #  lab_vote = factor(replace_na(b02==1, 0), levels=c(0,1)),
120 |   election_interest = as.integer(c01),
121 |   civic_duty = as.integer(ifelse(c02_2==-1, 3, c02_2)),
122 |   party_id = select(df, c("d01", "d03")) %>%
123 |     apply(function(x){ifelse(x[1]<=0 & !is.na(x[2]),x[2],x[1])}, MARGIN=1) %>%
124 |     mapvalues(attr(data$d01, 'labels'), labels(attr(data$d01, 'labels'))) %>%
125 |     factor(),
126 |   ideo_lr = as.integer(ifelse(df$e01<0, 5, df$e01)),
127 |   ideo_pc1 = f_pca$ind$coord[,"Dim.1"],
128 |   ideo_pc2 = f_pca$ind$coord[,"Dim.2"],
129 |   vote_leave = factor(as.integer(p01==2)),
130 |   class = factor(ifelse(df$w01!=1&df$w01!=2, df$w02, df$w01)),
131 |   female = factor(as.integer(y09==2)),
132 |   edlevel = factor(tidyr::replace_na(edlevel, 0)),
133 |   region = factor(region) %>% relevel(ref="London")
134 | )
135 | 
136 | feather::write_feather("bes_data.feather")
137 | write.csv("bes_data.csv")
138 | 
139 | 


--------------------------------------------------------------------------------
/Week2/data/data_prep.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # This script contains the commands used to create the BES data subsets we use
 4 | # for this week's lecture from the original data download.
 5 | # It's in the repo for future reference, or in case you were curious where the
 6 | # files actually come from.
 7 | 
 8 | import pandas as pd
 9 | import re
10 | import numpy as np
11 | 
12 | # Reading in source data file, in stata format >:(
13 | df = pd.read_stata("bes_f2f_2017_v1.3.dta")
14 | 
15 | # Choosing subset of columns for week 2.
16 | cols = df.columns.tolist()
17 | pattern = re.compile(r"^[aekxy][0-1][0-9]$")
18 | subset = cols[0:1]+cols[337:338]+cols[340:346]+cols[355:356]+['Age'] + \
19 |          [col for col in cols if re.match(pattern, col)]
20 | 
21 | # Fixing data for Feather conversion
22 | 
23 | df.loc[:, 'Age'] = df['Age'].replace({"Refused": np.nan}).astype(np.float)
24 | df.loc[:, 'q25_cses'] = df['q25_cses'].replace(
25 |     {'Not stated': np.nan}).astype(np.float)
26 | 
27 | 
28 | # Creating a new dataframe with just these columns.
29 | week2 = df[subset].copy()
30 | 
31 | 
32 | # Saving to `csv`, `json`, `feather`; `hdf` requires too many dependencies
33 | df.to_csv("bes_data_full_week2.csv", index=False)
34 | df.to_json("bes_data_full_week2.json")
35 | df.to_feather("bes_data_full_week2.feather")
36 | week2.to_csv("bes_data_subset_week2.csv", index=False)
37 | week2.to_json("bes_data_subset_week2.json")
38 | week2.to_feather("bes_data_subset_week2.feather")
39 | # week2.to_hdf("bes_data_subset_week2.hdf", key="a_meta")
40 | 


--------------------------------------------------------------------------------
/Week2/data/data_week2.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/Week2/data/data_week2.zip


--------------------------------------------------------------------------------
/Week2/exercises.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Week 2 Lecture Exercises\n",
  8 |     "\n",
  9 |     "\n",
 10 |     "We'll be working with the BES 2017 face-to-face cross-sectional survey extensively in this course.\n",
 11 |     "\n",
 12 |     "- You can download a zip folder containing the data from the website: https://muhark.github.io/dpir-intro-python/Week2/data/data_week2.zip\n",
 13 |     "- For these exercises, you can either use `bes_data_full_week2` or `bes_data_subset_week2`.\n",
 14 |     "- I've included the codebook (`BES-2017-F2F-codebook.pdf`). You'll need this to interpret the columns."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "### Step 0: Read in the Data\n",
 22 |     "\n",
 23 |     "I've taken this first step for you because I'm hosting the data files online. Normally you would write a filepath to the location the file is being kept relative to where the script is being executed."
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": null,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "import pandas as pd\n",
 33 |     "\n",
 34 |     "link = 'http://github.com/muhark/dpir-intro-python/raw/master/Week2/data/bes_data_subset_week2.feather'\n",
 35 |     "bes_df = pd.read_feather(link)"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "## Exercise 1: First Look at the Data\n",
 43 |     "\n",
 44 |     "_Answer the following questions about the dataset_:\n",
 45 |     "\n",
 46 |     "- How many observations in the dataset?\n",
 47 |     "- How many variables?\n",
 48 |     "- How many variables contain numeric values?\n",
 49 |     "- How many variables are open-ended response?\n",
 50 |     "- How many categorical variables?"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": null,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": []
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "# Exercise 2: Clean Up Labels\n",
 65 |     "\n",
 66 |     "_It's annoying to have to always refer to the codebook. Choose a few sections from the survey (i.e. questions a, questions b, etc.) and give the columns short, meaningful titles._\n",
 67 |     "\n",
 68 |     "For this part of the assignment, \n",
 69 |     "\n",
 70 |     "For instance, `a01` asks \"First, I'd like to ask you a few questions about the issues and problems facing Britain today. As far as you're concerned, what is the single most important issue facing the country at the present time?\". I might rename this question `most_important_issue`, or even `top_issue`.\n",
 71 |     "\n",
 72 |     "Another example: `y01` could be renamed `income` or `annual_income`.\n",
 73 |     "\n",
 74 |     "To keep your code neat, I recommend that you first create a dictionary called something like `col_name_dict`, put the original and replacements in there, and then use the `df.rename()` function to substitute the column names.\n"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": []
 83 |   },
 84 |   {
 85 |    "cell_type": "markdown",
 86 |    "metadata": {},
 87 |    "source": [
 88 |     "## Exercise 3: Cursory Statistics\n",
 89 |     "\n",
 90 |     "There are a few things you can calculate fairly easily. For instance:\n",
 91 |     "\n",
 92 |     "- How many responses per region? per constituency?\n",
 93 |     "- (If using section y:) Median income bracket? Modal religion? Mean/median age?\n",
 94 |     "\n",
 95 |     "Here you want to be creative. What questions would you ask of your data? What would a reviewer or a client be likely to want to know?\n",
 96 |     "\n",
 97 |     "For an additional challenge, calculate each of the statistics per-region, e.g. median income bracket per-region."
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": []
106 |   }
107 |  ],
108 |  "metadata": {
109 |   "kernelspec": {
110 |    "display_name": "Python 3",
111 |    "language": "python",
112 |    "name": "python3"
113 |   },
114 |   "language_info": {
115 |    "codemirror_mode": {
116 |     "name": "ipython",
117 |     "version": 3
118 |    },
119 |    "file_extension": ".py",
120 |    "mimetype": "text/x-python",
121 |    "name": "python",
122 |    "nbconvert_exporter": "python",
123 |    "pygments_lexer": "ipython3",
124 |    "version": "3.7.9"
125 |   }
126 |  },
127 |  "nbformat": 4,
128 |  "nbformat_minor": 4
129 | }
130 | 


--------------------------------------------------------------------------------
/Week2/lecture.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/Week2/lecture.pdf


--------------------------------------------------------------------------------
/Week2/planning.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Intro to Python Course Planning
 3 | ---
 4 | 
 5 | 
 6 | # Lecture Structure
 7 | 
 8 | - Recap
 9 | - This Week
10 | - Data Structures (Theoretical)
11 |     - Importance of Understanding Data
12 |     - Value and Relation
13 |     - Data Structures (Graph, Hierarchical, Tabular)
14 | - Data Formats
15 |     - `csv`, `xls(x)`, `html`
16 | - Data I/O
17 |     - `read_csv`
18 |     - `read_xlsx`
19 |     - `read_html`
20 | - The `pandas.DataFrame`
21 |     - `.info()`
22 |         - numpy dtypes
23 | - Slicing and indexing your dataframe
24 |     - `[]`
25 |     - `.loc[]`
26 |     - `.iloc[]`
27 |     - Views vs copies
28 | - Understanding your data
29 |     - `.head()`
30 |     - `.describe()`
31 |     - `.unique()`
32 |     - `.value_counts()`
33 | - Summary functions
34 |     - `.mean()`
35 |     - `.sum()`
36 | 


--------------------------------------------------------------------------------
/Week3/exercises.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Week 3 Exercises\n",
  8 |     "\n",
  9 |     "This week we learned how to do the following tasks:\n",
 10 |     "\n",
 11 |     "- Write functions.\n",
 12 |     "- Apply functions element-wise, cumulatively.\n",
 13 |     "- Calculate point and grouped summaries.\n",
 14 |     "- Concatenate and Merge Datasets\n"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "## Task 1: Functions\n",
 22 |     "\n",
 23 |     "### Task 1a: Numeric Functions\n",
 24 |     "\n",
 25 |     "In this exercise you write functions whose domain are either scalar numbers or numeric vectors.\n",
 26 |     "\n",
 27 |     "#### Scalar Functions\n",
 28 |     "\n",
 29 |     "- One Input: Absolute value\n",
 30 |     "- Two Inputs: Calculate the difference between the first input and the largest multiple of the second input that is less than the first input. Therefore, if the inputs are (41, 10), the function should calculate 41 - 4\\*10 = 1.\n",
 31 |     "- Challenge: Write a function that returns the factors of the input. For example, 132 = 2\\*2\\*3\\*11, so $f(132) = \\{2, 2, 3, 11\\}$\n",
 32 |     "\n",
 33 |     "#### Vector Functions\n",
 34 |     "\n",
 35 |     "- One Input: Write a summary statistics function. Given a vector, this function should return the following statistics in a `pd.Series` object with corresponding index labels: number of elements, sum, mean, median, variance, standard deviation, and any other statistics that you think are helpful.\n",
 36 |     "- Two Inputs: Write a function that given two equal-length inputs, determines whether each element in the first is divisible by the second. The output should be a vector of equal length to the inputs, indicating with True/False values whether the arguments of the first vector were divisible by the corresponding element in the second. CHALLENGE: Allow the function to take either a scalar or vector input as its second argument.\n",
 37 |     "\n",
 38 |     "### Task 1b: String Functions\n",
 39 |     "\n",
 40 |     "#### Scalar Functions\n",
 41 |     "\n",
 42 |     "- One Input: Write a function that divides a string into a list of words. Note: the `str.split()` function is useful here.\n",
 43 |     "- Two Inputs: Write a function that calculates the number of times the second argument occurs in the first. e.g. \"How many times does the letter e occur in this sentence?\"\n",
 44 |     "\n",
 45 |     "#### Vector Function\n",
 46 |     "\n",
 47 |     "- One Input: Write a function that, given a vector/list/series of strings, returns a series where the index is are the unique words in the input, and the values are the number of times that unique word occurs in the entire input. Therefore, if I took a list containing all of the State of the Union Address, I want a function that tells me a) what the unique words in the collection of all Addresses is, and b) how many times those words occur in the total collection.\n"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "markdown",
 52 |    "metadata": {},
 53 |    "source": [
 54 |     "## Task 2: Apply\n",
 55 |     "\n",
 56 |     "### Task 2a: Element-Wise Operations\n",
 57 |     "\n",
 58 |     "1. Using the `Age` variable from the BES dataset, calculate the age of each respondent rounded down to the nearest multiple of 5. Try writing this both using a defined function and with a `lambda` function.\n",
 59 |     "2. Recode the column `y09` as 0 and 1.\n",
 60 |     "3. Write a function that gets the lower bound from the income bounds reported in column `y01`, and returns it as an integer.\n",
 61 |     "\n",
 62 |     "\n",
 63 |     "### Task 2b: Grouped Functions\n",
 64 |     "\n",
 65 |     "1. Calculate the summary statistics on `Age` for each region, and each region/constituency.\n",
 66 |     "2. Calculate the median income bracket (`y01`) per region and region/constituency.\n",
 67 |     "3. Calculate the most commonly given answer to `a02` per region and region/income bracket.\n",
 68 |     "4. Calculate the most commonly given answer to `a02` and `y06` per region."
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "metadata": {},
 75 |    "outputs": [],
 76 |    "source": []
 77 |   }
 78 |  ],
 79 |  "metadata": {
 80 |   "kernelspec": {
 81 |    "display_name": "teaching",
 82 |    "language": "python",
 83 |    "name": "teaching"
 84 |   },
 85 |   "language_info": {
 86 |    "codemirror_mode": {
 87 |     "name": "ipython",
 88 |     "version": 3
 89 |    },
 90 |    "file_extension": ".py",
 91 |    "mimetype": "text/x-python",
 92 |    "name": "python",
 93 |    "nbconvert_exporter": "python",
 94 |    "pygments_lexer": "ipython3",
 95 |    "version": "3.7.6"
 96 |   }
 97 |  },
 98 |  "nbformat": 4,
 99 |  "nbformat_minor": 4
100 | }
101 | 


--------------------------------------------------------------------------------
/Week3/groupby_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/Week3/groupby_example.png


--------------------------------------------------------------------------------
/Week3/lecture.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/Week3/lecture.pdf


--------------------------------------------------------------------------------
/Week3/planning.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Week 3 Planning - Data Structures and Pandas II
 3 | author: Musashi Harukawa
 4 | ---
 5 | 
 6 | <!-- # From the Syllabus
 7 | 
 8 | **Learning Aims**:
 9 | 
10 | 1. Writing Python functions
11 | 2. Vectorize with `apply`
12 | 3. Split-apply-combine with `groupby`
13 | 4. Working with datetime data
14 | 
15 | The third week builds on students' knowledge of `pandas`, introducing two key tools in data analysis: `apply` and `groupby`. Students will also learn how to write functions and be introduced to the idea of namespaces.
16 | 
17 | By the end of this week, students should have a sufficient grounding in handling tabular data with base Python and `pandas` to deal with most data cleaning and reshaping tasks they use in their own research.
18 | 
19 | # Lecture Structure
20 | 
21 | - Recap from last week
22 |     - Series and DataFrames
23 |     - Slicing and Indexing
24 |     - Summary functions
25 | - Writing functions
26 |     - We want to be able to apply any summary function
27 |     - How to write functions in Python
28 |         - `def`
29 |         - inputs and outputs
30 |         - naming conventions
31 |         - Extra: `lambda` functions
32 |     - Aside: when to use functions (if you find yourself copy pasting code and editing, then use a function).
33 | - pandas `apply`
34 |     - `pd.Series.apply`
35 |     - `pd.DataFrame.apply`
36 |     - `apply` vs `itertuples` or `iterrows`
37 | - Split-apply-combine
38 |     - Frequent use case: apply function to groups within the data. For example, get a per-state average.
39 |     - Diagrammatic explanation
40 |     - A few examples
41 |     - The issue with multi-indexes, the easy fix.
42 | - Working with datetime data (is a nightmare)
43 |     - The nightmare of datetimes
44 |     - `strptime` and `strftime`
45 |     - python native `datetime`, pandas `Timestamp` and numpy `datetime64`
46 |     - `pd.Series.dt` functions, and how they can be used with `groupby` -->
47 | 
48 | # Methods Covered:
49 | 
50 | - Writing functions
51 | - Column/DataFrame apply
52 | - Groupby-Apply
53 | - Append, concat and merge
54 | - Melt and pivot
55 | 
56 | # Methods/Theory
57 | 
58 | - What is a function?
59 | - Applying functions to vectors:
60 |     - transformation, pointwise, and summaries
61 |     - grouped summary
62 | - Combining datasets
63 |     - Append vs concat vs merge
64 |     - A bit of set theory: union, etc.
65 | - Long vs wide-form data
66 | 
67 | 
68 | # Computational Aspect
69 | 
70 | - Functions:
71 |     - Tools for control flow + generalizability
72 |     - Namespaces
73 | - Apply:
74 | - Groupby-Apply:
75 |     - Vectorization and performance
76 | - Append, concat and merge
77 |     - Performance, accessibility over indexed data
78 | -
79 | 
80 | 
81 | Scrap:
82 | 
83 | $$
84 | f(X_{i, 1}) = \begin{bmatrix}
85 |                     f(x_{1, 1}) \\
86 |                     f(x_{2, 1}) \\
87 |                     \vdots \\
88 |                     f(x_{N, 1})
89 |                 \end{bmatrix}
90 | $$
91 | 


--------------------------------------------------------------------------------
/Week4/crosstab_heatmap.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import re
 3 | import seaborn as sns
 4 | import matplotlib
 5 | import matplotlib.pyplot as plt
 6 | 
 7 | df = pd.read_feather("../Week2/data/bes_data_subset_week2.feather")
 8 | matplotlib.rcParams["text.usetex"] = True
 9 | 
10 | 
11 | def crosstab_heatmap(df, col1, col2, col_dict):
12 |     # Produce crosstab
13 |     xtab = pd.crosstab(df[col1], df[col2])
14 |     # Right-column should show total number of answers
15 |     if isinstance(xtab.columns, pd.CategoricalIndex):
16 |         xtab.columns = xtab.columns.add_categories("Total")
17 |     if isinstance(xtab.index, pd.CategoricalIndex):
18 |         xtab.index = xtab.index.add_categories("Avg Prop")
19 |     xtab.loc[:, "Total"] = xtab.sum(axis=1)
20 |     xtab = xtab.loc[xtab['Total'] >= xtab.shape[1]*5]
21 |     props = xtab.iloc[:, :-1].div(xtab['Total'], axis=0)
22 |     props.loc['Avg Prop', :] = props.mean(axis=0)
23 |     xtab.loc['Avg Prop', :] = ""
24 |     props.loc[:, 'Total'] = ""
25 | 
26 |     annot = xtab.applymap(
27 |         lambda x: "{:.4g}\n".format(int(x)) if isinstance(x, float) else ""
28 |         ).add(
29 |             props.applymap(
30 |                 lambda x: "({:.3g}\%)".format(
31 |                     100*x) if isinstance(x, float) else ""
32 |                 ))
33 | 
34 |     annot.loc['Avg Prop', 'Total'] = "{:.5g}".format(xtab.iloc[:-1, -1].sum())
35 |     annot.loc['Avg Prop', :] = annot.loc['Avg Prop',
36 |                                          :].str.replace(re.compile(r"[\(\)]"), "")
37 |     annot.loc[:, 'Total'] = annot['Total'].str.replace("\n", "")
38 |     diffs = props.iloc[:, :-1] - props.iloc[-1, :-1]
39 |     diffs['Total'] = float(0)
40 |     diffs = diffs.applymap(lambda x: 100*x)
41 | 
42 |     fig_title = "Crosstab Heatmap of \\textbf{"+col_dict[col1]+"} by \\textbf{"+col_dict[col2]
43 | 
44 |     f, ax = plt.subplots(1, 1, figsize=annot.shape)
45 |     sns.heatmap(
46 |         diffs.T,
47 |         annot=annot.T,
48 |         fmt="s",
49 |         ax=ax,
50 |         cmap="RdBu_r",
51 |         center=0,
52 |         cbar_kws={"label": "Difference from Avg. Proportion"},
53 |     )
54 |     ax.set_title(fig_title)
55 |     ax.xaxis.tick_top()
56 |     ax.xaxis.set_label_position("top")
57 |     ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="left")
58 |     ax.set_yticklabels(ax.get_yticklabels())
59 |     f.axes[1].set_yticklabels(
60 |         [lab.get_text() + "\%" for lab in f.axes[1].get_yticklabels()]
61 |     )
62 |     ax.axvline(diffs.shape[0] - 1, color="k")
63 |     ax.axhline(diffs.shape[1] - 1, color="k")
64 |     ax.set_xlabel("")
65 |     ax.set_ylabel("")
66 |     ax.xaxis.set_label_position("bottom")
67 |     return f
68 | 
69 | 
70 | 
71 | col_dict = {'a02': 'Best Party', 'y09': 'Gender'}
72 | f = crosstab_heatmap(df, 'a02', 'y09', col_dict)
73 | f.savefig("extra_challenge.png", bbox_inches="tight")
74 | 


--------------------------------------------------------------------------------
/Week4/exercises.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# Week 4 Exercises: Data Visualisation\n",
 8 |     "\n",
 9 |     "This week in particular does not have correct \"solutions\". However, I encourage you to attempt the following:\n",
10 |     "\n",
11 |     "- Make your figures as complete and professional as possible. This means adding titles, legends, axis labels, and making them aesthetically pleasing.\n",
12 |     "- Write the solutions as generalisable functions. As much as possible you should be able to substitute the inputs and still get a complete and correctly-labelled figure.\n",
13 |     "\n",
14 |     "Additional code examples can be found in: https://github.com/muhark/dpir-intro-python/blob/master/Week4/figures.py"
15 |    ]
16 |   },
17 |   {
18 |    "cell_type": "markdown",
19 |    "metadata": {},
20 |    "source": [
21 |     "## Task 1: Simple Figure\n",
22 |     "\n",
23 |     "Using the BES data:\n",
24 |     "\n",
25 |     "- Create a figure with a single subplot.\n",
26 |     "- Plot the answer to item 'a02' (party best suited to tackle the biggest issue in Britain) as a function of age.\n",
27 |     "\n",
28 |     "As an additional challenge, try using only functions from `matplotlib` for this figure."
29 |    ]
30 |   },
31 |   {
32 |    "cell_type": "markdown",
33 |    "metadata": {},
34 |    "source": [
35 |     "## Task 2: Panelling\n",
36 |     "\n",
37 |     "Recreate the same figure as above, but with a separate subplot for each region.\n",
38 |     "\n",
39 |     "- Challenge 1: Use a for-loop\n",
40 |     "- Challenge 2: Write this as a function where 'a02' and 'region' can be substituted for other categorical variables.\n",
41 |     "- Challenge 3: Make the figure size dynamic, i.e. a function of the number of subplots.\n",
42 |     "- Challenge 4: Limit the number of subplot columns to 4; if there are more than 4 categories, the function should add an additional row to fit them."
43 |    ]
44 |   },
45 |   {
46 |    "cell_type": "markdown",
47 |    "metadata": {},
48 |    "source": [
49 |     "## Task 3: Color Palettes and Mapping\n",
50 |     "\n",
51 |     "Seaborn has a lot of resources for customising the color palettes in your figures. See: https://seaborn.pydata.org/tutorial/color_palettes.html\n",
52 |     "\n",
53 |     "A useful tool when creating figures is creating a column of color values. In other words, given some categorical column, you want to be able to create a column where each category is replaced by a unique color.\n",
54 |     "\n",
55 |     "Let's try doing this:\n",
56 |     "\n",
57 |     "- First, create a color palette with a number of colors equal to the number of categories in your column. To do this, you will need the `sns.color_palette()`, `pd.Series.unique()`, and `dict(zip())`.\n",
58 |     "    - Given two lists, `zip` will combine them into a list of pairs. e.g. `zip([1, 2, 3], ['a', 'b', 'c'])` will return `[(1, 'a'), (2, 'b'), (3, 'c')]`. Passing this to `dict`, i.e. `dict(zip([1, 2, 3], ['a', 'b', 'c']))`, will return `{1: 'a', 2: 'b', 3: 'c'}`.\n",
59 |     "- Apply the dictionary to your column. You should get back a column of RGB values (a triplet of red, green and blue defining a color)."
60 |    ]
61 |   },
62 |   {
63 |    "cell_type": "markdown",
64 |    "metadata": {},
65 |    "source": [
66 |     "## Extra Challenge: Crosstab Heatmap\n",
67 |     "\n",
68 |     "This one is extremely hard. See the following figure: https://github.com/muhark/dpir-intro-python/blob/master/Week4/extra_challenge.png\n",
69 |     "\n",
70 |     "Do your best to create a similar figure. You will need the `pd.crosstab` function, and the `sns.heatmap` function.\n",
71 |     "\n",
72 |     "(Solution is in the github, but give it a go!)"
73 |    ]
74 |   }
75 |  ],
76 |  "metadata": {
77 |   "kernelspec": {
78 |    "display_name": "teaching",
79 |    "language": "python",
80 |    "name": "teaching"
81 |   },
82 |   "language_info": {
83 |    "codemirror_mode": {
84 |     "name": "ipython",
85 |     "version": 3
86 |    },
87 |    "file_extension": ".py",
88 |    "mimetype": "text/x-python",
89 |    "name": "python",
90 |    "nbconvert_exporter": "python",
91 |    "pygments_lexer": "ipython3",
92 |    "version": "3.7.9"
93 |   }
94 |  },
95 |  "nbformat": 4,
96 |  "nbformat_minor": 4
97 | }
98 | 


--------------------------------------------------------------------------------
/Week4/extra_challenge.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/Week4/extra_challenge.png


--------------------------------------------------------------------------------
/Week4/figures.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | import matplotlib
  5 | import seaborn as sns
  6 | 
  7 | # sns.set_style("darkgrid")
  8 | 
  9 | # Figure 1: Summary Statistics vs DistPlot
 10 | 
 11 | x1 = list(np.random.normal(0, 1, 150))+list(np.random.normal(8, 1, 150))
 12 | x2 = np.random.normal(4, 4, 300)
 13 | x3 = np.random.multinomial(1, [1/6]*6, size=50).ravel()
 14 | data = pd.DataFrame({
 15 |     "x1": x1,
 16 |     "x2": x2,
 17 |     "x3": x3
 18 |     })
 19 | data.describe().to_html()
 20 | df = data.melt()
 21 | 
 22 | f, ax = plt.subplots(1, 1, figsize=(8, 4))
 23 | sns.distplot(data['x1'], label='x1')
 24 | sns.distplot(data['x2'], label='x2')
 25 | ax.legend()
 26 | f.savefig("figures/lecture_fig1.png", bbox_inches="tight")
 27 | 
 28 | bes_df = pd.read_feather("../Week2/data/bes_data_subset_week2.feather")
 29 | 
 30 | # Anatomy of a Figure
 31 | 
 32 | matplotlib.rcdefaults()
 33 | f = plt.figure(figsize=(8, 4))
 34 | 
 35 | matplotlib.rc('axes', edgecolor='r', lw=5)
 36 | f, ax = plt.subplots(1, 1, figsize=(8, 4))
 37 | f.suptitle("This is a figure with a subplot")
 38 | ax.set_title("This is a subplot", color="r")
 39 | f.savefig("figures/lecture_emptysubplot1.png", bbox_inches="tight")
 40 | matplotlib.rcdefaults()
 41 | 
 42 | 
 43 | #matplotlib.rc('axes', edgecolor='r', lw=5)
 44 | f, ax = plt.subplots(1, 2, figsize=(8, 4))
 45 | f.suptitle("This is a figure with two subplots")
 46 | ax[0].set_title("This is a subplot", color="r")
 47 | ax[1].set_title("This is another subplot", color="r")
 48 | f.savefig("figures/lecture_emptysubplot2.png", bbox_inches="tight")
 49 | matplotlib.rcdefaults()
 50 | 
 51 | #matplotlib.rc('axes', edgecolor='r', lw=5)
 52 | f, ax = plt.subplots(2, 2, figsize=(8, 4))
 53 | f.suptitle("This is a figure with four subplots")
 54 | for i in range(2):
 55 |     for j in range(2):
 56 |         ax[i][j].set_title(f"Subplot [{i}][{j}]", color="r")
 57 | f.savefig("figures/lecture_emptysubplot3.png", bbox_inches="tight")
 58 | matplotlib.rcdefaults()
 59 | 
 60 | 
 61 | 
 62 | f, ax = plt.subplots(1, 1, figsize=(8, 4))
 63 | ax.scatter(data['x2'], data['x1'], color='r')
 64 | # data['x3'].apply(lambda x: dict(zip(range(1, 6), sns.color_palette(n_colors=5)))[x])
 65 | f.savefig("figures/lecture_scatter1.png", bbox_inches="tight")
 66 | 
 67 | f, ax = plt.subplots(1, 1, figsize=(8, 4))
 68 | ax.plot(np.linspace(0, 10, 100), np.linspace(0, 5, 100), color='r')
 69 | f.savefig("figures/lecture_line1.png", bbox_inches="tight")
 70 | 
 71 | f, ax = plt.subplots(1, 1, figsize=(8, 4))
 72 | ax.scatter(data['x2'], data['x1'], color='r', s=3)
 73 | ax.plot(np.linspace(-10, 20, 150), np.linspace(-3.5, 4, 150)**2)
 74 | ax.axhline(0, color='k', alpha=0.5, ls="--")
 75 | ax.axvline(0, color='k', alpha=0.5, ls="--")
 76 | f.savefig("figures/lecture_linescatter1.png", bbox_inches="tight")
 77 | 
 78 | f, ax = plt.subplots(1, 1, figsize=(8, 4))
 79 | ax.scatter(data['x2'], data['x1'], color='g', s=3)
 80 | ax.plot(np.linspace(-10, 20, 150), np.linspace(-3.5, 4, 150)**2)
 81 | ax.axhline(0, color='k', alpha=0.5, ls="--")
 82 | ax.axvline(0, color='k', alpha=0.5, ls="--")
 83 | ax.xaxis.set_label_text("X-Axis Label", color='r')
 84 | ax.yaxis.set_label_text("Y-Axis Label", color='r')
 85 | f.savefig("figures/lecture_linescatter2.png", bbox_inches="tight")
 86 | 
 87 | f, ax = plt.subplots(1, 1, figsize=(8, 3.5))
 88 | ax.scatter(data['x2'], data['x1'], color='g', s=3)
 89 | ax.plot(np.linspace(-10, 20, 150), np.linspace(-3.5, 4, 150)**2)
 90 | ax.axhline(0, color='k', alpha=0.5, ls="--")
 91 | ax.axvline(0, color='k', alpha=0.5, ls="--")
 92 | ax.xaxis.set_label_text("X-Axis Label")
 93 | ax.yaxis.set_label_text("Y-Axis Label")
 94 | ax.xaxis.set_ticks(range(-10, 40, 10))
 95 | ax.yaxis.set_ticks(range(-4, 25, 2))
 96 | f.savefig("figures/lecture_linescatter3.png", bbox_inches="tight")
 97 | 
 98 | f, ax = plt.subplots(1, 1, figsize=(8, 3.5))
 99 | ax.scatter(data['x2'], data['x1'], color='g', s=3)
100 | ax.plot(np.linspace(-10, 20, 150), np.linspace(-3.5, 4, 150)**2)
101 | ax.axhline(0, color='k', alpha=0.5, ls="--")
102 | ax.axvline(0, color='k', alpha=0.5, ls="--")
103 | ax.xaxis.set_label_text("X-Axis Label")
104 | ax.yaxis.set_label_text("Y-Axis Label")
105 | ax.xaxis.set_major_locator(matplotlib.ticker.MultipleLocator(base=3))
106 | ax.yaxis.set_major_locator(matplotlib.ticker.MultipleLocator(base=2))
107 | f.savefig("figures/lecture_linescatter4.png", bbox_inches="tight")
108 | 
109 | 
110 | f, ax = plt.subplots(1, 1, figsize=(8, 4))
111 | sns.boxenplot(bes_df['region'], bes_df['Age'], ax=ax)
112 | ax.xaxis.set_ticklabels(ax.xaxis.get_ticklabels(), rotation=30)
113 | f.savefig("figures/lecture_rotated_labels1.png", bbox_inches="tight")
114 | 
115 | # Gallery
116 | 
117 | sns.set_style('darkgrid')
118 | 
119 | # Histogram (One Category)
120 | 
121 | f, ax = plt.subplots(1, 1, figsize=(8, 4))
122 | sns.distplot(bes_df['Age'].dropna(), kde=False, ax=ax)
123 | ax.set_title("Age Distribution of BES Respondents")
124 | f.savefig("figures/lecture_hist1.png")
125 | 
126 | # Histogram (Two Categories)
127 | 
128 | f, ax = plt.subplots(1, 1, figsize=(8, 6))
129 | sns.distplot(bes_df[bes_df['y09'] == 'Male']
130 |              ['Age'].dropna(), kde=False, label='Male')
131 | sns.distplot(bes_df[bes_df['y09'] == 'Female']
132 |              ['Age'].dropna(), kde=False, label='Female')
133 | ax.legend()
134 | ax.set_title("Age Distribution of BES by Gender")
135 | f.savefig("figures/lecture_hist2.png", bbox_inches="tight")
136 | 
137 | # Box and Whisker Plot
138 | 
139 | f, ax = plt.subplots(1, 1, figsize=(8, 4))
140 | sns.boxplot(bes_df['Age'].dropna(), bes_df['region'])
141 | ax.set_title("BES Age Distribution by Region")
142 | f.savefig("figures/lecture_box1.png", bbox_inches="tight")
143 | 
144 | # Swarm Plot (One Category)
145 | 
146 | f, ax = plt.subplots(1, 1, figsize=(8, 4))
147 | sns.swarmplot(bes_df['Age'])
148 | ax.set_title("BES Age Swarm Plot")
149 | f.savefig("figures/lecture_swarm1.png", bbox_inches="tight")
150 | 
151 | 
152 | # Swarm Plot (Multiple Categories)
153 | 
154 | f, ax = plt.subplots(1, 1, figsize=(8, 4))
155 | sns.swarmplot(bes_df['Age'], bes_df['y01'])
156 | ax.set_title("BES Age Swarm Plot by Income Group")
157 | f.savefig("figures/lecture_swarm2.png", bbox_inches="tight")
158 | 
159 | 
160 | # Violin Plot (One Category)
161 | 
162 | f, ax = plt.subplots(1, 1, figsize=(8, 4))
163 | sns.violinplot(bes_df['Age'])
164 | ax.set_title("BES Age Violin Plot")
165 | f.savefig("figures/lecture_violin1.png", bbox_inches="tight")
166 | 
167 | # Heatmap
168 | 
169 | f, ax=plt.subplots(1, 1, figsize=(15, 8))
170 | sns.heatmap(pd.crosstab(bes_df['y01'], bes_df['region']), cmap="RdBu_r")
171 | f.savefig("figures/lecture_heatmap1.png", bbox_inches="tight")
172 | # This is an issue with this particular version of matplotlib
173 | 
174 | # Not implented in beamer?
175 | data.describe().to_latex("latex_table.tex")
176 | 


--------------------------------------------------------------------------------
/Week4/figures/lecture_box1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/Week4/figures/lecture_box1.png


--------------------------------------------------------------------------------
/Week4/figures/lecture_emptysubplot1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/Week4/figures/lecture_emptysubplot1.png


--------------------------------------------------------------------------------
/Week4/figures/lecture_emptysubplot2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/Week4/figures/lecture_emptysubplot2.png


--------------------------------------------------------------------------------
/Week4/figures/lecture_emptysubplot3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/Week4/figures/lecture_emptysubplot3.png


--------------------------------------------------------------------------------
/Week4/figures/lecture_fig1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/Week4/figures/lecture_fig1.png


--------------------------------------------------------------------------------
/Week4/figures/lecture_heatmap1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/Week4/figures/lecture_heatmap1.png


--------------------------------------------------------------------------------
/Week4/figures/lecture_hist1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/Week4/figures/lecture_hist1.png


--------------------------------------------------------------------------------
/Week4/figures/lecture_hist2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/Week4/figures/lecture_hist2.png


--------------------------------------------------------------------------------
/Week4/figures/lecture_line1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/Week4/figures/lecture_line1.png


--------------------------------------------------------------------------------
/Week4/figures/lecture_linescatter1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/Week4/figures/lecture_linescatter1.png


--------------------------------------------------------------------------------
/Week4/figures/lecture_linescatter2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/Week4/figures/lecture_linescatter2.png


--------------------------------------------------------------------------------
/Week4/figures/lecture_linescatter3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/Week4/figures/lecture_linescatter3.png


--------------------------------------------------------------------------------
/Week4/figures/lecture_linescatter4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/Week4/figures/lecture_linescatter4.png


--------------------------------------------------------------------------------
/Week4/figures/lecture_rotated_labels1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/Week4/figures/lecture_rotated_labels1.png


--------------------------------------------------------------------------------
/Week4/figures/lecture_scatter1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/Week4/figures/lecture_scatter1.png


--------------------------------------------------------------------------------
/Week4/figures/lecture_swarm1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/Week4/figures/lecture_swarm1.png


--------------------------------------------------------------------------------
/Week4/figures/lecture_swarm2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/Week4/figures/lecture_swarm2.png


--------------------------------------------------------------------------------
/Week4/figures/lecture_violin1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/Week4/figures/lecture_violin1.png


--------------------------------------------------------------------------------
/Week4/latex_table.tex:
--------------------------------------------------------------------------------
 1 | \begin{tabular}{lrrr}
 2 | \toprule
 3 | {} &          x1 &          x2 &          x3 \\
 4 | \midrule
 5 | count &  300.000000 &  300.000000 &  300.000000 \\
 6 | mean  &    3.986588 &    4.256635 &    0.166667 \\
 7 | std   &    4.070890 &    4.146357 &    0.373301 \\
 8 | min   &   -2.529208 &   -9.047706 &    0.000000 \\
 9 | 25\%   &    0.010607 &    1.759387 &    0.000000 \\
10 | 50\%   &    3.960120 &    4.333940 &    0.000000 \\
11 | 75\%   &    7.860896 &    6.635001 &    0.000000 \\
12 | max   &   10.472298 &   18.649113 &    1.000000 \\
13 | \bottomrule
14 | \end{tabular}
15 | 


--------------------------------------------------------------------------------
/Week4/lecture.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/Week4/lecture.pdf


--------------------------------------------------------------------------------
/Week4/planning.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Week 4 Planning
 3 | ---
 4 | 
 5 | # Data Visualisation
 6 | 
 7 | ## Teaching Aims
 8 | 
 9 | - Methodological: Systematising and understanding summarising and conveying information graphically.
10 |     - Key Questions:
11 |         - How many variables (dimensions)?
12 |         - What type of variables? Discrete vs. Continuous, Ordered?
13 |         - What kind of comparison?
14 | - Implementation: Understanding the implicit model behind graphing software like matplotlib (and seaborn)
15 |     - Figures
16 |         - Title
17 |         - Spacing
18 |         - Axes (subplots)
19 |             - Title
20 |             - Graphical Objects
21 |                 - Lines
22 |                 - Dots
23 |                 - Shapes
24 |                 - Text
25 |             - axes (xaxis and yaxis)
26 |                 - Ticks
27 |                     - Tick intervals
28 |                     - Tick labels
29 |                 - Label
30 | 
31 | 
32 | ## Methodological Aspect
33 | 
34 | - Figures, by and large, serve a similar function to statistics. They convey a great deal of relevant information about a dataset without requiring one to inspect the individual values.
35 |     - For instance, a histogram or KDE plot says a lot more about the shape of a distribution than most statisitics can.
36 |     - It's easier to understand the functional shape of a time trend by plotting it than just eyeballing the numbers.
37 |     - Also, they look pretty.
38 | - Data visualisation is useful at two steps in the data analysis process:
39 |     - Exploratory Analysis: here, being able to quickly construct a graph that shows what you need is key.
40 |     - Presentation of Results: knowing a lot about how to customise a plot to exactly match your requirements matters more here.
41 | - There are dozens of types of figures:
42 |     - Distribution:
43 |         - Histogram, KDE, rugplot, swarmplot, violinplot, box-and-whiskers
44 |     - Unorderable Frequencies:
45 |         - Bar, Grouped bars,
46 | - Use Cases:
47 |     - 1 dimensional:
48 |         - Orderable:
49 |             - Histogram, kernel density estimate
50 |         - Unorderable:
51 |             - Pie chart (if proportions), bar chart (frequencies)
52 |     - 2 dimensional:
53 |         - Orderable * Unorderable:
54 |             -
55 | 
56 | 
57 | ## The Anatomy of a Figure
58 | 
59 | - The figure
60 | - The subplots (axes)
61 | - The axes (labels, ticks, etc)
62 | - The graphical elements
63 | 


--------------------------------------------------------------------------------
/Week5/examples_student.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "slideshow": {
  7 |      "slide_type": "subslide"
  8 |     }
  9 |    },
 10 |    "source": [
 11 |     "# Coding Tutorial 5: Unsupervised Learning\n",
 12 |     "\n",
 13 |     "In this coding tutorial, we learn how to do the following for `k-means` clustering and principal components analysis:\n",
 14 |     "\n",
 15 |     "- Import models from `scikit-learn`\n",
 16 |     "- Prepare a pandas dataframe for analysis with `scikit-learn`\n",
 17 |     "- Instantiate and fit a model to data\n",
 18 |     "- Visualise the results of the model"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {
 24 |     "slideshow": {
 25 |      "slide_type": "subslide"
 26 |     }
 27 |    },
 28 |    "source": [
 29 |     "# Importing Models from Scikit-Learn\n",
 30 |     "\n",
 31 |     "`scikit-learn` is actually a collection of modules, so you will need to find which sub-module contains the model you want to use."
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {
 38 |     "slideshow": {
 39 |      "slide_type": "subslide"
 40 |     }
 41 |    },
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "# standard imports\n",
 45 |     "import pandas as pd\n",
 46 |     "import numpy as np\n",
 47 |     "import matplotlib.pyplot as plt\n",
 48 |     "import seaborn as sns\n",
 49 |     "\n",
 50 |     "# scikit-learn imports\n",
 51 |     "from sklearn.preprocessing import StandardScaler\n",
 52 |     "from sklearn.cluster import KMeans\n",
 53 |     "from sklearn.decomposition import PCA"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": null,
 59 |    "metadata": {
 60 |     "slideshow": {
 61 |      "slide_type": "subslide"
 62 |     }
 63 |    },
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "# import the data\n",
 67 |     "link = 'http://github.com/muhark/dpir-intro-python/raw/master/Week2/data/bes_data_subset_week2.feather'\n",
 68 |     "df = pd.read_feather(link)"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "metadata": {
 74 |     "slideshow": {
 75 |      "slide_type": "subslide"
 76 |     }
 77 |    },
 78 |    "source": [
 79 |     "# Data Pre-Processing\n",
 80 |     "\n",
 81 |     "There are four steps for preparing data for analysis:\n",
 82 |     "\n",
 83 |     "1. Feature Selection\n",
 84 |     "2. Accounting for NAs\n",
 85 |     "3. One Hot Encoding\n",
 86 |     "4. Conversion to numpy ndarray"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "metadata": {
 92 |     "slideshow": {
 93 |      "slide_type": "subslide"
 94 |     }
 95 |    },
 96 |    "source": [
 97 |     "## Feature Selection\n",
 98 |     "\n",
 99 |     "Here we just choose which columns we are going to use. If your data has a lot of NAs, it may be worthwhile to prefer columns with fewer NAs."
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "metadata": {
106 |     "slideshow": {
107 |      "slide_type": "fragment"
108 |     }
109 |    },
110 |    "outputs": [],
111 |    "source": [
112 |     "features = ['region', 'Age', 'a02', 'a03', 'e01',\n",
113 |     "            'k01', 'k02', 'k11', 'k13', 'k06', 'k08',\n",
114 |     "            'y01', 'y03', 'y06', 'y08', 'y09', 'y11', 'y17']"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "markdown",
119 |    "metadata": {
120 |     "slideshow": {
121 |      "slide_type": "subslide"
122 |     }
123 |    },
124 |    "source": [
125 |     "## Accounting for NAs"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "metadata": {
132 |     "slideshow": {
133 |      "slide_type": "fragment"
134 |     }
135 |    },
136 |    "outputs": [],
137 |    "source": [
138 |     "# Can check for na's with:\n",
139 |     "# df[features].isna().sum()\n",
140 |     "df = df[features].dropna()"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "markdown",
145 |    "metadata": {
146 |     "slideshow": {
147 |      "slide_type": "subslide"
148 |     }
149 |    },
150 |    "source": [
151 |     "## One-Hot Encoding\n",
152 |     "\n",
153 |     "We can do a one-hot encoding using the `pd.get_dummies()` function."
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": null,
159 |    "metadata": {
160 |     "slideshow": {
161 |      "slide_type": "fragment"
162 |     }
163 |    },
164 |    "outputs": [],
165 |    "source": [
166 |     "data = pd.get_dummies(df)\n",
167 |     "print(df.shape, data.shape)"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "markdown",
172 |    "metadata": {
173 |     "slideshow": {
174 |      "slide_type": "subslide"
175 |     }
176 |    },
177 |    "source": [
178 |     "## Normalization and Conversion to `numpy`\n",
179 |     "\n",
180 |     "We call the `StandardScaler().fit_transform()` function on the `.values` argument of the dataframe"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": null,
186 |    "metadata": {
187 |     "slideshow": {
188 |      "slide_type": "fragment"
189 |     }
190 |    },
191 |    "outputs": [],
192 |    "source": [
193 |     "X = data.values\n",
194 |     "scaler = StandardScaler()\n",
195 |     "X_norm = scaler.fit_transform(X)"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "markdown",
200 |    "metadata": {
201 |     "slideshow": {
202 |      "slide_type": "subslide"
203 |     }
204 |    },
205 |    "source": [
206 |     "# Instantiating and Fitting `k-means`\n",
207 |     "\n",
208 |     "We first create an instance of the model, where we provide parameters, and then we pass data to it."
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": null,
214 |    "metadata": {
215 |     "slideshow": {
216 |      "slide_type": "fragment"
217 |     }
218 |    },
219 |    "outputs": [],
220 |    "source": [
221 |     "kmeans = KMeans(n_clusters=5, random_state=634)"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": null,
227 |    "metadata": {
228 |     "slideshow": {
229 |      "slide_type": "fragment"
230 |     }
231 |    },
232 |    "outputs": [],
233 |    "source": [
234 |     "kmeans.fit(X_norm)"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "markdown",
239 |    "metadata": {
240 |     "slideshow": {
241 |      "slide_type": "subslide"
242 |     }
243 |    },
244 |    "source": [
245 |     "We can extract the labels using the `.labels_` method, and then assign them to a column."
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": null,
251 |    "metadata": {
252 |     "slideshow": {
253 |      "slide_type": "subslide"
254 |     }
255 |    },
256 |    "outputs": [],
257 |    "source": [
258 |     "df['labels_'] = kmeans.labels_\n",
259 |     "df['labels_'] = df['labels_'].astype(str)"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "markdown",
264 |    "metadata": {
265 |     "slideshow": {
266 |      "slide_type": "subslide"
267 |     }
268 |    },
269 |    "source": [
270 |     "# Visualising the Results\n",
271 |     "\n",
272 |     "This is a bit difficult with so many variables. Let's look at age."
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "code",
277 |    "execution_count": null,
278 |    "metadata": {
279 |     "slideshow": {
280 |      "slide_type": "subslide"
281 |     }
282 |    },
283 |    "outputs": [],
284 |    "source": [
285 |     "f, ax = plt.subplots(1, 1, figsize=(15, 8))\n",
286 |     "sns.histplot(df[['labels_', 'Age']].sort_values('labels_'),\n",
287 |     "             x='Age', ax=ax, kde=True, hue='labels_');"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": null,
293 |    "metadata": {
294 |     "slideshow": {
295 |      "slide_type": "subslide"
296 |     }
297 |    },
298 |    "outputs": [],
299 |    "source": [
300 |     "# We can appropriate this function\n",
301 |     "def grouped_barplot(data, var1, var2):\n",
302 |     "    \"\"\"\n",
303 |     "    Creates a grouped bar plot of the distribution of `var2` within each group of `var2`.\n",
304 |     "    \"\"\"\n",
305 |     "    temp = data.groupby([var1, var2]).apply(len).reset_index().rename({0: 'Count'}, axis=1)\n",
306 |     "    f, ax = plt.subplots(1, 1, figsize=(len(data[var1].unique())*len(data[var1].unique())/5, 10))\n",
307 |     "    sns.barplot(data=temp, x=var1, y='Count', hue=var2)\n",
308 |     "    ax.set_title(f\"BES Sample {var2} per {var1}\")\n",
309 |     "    ax.xaxis.set_ticklabels(ax.xaxis.get_ticklabels(), rotation=30)"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": null,
315 |    "metadata": {
316 |     "slideshow": {
317 |      "slide_type": "subslide"
318 |     }
319 |    },
320 |    "outputs": [],
321 |    "source": [
322 |     "grouped_barplot(df, 'a02','labels_') "
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "code",
327 |    "execution_count": null,
328 |    "metadata": {
329 |     "slideshow": {
330 |      "slide_type": "subslide"
331 |     }
332 |    },
333 |    "outputs": [],
334 |    "source": [
335 |     "grouped_barplot(df, 'region','labels_')"
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "markdown",
340 |    "metadata": {
341 |     "slideshow": {
342 |      "slide_type": "subslide"
343 |     }
344 |    },
345 |    "source": [
346 |     "## Instantiating and Fitting PCA"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "code",
351 |    "execution_count": null,
352 |    "metadata": {
353 |     "slideshow": {
354 |      "slide_type": "fragment"
355 |     }
356 |    },
357 |    "outputs": [],
358 |    "source": [
359 |     "pca = PCA(n_components=2, random_state=634)\n",
360 |     "pca = pca.fit(X_norm)\n",
361 |     "reduced_X = pca.fit_transform(X_norm)"
362 |    ]
363 |   },
364 |   {
365 |    "cell_type": "code",
366 |    "execution_count": null,
367 |    "metadata": {
368 |     "slideshow": {
369 |      "slide_type": "fragment"
370 |     }
371 |    },
372 |    "outputs": [],
373 |    "source": [
374 |     "sns.scatterplot(x=reduced_X[:, 0], y=reduced_X[:, 1]);"
375 |    ]
376 |   },
377 |   {
378 |    "cell_type": "markdown",
379 |    "metadata": {
380 |     "slideshow": {
381 |      "slide_type": "subslide"
382 |     }
383 |    },
384 |    "source": [
385 |     "## Combining PCA and `k-means`\n",
386 |     "\n",
387 |     "We can fit k-means to PCA-reduced data:"
388 |    ]
389 |   },
390 |   {
391 |    "cell_type": "code",
392 |    "execution_count": null,
393 |    "metadata": {
394 |     "slideshow": {
395 |      "slide_type": "subslide"
396 |     }
397 |    },
398 |    "outputs": [],
399 |    "source": [
400 |     "pcakmeans = KMeans(n_clusters=5, random_state=634)\n",
401 |     "pcakmeans.fit(reduced_X)\n",
402 |     "df['pcakmeans_labels'] = pcakmeans.labels_"
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "code",
407 |    "execution_count": null,
408 |    "metadata": {
409 |     "slideshow": {
410 |      "slide_type": "subslide"
411 |     }
412 |    },
413 |    "outputs": [],
414 |    "source": [
415 |     "sns.set_style('darkgrid')\n",
416 |     "f, ax = plt.subplots(1, 1, figsize=(15, 8))\n",
417 |     "sns.scatterplot(x=reduced_X[:, 0], y=reduced_X[:, 1],\n",
418 |     "                hue=pcakmeans.labels_,\n",
419 |     "                palette=sns.color_palette(palette='colorblind', n_colors=5));"
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "code",
424 |    "execution_count": null,
425 |    "metadata": {
426 |     "slideshow": {
427 |      "slide_type": "subslide"
428 |     }
429 |    },
430 |    "outputs": [],
431 |    "source": [
432 |     "grouped_barplot(df, 'a02', 'pcakmeans_labels')"
433 |    ]
434 |   },
435 |   {
436 |    "cell_type": "code",
437 |    "execution_count": null,
438 |    "metadata": {},
439 |    "outputs": [],
440 |    "source": [
441 |     "pd.DataFrame(pca.components_, columns=data.columns)"
442 |    ]
443 |   }
444 |  ],
445 |  "metadata": {
446 |   "kernelspec": {
447 |    "display_name": "Python 3",
448 |    "language": "python",
449 |    "name": "python3"
450 |   },
451 |   "language_info": {
452 |    "codemirror_mode": {
453 |     "name": "ipython",
454 |     "version": 3
455 |    },
456 |    "file_extension": ".py",
457 |    "mimetype": "text/x-python",
458 |    "name": "python",
459 |    "nbconvert_exporter": "python",
460 |    "pygments_lexer": "ipython3",
461 |    "version": "3.7.9"
462 |   }
463 |  },
464 |  "nbformat": 4,
465 |  "nbformat_minor": 4
466 | }
467 | 


--------------------------------------------------------------------------------
/Week5/exercises.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# Coding Exercises Week 5: Unsupervised Learning\n",
 8 |     "\n",
 9 |     "The task for this week is a little bit different. Using the code in the examples as a guide, do the following:\n",
10 |     "\n",
11 |     "- Fit a kmeans, pca, and pca+kmeans model for a selection of the variables from the BES data.\n",
12 |     "- Visualise the clusterings.\n",
13 |     "- Interpret the principal components. To do this, you will need to inspect how each component \"weights\" each of the features. The higher the weight, the proportion of the variance of the component is being derived from this weight.\n",
14 |     "- Interpret the clusters: what kind of people are assigned to each cluster? Does this make sense? How does this match up to your understanding of the division within British society?"
15 |    ]
16 |   },
17 |   {
18 |    "cell_type": "markdown",
19 |    "metadata": {},
20 |    "source": [
21 |     "## Bonus Challenge\n",
22 |     "\n",
23 |     "As mentioned, `sklearn.cluster` contains many other clustering algorithms. Read through the documentation at https://scikit-learn.org/stable/modules/clustering.html and choose one that you think you understand best (Agglomerative Clustering is the most intuitive, IMHO). Cluster the BES data with this algorithm, and compare results between the models."
24 |    ]
25 |   },
26 |   {
27 |    "cell_type": "code",
28 |    "execution_count": null,
29 |    "metadata": {},
30 |    "outputs": [],
31 |    "source": []
32 |   }
33 |  ],
34 |  "metadata": {
35 |   "kernelspec": {
36 |    "display_name": "teaching",
37 |    "language": "python",
38 |    "name": "teaching"
39 |   },
40 |   "language_info": {
41 |    "codemirror_mode": {
42 |     "name": "ipython",
43 |     "version": 3
44 |    },
45 |    "file_extension": ".py",
46 |    "mimetype": "text/x-python",
47 |    "name": "python",
48 |    "nbconvert_exporter": "python",
49 |    "pygments_lexer": "ipython3",
50 |    "version": "3.7.9"
51 |   }
52 |  },
53 |  "nbformat": 4,
54 |  "nbformat_minor": 4
55 | }
56 | 


--------------------------------------------------------------------------------
/Week5/lecture.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/Week5/lecture.pdf


--------------------------------------------------------------------------------
/Week5/local_plot_utils.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import re
 3 | import seaborn as sns
 4 | import matplotlib
 5 | import matplotlib.pyplot as plt
 6 | 
 7 | matplotlib.rcParams["text.usetex"] = True
 8 | 
 9 | 
10 | def crosstab_heatmap(df, col1, col2):
11 |     # Produce crosstab
12 |     xtab = pd.crosstab(df[col1], df[col2])
13 |     # Right-column should show total number of answers
14 |     if isinstance(xtab.columns, pd.CategoricalIndex):
15 |         xtab.columns = xtab.columns.add_categories("Total")
16 |     if isinstance(xtab.index, pd.CategoricalIndex):
17 |         xtab.index = xtab.index.add_categories("Avg Prop")
18 |     xtab.loc[:, "Total"] = xtab.sum(axis=1)
19 |     xtab = xtab.loc[xtab['Total'] >= xtab.shape[1]*5]
20 |     props = xtab.iloc[:, :-1].div(xtab['Total'], axis=0)
21 |     props.loc['Avg Prop', :] = props.mean(axis=0)
22 |     xtab.loc['Avg Prop', :] = ""
23 |     props.loc[:, 'Total'] = ""
24 | 
25 |     annot = xtab.applymap(
26 |         lambda x: "{:.4g}\n".format(int(x)) if isinstance(x, float) else ""
27 |         ).add(
28 |             props.applymap(
29 |                 lambda x: "({:.3g}\%)".format(
30 |                     100*x) if isinstance(x, float) else ""
31 |                 ))
32 | 
33 |     annot.loc['Avg Prop', 'Total'] = "{:.5g}".format(xtab.iloc[:-1, -1].sum())
34 |     annot.loc['Avg Prop', :] = annot.loc['Avg Prop',
35 |                                          :].str.replace(re.compile(r"[\(\)]"), "")
36 |     annot.loc[:, 'Total'] = annot['Total'].str.replace("\n", "")
37 |     diffs = props.iloc[:, :-1] - props.iloc[-1, :-1]
38 |     diffs['Total'] = float(0)
39 |     diffs = diffs.applymap(lambda x: 100*x)
40 |             
41 |     f, ax = plt.subplots(1, 1, figsize=annot.shape)
42 |     sns.heatmap(
43 |         diffs.T,
44 |         annot=annot.T,
45 |         fmt="s",
46 |         ax=ax,
47 |         cmap="RdBu_r",
48 |         center=0,
49 |         cbar_kws={"label": "Difference from Avg. Proportion"},
50 |     )
51 |     ax.xaxis.tick_top()
52 |     ax.xaxis.set_label_position("top")
53 |     ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="left")
54 |     ax.set_yticklabels(ax.get_yticklabels())
55 |     f.axes[1].set_yticklabels(
56 |         [lab.get_text() + "\%" for lab in f.axes[1].get_yticklabels()]
57 |     )
58 |     ax.axvline(diffs.shape[0] - 1, color="k")
59 |     ax.axhline(diffs.shape[1] - 1, color="k")
60 |     ax.set_xlabel("")
61 |     ax.set_ylabel("")
62 |     ax.xaxis.set_label_position("bottom")
63 |     return f


--------------------------------------------------------------------------------
/Week5/planning.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Week 5 Planning: Machine Learning
 3 | ---
 4 | 
 5 | # Unsupervised Methods
 6 | 
 7 | ## Introduction
 8 | 
 9 | - Need a definition for ML. My understanding is that it's a collection of numeric/algorithmic methods.
10 | - First lesson looks at unsupervised methods (only X, no Y)
11 | - We focus specifically on two use cases:
12 |     - Clustering
13 |     - Dimensionality reduction
14 | 
15 | ## Clustering
16 | 
17 | - _Motivation_: what are some things that we may wish to cluster in political science?
18 |     - Sometimes, we may have an intuition that certain groups exist, and are seeking to discover them. Other times, we have no a priori expectation of groupings, and are exploring how the data can cluster.
19 | - _Method_: Given a $j$-dimensional space, and matrix $X_{ij}$ of $i$ length-$j$ vectors, assign each vector $x_i$ to a cluster $k \in K$.
20 | - Usually sense that members of the same cluster will be similar, and members of different clusters will be dissimilar.
21 |     - Question: What metrics of (dis)similarity exist?
22 | - Two examples: `k-means`, agglomerative pair-wise.
23 |     - (_Implementation_): Review each algorithm, highlighting what makes one more efficient/scalable than the other.
24 | - Clustering diagnostic metrics.
25 | - Useful summary: https://www.cc.gatech.edu/~isbell/reading/papers/berkhin02survey.pdf
26 | 
27 | 
28 | ## Dimensionality Reduction
29 | 
30 | - _Motivation_: when would you reduce dimensionality in political science?
31 |     - You have too many variables in your model, and are seeking to drop some.
32 |     - You are aiming to visualise/understand some high-dimensional space.
33 |     - You are seeking to recover some latent dimensions within your data that are not captured by your existing variables.
34 | - _Method_: Given a matrix $X_{ij}$ of $i$ length-$j$ vectors, reduce $X_{ij}$ to $X_{ik}$ where $K \le J$.
35 |     - In some variants, $\forall k \subset J$, in others, $\exists k \not\subset J$.
36 | - Again, parametric vs non-parametric methods.
37 | 


--------------------------------------------------------------------------------
/Week6/exercises.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Coding Exercise Week 6\n",
  8 |     "\n",
  9 |     "My main challenge to you this week is to improve on the 0.5066162570888468 correct out-of-sample prediction rate the lecture example RF achieved.\n",
 10 |     "\n",
 11 |     "To ensure that you have the same train-test split, I've given you the beginning of the code. Your goal is to build a model that provides a better prediction of `y_test` using `X_test` as inputs than the one in the lecture.\n",
 12 |     "\n",
 13 |     "Some ideas for how you might improve beyond the model in the lecture:\n",
 14 |     "\n",
 15 |     "- Using a GridCVSearch instead of RandomCVSearch to further fine-tune the hyperparameters\n",
 16 |     "- Add additional features to the model (make sure you have the same splits!)\n",
 17 |     "- Using a different prediction algorithm, for example a Support Vector Machine or a Neural Network."
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 1,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "import pandas as pd\n",
 27 |     "import numpy as np\n",
 28 |     "\n",
 29 |     "from sklearn.model_selection import train_test_split\n",
 30 |     "\n",
 31 |     "# Set random seed\n",
 32 |     "np.random.seed(634)"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 3,
 38 |    "metadata": {},
 39 |    "outputs": [
 40 |     {
 41 |      "name": "stdout",
 42 |      "output_type": "stream",
 43 |      "text": [
 44 |       "(1585, 128) (1585,)\n",
 45 |       "(529, 128) (529,)\n"
 46 |      ]
 47 |     }
 48 |    ],
 49 |    "source": [
 50 |     "link = 'http://github.com/muhark/dpir-intro-python/raw/master/Week2/data/bes_data_subset_week2.feather'\n",
 51 |     "df = pd.read_feather(link)\n",
 52 |     "# Refactoring e01: partisan self-id\n",
 53 |     "df.loc[:, 'e01'] = df['e01'].apply(\n",
 54 |     "    lambda x: int(x.split(' ')[0]) if x[0] in ''.join(list(map(str, list(range(10))))) else None)\n",
 55 |     "# Let's predict 'a02' as a function of the rest\n",
 56 |     "features = ['region', 'Age', 'a03', 'e01', 'k01',\n",
 57 |     "            'k02', 'k11', 'k06', 'k08', 'y01',\n",
 58 |     "            'y03', 'y06', 'y08', 'y09', 'y11', 'y17']\n",
 59 |     "labels = 'a02'\n",
 60 |     "\n",
 61 |     "# Prep data\n",
 62 |     "df = df[features+[labels]].dropna()\n",
 63 |     "temp = pd.get_dummies(df[features])\n",
 64 |     "feature_names = temp.columns.tolist()\n",
 65 |     "X = temp.values\n",
 66 |     "y = df[labels].values\n",
 67 |     "\n",
 68 |     "# Train-test split\n",
 69 |     "X_train, X_test, y_train, y_test = train_test_split(\n",
 70 |     "    X, y, test_size=0.25)\n",
 71 |     "\n",
 72 |     "print(X_train.shape, y_train.shape)\n",
 73 |     "print(X_test.shape, y_test.shape)"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "col_dict = {\n",
 83 |     "    'region': 'Region',\n",
 84 |     "    'Age': 'Age',\n",
 85 |     "    'a02': 'Which party is best able to handle this issue?',\n",
 86 |     "    'a03': 'How interested are you in politics?',\n",
 87 |     "    'e01': 'Left-Right Self-Placement',\n",
 88 |     "    'k01': 'Attention to Politics',\n",
 89 |     "    'k02': 'Reads politics news',\n",
 90 |     "    'k11': 'Contacted by canvasser',\n",
 91 |     "    'k06': 'Uses Twitter',\n",
 92 |     "    'k08': 'Uses Facebook',\n",
 93 |     "    'y01': 'Income bracket',\n",
 94 |     "    'y03': 'Housing type',\n",
 95 |     "    'y06': 'Religion',\n",
 96 |     "    'y08': 'Trade Union Membership',\n",
 97 |     "    'y09': 'Gender',\n",
 98 |     "    'y11': 'Ethnicity',\n",
 99 |     "    'y17': 'Employment type'\n",
100 |     "}"
101 |    ]
102 |   }
103 |  ],
104 |  "metadata": {
105 |   "kernelspec": {
106 |    "display_name": "teaching",
107 |    "language": "python",
108 |    "name": "teaching"
109 |   },
110 |   "language_info": {
111 |    "codemirror_mode": {
112 |     "name": "ipython",
113 |     "version": 3
114 |    },
115 |    "file_extension": ".py",
116 |    "mimetype": "text/x-python",
117 |    "name": "python",
118 |    "nbconvert_exporter": "python",
119 |    "pygments_lexer": "ipython3",
120 |    "version": "3.7.9"
121 |   }
122 |  },
123 |  "nbformat": 4,
124 |  "nbformat_minor": 4
125 | }
126 | 


--------------------------------------------------------------------------------
/Week6/lecture.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Introduction to Python for Social Science
  3 | subtitle: Lecture 6 - Machine Learning II
  4 | author: Musashi Harukawa, DPIR
  5 | date: 6th Week Hilary 2021
  6 | ---
  7 | 
  8 | # Recap
  9 | 
 10 | ## Last Week
 11 | 
 12 | - Unsupervised Machine Learning
 13 |     - Clustering with `k-means`
 14 |     - Dimensionality Reduction with `PCA`
 15 | 
 16 | ## This Week
 17 | 
 18 | - Supervised Machine Learning
 19 |     - In depth: Decision Trees
 20 | - Ensemble Methods
 21 |     - Forests
 22 |     - Meta-Learners
 23 | - Optimising Your Model
 24 |     - Cross Validation Methods
 25 |     - Hyperparameter Tuning
 26 | 
 27 | # Supervised ML
 28 | 
 29 | ## Supervised Learning: Use X to infer Y
 30 | 
 31 | - Supervised Learning starts with a dataset containing both _features_ ($X$) and _labels_ ($y$).
 32 | - They then construct a "rule" relating $X$ to $y$, so that given some combination of values for $X$, they can "predict" a value of $y$.
 33 | - In other terms, supervised learning finds $f$ in $y=f(X)$.
 34 |     - If $y$ is discrete/categorical, then the task is called _classification_.
 35 |     - If $y$ is continuous, then the task is called _regression_.
 36 | 
 37 | 
 38 | ## Supervised Learning Models
 39 | 
 40 | Some general classes of supervised models include:
 41 | 
 42 | - Linear Models
 43 | - Support Vector Machines (SVMs)
 44 | - Naive Bayes
 45 | - Tree-based Estimators
 46 | - (Supervised) Neural Networks
 47 | 
 48 | ## Decision Trees
 49 | 
 50 | Although radically distinct to linear estimators such as OLS, decision trees offer a simple and intuitive approach to estimating values of $y$ based on $X$.
 51 | 
 52 | - If you have played the game twenty questions, then you should be familiar with the idea behind decision trees.
 53 | - Constructs a series of binary questions (nodes) regarding your features, and eventually at the end of the resulting branches gives a prediction (leaf) of your label.
 54 | 
 55 | ## Understanding Decision Trees
 56 | 
 57 | A decision tree can be understood as a mapping from the multi-dimensional feature space, $X_{ij}$, to the label space $y_i$.
 58 | 
 59 | - Each question partitions the $X_{ij}$-space.
 60 | - Each leaf maps one of these partitions to a value (or range) in the $y$-space.
 61 | - The algorithm necessarily sets some convergence threshold so that there are fewer leafs than observations.
 62 | 
 63 | ## Impurity
 64 | 
 65 | - Given that the algorithm knows the values of $y$:
 66 |     - Its goal is to split the $X$-space in such a way that each partition does not contain more than one distinct value of $y$.
 67 |     - In essence, it wants to split the $X$-space in way that increases the "purity" of each partition.
 68 |     - A partition containing more than one distinct value of $y$ will necessarily lead to at least one erroneous prediction.
 69 | - There are various measures of impurity:
 70 |     - GINI: $H(X_m) = \sum_k p_{mk} (1 - p_{mk})$
 71 |     - Entropy: $H(X_m) = - \sum_k p_{mk} \log(p_{mk})$
 72 | 
 73 | ## Visualising Trees
 74 | 
 75 | ![Source: https://github.com/parrt/dtreeviz](iris-TD-2.svg)
 76 | 
 77 | ## Tree Trade-offs
 78 | 
 79 | Advantages:
 80 | 
 81 | - Excels at capturing conditional dependencies
 82 | - Arguably more intuitive than OLS.
 83 | - Provides an metric of feature importance that has a substantive interpretation.
 84 | 
 85 | ::: {.fragment}
 86 | Disadvantages:
 87 | :::
 88 | 
 89 | - **Extremely** prone to _over-fitting_.
 90 | - Does not provide a linear marginal effect estimate.
 91 | 
 92 | ## Choosing Your Supervised Algorithm
 93 | 
 94 | These are some of the criteria you may want to consider when choosing an algorithm:
 95 | 
 96 | - _Prediction Accuracy_: Algorithms vary in their ability to predict unseen data. We will discuss this more during cross validation.
 97 | - _Minimum Data_: Some models are able to do more with less. This is especially true if the model makes certain parametric assumptions about the nature or distribution of the data.
 98 | - _Interpretability_: Not all methods provide insight into _how_ they formulate their predictions. Methods range from extremely intuitive, such as decision trees, to complete black boxes, such as neural networks. When seeking to _explain_ and not _predict_, one should take this into account.
 99 | 
100 | ## This brings me to...
101 | 
102 | # Ensemble Methods
103 | 
104 | ## Managing Shortcomings by Working Together
105 | 
106 | - There is no single model or algorithm that performs best across all criteria in all scenarios.
107 | - Ensemble methods, which is really a fancy way of saying using more than one method, are often devised to address this issue.
108 | - I group ensemble methods into two types: _aggregating_ and _sequential_ ensembles.
109 |     - _Aggregating Ensembles_ train on and estimate predicted values of the same data, and then use a meta-learner to aggregate these predictions.
110 |     - _Sequential Ensembles_ use the output of one algorithm (often unsupervised) as features to train another. PCA+kmeans is an example of this.
111 | 
112 | ## Aggregating Trees: Random Forests
113 | 
114 | There are various algorithms that aggregate decision trees, but here I outline the logic behind the most straightforward and common one: Random Forests (RFs).
115 | 
116 | - Construct $N$ decision trees.
117 | - For each split in each tree, randomly select a subset of features. This split can only be made over these features.
118 | - To predict, the same input array is passed to all the constituent trees, and the algorithm either returns mean prediction (continuous data) or modal prediction (categorical data).
119 | - The noteworthy improvement on this algorithm is Bayesian Additive Regression Trees (BARTs).
120 | 
121 | ## Aggregating Learners: Meta-Learners
122 | 
123 | A number of papers have been published recently that use ensemble methods to estimate heterogeneous treatment effects:
124 | 
125 | - [Grimmer \& Westwood, _Political Analysis_ 2017](https://www.cambridge.org/core/journals/political-analysis/article/estimating-heterogeneous-treatment-effects-and-the-effects-of-heterogeneous-treatments-with-ensemble-methods/C7E3EA00D0AD83429CBE73F4F0C6652C)
126 | - [Kunzel et al, _PNAS_ 2019](https://arxiv.org/abs/1706.03461)
127 | 
128 | <p class="fragment">These papers both focus on innovating on the _meta-learner_.</p>
129 | 
130 | # Optimising Your Model
131 | 
132 | ## Machine Learning is not just Algorithms
133 | 
134 | - Another contribution of machine learning to econometrics, in my opinion, has been the development of strategies to test and evaluate models.
135 | - Epistemologically, machine learning frequently takes a more agnostic view on trying to find a specific functional specification of a theoretical model.
136 |     - This means that the "correct" model is the one that does the best job of matching _empirics_, and not a particular theory.
137 |     - The cost of this is the unsuitability of many machine learning algorithms to theory testing in the traditional econometric sense.
138 | 
139 | ## Cross Validation
140 | 
141 | Cross validation is one such of these strategies. It consists of dividing the data into _training_ and _test_ sets:
142 | 
143 | 1. The model is fit using the _training_ data: $y_{train} = f(X_{train}) + \epsilon \rightarrow \hat{f}(X)$
144 | 2. The fitted model is applied to the _test features_ to generate _predicted values_: $\hat{y} = \hat{f}(X_{test})$
145 | 3. The difference between the _predicted values_ and the _test labels_ is used as a measure of the predictive accuracy of the model: $\hat{e} = y_{test} - \hat{y}$
146 | 
147 | ::: {.fragment}
148 | There are multiple aggregate measures of prediction error, but a common one is _mean squared (prediction) error_, calculated as the sum of squared differences between prediction and test label.
149 | :::
150 | 
151 | ## k-fold Cross Validation
152 | 
153 | - There are some obvious shortcomings to dividing the data into a training at test set just once.
154 | - A slightly more advanced method for train-test splitting is known a k-fold CV, which consists of splitting the training data randomly into $k$ bins, and then iteratively using the $k$th bin as a test set for all bins not $k$.
155 | 
156 | ## Cross Validation Visualised
157 | 
158 | ![K-Fold Cross Validation](https://scikit-learn.org/stable/_images/grid_search_cross_validation.png)
159 | 
160 | ## Choosing Parameters
161 | 
162 | Another strategy for improving the predictive accuracy of algorithms relates to choosing the right _parameters_.
163 | 
164 | Most, if not all algorithms have some parameters that affect predictions in very unobvious ways. For example:
165 | 
166 | - `k-means`: number of clusters
167 | - Decision Tree: min/max number of splits
168 | - Random Forest: proportion of features to use in each subset
169 | - LASSO/Ridge/EN: $\beta$
170 | 
171 | ## Hyperparameter Tuning
172 | 
173 | - Hyperparameter tuning is the practice of choosing model parameters by maximising an _objective function_. Some possible objective functions include:
174 |     - _Mean Absolute Prediction Error_: Combine with train-test splits.
175 |     - _Goodness-of-Fit_: Measures such as R-squared, AIC, etc.
176 |     - _Coherence/Entropy Measures_: Most algorithms have a measure of the complexity/information tradeoff, which can be optimised.
177 | - Hyperparameter tuning is computationally costly, but also easily parallelisable.
178 | <!-- - _Remember_: The optimal parameters are closely related to the choice of objective function. Whether they are correct for the task you are trying to accomplish depends therefore on the objective function. -->
179 | 
180 | # Machine Learning Recap
181 | 
182 | ## Key Terms
183 | 
184 | - _Unsupervised Learning_: No $y$, explore $X$
185 | - _Supervised Learning_: Learn relationship between features and labels.
186 | - _Clustering_: Split observations into groups.
187 | - _Dimensionality Redution_: Reduce $j$, the number of features.
188 | - _Classification vs Regression_: Depends on structure of $y$
189 | - _Cross Validation_: Train-test split data to optimise supervised learner.
190 | - _Hyperparameter Tuning_: Systematically choose optimal parameters for algorithm.
191 | - _Objective Function_: An optimisable aspect of the data used to measure goodness-of-fit.
192 | 
193 | ## Trade-offs
194 | 
195 | These trade-offs are not linear, but generally hold:
196 | 
197 | - _Explanatory vs predictive power_
198 | - _Flexibility vs efficiency_
199 | - _Information vs time_
200 | 
201 | ## Readings
202 | 
203 | Ensemble Methods:
204 | 
205 | - [Grimmer \& Westwood, _Political Analysis_ 2017](https://www.cambridge.org/core/journals/political-analysis/article/estimating-heterogeneous-treatment-effects-and-the-effects-of-heterogeneous-treatments-with-ensemble-methods/C7E3EA00D0AD83429CBE73F4F0C6652C)
206 | - [Kunzel et al, _PNAS_ 2019](https://arxiv.org/abs/1706.03461)
207 | 
208 | Elements of Statistical Learning:
209 | 
210 | - 9.2: Tree-Based Methods
211 | - 15: Random Forests
212 | - 16: Ensemble Learning
213 | 
214 | 


--------------------------------------------------------------------------------
/Week6/lecture.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/Week6/lecture.pdf


--------------------------------------------------------------------------------
/Week7/exercises.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Week 7 Coding Exercises\n",
  8 |     "\n",
  9 |     "This week we learned three main skills:\n",
 10 |     "\n",
 11 |     "- Regular Expressions\n",
 12 |     "- Pulling Webpages\n",
 13 |     "- Scraping Webpages\n",
 14 |     "\n",
 15 |     "I want you to go further with the code we started developing in the coding tutorial.\n",
 16 |     "\n",
 17 |     "By the end of the tutorial, we had code to get all of the books, along with their price, rating, and a link to their dedicated page.\n",
 18 |     "\n",
 19 |     "My next challenge for you is to do the following\n",
 20 |     "\n",
 21 |     "1. On each of the dedicated pages, there is a description of the book. Write code that will scrape the description for each book and add it as a column on the product_info dataframe.\n",
 22 |     "2. Currently, this scraper can only access the first page of results. Modify the scraper so that it iterates through pages. (Hint: There is a link close to the bottom of each page, but the title will be different each time (page-3, page-4, etc.). You will need to work in a solution that finds the link each time, or automatically increments the page at each iteration."
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "import requests\n",
 32 |     "import pandas as pd\n",
 33 |     "import re\n",
 34 |     "from bs4 import BeautifulSoup\n",
 35 |     "\n",
 36 |     "url = \"http://books.toscrape.com\"\n",
 37 |     "session = requests.Session()\n",
 38 |     "page = session.get(url)\n",
 39 |     "\n",
 40 |     "soup = BeautifulSoup(page.text, 'html.parser')\n",
 41 |     "product_pods = soup.find_all('article', class_=\"product_pod\")\n",
 42 |     "\n",
 43 |     "def get_product_info(product_pod):\n",
 44 |     "    # Title can be accessed from img alt\n",
 45 |     "    image_elem = product_pod.div.a.img\n",
 46 |     "    title = image_elem['alt']\n",
 47 |     "    # Rating can be accessed from class (css) on star-rating\n",
 48 |     "    rating_elem = product_pod.find('p', class_=re.compile(r\"star-rating .*\"))\n",
 49 |     "    rating = rating_elem['class'][1]+\"/Five\" # Second class attribute\n",
 50 |     "    price_elem = product_pod.find('div', class_=\"product_price\")\n",
 51 |     "    price = re.search(re.compile(\"[0-9\\.]+\"), price_elem.text)[0]\n",
 52 |     "    link = product_pod.find('a', href=True)['href']\n",
 53 |     "    return title, rating, price, link\n",
 54 |     "\n",
 55 |     "product_info = []\n",
 56 |     "for pod in product_pods:\n",
 57 |     "    product_info.append(get_product_info(pod))\n",
 58 |     "\n",
 59 |     "product_info = pd.DataFrame(product_info, columns=[\"Title\", \"Rating\", \"Price\", 'Link'])\n",
 60 |     "product_info.loc[:, 'Link'] = product_info['Link'].apply(lambda x: url+\"/\"+x)"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "Regex Bonus Challenge:\n",
 68 |     "\n",
 69 |     "Full disclaimer: I wasn't able to figure out a solution to this, but I thought you might enjoy the challenge.\n",
 70 |     "\n",
 71 |     "Given the lyrics in the box below, find a way to match the full phrase after each \"million\".\n",
 72 |     "\n",
 73 |     "Therefore you should get back the matches:\n",
 74 |     "\n",
 75 |     "```\n",
 76 |     "['bags of the best Sligo rags',\n",
 77 |     " 'barrels of stone',\n",
 78 |     " 'sides of old blind horses hides',\n",
 79 |     " 'barrels of bones',\n",
 80 |     " 'hogs',\n",
 81 |     " 'dogs',\n",
 82 |     " 'barrels of porter',\n",
 83 |     " 'bails of old nanny goats']\n",
 84 |     " ```\n",
 85 |     "\n",
 86 |     "\n",
 87 |     "The closest I was able to come was: `million ([\\w ]+)`, but this matches the `and seven million dogs`, which I want separately."
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "lyrics = \"\"\"\n",
 97 |     "On the Fourth of July, 1806\n",
 98 |     "We set sail from the sweet Cove of Cork\n",
 99 |     "We were sailing away with a cargo of bricks\n",
100 |     "For the Grand City Hall in New York\n",
101 |     "'Twas a wonderful craft, she was rigged fore and aft\n",
102 |     "And oh, how the wild wind drove her\n",
103 |     "She stood several blasts, she had twenty seven masts\n",
104 |     "And they called her The Irish Rover\n",
105 |     "We had one million bags of the best Sligo rags\n",
106 |     "We had two million barrels of stone\n",
107 |     "We had three million sides of old blind horses hides\n",
108 |     "We had four million barrels of bones\n",
109 |     "We had five million hogs and six million dogs\n",
110 |     "Seven million barrels of porter\n",
111 |     "We had eight million bails of old nanny goats' tails\n",
112 |     "In the hold of the Irish Rover\n",
113 |     "\"\"\"\n",
114 |     "\n",
115 |     "r = re.compile(r\"million ([\\w ]+)\") # This solution doesn't work\n",
116 |     "\n",
117 |     "re.findall(r, lyrics)"
118 |    ]
119 |   }
120 |  ],
121 |  "metadata": {
122 |   "kernelspec": {
123 |    "display_name": "teaching",
124 |    "language": "python",
125 |    "name": "teaching"
126 |   },
127 |   "language_info": {
128 |    "codemirror_mode": {
129 |     "name": "ipython",
130 |     "version": 3
131 |    },
132 |    "file_extension": ".py",
133 |    "mimetype": "text/x-python",
134 |    "name": "python",
135 |    "nbconvert_exporter": "python",
136 |    "pygments_lexer": "ipython3",
137 |    "version": "3.7.6"
138 |   }
139 |  },
140 |  "nbformat": 4,
141 |  "nbformat_minor": 4
142 | }
143 | 


--------------------------------------------------------------------------------
/Week7/lecture.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/Week7/lecture.pdf


--------------------------------------------------------------------------------
/Week7/planning.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Planning Week 7 - Web Scraping
 3 | ---
 4 | 
 5 | # Some Thoughts
 6 | 
 7 | This lesson should cover:
 8 | 
 9 | - the way that the Internet and webpages work
10 | - specific tools to navigate webpages (regex, requests, beautifulsoup)
11 | - a more general understanding of automation and "deployment"
12 | 
13 | For good measure, I need to also include:
14 | 
15 | - the legal grey area that is web scraping, and good etiquette
16 | - a discussion of time-saving/practicality
17 | 
18 | In addition to the commands and libraries mentioned above, I'll need to cover:
19 | 
20 | - Writing python scripts (instead of notebooks)
21 | - `try/except` loops and error handling
22 | - `while` loops
23 | 
24 | # Structure
25 | 
26 | - **Roadmap**
27 | - How does the Internet work? (short version)
28 |     - Your computer sends a `GET` request with some routing information (url, ip address) to an intermediary server (DNS).
29 |     - The DNS forwards the request to the desired destination.
30 |     - The webpage host server receives the request, and sends back the requested information (via DNS, etc.). This information is usually a mixture of `html`, `css`, `javascript` and maybe `php`.
31 |         - `html`: The skeleton and text.
32 |         - `css`: The aesthetic styling elements.
33 |         - `javascript`: Locally-executed interactive elements.
34 |         - `php`: Host-side interactive elements.
35 |     - Your computer receives the information, and a specialised program known as a "browser" renders the information as a webpage.
36 | - What kind of information is stored in webpages that we, as social scientists, might want to use?
37 |     - News articles (news websites) (*ALL* news articles since _x_ date.)
38 |     - Press releases (corporate, political) (*ALL* press release by politician _x_)
39 |     - Government resources/archives (all parliamentary transcripts, or of a specific sub committee)
40 |     - Tweets? (This lesson discusses APIs last)
41 | - How can we use Python to assist us with conducting this collection on a large scale?
42 |     - This is a task of locating, filtering and extracting information from a largely _unstructured_ dataset. For this we use:
43 |         - `requests` to get webpages
44 |         - `beautifulsoup` to clean up, structure and work with `html`
45 |         - `regex` to apply flexible patterns to character-string data.
46 |     - Added challenges are rate limiting, not knowing the format of webpages, and so on.
47 |     - Some glaring omissions:
48 |         - `scrapy`: A library for building deployable web crawlers.
49 |         - `selenium`: For dealing with `javascript`/creating a scraper that behaves more like a human.
50 | - When is it (in)appropriate to scrape?
51 |     - Web servers have limited resources for serving requests; if they try to send too much data, then they will slow down/break.
52 |         - Most web servers have DDoS protection measures; if they see that they are receiving a large volume of requests from a particular IP, then they will block/throttle that address.
53 |         - Even if the server does not have these measures, be considerate, and do not accidentally cyberattack somebody.
54 |     - Scraping is not *usually* included in the ToS of a website, but may be prohibited by your ISP etc. In most cases, it is in a legal grey area.
55 |         - Companies and governments have the option of sending a cease-and-desist, in which case scraping does become illegal.
56 |         - Obviously you should not do anything illegal.
57 | 


--------------------------------------------------------------------------------
/Week8/examples_selenium.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "opponent-colorado",
  6 |    "metadata": {
  7 |     "slideshow": {
  8 |      "slide_type": "subslide"
  9 |     }
 10 |    },
 11 |    "source": [
 12 |     "# Browser Automation with Selenium\n",
 13 |     "\n",
 14 |     "This notebook contains a short tutorial for scraping with the Selenium toolkit.\n",
 15 |     "\n",
 16 |     "We will be scraping `quotes.toscrape.com`, a wonderful page for practicing more advanced scraping techniques."
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": null,
 22 |    "id": "julian-canon",
 23 |    "metadata": {
 24 |     "slideshow": {
 25 |      "slide_type": "subslide"
 26 |     }
 27 |    },
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "# imports\n",
 31 |     "import requests\n",
 32 |     "from selenium import webdriver\n",
 33 |     "from selenium.webdriver.common.by import By"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "id": "finished-mixer",
 39 |    "metadata": {
 40 |     "slideshow": {
 41 |      "slide_type": "subslide"
 42 |     }
 43 |    },
 44 |    "source": [
 45 |     "## When static scraping fails:\n",
 46 |     "\n",
 47 |     "The following webpage is generated dynamically by `javascript`.\n",
 48 |     "We can see the script source in this page, but this is often not the case:"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": null,
 54 |    "id": "straight-columbia",
 55 |    "metadata": {
 56 |     "slideshow": {
 57 |      "slide_type": "subslide"
 58 |     }
 59 |    },
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "from bs4 import BeautifulSoup\n",
 63 |     "\n",
 64 |     "url = \"https://quotes.toscrape.com/js/\"\n",
 65 |     "page = requests.get(url)\n",
 66 |     "print(BeautifulSoup(page.text).body.prettify())"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "id": "collect-finnish",
 72 |    "metadata": {
 73 |     "slideshow": {
 74 |      "slide_type": "subslide"
 75 |     }
 76 |    },
 77 |    "source": [
 78 |     "## Instantiating the WebDriver\n",
 79 |     "\n",
 80 |     "When we call the `webdriver.Chrome()` method, if we have the webdriver properly installed, an automated Chrome instance should appear!\n"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "id": "julian-nightlife",
 87 |    "metadata": {
 88 |     "slideshow": {
 89 |      "slide_type": "subslide"
 90 |     }
 91 |    },
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "driver = webdriver.Chrome()\n",
 95 |     "driver.get(url)"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "id": "intimate-edgar",
101 |    "metadata": {
102 |     "slideshow": {
103 |      "slide_type": "subslide"
104 |     }
105 |    },
106 |    "source": [
107 |     "Let's select all of the quote-boxes that have the tag \"life\"."
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "id": "white-interstate",
114 |    "metadata": {
115 |     "slideshow": {
116 |      "slide_type": "subslide"
117 |     }
118 |    },
119 |    "outputs": [],
120 |    "source": [
121 |     "# This returns a list of elements that have the CSS class 'quote'\n",
122 |     "quote_boxes = driver.find_elements(\n",
123 |     "    By.CLASS_NAME, 'quote')"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "id": "living-bundle",
130 |    "metadata": {
131 |     "slideshow": {
132 |      "slide_type": "subslide"
133 |     }
134 |    },
135 |    "outputs": [],
136 |    "source": [
137 |     "# Let's navigate the first element to recognize a pattern\n",
138 |     "# Selecting the first div\n",
139 |     "quote_box = quote_boxes[0]\n",
140 |     "# Selecting the container div for the tags\n",
141 |     "tags = quote_box.find_element(By.CLASS_NAME, 'tags')\n",
142 |     "# Getting the tag names\n",
143 |     "[\n",
144 |     "    tag.text for tag\n",
145 |     "    in tags.find_elements(By.TAG_NAME, 'a')\n",
146 |     "]"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "id": "impressive-diesel",
153 |    "metadata": {
154 |     "slideshow": {
155 |      "slide_type": "subslide"
156 |     }
157 |    },
158 |    "outputs": [],
159 |    "source": [
160 |     "# Some crazy list filtering\n",
161 |     "life_quotes = [\n",
162 |     "    quote for quote in quote_boxes if                     # unpack quote_boxes\n",
163 |     "    'life' in [tag.text for tag in                        # check if 'life' is in\n",
164 |     "               quote.find_element(By.CLASS_NAME, 'tags'). # the list of tags\n",
165 |     "               find_elements(By.TAG_NAME, 'a')]           # like we obtained before\n",
166 |     "]\n",
167 |     "life_quotes"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": null,
173 |    "id": "young-emergency",
174 |    "metadata": {
175 |     "slideshow": {
176 |      "slide_type": "subslide"
177 |     }
178 |    },
179 |    "outputs": [],
180 |    "source": [
181 |     "# Let's put that into a function\n",
182 |     "def filter_quotes_by_tag(driver, tag):\n",
183 |     "    quote_boxes = driver.find_elements(By.CLASS_NAME, 'quote')\n",
184 |     "    tagged_quotes = [\n",
185 |     "    quote for quote in quote_boxes if                     # unpack quote_boxes\n",
186 |     "        tag in [t.text for t in                           # check if tag is in\n",
187 |     "               quote.find_element(By.CLASS_NAME, 'tags'). # the list of tags\n",
188 |     "               find_elements(By.TAG_NAME, 'a')]           # like we obtained before\n",
189 |     "    ]\n",
190 |     "    return tagged_quotes"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "markdown",
195 |    "id": "binding-wheel",
196 |    "metadata": {
197 |     "slideshow": {
198 |      "slide_type": "subslide"
199 |     }
200 |    },
201 |    "source": [
202 |     "## Simulating Clicks\n",
203 |     "\n",
204 |     "We can use the `.click()` property of any element to 'click' on it.\n",
205 |     "\n",
206 |     "Let's proceed to the next page of quotes."
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": null,
212 |    "id": "aging-voltage",
213 |    "metadata": {
214 |     "slideshow": {
215 |      "slide_type": "subslide"
216 |     }
217 |    },
218 |    "outputs": [],
219 |    "source": [
220 |     "# Get the \"next\" element\n",
221 |     "next_button = driver.find_element(By.PARTIAL_LINK_TEXT, 'Next')\n",
222 |     "print(driver.current_url)\n",
223 |     "next_button.click()\n",
224 |     "print(driver.current_url)"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "markdown",
229 |    "id": "still-gasoline",
230 |    "metadata": {
231 |     "slideshow": {
232 |      "slide_type": "subslide"
233 |     }
234 |    },
235 |    "source": [
236 |     "## Sending Keys\n",
237 |     "\n",
238 |     "Let's try to log in!"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": null,
244 |    "id": "controversial-jackson",
245 |    "metadata": {
246 |     "slideshow": {
247 |      "slide_type": "subslide"
248 |     }
249 |    },
250 |    "outputs": [],
251 |    "source": [
252 |     "login_box = driver.find_element(By.LINK_TEXT, 'Login')\n",
253 |     "login_box.click()"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "code",
258 |    "execution_count": null,
259 |    "id": "august-purpose",
260 |    "metadata": {
261 |     "slideshow": {
262 |      "slide_type": "subslide"
263 |     }
264 |    },
265 |    "outputs": [],
266 |    "source": [
267 |     "# Entering username and password\n",
268 |     "username_box = driver.find_element(By.ID, 'username')\n",
269 |     "password_box = driver.find_element(By.ID, 'password')"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": null,
275 |    "id": "flush-minutes",
276 |    "metadata": {
277 |     "slideshow": {
278 |      "slide_type": "subslide"
279 |     }
280 |    },
281 |    "outputs": [],
282 |    "source": [
283 |     "username_box.send_keys('username')\n",
284 |     "password_box.send_keys('password')"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "code",
289 |    "execution_count": null,
290 |    "id": "double-boring",
291 |    "metadata": {
292 |     "slideshow": {
293 |      "slide_type": "subslide"
294 |     }
295 |    },
296 |    "outputs": [],
297 |    "source": [
298 |     "# Using XPATH to get the login button\\\n",
299 |     "# https://www.w3schools.com/xml/xpath_syntax.asp\n",
300 |     "login_button = driver.find_element(\n",
301 |     "    By.XPATH, r\"//input[(@type='submit')]\")\n",
302 |     "login_button.click()"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "markdown",
307 |    "id": "correct-speaking",
308 |    "metadata": {
309 |     "slideshow": {
310 |      "slide_type": "subslide"
311 |     }
312 |    },
313 |    "source": [
314 |     "## Race Conditions\n",
315 |     "\n",
316 |     "Usually the page will take time to load.\n",
317 |     "\n",
318 |     "If you are running Selenium from a script, it will execute the commands sequentially\n",
319 |     "as fast as possible. This causes problems."
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "code",
324 |    "execution_count": null,
325 |    "id": "quality-israel",
326 |    "metadata": {
327 |     "slideshow": {
328 |      "slide_type": "subslide"
329 |     }
330 |    },
331 |    "outputs": [],
332 |    "source": [
333 |     "url = \"https://quotes.toscrape.com/js-delayed/\"\n",
334 |     "driver.get(url)\n",
335 |     "filter_quotes_by_tag(driver, 'life')"
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "markdown",
340 |    "id": "average-scottish",
341 |    "metadata": {
342 |     "slideshow": {
343 |      "slide_type": "subslide"
344 |     }
345 |    },
346 |    "source": [
347 |     "Selenium does provide more sophisticated \"wait\" functionality,\n",
348 |     "where you can define some condition that it will test until\n",
349 |     "it becomes true.\n",
350 |     "\n",
351 |     "I'll demonstrate a simpler (and less reliable) solution, which\n",
352 |     "is to just use a timed wait."
353 |    ]
354 |   },
355 |   {
356 |    "cell_type": "code",
357 |    "execution_count": null,
358 |    "id": "frozen-forest",
359 |    "metadata": {
360 |     "slideshow": {
361 |      "slide_type": "fragment"
362 |     }
363 |    },
364 |    "outputs": [],
365 |    "source": [
366 |     "from time import sleep\n",
367 |     "url = \"https://quotes.toscrape.com/js-delayed/\"\n",
368 |     "driver.get(url)\n",
369 |     "sleep(10) # I happen to know the length of the delay\n",
370 |     "filter_quotes_by_tag(driver, 'life')"
371 |    ]
372 |   },
373 |   {
374 |    "cell_type": "code",
375 |    "execution_count": null,
376 |    "id": "weekly-kingdom",
377 |    "metadata": {},
378 |    "outputs": [],
379 |    "source": [
380 |     "driver.quit()"
381 |    ]
382 |   }
383 |  ],
384 |  "metadata": {
385 |   "kernelspec": {
386 |    "display_name": "scrape",
387 |    "language": "python",
388 |    "name": "scrape"
389 |   },
390 |   "language_info": {
391 |    "codemirror_mode": {
392 |     "name": "ipython",
393 |     "version": 3
394 |    },
395 |    "file_extension": ".py",
396 |    "mimetype": "text/x-python",
397 |    "name": "python",
398 |    "nbconvert_exporter": "python",
399 |    "pygments_lexer": "ipython3",
400 |    "version": "3.8.5"
401 |   }
402 |  },
403 |  "nbformat": 4,
404 |  "nbformat_minor": 5
405 | }
406 | 


--------------------------------------------------------------------------------
/Week8/lecture.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Introduction to Python for Social Science
  3 | subtitle: Lecture 8 - APIs and Selenium
  4 | author: Musashi Harukawa, DPIR
  5 | date: 8th Week Hilary 2021
  6 | ---
  7 | 
  8 | # Lecture Roadmap
  9 | 
 10 | ## Last Week
 11 | 
 12 | - HTTP requests and Internet fundamentals
 13 | - Regular Expressions
 14 | 
 15 | ## This Week
 16 | 
 17 | - APIs
 18 | 	- Twitter's Academic Track
 19 | - Browser Automation
 20 | 
 21 | # APIs
 22 | 
 23 | ## What is an API?
 24 | 
 25 | - _Application Programming Interface_
 26 | - _Interface_: Specialized endpoint
 27 | 	- Specific query syntax
 28 | 	- Returns defined data packets
 29 | - We are interested in _Web APIs_
 30 | 
 31 | ## Web API Examples
 32 | 
 33 | - Twitter
 34 | - Reddit
 35 | - NY Times
 36 | - The Guardian
 37 | - Spotify
 38 | - Netflix
 39 | 
 40 | ## API Mechanics
 41 | 
 42 | - REST vs SOAP
 43 | - RESTful APIs loosely based on HTTP methods
 44 | 	- Accept HTTP-like requests to access server-side assets
 45 | 	- Return the payload usually as JSON or XML
 46 | 	- _Stateless_: no server-side session information
 47 | 
 48 | ::: notes
 49 | - Most of the APIs I have come across are REST; all I know about SOAP is that it mandates XML payloads.
 50 | - Loosely-based: depending on the API, may allow for header or body parameters that do not typically exist in HTTP requests.
 51 | - Payload: the actual data packet. Sounds dramatic, it's just the thing you wanted (versus the header, which basically says what it is and where it should go).
 52 | - Stateless: remember that the server does not remember who its speaking to. That means your credentials need to be sent with each request, and importantly for paginated requests, a "next page" token. We'll come back to that later.
 53 | :::
 54 | 
 55 | ## Twitter's API
 56 | 
 57 | - Many different Twitter APIs and endpoints (Standard, Premium, Enterprise, and **Academic**)
 58 | - **Academic Research product track** has following endpoints:
 59 | 	- _Full-archive search_: (Almost) everything back to 2006!
 60 | 	- _Recent search_: Last 7 days, higher volumes
 61 | 	- _Filtered stream_: Real-time filtered stream, capped at 1% of total volume
 62 | 	- _Sampled stream_: $~1\%$ of all new Tweets in real-time
 63 | 	- _Tweet and User Lookup_: Look up user/tweet by id
 64 | 	- and [more](https://developer.twitter.com/en/solutions/academic-research/products-for-researchers)
 65 | 
 66 | ## Applying for Access
 67 | 
 68 | - The Academic Research track has the following criteria:
 69 | 	- Master's student or above (doctoral candidate, post-doc, faculty, researcher, etc.)
 70 | 	- Clearly defined research objective and specific plans for how you will use the Twitter data
 71 | 	- Non-commercial use
 72 | - You can apply [here](https://developer.twitter.com/en/portal/petition/academic/is-it-right-for-you)
 73 | 
 74 | ## Using the API (with Python)
 75 | 
 76 | - We can use Python to generate requests to interact with Twitter's API
 77 | - Twitter provides a "wrapper" package: `searchtweets-v2`
 78 | - Documentation provided [here](https://pypi.org/project/searchtweets-v2/) and [here](https://developer.twitter.com/en/docs/twitter-api/tweets/search/introduction)
 79 | 
 80 | ## Managing Credentials
 81 | 
 82 | - Once you are granted access, you will be given a set of credentials for your project/application.
 83 | - Store these securely, i.e. do not post them somewhere public.
 84 | - Place them in a credentials `yaml` file that looks like the following:
 85 | 
 86 | :::{.fragment}
 87 | ```{yaml}
 88 | search_tweets_v2:
 89 |   endpoint:  https://api.twitter.com/2/tweets/search/all
 90 |   consumer_key: <CONSUMER_KEY>
 91 |   consumer_secret: <CONSUMER_SECRET>
 92 |   bearer_token: <BEARER_TOKEN>
 93 | ```
 94 | :::
 95 | 
 96 | ## Writing and Sending Requests
 97 | 
 98 | - To be discussed in the coding tutorial
 99 | 
100 | # Browser Automation
101 | 
102 | ## When does static scraping fail?
103 | 
104 | - Sometimes the information you need is not contained in the `html` returned by a request.
105 | - Obtaining that information may require interaction with the web app.
106 | 	- Log in
107 | 	- Dynamic elements
108 | - Some web servers block suspicious activity
109 | 
110 | ## Static vs Dynamic Webpages
111 | 
112 | - Interactive $\not\to$ Dynamic
113 | - Dynamic page source generation
114 | 	- Server-side: `php`
115 | 	- Client-side: JavaScript
116 | 
117 | ::: notes
118 | - `html` supports moderate interactivity, through drop-down menus and text boxes
119 | - What we're talking about here is pages with dynamic source, i.e. when you interact with an element, the source of the page itself transforms.
120 | :::
121 | 
122 | ## Browser Automation
123 | 
124 | - [Selenium Browser Automation Framework](https://www.selenium.dev/)
125 | - Designed for testing, but useful for scraping!
126 | - Any and all browser actions can be emulated/automated.
127 | 
128 | ## Using Selenium
129 | 
130 | - Actions are methods of a "WebDriver" object.
131 | - Many similar methods to `BeautifulSoup` for navigating DOM.
132 | 	- Search for elements by id, regex, xpath, etc.
133 | - Selenium IDE allows you to record your own usage and codify it afterwards.
134 | 
135 | ## Considerations
136 | 
137 | - Race conditions—"wait"s are your friend!
138 | - Overhead/overkill
139 | - Human-like automation
140 | 
141 | # Course Recap
142 | 
143 | ## Topics
144 | 
145 | 1. Introduction to Python and the Development Environment
146 | 2. Data Structures and `pandas` I
147 | 3. Data Structures and `pandas` II
148 | 4. Data Visualisation
149 | 5. Machine Learning with `scikit-learn` I
150 | 6. Machine Learning with `scikit-learn` II
151 | 7. Web Scraping with `BeautifulSoup` and `regex`
152 | 8. Web Scraping with `Selenium`
153 | 
154 | ## You can now:
155 | 
156 | - Write and execute code using Google Colab notebooks
157 | - Read, write, clean, and analyze different tabular data formats
158 | - Generate summary statistics and run simple linear algebra operations
159 | - Create static, 2D data-based visuals
160 | - Implement, fine-tune and interpret a range of ML models
161 | - Flexibly search text with regex
162 | - Scrape static and dynamic webpages, and use APIs
163 | 


--------------------------------------------------------------------------------
/Week8/lecture.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/Week8/lecture.pdf


--------------------------------------------------------------------------------
/Week8/lecture_planning.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Lecture Notes and Planning
 3 | ---
 4 | 
 5 | # Overview
 6 | 
 7 | - Advanced web scraping topics:
 8 | 	- APIs
 9 | 	- Browser automation
10 | 
11 | - APIs
12 | 	- Overview
13 | 		- What is an API?
14 | 		- Examples of APIs
15 | 		- How do they work?
16 | 			- REST
17 | 	- Twitter's API
18 | 		- What does it provide?
19 | 		- Gaining access
20 | 		- Python wrapper
21 | 		- Credentials
22 | 
23 | - Browser automation
24 | 	- When does static web scraping fail?
25 | 		- Dynamic assets and JavaScript
26 | 		- Bot detection
27 | 	- Selenium
28 | 		- Tool for automating browsers
29 | 		- Designed explicitly for testing web apps.
30 | 		- Allows you to automate any/all browser actions.
31 | 	- Considerations when using Selenium
32 | 		- Race conditions
33 | 		- Overhead/overkill
34 | 		- Emulating human behaviour
35 | 
36 | 
37 | 	
38 | 


--------------------------------------------------------------------------------
/Week8Old/examples.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Coding Tutorial Week 8\n",
  8 |     "\n",
  9 |     "In this tutorial, we look at ways to combine `spaCy`, `regex`, `pandas`, `matplotlib` and `seaborn` to analyse the text column of the BES data."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import pandas as pd\n",
 19 |     "import seaborn as sns\n",
 20 |     "import matplotlib.pyplot as plt\n",
 21 |     "import re\n",
 22 |     "import numpy as np\n",
 23 |     "from collections import Counter\n",
 24 |     "from sklearn.cluster import AgglomerativeClustering\n",
 25 |     "\n",
 26 |     "bes_df = pd.read_feather(\"../Week2/data/bes_data_subset_week2.feather\")"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "metadata": {},
 32 |    "source": [
 33 |     "We now import spacy and our language model."
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "import spacy"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "nlp = spacy.load(\"en_core_web_md\") # if this doesn't work for you\n",
 52 |     "# open ipython terminal\n",
 53 |     "# >>> import spacy\n",
 54 |     "# >>> nlp = spacy.load(\"en_core_web_sm\")\n",
 55 |     "# >>> nlp._path\n",
 56 |     "# PosixPath('<COPY THIS>')"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "To get an idea of what spacy can do, let's use it on one of the short responses."
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "doc = nlp(bes_df.loc[1216, 'a01'])\n",
 73 |     "doc"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "doc.print_tree()"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "[token.pos_ for token in doc]"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "def view_spacy_data(doc):\n",
101 |     "    \"View various aspects of the language model.\"\n",
102 |     "    data = []\n",
103 |     "    for token in doc:\n",
104 |     "        data.append([token.text, token.lemma_, token.pos_, token.tag_, token.dep_,\n",
105 |     "                     token.shape_, token.is_alpha, token.is_stop])\n",
106 |     "    columns = ['text', 'lemma_', 'pos_', 'tag_',\n",
107 |     "               'dep_', 'shape_', 'is_alpha', 'is_stop']\n",
108 |     "    df = pd.DataFrame(data, columns=columns)\n",
109 |     "    return df"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "view_spacy_data(doc)"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "markdown",
123 |    "metadata": {},
124 |    "source": [
125 |     "Let's do a bit of pre-processing before we apply the language model to the rest of the answers.\n",
126 |     "\n",
127 |     "- NA removal\n",
128 |     "- lowercase everything\n",
129 |     "- remove consecutive spaces"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "bes_df['a01'].value_counts().head(20) # Looks like '-1' is a na value\n",
139 |     "                                      # Let's drop all rows that are na here"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "df = bes_df.loc[bes_df['a01']!='-1', :].reset_index(drop=True)"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": null,
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "df['a01'].apply(lambda x: type(x)).unique()"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": null,
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": [
166 |     "df.loc[:, 'a01'] = df['a01'].str.lower().str.replace(re.compile(r\"\\s{2,}\"), \" \")"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": null,
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": [
175 |     "df['nlp'] = df['a01'].apply(lambda x: nlp(x))"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": null,
181 |    "metadata": {},
182 |    "outputs": [],
183 |    "source": [
184 |     "def get_nouns(doc):\n",
185 |     "    nouns = [token.lemma_ for token in doc if\n",
186 |     "             token.pos_ in ('PROPN', 'NOUN') and\n",
187 |     "             token.is_stop==False]\n",
188 |     "    return nouns"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": null,
194 |    "metadata": {},
195 |    "outputs": [],
196 |    "source": [
197 |     "df['nouns'] = df['nlp'].apply(get_nouns)"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "code",
202 |    "execution_count": null,
203 |    "metadata": {},
204 |    "outputs": [],
205 |    "source": [
206 |     "noun_frequencies = pd.Series(Counter(df['nouns'].sum())).sort_values(ascending=False)"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": null,
212 |    "metadata": {},
213 |    "outputs": [],
214 |    "source": [
215 |     "f, ax = plt.subplots(1,1, figsize=(15, 9))\n",
216 |     "\n",
217 |     "ax.set_title(\"Top 50 Most Common Nouns in Item a01\")\n",
218 |     "sns.barplot(noun_frequencies.head(50).index, noun_frequencies.head(50), ax=ax)\n",
219 |     "ax.xaxis.set_ticklabels(ax.xaxis.get_ticklabels(), rotation=-90)\n",
220 |     "None"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "markdown",
225 |    "metadata": {},
226 |    "source": [
227 |     "# Hierarchical Cosine Distance Clustering\n",
228 |     "\n",
229 |     "This model will take more time than we have in class to evaluate, but here's a bit of code that can show you how to conduct cosine clustering on your document vectors."
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": null,
235 |    "metadata": {},
236 |    "outputs": [],
237 |    "source": [
238 |     "df['vector'] = df['nlp'].apply(lambda x: x.vector)\n",
239 |     "df = df.loc[df['vector'].apply(lambda x: np.any(x)), :] # Dropping zero vectors"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": null,
245 |    "metadata": {},
246 |    "outputs": [],
247 |    "source": [
248 |     "doc_vectors = np.vstack(df['vector'].values)"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": null,
254 |    "metadata": {},
255 |    "outputs": [],
256 |    "source": [
257 |     "df['vector'][0].shape"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": null,
263 |    "metadata": {},
264 |    "outputs": [],
265 |    "source": [
266 |     "df['vector'].apply(lambda x: type(x))"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": null,
272 |    "metadata": {},
273 |    "outputs": [],
274 |    "source": [
275 |     "cosine_cluster = AgglomerativeClustering(n_clusters=30, affinity=\"cosine\", linkage=\"single\")"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": null,
281 |    "metadata": {},
282 |    "outputs": [],
283 |    "source": [
284 |     "cosine_cluster.fit(doc_vectors)"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "code",
289 |    "execution_count": null,
290 |    "metadata": {},
291 |    "outputs": [],
292 |    "source": [
293 |     "df['cos_labs'] = cosine_cluster.labels_\n",
294 |     "df.loc[:, 'cos_labs'] = df['cos_labs'].astype(pd.CategoricalDtype())"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": null,
300 |    "metadata": {},
301 |    "outputs": [],
302 |    "source": [
303 |     "df['cos_labs'].value_counts()"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": null,
309 |    "metadata": {},
310 |    "outputs": [],
311 |    "source": [
312 |     "for label in range(10):\n",
313 |     "    n_samp = min(3, sum(df['cos_labs']==label))\n",
314 |     "    sample = df.loc[df['cos_labs']==label, 'a01'].sample(n_samp)\n",
315 |     "    print(\"###### CLUSTER \"+str(label)+\" ######\")\n",
316 |     "    for item in sample.iteritems():\n",
317 |     "        print(item[1])\n",
318 |     "    print(\"\\n\")"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "markdown",
323 |    "metadata": {},
324 |    "source": [
325 |     "Obviously there's a lot of work to be done here."
326 |    ]
327 |   }
328 |  ],
329 |  "metadata": {
330 |   "kernelspec": {
331 |    "display_name": "teaching",
332 |    "language": "python",
333 |    "name": "teaching"
334 |   },
335 |   "language_info": {
336 |    "codemirror_mode": {
337 |     "name": "ipython",
338 |     "version": 3
339 |    },
340 |    "file_extension": ".py",
341 |    "mimetype": "text/x-python",
342 |    "name": "python",
343 |    "nbconvert_exporter": "python",
344 |    "pygments_lexer": "ipython3",
345 |    "version": "3.7.6"
346 |   }
347 |  },
348 |  "nbformat": 4,
349 |  "nbformat_minor": 4
350 | }
351 | 


--------------------------------------------------------------------------------
/Week8Old/figure1.dot:
--------------------------------------------------------------------------------
 1 | digraph G {
 2 |   rankdir=LR;
 3 |   /* Entities */
 4 |   a [label="I"];
 5 |   b [label="will"];
 6 |   c [label="not"];
 7 |   d [label="endorse"];
 8 |   e [label="Sanders"];
 9 |   f [label="Biden"];
10 |   g [label=".", shape="doublecircle"];
11 |   /* Relationships */
12 |   a -> b [label="p(will|I)"]
13 |   b -> c
14 |   b -> d
15 |   c -> d
16 |   d -> e
17 |   d -> f
18 |   e -> g
19 |   f -> g
20 | }
21 | 


--------------------------------------------------------------------------------
/Week8Old/figure1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/Week8Old/figure1.png


--------------------------------------------------------------------------------
/Week8Old/figure1.svg:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
  2 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
  3 |  "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
  4 | <!-- Generated by graphviz version 2.40.1 (0)
  5 |  -->
  6 | <!-- Title: G Pages: 1 -->
  7 | <svg width="648pt" height="104pt"
  8 |  viewBox="0.00 0.00 648.28 104.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
  9 | <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 100)">
 10 | <title>G</title>
 11 | <polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-100 644.278,-100 644.278,4 -4,4"/>
 12 | <!-- a -->
 13 | <g id="node1" class="node">
 14 | <title>a</title>
 15 | <ellipse fill="none" stroke="#000000" cx="27" cy="-44" rx="27" ry="18"/>
 16 | <text text-anchor="middle" x="27" y="-40.3" font-family="Times,serif" font-size="14.00" fill="#000000">I</text>
 17 | </g>
 18 | <!-- b -->
 19 | <g id="node2" class="node">
 20 | <title>b</title>
 21 | <ellipse fill="none" stroke="#000000" cx="173.2976" cy="-44" rx="27.0966" ry="18"/>
 22 | <text text-anchor="middle" x="173.2976" y="-40.3" font-family="Times,serif" font-size="14.00" fill="#000000">will</text>
 23 | </g>
 24 | <!-- a&#45;&gt;b -->
 25 | <g id="edge1" class="edge">
 26 | <title>a&#45;&gt;b</title>
 27 | <path fill="none" stroke="#000000" d="M54.1746,-44C77.1247,-44 110.2284,-44 135.7169,-44"/>
 28 | <polygon fill="#000000" stroke="#000000" points="135.7721,-47.5001 145.7721,-44 135.772,-40.5001 135.7721,-47.5001"/>
 29 | <text text-anchor="middle" x="100" y="-47.8" font-family="Times,serif" font-size="14.00" fill="#000000">p(will|I)</text>
 30 | </g>
 31 | <!-- c -->
 32 | <g id="node3" class="node">
 33 | <title>c</title>
 34 | <ellipse fill="none" stroke="#000000" cx="264.5952" cy="-78" rx="27" ry="18"/>
 35 | <text text-anchor="middle" x="264.5952" y="-74.3" font-family="Times,serif" font-size="14.00" fill="#000000">not</text>
 36 | </g>
 37 | <!-- b&#45;&gt;c -->
 38 | <g id="edge2" class="edge">
 39 | <title>b&#45;&gt;c</title>
 40 | <path fill="none" stroke="#000000" d="M197.2849,-52.9331C207.6639,-56.7983 219.9878,-61.3878 231.236,-65.5768"/>
 41 | <polygon fill="#000000" stroke="#000000" points="230.1752,-68.9165 240.768,-69.1266 232.6182,-62.3566 230.1752,-68.9165"/>
 42 | </g>
 43 | <!-- d -->
 44 | <g id="node4" class="node">
 45 | <title>d</title>
 46 | <ellipse fill="none" stroke="#000000" cx="376.6909" cy="-44" rx="48.1917" ry="18"/>
 47 | <text text-anchor="middle" x="376.6909" y="-40.3" font-family="Times,serif" font-size="14.00" fill="#000000">endorse</text>
 48 | </g>
 49 | <!-- b&#45;&gt;d -->
 50 | <g id="edge3" class="edge">
 51 | <title>b&#45;&gt;d</title>
 52 | <path fill="none" stroke="#000000" d="M200.7281,-44C230.6891,-44 279.5594,-44 318.2038,-44"/>
 53 | <polygon fill="#000000" stroke="#000000" points="318.2981,-47.5001 328.2981,-44 318.298,-40.5001 318.2981,-47.5001"/>
 54 | </g>
 55 | <!-- c&#45;&gt;d -->
 56 | <g id="edge4" class="edge">
 57 | <title>c&#45;&gt;d</title>
 58 | <path fill="none" stroke="#000000" d="M289.195,-70.5386C300.9155,-66.9836 315.3486,-62.6059 329.0886,-58.4384"/>
 59 | <polygon fill="#000000" stroke="#000000" points="330.3687,-61.7076 338.9223,-55.4557 328.3369,-55.009 330.3687,-61.7076"/>
 60 | </g>
 61 | <!-- e -->
 62 | <g id="node5" class="node">
 63 | <title>e</title>
 64 | <ellipse fill="none" stroke="#000000" cx="510.5323" cy="-72" rx="48.9926" ry="18"/>
 65 | <text text-anchor="middle" x="510.5323" y="-68.3" font-family="Times,serif" font-size="14.00" fill="#000000">Sanders</text>
 66 | </g>
 67 | <!-- d&#45;&gt;e -->
 68 | <g id="edge5" class="edge">
 69 | <title>d&#45;&gt;e</title>
 70 | <path fill="none" stroke="#000000" d="M419.0391,-52.8594C431.3669,-55.4384 444.9865,-58.2876 457.8673,-60.9823"/>
 71 | <polygon fill="#000000" stroke="#000000" points="457.4953,-64.4802 468.0001,-63.1022 458.9288,-57.6285 457.4953,-64.4802"/>
 72 | </g>
 73 | <!-- f -->
 74 | <g id="node6" class="node">
 75 | <title>f</title>
 76 | <ellipse fill="none" stroke="#000000" cx="510.5323" cy="-18" rx="37.8943" ry="18"/>
 77 | <text text-anchor="middle" x="510.5323" y="-14.3" font-family="Times,serif" font-size="14.00" fill="#000000">Biden</text>
 78 | </g>
 79 | <!-- d&#45;&gt;f -->
 80 | <g id="edge6" class="edge">
 81 | <title>d&#45;&gt;f</title>
 82 | <path fill="none" stroke="#000000" d="M419.4071,-35.702C434.0766,-32.8523 450.5394,-29.6542 465.3948,-26.7684"/>
 83 | <polygon fill="#000000" stroke="#000000" points="466.2317,-30.1713 475.3807,-24.8285 464.8968,-23.2998 466.2317,-30.1713"/>
 84 | </g>
 85 | <!-- g -->
 86 | <g id="node7" class="node">
 87 | <title>g</title>
 88 | <ellipse fill="none" stroke="#000000" cx="618.278" cy="-44" rx="18" ry="18"/>
 89 | <ellipse fill="none" stroke="#000000" cx="618.278" cy="-44" rx="22" ry="22"/>
 90 | <text text-anchor="middle" x="618.278" y="-40.3" font-family="Times,serif" font-size="14.00" fill="#000000">.</text>
 91 | </g>
 92 | <!-- e&#45;&gt;g -->
 93 | <g id="edge7" class="edge">
 94 | <title>e&#45;&gt;g</title>
 95 | <path fill="none" stroke="#000000" d="M550.6528,-61.5738C562.6546,-58.4549 575.6323,-55.0824 586.979,-52.1337"/>
 96 | <polygon fill="#000000" stroke="#000000" points="588.0679,-55.4671 596.8661,-49.5643 586.3072,-48.6921 588.0679,-55.4671"/>
 97 | </g>
 98 | <!-- f&#45;&gt;g -->
 99 | <g id="edge8" class="edge">
100 | <title>f&#45;&gt;g</title>
101 | <path fill="none" stroke="#000000" d="M544.3281,-26.1552C558,-29.4544 573.685,-33.2393 587.0815,-36.472"/>
102 | <polygon fill="#000000" stroke="#000000" points="586.2666,-39.8758 596.8087,-38.8193 587.9087,-33.0711 586.2666,-39.8758"/>
103 | </g>
104 | </g>
105 | </svg>
106 | 


--------------------------------------------------------------------------------
/Week8Old/lecture.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/Week8Old/lecture.pdf


--------------------------------------------------------------------------------
/Week8Old/planning.md:
--------------------------------------------------------------------------------
 1 | # Week 8 Lecture Planning
 2 | 
 3 | _Introduction to Natural Language Processing_
 4 | 
 5 | ## _Week 8_: Introduction to Natural Language Processing
 6 | 
 7 | 1. What is NLP?
 8 | 2. Syntactic Parsing with `spaCy`
 9 | 3. Entity recognition and document similarity with `spacy`
10 | 
11 | Students will be introduced to an advanced and increasingly popular natural language processing library: `spaCy`, and discuss how it can be useful in their own research. Students will learn basic concepts in natural language processing, and the challenges of operationalising text. References for more advanced models will also be included in the readings.
12 | 
13 | ## Points to Cover
14 | 
15 | - What is natural language processing?
16 |     - Interdisciplinary field drawing on linguistics, machine learning, computer science, etc. innovating methods in analysing natural language.
17 |     - NLP models used in social sciences tend to be more descriptive than generative.
18 | - Considerations about language as a data source
19 |     - The language generating process (i.e. the generative model)
20 |     - Sources of variance within the language generating process
21 |     - Complex information structures within language
22 |         - Simplistically, automatons
23 | - Representations of Language and Related Metrics
24 |     - Word-frequency approaches
25 |         - Bag of Words
26 |         - _tf-idf_
27 |     - Embedding/Vector Representation
28 |         - Word embeddings
29 |             - word2vec
30 |             - GloVE
31 |         - Document embeddings
32 |             - doc2vec
33 |     - Language Metrics
34 |         - Why Euclidean distance is problematic
35 |         - Cosine Distance
36 |         - Clustering
37 | -
38 | 


--------------------------------------------------------------------------------
/Week8Old/wmd_fig1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/Week8Old/wmd_fig1.png


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-minimal
2 | 


--------------------------------------------------------------------------------
/dpir-intro-theme.css:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * A simple theme for reveal.js presentations, similar
  3 |  * to the default theme. The accent color is darkblue.
  4 |  *
  5 |  * This theme is Copyright (C) 2012 Owen Versteeg, https://github.com/StereotypicalApps. It is MIT licensed.
  6 |  * reveal.js is Copyright (C) 2011-2012 Hakim El Hattab, http://hakim.se
  7 |  */
  8 | @import url(https://fonts.googleapis.com/css?family=News+Cycle:400,700);
  9 | @import url(https://fonts.googleapis.com/css?family=Lato:400,700,400italic,700italic);
 10 | section.has-dark-background, section.has-dark-background h1, section.has-dark-background h2, section.has-dark-background h3, section.has-dark-background h4, section.has-dark-background h5, section.has-dark-background h6 {
 11 |   color: #fff; }
 12 | 
 13 | /*********************************************
 14 |  * GLOBAL STYLES
 15 |  *********************************************/
 16 | body {
 17 |   background: #fff;
 18 |   background-color: #fff; }
 19 | 
 20 | .reveal {
 21 |   font-family: "Lato", sans-serif;
 22 |   font-size: 35px;
 23 |   font-weight: normal;
 24 |   color: #000; }
 25 | 
 26 | ::selection {
 27 |   color: #000;
 28 |   background: rgba(0, 0, 0, 0.99);
 29 |   text-shadow: none; }
 30 | 
 31 | ::-moz-selection {
 32 |   color: #fff;
 33 |   background: rgba(0, 0, 0, 0.99);
 34 |   text-shadow: none; }
 35 | 
 36 | .reveal .slides section,
 37 | .reveal .slides section > section {
 38 |   line-height: 1.3;
 39 |   font-weight: inherit; }
 40 | 
 41 | /*********************************************
 42 |  * HEADERS
 43 |  *********************************************/
 44 | .reveal h1,
 45 | .reveal h2,
 46 | .reveal h3,
 47 | .reveal h4,
 48 | .reveal h5,
 49 | .reveal h6 {
 50 |   margin: 0 0 20px 0;
 51 |   color: #000;
 52 |   font-family: "Lato", sans-serif;
 53 |   font-weight: bold;
 54 |   line-height: 1.2;
 55 |   letter-spacing: normal;
 56 |   text-transform: none;
 57 |   text-shadow: none;
 58 |   word-wrap: break-word; }
 59 | 
 60 | .reveal h1 {
 61 |   font-size: 1.65em;
 62 |   text-align: left; }
 63 | 
 64 | .reveal h2 {
 65 |   font-size: 1.05em;
 66 |   text-align: left; }
 67 | 
 68 | .reveal h3 {
 69 |   font-size: 1.025em; }
 70 | 
 71 | .reveal h4 {
 72 |   font-size: 1em; }
 73 | 
 74 | .reveal h1 {
 75 |   text-shadow: none; }
 76 | 
 77 | /*********************************************
 78 |  * OTHER
 79 |  *********************************************/
 80 | .reveal p {
 81 |   margin: 5px 0;
 82 |   line-height: 1.15;
 83 |   text-align: left; }
 84 | 
 85 | /* Ensure certain elements are never larger than the slide itself */
 86 | .reveal img,
 87 | .reveal video,
 88 | .reveal iframe {
 89 |   max-width: 95%;
 90 |   max-height: 95%; }
 91 | 
 92 | .reveal strong,
 93 | .reveal b {
 94 |   font-weight: bold; }
 95 | 
 96 | .reveal em {
 97 |   font-style: italic; }
 98 | 
 99 | .reveal ol,
100 | .reveal dl,
101 | .reveal ul {
102 |   text-align: left;
103 |   margin: 0 0 0 1.5em; }
104 | 
105 | .reveal ol {
106 |   list-style-type: decimal; }
107 | 
108 | .reveal ul {
109 |   list-style-type: disc; }
110 | 
111 | .reveal ul ul {
112 |   list-style-type: square; }
113 | 
114 | .reveal ul ul ul {
115 |   list-style-type: circle; }
116 | 
117 | .reveal ul ul,
118 | .reveal ul ol,
119 | .reveal ol ol,
120 | .reveal ol ul {
121 |   display: block;
122 |   margin-left: 1.5em; }
123 | 
124 | .reveal dt {
125 |   font-weight: bold; }
126 | 
127 | .reveal dd {
128 |   margin-left: 30px; }
129 | 
130 | .reveal blockquote {
131 |   display: block;
132 |   position: relative;
133 |   width: 70%;
134 |   margin: 20px auto;
135 |   padding: 5px;
136 |   font-style: italic;
137 |   background: rgba(255, 255, 255, 0.05);
138 |   box-shadow: 0px 0px 2px rgba(0, 0, 0, 0.2); }
139 | 
140 | .reveal blockquote p:first-child,
141 | .reveal blockquote p:last-child {
142 |   display: inline-block; }
143 | 
144 | .reveal q {
145 |   font-style: italic; }
146 | 
147 | .reveal pre {
148 |   display: block;
149 |   position: relative;
150 |   width: 90%;
151 |   margin: 20px auto;
152 |   text-align: left;
153 |   font-size: 0.55em;
154 |   font-family: monospace;
155 |   line-height: 1.2em;
156 |   word-wrap: break-word;
157 |   box-shadow: 0px 5px 15px rgba(0, 0, 0, 0.15); }
158 | 
159 | .reveal code {
160 |   font-family: monospace;
161 |   text-transform: none;
162 |   padding: 2px 4px;
163 |   font-size: 90%;
164 |   color: #c7254e;
165 |   background-color: #f9f2f4;
166 |   vertical-align: baseline;
167 |   white-space: pre-wrap;
168 |   border-radius: 2px;
169 | 
170 | }
171 | 
172 | .reveal pre code {
173 |   display: block;
174 |   padding: 5px;
175 |   overflow: auto;
176 |   max-height: 400px;
177 |   word-wrap: normal; }
178 | 
179 | .reveal table {
180 |   margin: auto;
181 |   border-collapse: collapse;
182 |   border-spacing: 0; }
183 | 
184 | .reveal table th {
185 |   font-weight: bold; }
186 | 
187 | .reveal table th,
188 | .reveal table td {
189 |   text-align: left;
190 |   padding: 0.2em 0.5em 0.2em 0.5em;
191 |   border-bottom: 1px solid; }
192 | 
193 | .reveal table th[align="center"],
194 | .reveal table td[align="center"] {
195 |   text-align: center; }
196 | 
197 | .reveal table th[align="right"],
198 | .reveal table td[align="right"] {
199 |   text-align: right; }
200 | 
201 | .reveal table tbody tr:last-child th,
202 | .reveal table tbody tr:last-child td {
203 |   border-bottom: none; }
204 | 
205 | .reveal sup {
206 |   vertical-align: super;
207 |   font-size: smaller; }
208 | 
209 | .reveal sub {
210 |   vertical-align: sub;
211 |   font-size: smaller; }
212 | 
213 | .reveal small {
214 |   display: inline-block;
215 |   font-size: 0.6em;
216 |   line-height: 1.2em;
217 |   vertical-align: top; }
218 | 
219 | .reveal small * {
220 |   vertical-align: top; }
221 | 
222 | /*********************************************
223 |  * LINKS
224 |  *********************************************/
225 | .reveal a {
226 |   color: #00008B;
227 |   text-decoration: none;
228 |   -webkit-transition: color .15s ease;
229 |   -moz-transition: color .15s ease;
230 |   transition: color .15s ease; }
231 | 
232 | .reveal a:hover {
233 |   color: #0000f1;
234 |   text-shadow: none;
235 |   border: none; }
236 | 
237 | .reveal .roll span:after {
238 |   color: #fff;
239 |   background: #00003f; }
240 | 
241 | /*********************************************
242 |  * IMAGES
243 |  *********************************************/
244 | .reveal section img {
245 |   margin: 15px 0px;
246 |   background: rgba(255, 255, 255, 0.12);
247 |   border: 4px solid #000;
248 |   box-shadow: 0 0 10px rgba(0, 0, 0, 0.15); }
249 | 
250 | .reveal section img.plain {
251 |   border: 0;
252 |   box-shadow: none; }
253 | 
254 | .reveal a img {
255 |   -webkit-transition: all .15s linear;
256 |   -moz-transition: all .15s linear;
257 |   transition: all .15s linear; }
258 | 
259 | .reveal a:hover img {
260 |   background: rgba(255, 255, 255, 0.2);
261 |   border-color: #00008B;
262 |   box-shadow: 0 0 20px rgba(0, 0, 0, 0.55); }
263 | 
264 | /*********************************************
265 |  * NAVIGATION CONTROLS
266 |  *********************************************/
267 | .reveal .controls {
268 |   color: #00008B; }
269 | 
270 | /*********************************************
271 |  * PROGRESS BAR
272 |  *********************************************/
273 | .reveal .progress {
274 |   background: rgba(0, 0, 0, 0.2);
275 |   color: #00008B; }
276 | 
277 | .reveal .progress span {
278 |   -webkit-transition: width 800ms cubic-bezier(0.26, 0.86, 0.44, 0.985);
279 |   -moz-transition: width 800ms cubic-bezier(0.26, 0.86, 0.44, 0.985);
280 |   transition: width 800ms cubic-bezier(0.26, 0.86, 0.44, 0.985); }
281 | 
282 | /*********************************************
283 |  * PRINT BACKGROUND
284 |  *********************************************/
285 | @media print {
286 |   .backgrounds {
287 |     background-color: #fff; } }
288 | 


--------------------------------------------------------------------------------
/images/anaconda_navigator_environments.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/images/anaconda_navigator_environments.png


--------------------------------------------------------------------------------
/images/anaconda_navigator_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/images/anaconda_navigator_screenshot.png


--------------------------------------------------------------------------------
/images/atom_editor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/images/atom_editor.png


--------------------------------------------------------------------------------
/images/jupyter_lab_editor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/images/jupyter_lab_editor.png


--------------------------------------------------------------------------------
/images/jupyter_lab_launcher.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/images/jupyter_lab_launcher.png


--------------------------------------------------------------------------------
/index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Introduction to Python for Social Science
 3 | author: Musashi Harukawa
 4 | ---
 5 | 
 6 | # Introduction to Python for Social Science
 7 | 
 8 | Welcome to the official course website for the Introduction to Python for Social Science optional methods module at the Department of Politics and International Relations, University of Oxford.
 9 | 
10 | I will be posting all slides, workbooks, practice sets and solutions on this website. Recorded lectures will be hosted on Canvas/Panopto. Information regarding office hours will be sent out in the coming days.
11 | 
12 | # Course Details
13 | 
14 | Lectures will be held weekly at time/place tbd. Office hours will be discussed in the first lectures.
15 | 
16 | You can find an older syllabus for the course [here](/dpir-intro-python/syllabus.pdf).
17 | 
18 | # Lecture Slides
19 | 
20 | I'm using [`reveal.js`](https://revealjs.com/#/) for the lecture slides, so they should be viewable in the browser on any device. `pdf` (beamer) format slides are also available lower down the page.
21 | 
22 | You might note that the previous year's slides are still accessible via the linked GitHub. Please note that there are some substantial changes being made to the course, and the slides in the GitHub may not reflect this yet.
23 | 
24 | 1. [Introduction to Python and the Development Environment](/dpir-intro-python/Week1/lecture.html)
25 | 2. [Data Structures and `pandas` I](/dpir-intro-python/Week2/lecture.html)
26 | 3. [Data Structures and `pandas` II](/dpir-intro-python/Week3/lecture.html)
27 | 4. [Data Visualisation](/dpir-intro-python/Week4/lecture.html)
28 | 5. [Machine Learning with `scikit-learn` I](/dpir-intro-python/Week5/lecture.html)
29 | 6. [Machine Learning with `scikit-learn` II](/dpir-intro-python/Week6/lecture.html)
30 | 7. [Mining the Web I](/dpir-intro-python/Week7/lecture.html)
31 | 8. [Mining the Web II](/dpir-intro-python/Week8/lecture.html)
32 | 
33 | 
34 | # Materials
35 | 
36 | | Week | `reveal.js`                                     | Code Examples                                                                                                               | Code Exercises                                        | Code Solutions                                        | Data                                                |
37 | | ---- | ----------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------- | ----------------------------------------------------- | --------------------------------------------------- |
38 | | 1    | [slides](/dpir-intro-python/Week1/lecture.html) | [examples](/dpir-intro-python/Week1/examples_student.ipynb)                                                                 | [exercises](/dpir-intro-python/Week1/exercises.ipynb) | [solutions](/dpir-intro-python/Week1/solutions.ipynb) | None                                                |
39 | | 2    | [slides](/dpir-intro-python/Week2/lecture.html) | [examples](/dpir-intro-python/Week2/examples_student.ipynb)                                                                 | [exercises](/dpir-intro-python/Week2/exercises.ipynb) | [solutions](/dpir-intro-python/Week2/solutions.ipynb) | [BES](/dpir-intro-python/Week2/data/data_week2.zip) |
40 | | 3    | [slides](/dpir-intro-python/Week3/lecture.html) | [examples](/dpir-intro-python/Week3/examples_student.ipynb)                                                                 | [exercises](/dpir-intro-python/Week3/exercises.ipynb) | [solutions](/dpir-intro-python/Week3/solutions.ipynb) | None                                                |
41 | | 4    | [slides](/dpir-intro-python/Week4/lecture.html) | [examples](/dpir-intro-python/Week4/examples_student.ipynb)                                                                 | [exercises](/dpir-intro-python/Week4/exercises.ipynb) | [solutions](/dpir-intro-python/Week4/solutions.ipynb) | None                                                |
42 | | 5    | [slides](/dpir-intro-python/Week5/lecture.html) | [examples](/dpir-intro-python/Week5/examples_student.ipynb)                                                                 | [exercises](/dpir-intro-python/Week5/exercises.ipynb) | [solutions](/dpir-intro-python/Week5/solutions.ipynb) | None                                                |
43 | | 6    | [slides](/dpir-intro-python/Week6/lecture.html) | [examples](/dpir-intro-python/Week6/examples_student.ipynb)                                                                 | [exercises](/dpir-intro-python/Week6/exercises.ipynb) | [solutions](/dpir-intro-python/Week6/solutions.ipynb) | None                                                |
44 | | 7    | [slides](/dpir-intro-python/Week7/lecture.html) | [examples](/dpir-intro-python/Week7/examples_student.ipynb)                                                                 | [exercises](/dpir-intro-python/Week7/exercises.ipynb) | [solutions](/dpir-intro-python/Week7/solutions.ipynb) | None                                                |
45 | | 8    | [slides](/dpir-intro-python/Week8/lecture.html) | [Twitter API](/dpir-intro-python/Week8/examples_twitter.ipynb) [Selenium](/dpir-intro-python/Week8/examples_selenium.ipynb) | None                                                  | None                                                  | None                                                |
46 | 
47 | 
48 | 
49 | #### About Me
50 | 
51 | My name is Musashi Harukawa, I am a DPhil Politics student at the University of Oxford. Prior to returning to academia, I worked as a quantitative analyst/data scientist at a stock exchange in Tokyo and an English teacher in Moscow.
52 | 
53 | I research computational methods for unstructured (i.e. text or image) data and digital political campaigning. I also have [a blog!](https://muhark.github.io) 
54 | 
55 | #### Some presentations etc.
56 | 
57 | ["Estimating the Micro_Targeting Effect: Evidence from a Survey Experiment During the 2020 U.S. Presidential Election", _Under Review_](https://muhark.github.io/static/docs/harukawa-2021-microtargeting.pdf), [Link to Presentation](/misc_presentations/cess-mt21-pres.html)
58 | 
59 | ["Comparative Government Revision Class"](/misc_presentations/compgov_revision.html)
60 | 
61 | 


--------------------------------------------------------------------------------
/ipynb_slideify.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import json
 3 | 
 4 | file = sys.argv[1]
 5 | 
 6 | with open(file, "r") as notebook:
 7 |     nb = json.loads(notebook.read())
 8 | 
 9 | for cell in nb['cells']:
10 |     try:
11 |         if cell['source'][0]=='#':
12 |             cell['metadata'] = {'slideshow': {'slide_type': 'slide'}}
13 |         else:
14 |             cell['metadata'] = {'slideshow': {'slide_type': 'subslide'}}
15 |     except IndexError:
16 |         cell['metadata'] = {'slideshow': {'slide_type': 'subslide'}}
17 | 
18 | 
19 | with open(sys.argv[2], "w+") as out_nb:
20 |     json.dump(nb, out_nb)
21 | 


--------------------------------------------------------------------------------
/minimal-theme.css:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * A simple theme for reveal.js presentations, similar
  3 |  * to the default theme. The accent color is darkblue.
  4 |  *
  5 |  * This theme is Copyright (C) 2012 Owen Versteeg, https://github.com/StereotypicalApps. It is MIT licensed.
  6 |  * reveal.js is Copyright (C) 2011-2012 Hakim El Hattab, http://hakim.se
  7 |  */
  8 | @import url(https://fonts.googleapis.com/css?family=News+Cycle:400,700);
  9 | @import url(https://fonts.googleapis.com/css?family=Lato:400,700,400italic,700italic);
 10 | section.has-dark-background, section.has-dark-background h1, section.has-dark-background h2, section.has-dark-background h3, section.has-dark-background h4, section.has-dark-background h5, section.has-dark-background h6 {
 11 |   color: #fff; }
 12 | 
 13 | /*********************************************
 14 |  * GLOBAL STYLES
 15 |  *********************************************/
 16 | body {
 17 |   background: #fff;
 18 |   background-color: #fff; }
 19 | 
 20 | .reveal {
 21 |   font-family: "Lato", sans-serif;
 22 |   font-size: 35px;
 23 |   font-weight: normal;
 24 |   color: #000; }
 25 | 
 26 | ::selection {
 27 |   color: #000;
 28 |   background: rgba(0, 0, 0, 0.99);
 29 |   text-shadow: none; }
 30 | 
 31 | ::-moz-selection {
 32 |   color: #fff;
 33 |   background: rgba(0, 0, 0, 0.99);
 34 |   text-shadow: none; }
 35 | 
 36 | .reveal .slides section,
 37 | .reveal .slides section > section {
 38 |   line-height: 1.3;
 39 |   font-weight: inherit; }
 40 | 
 41 | /*********************************************
 42 |  * HEADERS
 43 |  *********************************************/
 44 | .reveal h1,
 45 | .reveal h2,
 46 | .reveal h3,
 47 | .reveal h4,
 48 | .reveal h5,
 49 | .reveal h6 {
 50 |   margin: 0 0 20px 0;
 51 |   color: #000;
 52 |   font-family: "Lato", sans-serif;
 53 |   font-weight: bold;
 54 |   line-height: 1.2;
 55 |   letter-spacing: normal;
 56 |   text-transform: none;
 57 |   text-shadow: none;
 58 |   word-wrap: break-word; }
 59 | 
 60 | .reveal h1 {
 61 |   font-size: 1.65em;
 62 |   margin: 5px 30px 5px 30px;
 63 |   text-align: left; }
 64 | 
 65 | .reveal h2 {
 66 |   font-size: 1.05em;
 67 |   text-align: left; }
 68 | 
 69 | .reveal h3 {
 70 |   font-size: 1.025em; }
 71 | 
 72 | .reveal h4 {
 73 |   font-size: 1em; }
 74 | 
 75 | .reveal h1 {
 76 |   text-shadow: none; }
 77 | 
 78 | /*********************************************
 79 |  * OTHER
 80 |  *********************************************/
 81 | .reveal p {
 82 |   margin: 5px 0;
 83 |   line-height: 1.15;
 84 |   text-align: left; }
 85 | 
 86 | .reveal .subtitle {
 87 |   font-size: 1.25em;
 88 |   margin: 5px 30px 5px 30px;
 89 |   line-height: 1.15;
 90 |   text-align: left; }
 91 | 
 92 | .reveal .author {
 93 |   margin: 30px 30px 5px 30px;
 94 |   line-height: 1.15;
 95 |   text-align: left; }
 96 | 
 97 | .reveal .institute {
 98 |   margin: 5px 30px 5px 30px;
 99 |   line-height: 1.15;
100 |   text-align: left; }
101 | 
102 | .reveal .date {
103 |   margin: 5px 30px 5px 30px;
104 |   line-height: 1.15;
105 |   text-align: left; }
106 | 
107 | /* Ensure certain elements are never larger than the slide itself */
108 | .reveal img,
109 | .reveal video,
110 | .reveal iframe {
111 |   max-width: 95%;
112 |   max-height: 95%; }
113 | 
114 | .reveal strong,
115 | .reveal b {
116 |   font-weight: bold; }
117 | 
118 | .reveal em {
119 |   font-style: italic; }
120 | 
121 | .reveal ol,
122 | .reveal dl,
123 | .reveal ul {
124 |   text-align: left;
125 |   margin: 0 0 0 1.5em; }
126 | 
127 | .reveal ol {
128 |   list-style-type: decimal; }
129 | 
130 | .reveal ul {
131 |   list-style-type: disc; }
132 | 
133 | .reveal ul ul {
134 |   list-style-type: square; }
135 | 
136 | .reveal ul ul ul {
137 |   list-style-type: circle; }
138 | 
139 | .reveal ul ul,
140 | .reveal ul ol,
141 | .reveal ol ol,
142 | .reveal ol ul {
143 |   display: block;
144 |   margin-left: 1.5em; }
145 | 
146 | .reveal dt {
147 |   font-weight: bold; }
148 | 
149 | .reveal dd {
150 |   margin-left: 30px; }
151 | 
152 | .reveal blockquote {
153 |   display: block;
154 |   position: relative;
155 |   width: 70%;
156 |   margin: 20px auto;
157 |   padding: 5px;
158 |   font-style: italic;
159 |   background: rgba(255, 255, 255, 0.05);
160 |   box-shadow: 0px 0px 2px rgba(0, 0, 0, 0.2); }
161 | 
162 | .reveal blockquote p:first-child,
163 | .reveal blockquote p:last-child {
164 |   display: inline-block; }
165 | 
166 | .reveal q {
167 |   font-style: italic; }
168 | 
169 | .reveal pre {
170 |   display: block;
171 |   position: relative;
172 |   width: 90%;
173 |   margin: 20px auto;
174 |   text-align: left;
175 |   font-size: 0.55em;
176 |   font-family: monospace;
177 |   line-height: 1.2em;
178 |   word-wrap: break-word;
179 |   box-shadow: 0px 5px 15px rgba(0, 0, 0, 0.15); }
180 | 
181 | .reveal code {
182 |   font-family: monospace;
183 |   text-transform: none;
184 |   padding: 2px 4px;
185 |   font-size: 90%;
186 |   color: #c7254e;
187 |   background-color: #f9f2f4;
188 |   vertical-align: baseline;
189 |   white-space: pre-wrap;
190 |   border-radius: 2px;
191 | 
192 | }
193 | 
194 | .reveal pre code {
195 |   display: block;
196 |   padding: 5px;
197 |   overflow: auto;
198 |   max-height: 400px;
199 |   word-wrap: normal; }
200 | 
201 | .reveal table {
202 |   margin: auto;
203 |   border-collapse: collapse;
204 |   border-spacing: 0; }
205 | 
206 | .reveal table th {
207 |   font-weight: bold; }
208 | 
209 | .reveal table th,
210 | .reveal table td {
211 |   text-align: left;
212 |   padding: 0.2em 0.5em 0.2em 0.5em;
213 |   border-bottom: 1px solid; }
214 | 
215 | .reveal table th[align="center"],
216 | .reveal table td[align="center"] {
217 |   text-align: center; }
218 | 
219 | .reveal table th[align="right"],
220 | .reveal table td[align="right"] {
221 |   text-align: right; }
222 | 
223 | .reveal table tbody tr:last-child th,
224 | .reveal table tbody tr:last-child td {
225 |   border-bottom: none; }
226 | 
227 | .reveal sup {
228 |   vertical-align: super;
229 |   font-size: smaller; }
230 | 
231 | .reveal sub {
232 |   vertical-align: sub;
233 |   font-size: smaller; }
234 | 
235 | .reveal small {
236 |   display: inline-block;
237 |   font-size: 0.6em;
238 |   line-height: 1.2em;
239 |   vertical-align: top; }
240 | 
241 | .reveal small * {
242 |   vertical-align: top; }
243 | 
244 | /*********************************************
245 |  * LINKS
246 |  *********************************************/
247 | .reveal a {
248 |   color: #00008B;
249 |   text-decoration: none;
250 |   -webkit-transition: color .15s ease;
251 |   -moz-transition: color .15s ease;
252 |   transition: color .15s ease; }
253 | 
254 | .reveal a:hover {
255 |   color: #0000f1;
256 |   text-shadow: none;
257 |   border: none; }
258 | 
259 | .reveal .roll span:after {
260 |   color: #fff;
261 |   background: #00003f; }
262 | 
263 | /*********************************************
264 |  * IMAGES
265 |  *********************************************/
266 | .reveal section img {
267 |   margin: 15px 0px;
268 |   background: rgba(255, 255, 255, 0.12);
269 |   border: 4px solid #000;
270 |   box-shadow: 0 0 10px rgba(0, 0, 0, 0.15); }
271 | 
272 | .reveal section img.plain {
273 |   border: 0;
274 |   box-shadow: none; }
275 | 
276 | .reveal a img {
277 |   -webkit-transition: all .15s linear;
278 |   -moz-transition: all .15s linear;
279 |   transition: all .15s linear; }
280 | 
281 | .reveal a:hover img {
282 |   background: rgba(255, 255, 255, 0.2);
283 |   border-color: #00008B;
284 |   box-shadow: 0 0 20px rgba(0, 0, 0, 0.55); }
285 | 
286 | /*********************************************
287 |  * NAVIGATION CONTROLS
288 |  *********************************************/
289 | .reveal .controls {
290 |   color: #00008B; }
291 | 
292 | /*********************************************
293 |  * PROGRESS BAR
294 |  *********************************************/
295 | .reveal .progress {
296 |   background: rgba(0, 0, 0, 0.2);
297 |   color: #00008B; }
298 | 
299 | .reveal .progress span {
300 |   -webkit-transition: width 800ms cubic-bezier(0.26, 0.86, 0.44, 0.985);
301 |   -moz-transition: width 800ms cubic-bezier(0.26, 0.86, 0.44, 0.985);
302 |   transition: width 800ms cubic-bezier(0.26, 0.86, 0.44, 0.985); }
303 | 
304 | /*********************************************
305 |  * PRINT BACKGROUND
306 |  *********************************************/
307 | @media print {
308 |   .backgrounds {
309 |     background-color: #fff; } }
310 | 


--------------------------------------------------------------------------------
/misc_presentations/ConfoundedMeasurement.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/misc_presentations/ConfoundedMeasurement.png


--------------------------------------------------------------------------------
/misc_presentations/Measurement Bias-Confounding.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/misc_presentations/Measurement Bias-Confounding.png


--------------------------------------------------------------------------------
/misc_presentations/Measurement Bias-Relationship as Indicator.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/misc_presentations/Measurement Bias-Relationship as Indicator.png


--------------------------------------------------------------------------------
/misc_presentations/Measurement Bias-Simple Diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/misc_presentations/Measurement Bias-Simple Diagram.png


--------------------------------------------------------------------------------
/misc_presentations/Measurement Bias.drawio:
--------------------------------------------------------------------------------
1 | <mxfile host="app.diagrams.net" modified="2020-05-27T01:40:16.714Z" agent="5.0 (X11)" etag="2C11pOZfTOo2byJ8qXKr" version="13.1.3" type="device" pages="3"><diagram name="Simple Diagram" id="7R-koVz-8RwIILDvUgEx">7Vpbc9o6EP41zPQ8JOM75hEMOaEnTTIhndP0zWDFqDWWK+QA+fWVbMnYWMaEcE3LC9JaWsv77afVrt3Qncn8X+xG4y/IA0FDU7x5Q+82NE01VJv+MckilTSbRirwMfT4oKVgAF8BFypcGkMPTAsDCUIBgVFROEJhCEakIHMxRrPisGcUFO8auT4oCQYjNyhL/4ceGadSW2su5dcA+mNxZ9VqpVeG7uinj1Ec8vs1NP05+aWXJ67QxR90OnY9NMuJ9F5DdzBCJG1N5g4ImG2F2aznhY/sz9dK/9aALb/1SozvF6myq7dMyZ4Qg5DsWLWe6n5xgxgIM1gBvUtnSBs+azg37cGgf9XvPdCBbcf5+tB2nrJRWAwTErqC4VKWGI4sBFjUhhFrRhiNwJQi35mNIQGDyB0x8Yy6KJWNySSgPZXpz0C6i0kAQ8DlU4LRT+CgAOFEs64kP3rlGQZBTn6V/LK1vABMwHzFc2rsqmZgUxIBNAEEL+g8rsVscv/g/NEU3p8tvVEVPjTOeaLFZS4ngJ+pXqJIGxzIt4DaqgTVgy+reGVgfem1B18fenIc16Bd0LmCN7U1KQLqBtAPaXtEbQ4oRh2GCKR0bvMLE+h5bHoHgyl8dYeJKgZshGBIEluZnYbZZbpigqbphqTuE+FWEWHVNEoI2xKAtX0BzOaXAK5g2ijGwaKDKY0AqadbwjTgbU0yjIhLIGI4tpQ9QmJpK5BoZdLJOKfuAhN7fL2w4JMznF1/Dn79Rwz79epCLUMAPBqoeDdEISjYl1kHYTJGPgrd4AahiBv9ByBkwcMsc/AiQGAOybdc+4mpujR5rzvnmpPOQnRC+oDf8p10lmaK/nJe0hMT0ydij7EFdtQUKMYjsM6POY+Ii32wVqEu9wYMAupuL8Xl7Z5uRn2QHNz3es41HaS+MzJ6aBRPEvPWhsbUl26GQnA6MVG3zdqYKNsxdxES5exUSwY/dXYenZzWeZDTegM5tb/klJCzaR6ZnErJ4KdOzubR2WmfBzvtN7BTXIHiSjdHTrhRLvKHUthQDxdfpcvVZRlJZcr5jOizU3RELcn6FbMiSkddNiVZag5NMYopukjVtNn2bkbzsorLy8t1OS2Vpev58xJbQym6kW6V3cjYU2IrjQQb5LWsGBRVGoQXGblxG9lK32mojEwi3ZQYSmtJLGXvrcZTvbGuq/H0b7t9p/1491DDiPNy/WpfqkRUVTY79OyvhqPW+/oZ1XC2gUAv7j40z74sgyKt4ig7OK/Ij6LVqb4IWnVBSNVkQSh3tnmvimnkhu/Q4vQaHS1H9VRdaQconbcq4uQ0zvYPNa81Hp7GBsLWzFMCVdujN684s2lJfFk/ZCxtluy+pwxKO8/6BrdPbQKVVv8OkEBJUbRLKJ56anw2uFrHxLV1RFybHxxY+0DArlvluoLH/d1N+6H/vf3Yv7vdqHZRPNHVHP82CnBb5EBZziPKhpJXbgd+z72BrW/aj73bRzrIubt1evePG9n79DKcHeAle2stxWt/KY/yoVKebeoI6gomx39tXc2hUgawmwzonenLQZOgCnFNHqRV5UGy5Eq+9dT4+1ZJzTZ15pXPLDRzswKhvjd3rS57/XXXLd21+2HdVZcU+Q7qrtKIt+Ku0aeGfvXPW6x9woWTbb6uE8U8sccYm+0xW4RE2l1+rptcy30Trfd+Aw==</diagram><diagram name="Relationship as Indicator" id="tiDqENzYom1s-jQ-Ps2L">7Vpbd6o4FP41rjXzoAsIID4ipVNn2dalnpl23lCiZg4aT4hVz6+fBMIdC71g7en4ItkkG9jfty/Z0ALW+vAHcbarW+xCr6VI7qEFrlqKIquywf645BhKul01FCwJcsWkRDBBP6EQSkK6Qy70MxMpxh5F26xwjjcbOKcZmUMI3menLbCXverWWcKCYDJ3vKL0b+TSVSg1lG4iv4FouYquLOu98MzMmX9fErzbiOu1FLAIfuHptRPpEg/qrxwX71MiYLeARTCm4dH6YEGP2zYy2+z6r+ntXiKr9p+A+j/UH+v9QztUdv2SJfETErih76saiEejx8ic0GXWFcMN3rC/fmAjyNVIbIQJXeEl3jjeEOMtE8pM+C+k9Ci44ewoZqIVXXviLDwg+pA6fuSqOpoYXR2E5mBwjAYbSo4P6UFqFR8my4JRtM6nDqEm51XyAIHsGnlevMKNZsw9x/fRPBSKKXKwhODvMaG4JLQUN0+OdxWoiHk+3pE5fGae4Cu71SV8Tp+ixtxjPg3xGrLHZwsJ9ByKnrJ35wjnWsbzEoawA0GSFxBG/QSE6WhymjNtqSNXkCYYjSBBzEiQfBUmAfkjmaTVYdKHBpsq1jRGkPmOPAXuc0lsUXpvZEuwlBnDOaYmbDHaUD+lecQFbEJUn/RE9hXViWpIOeaFGhMexrf2emoKkzw53g5GtYHuMeP0Z+xgyQ9G90NzPPjHnA7u76KT7Frx+QK5k5jIQd2vEIWTrRMgs2eFWZbD4vqQUHh4BepFkIQWNWtMRRLjfVI4yVG5s0oVTbrUUBCQQbWpraE5mQyuB/aYTTQt69vYtB7jWSSxeA0UWA235YdbgufQ96uRSIrE+x31EPfxlNNa2MMk0Ayk4MfOLJhnp+TXwa9BTHUtB6rx4aDW8J+hObXvpmySdX9n2aNpLfSY5WgWHsdDyw2PrcyCPHH3uX0R2xyY4sQaua4X1CTQRz+dWaCKwyQCD9Or9VvaFdfFUokfZpUz+qCsqfXgUhqDyzgJl4ue8u4VozO4uxpY5vR+XA5dLMvo+BUBBT2tElHjrID2XgXorW1Ovo3t5+Asjba/PMB6t9plzwpwpDgD8IlMx2pJ79gnLI3x6q4q3WWrlJcnOYIpKwYxx7EnNQhJriqMA2YKkbIYKjcHifzVIVGVbBgEWi1IYijfHxLlq0PSlS7OTc7bcJQ6nJXpPkCPFac1u0iKDlqZ1qOkv6QhkGz30z2B4oYfiFvIt59Ke1KthpsBishszXcD3kYj9WSFE9csk5FtWzdskvzGvaKL57t1YN7KzWLI2+Esi/Ql7BLzGbNs619WwzS2SVRq9f/eMRLk4gAAdeOAlokC5wwBjXu7/jm8XX+Btyv/ezurvYxL8/bux3p7RdZ/vdcmYaILjFy5YFTEisvJ+sa54kB5k1/NtTCBnONheGNiVY6K79DvV053wIoBJjqDojNXqfiCanVMvkgU0qRLi0KnG2PJix1zPH1kc4Zm3x6+EkV2K2jrw2rwLgerfMYAWj2stKawAp+hx9UGepP7d0XLtlTKHKj8VUFjqJS1uapby4FTteMQOraHwYvTyc1g9FneHqRRb7RpI6s50HtloJ+z3QzKGmknQV/gILX70YeC+o8d/0KuLyeHJTxJ4RrN4oraoRqTF/ba9lBU0el0KhgU3s/lEqspHql6LvvKxYaselYalb3rPxHRFx48iHq8/BOaFDYcYLEjkJk9+p4zg14/fm9fI7Tralc3umXJAMgq0NS3luBFjFIYaM9k1TfW1VruXTOQelkV4c6gUFcXFOlKhaKGC3Rwut9XrLgjwSKS+LtYRlMRICX+7eH3iiiCLiNY5LneVPDQCh+V6E0FDzZMvqEOCZN8qA7s/wA=</diagram><diagram id="gy_MzscAscurrOvGj_Jd" name="Confounding">5VxZd6JIFP41njPzoAcoFn1ExG7P2NHWdHfSL3OIlMo0UjZgovn1U0CxrxpAk/YlcGuh6t7vrhTpAGl3/GQq++0XpEK9w1DqsQNGHYahWbqP/ziUk0cRBNYjbExNJZ1CwlJ7hYRIEepBU6EV62gjpNvaPk5cIcOAKztGU0wTvcS7rZEef+pe2cAUYblS9DT1h6baW4/aZ4SQ/hlqm63/ZJofeC1PyurXxkQHgzyvw4C1+/Oad4o/F9motVVU9BIhAbkDJBMh27vaHSWoO7z12eaNG+e0Bus2oWFXGSCpX9ndj9f1g0EvONSfGN93wy4gm7Hsk88QqGL+kFsDGfjP0N0ldOah8B0y7S3aIEPRpwjtMZHGxP+gbZ+IdJWDjTBpa+900gqPmv3gDO9x5O4x0jI6kpndm5N/Y9jmKTLIuX2MtoXD3Dt/nGUrpi06yMCEla5YlrbyyWNN95eUZh/hqIUO5goW8MxHqWJuoF3Qj/H6OfyMPIAI5xNEO4iXjTuYUFds7TmOR4XAehP0C4bOkYaXzFC+ClIEYEQBWV+z/Cm8DZFRIUDwRWQZIcmFzRkQEphWIURHABTCqQxCMQCFeKoCIbKBKH68EWoaZJgYgZhlm+hXYFXaAR1L1Y26GF7OBgd4B+DocXQUH12qR5cAxL2bQ1PDXILmB0ANd1uoYaug5qpGpAwhjYFhdTCfXVVpBxl87fbEHYr3rZwiHfaOd7IKnNwg4eT6VAJl3oz1ejauCgxjAqkJk1Ujoz/GrdG1G6jLYCgkYMi0AEOyiWdFP5BtdRhex9waPuGLjXMxn03FxeSneD+Z3fmN+FlBewrHocd15Pey1Wy43CuuqF5wnhdHaK54n6Fpw2OhPEgrG+cb48esL2HKRfuJ0jaSbvFUvgTf5GKYjxe1Xq6FzNW0sKoM9ZfV+Oen8V7qLr/M5CNLrXnY5d+BCHtsMrZkLoktdeUJ6sOg4iAhHZnulgHl/gKznNWyxrY7QidVCkxHhk22TTMBgBLCbRZRXuB5LQAJhQAKsSKH1OvgqRXxZ0SW9WGh9rIHoBMuhU9M4WGvjrLH4Onrw2T2Q94I2vM/6393v8Fe871yBDs3nY9UQlB5eHjjKKuc0LQdOIJkka6N/IXJDRxV7TkMDROxpDS7G8++3Y3kRXYkGdAyJomQYG/Tww8fYZaalmafgueYuU8+53m3Fc+y/LXjWRqUZwnSVFwuJ+OJI1lKlKRvC1F6zJdLYQJhbZW9c7k30QpaVjnTw9cls4OtawbsRLPJChZl7P7qER/PJZxH/9ryEyoFs3XVGuLvYQIXUxbLMkLsTQzdo2jhWjWHzDD5cvfhpzZXKLBf5lDoQQzBoJWCWHHA3GBBrCwoagxmqmJt6yi3NlO/SsGC4+OGDTAJg1VfVFy4z7Njju/iYiLey8s6Yg4R85jB66dmprbRjLYiD6rcCV7F2QH+1oIV/3BFUbAyxWi4u8edcDwqyfP7SlEJ5ogd57iiaxvD0XjMP8dNDB2+aStFF0nDTlNVL7GHlvaqPLlTORIhFhjPyw073MiZC5sqi6QyjYSRNMdWk0xSq+uTTP8i/Z3cjSaSeD87K2V457IDA65UeP1WZTe4SHZfZHH5bSEXSS4zN/hIsuSFckVsVZaAzZBlTgqG4y39NDRxfuUErmUuKJ4pn++QTGTjAAU5IhsUhtzVuZ94txtYvAjzs4wg3Rz3s6KYj8p9lonbMX/zJdwPpFY/9/k/iPsCdWvg59OV7WaPhDoAjGZ6A6+oUKkgwYNoQYLqUfw59Yj806HxRBCQJSTLDTXXIPxo8AqHct6EGH/dReH8ci7L0mfciX5jwVFFq8POZW1pxdGD6PTpqtlX0rtlHX3ICi0aS774tDVt78g31m4Aqmp3E8e+qyn2G3T4egfr3qbDFU4ZBTrM/GE67BdXb0eH2z1WndLhEg99uS6Gyi+AfsK190ssQBsemrmadmfXe9nEiyyQ/Pij4Xqvz5BKZsNv0fyWUcRqaJWKDh/PtnDUrdmWCtHcXFzcP+I+U3EoTy8UGF6KtrdguZxuwuQDrppYuKbEwr6HilDXeddQT9jMxasSWWqRXRlvTACXvdpyVaUb2MCFPHVPaS8/T+bvpVgeFXBddQ+aTch3kCXfNkuubP4Jmgz5OsfaHJH43zPzvw/Oh7xDOrzMgEREhH4vZ6KuN43ohNbc/pieotfrlYDFW8/tYqgGyLCJ95gMna5Usg0hJvP4cIUqsXPiaZ+z0eCbdsLHTrDSFJvyoweQ4EmGGmW+uAAN8SRLiXK81FqHR5IkZJ/NiCAzeYq0wnHWhLviWYHvC1kODtAs4NgAo4WnUfNxkBZRRARcQaRwaQZwypZkztfeqcwhmdImpqkvb8hkWVYwk0gOUp507VOsQ0CzI7YuQv7r4e8Se6ndhlnM/gQjw1RcCsLgnwQ0ZSrxbfg/LDx4hP8oBMj/Aw==</diagram></mxfile>


--------------------------------------------------------------------------------
/misc_presentations/Measurement Bias.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <mxfile host="app.diagrams.net" modified="2020-05-26T19:28:46.212Z" agent="5.0 (X11)" etag="hbMhMalLoKBvHE87VISf" version="13.1.3" type="device">
  3 |   <diagram id="gy_MzscAscurrOvGj_Jd" name="Page-1">
  4 |     <mxGraphModel dx="1418" dy="792" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="827" pageHeight="1169" background="#ffffff" math="0" shadow="0">
  5 |       <root>
  6 |         <mxCell id="0" />
  7 |         <mxCell id="1" parent="0" />
  8 |         <mxCell id="CdQ4mWzfXn1R5o8InVmB-39" style="edgeStyle=none;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;startArrow=classic;startFill=1;" parent="1" source="CdQ4mWzfXn1R5o8InVmB-1" target="CdQ4mWzfXn1R5o8InVmB-2" edge="1">
  9 |           <mxGeometry relative="1" as="geometry">
 10 |             <mxPoint x="100" y="410" as="sourcePoint" />
 11 |           </mxGeometry>
 12 |         </mxCell>
 13 |         <mxCell id="CdQ4mWzfXn1R5o8InVmB-72" style="edgeStyle=none;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;startArrow=none;startFill=0;endArrow=classic;endFill=1;strokeWidth=1;" parent="1" source="CdQ4mWzfXn1R5o8InVmB-1" target="CdQ4mWzfXn1R5o8InVmB-40" edge="1">
 14 |           <mxGeometry relative="1" as="geometry" />
 15 |         </mxCell>
 16 |         <mxCell id="CdQ4mWzfXn1R5o8InVmB-73" style="edgeStyle=none;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0.51;entryY=-0.1;entryDx=0;entryDy=0;entryPerimeter=0;startArrow=none;startFill=0;endArrow=classic;endFill=1;strokeWidth=1;" parent="1" source="CdQ4mWzfXn1R5o8InVmB-1" target="CdQ4mWzfXn1R5o8InVmB-45" edge="1">
 17 |           <mxGeometry relative="1" as="geometry" />
 18 |         </mxCell>
 19 |         <mxCell id="CdQ4mWzfXn1R5o8InVmB-74" style="edgeStyle=none;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=1;entryDx=0;entryDy=0;startArrow=none;startFill=0;endArrow=classic;endFill=1;strokeWidth=1;curved=1;" parent="1" source="CdQ4mWzfXn1R5o8InVmB-1" target="CdQ4mWzfXn1R5o8InVmB-60" edge="1">
 20 |           <mxGeometry relative="1" as="geometry">
 21 |             <Array as="points">
 22 |               <mxPoint x="190" y="480" />
 23 |             </Array>
 24 |           </mxGeometry>
 25 |         </mxCell>
 26 |         <mxCell id="CdQ4mWzfXn1R5o8InVmB-75" style="edgeStyle=none;curved=1;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=1;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;startArrow=none;startFill=0;endArrow=classic;endFill=1;strokeWidth=1;" parent="1" source="CdQ4mWzfXn1R5o8InVmB-1" target="CdQ4mWzfXn1R5o8InVmB-15" edge="1">
 27 |           <mxGeometry relative="1" as="geometry">
 28 |             <Array as="points">
 29 |               <mxPoint x="170" y="420" />
 30 |             </Array>
 31 |           </mxGeometry>
 32 |         </mxCell>
 33 |         <mxCell id="CdQ4mWzfXn1R5o8InVmB-1" value="&lt;b&gt;POLARIZATION&lt;/b&gt;" style="rounded=1;whiteSpace=wrap;html=1;" parent="1" vertex="1">
 34 |           <mxGeometry x="40" y="200" width="120" height="60" as="geometry" />
 35 |         </mxCell>
 36 |         <mxCell id="CdQ4mWzfXn1R5o8InVmB-22" style="edgeStyle=none;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" parent="1" source="CdQ4mWzfXn1R5o8InVmB-2" target="CdQ4mWzfXn1R5o8InVmB-15" edge="1">
 37 |           <mxGeometry relative="1" as="geometry" />
 38 |         </mxCell>
 39 |         <mxCell id="CdQ4mWzfXn1R5o8InVmB-2" value="&lt;div&gt;&lt;b&gt;CONFOUNDER&lt;/b&gt;&lt;/div&gt;&lt;div&gt;e.g. Diversity&lt;br&gt;&lt;b&gt;&lt;/b&gt;&lt;/div&gt;" style="rounded=1;whiteSpace=wrap;html=1;" parent="1" vertex="1">
 40 |           <mxGeometry x="40" y="460" width="120" height="60" as="geometry" />
 41 |         </mxCell>
 42 |         <mxCell id="CdQ4mWzfXn1R5o8InVmB-69" style="rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=0.5;exitDx=0;exitDy=0;startArrow=classic;startFill=1;endArrow=none;endFill=0;strokeWidth=1;" parent="1" source="CdQ4mWzfXn1R5o8InVmB-13" edge="1">
 43 |           <mxGeometry relative="1" as="geometry">
 44 |             <mxPoint x="520" y="280" as="targetPoint" />
 45 |           </mxGeometry>
 46 |         </mxCell>
 47 |         <mxCell id="CdQ4mWzfXn1R5o8InVmB-70" style="edgeStyle=none;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=0.5;exitDx=0;exitDy=0;startArrow=classic;startFill=1;endArrow=none;endFill=0;strokeWidth=1;" parent="1" source="CdQ4mWzfXn1R5o8InVmB-13" edge="1">
 48 |           <mxGeometry relative="1" as="geometry">
 49 |             <mxPoint x="420" y="290" as="targetPoint" />
 50 |           </mxGeometry>
 51 |         </mxCell>
 52 |         <mxCell id="CdQ4mWzfXn1R5o8InVmB-71" style="edgeStyle=none;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=0.5;exitDx=0;exitDy=0;startArrow=classic;startFill=1;endArrow=none;endFill=0;strokeWidth=1;" parent="1" source="CdQ4mWzfXn1R5o8InVmB-13" edge="1">
 53 |           <mxGeometry relative="1" as="geometry">
 54 |             <mxPoint x="370" y="320" as="targetPoint" />
 55 |           </mxGeometry>
 56 |         </mxCell>
 57 |         <mxCell id="CdQ4mWzfXn1R5o8InVmB-13" value="&lt;b&gt;CLASSIFIER ACCURACY&lt;br&gt;&lt;/b&gt;" style="shape=process;whiteSpace=wrap;html=1;backgroundOutline=1;strokeColor=#000000;fillColor=#FFFFFF;" parent="1" vertex="1">
 58 |           <mxGeometry x="650" y="250" width="120" height="60" as="geometry" />
 59 |         </mxCell>
 60 |         <mxCell id="CdQ4mWzfXn1R5o8InVmB-76" style="edgeStyle=none;curved=1;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=0;exitDx=0;exitDy=0;entryX=0.275;entryY=1.017;entryDx=0;entryDy=0;startArrow=classic;startFill=1;endArrow=classic;endFill=1;strokeWidth=1;entryPerimeter=0;" parent="1" source="CdQ4mWzfXn1R5o8InVmB-15" target="CdQ4mWzfXn1R5o8InVmB-40" edge="1">
 61 |           <mxGeometry relative="1" as="geometry">
 62 |             <Array as="points">
 63 |               <mxPoint x="319" y="380" />
 64 |             </Array>
 65 |           </mxGeometry>
 66 |         </mxCell>
 67 |         <mxCell id="CdQ4mWzfXn1R5o8InVmB-77" style="edgeStyle=none;curved=1;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;startArrow=none;startFill=0;endArrow=classic;endFill=1;strokeWidth=1;dashed=1;" parent="1" source="CdQ4mWzfXn1R5o8InVmB-15" target="CdQ4mWzfXn1R5o8InVmB-13" edge="1">
 68 |           <mxGeometry relative="1" as="geometry" />
 69 |         </mxCell>
 70 |         <mxCell id="CdQ4mWzfXn1R5o8InVmB-15" value="&lt;div&gt;&lt;b&gt;COVARIATES&lt;/b&gt;&lt;/div&gt;&lt;div&gt;e.g. Age, Origin&lt;br&gt;&lt;b&gt;&lt;/b&gt;&lt;/div&gt;" style="rounded=0;whiteSpace=wrap;html=1;strokeColor=#000000;fillColor=#FFFFFF;" parent="1" vertex="1">
 71 |           <mxGeometry x="360" y="460" width="120" height="60" as="geometry" />
 72 |         </mxCell>
 73 |         <mxCell id="CdQ4mWzfXn1R5o8InVmB-27" value="&lt;b&gt;LATENT CONCEPT&lt;/b&gt;" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;" parent="1" vertex="1">
 74 |           <mxGeometry x="40" y="154" width="120" height="20" as="geometry" />
 75 |         </mxCell>
 76 |         <mxCell id="CdQ4mWzfXn1R5o8InVmB-28" value="&lt;div&gt;&lt;b&gt;INDICATOR&lt;/b&gt;&lt;/div&gt;" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;" parent="1" vertex="1">
 77 |           <mxGeometry x="395" y="154" width="80" height="20" as="geometry" />
 78 |         </mxCell>
 79 |         <mxCell id="CdQ4mWzfXn1R5o8InVmB-29" value="&lt;div&gt;&lt;b&gt;MEASURE&lt;/b&gt;&lt;br&gt;&lt;/div&gt;" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;" parent="1" vertex="1">
 80 |           <mxGeometry x="670" y="154" width="80" height="20" as="geometry" />
 81 |         </mxCell>
 82 |         <mxCell id="CdQ4mWzfXn1R5o8InVmB-34" value="" style="shape=curlyBracket;whiteSpace=wrap;html=1;rounded=1;strokeColor=#000000;fillColor=#FFFFFF;rotation=90;" parent="1" vertex="1">
 83 |           <mxGeometry x="90" y="120" width="20" height="120" as="geometry" />
 84 |         </mxCell>
 85 |         <mxCell id="CdQ4mWzfXn1R5o8InVmB-35" value="" style="shape=curlyBracket;whiteSpace=wrap;html=1;rounded=1;strokeColor=#000000;fillColor=#FFFFFF;rotation=90;" parent="1" vertex="1">
 86 |           <mxGeometry x="425" y="35" width="20" height="290" as="geometry" />
 87 |         </mxCell>
 88 |         <mxCell id="CdQ4mWzfXn1R5o8InVmB-36" value="" style="shape=curlyBracket;whiteSpace=wrap;html=1;rounded=1;strokeColor=#000000;fillColor=#FFFFFF;rotation=90;" parent="1" vertex="1">
 89 |           <mxGeometry x="700" y="120" width="20" height="120" as="geometry" />
 90 |         </mxCell>
 91 |         <mxCell id="CdQ4mWzfXn1R5o8InVmB-61" style="edgeStyle=none;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.425;exitY=0.917;exitDx=0;exitDy=0;entryX=0.263;entryY=0.06;entryDx=0;entryDy=0;startArrow=classic;startFill=1;strokeWidth=3;exitPerimeter=0;entryPerimeter=0;" parent="1" source="CdQ4mWzfXn1R5o8InVmB-40" target="CdQ4mWzfXn1R5o8InVmB-60" edge="1">
 92 |           <mxGeometry relative="1" as="geometry" />
 93 |         </mxCell>
 94 |         <mxCell id="CdQ4mWzfXn1R5o8InVmB-40" value="&lt;b&gt;SPEECH 1&lt;br&gt;&lt;/b&gt;" style="shape=document;whiteSpace=wrap;html=1;boundedLbl=1;strokeColor=#000000;fillColor=#FFFFFF;" parent="1" vertex="1">
 95 |           <mxGeometry x="290" y="200" width="80" height="60" as="geometry" />
 96 |         </mxCell>
 97 |         <mxCell id="CdQ4mWzfXn1R5o8InVmB-62" style="edgeStyle=none;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=0.933;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;startArrow=classic;startFill=1;strokeWidth=3;exitPerimeter=0;" parent="1" source="CdQ4mWzfXn1R5o8InVmB-41" target="CdQ4mWzfXn1R5o8InVmB-60" edge="1">
 98 |           <mxGeometry relative="1" as="geometry" />
 99 |         </mxCell>
100 |         <mxCell id="CdQ4mWzfXn1R5o8InVmB-41" value="&lt;b&gt;SPEECH 2&lt;br&gt;&lt;/b&gt;" style="shape=document;whiteSpace=wrap;html=1;boundedLbl=1;strokeColor=#000000;fillColor=#FFFFFF;" parent="1" vertex="1">
101 |           <mxGeometry x="380" y="200" width="80" height="60" as="geometry" />
102 |         </mxCell>
103 |         <mxCell id="CdQ4mWzfXn1R5o8InVmB-63" style="edgeStyle=none;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=0.917;exitDx=0;exitDy=0;startArrow=classic;startFill=1;strokeWidth=3;entryX=0.738;entryY=0.08;entryDx=0;entryDy=0;exitPerimeter=0;entryPerimeter=0;" parent="1" source="CdQ4mWzfXn1R5o8InVmB-42" target="CdQ4mWzfXn1R5o8InVmB-60" edge="1">
104 |           <mxGeometry relative="1" as="geometry">
105 |             <mxPoint x="450" y="310" as="targetPoint" />
106 |           </mxGeometry>
107 |         </mxCell>
108 |         <mxCell id="CdQ4mWzfXn1R5o8InVmB-42" value="&lt;b&gt;SPEECH &lt;i&gt;D&lt;/i&gt;&lt;br&gt;&lt;/b&gt;" style="shape=document;whiteSpace=wrap;html=1;boundedLbl=1;strokeColor=#000000;fillColor=#FFFFFF;" parent="1" vertex="1">
109 |           <mxGeometry x="500" y="200" width="80" height="60" as="geometry" />
110 |         </mxCell>
111 |         <mxCell id="CdQ4mWzfXn1R5o8InVmB-60" value="&lt;b&gt;PARTY LABEL&lt;br&gt;&lt;/b&gt;" style="ellipse;whiteSpace=wrap;html=1;strokeColor=#000000;fillColor=#FFFFFF;" parent="1" vertex="1">
112 |           <mxGeometry x="380" y="350" width="80" height="50" as="geometry" />
113 |         </mxCell>
114 |         <mxCell id="CdQ4mWzfXn1R5o8InVmB-44" value="" style="shape=curlyBracket;whiteSpace=wrap;html=1;rounded=1;strokeColor=#000000;fillColor=#FFFFFF;rotation=-360;" parent="1" vertex="1">
115 |           <mxGeometry x="255" y="200" width="20" height="200" as="geometry" />
116 |         </mxCell>
117 |         <mxCell id="CdQ4mWzfXn1R5o8InVmB-45" value="&lt;div&gt;&lt;b&gt;PARTY-SPEECH RELATIONSHIP&lt;/b&gt;&lt;/div&gt;" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;rotation=-90;" parent="1" vertex="1">
118 |           <mxGeometry x="145" y="290" width="200" height="20" as="geometry" />
119 |         </mxCell>
120 |         <mxCell id="CdQ4mWzfXn1R5o8InVmB-43" value="&lt;div&gt;&lt;font size=&quot;1&quot;&gt;&lt;b style=&quot;font-size: 25px&quot;&gt;...&lt;/b&gt;&lt;/font&gt;&lt;/div&gt;" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;" parent="1" vertex="1">
121 |           <mxGeometry x="460" y="215" width="40" height="20" as="geometry" />
122 |         </mxCell>
123 |       </root>
124 |     </mxGraphModel>
125 |   </diagram>
126 | </mxfile>
127 | 


--------------------------------------------------------------------------------
/misc_presentations/MeasurementFig1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/misc_presentations/MeasurementFig1.png


--------------------------------------------------------------------------------
/misc_presentations/a7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/misc_presentations/a7.png


--------------------------------------------------------------------------------
/misc_presentations/cess-mt21-pres.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Does Microtargeting Work?"
  3 | subtitle: "Evidence from a Survey Experiment during the 2020 US Election"
  4 | author: "Musashi Jacobs-Harukawa"
  5 | institute: "Department of Politics and International Relations, University of Oxford"
  6 | date: "CESS Colloquium, 3 Nov 2021"
  7 | section-titles: false
  8 | aspectratio: 1610
  9 | mainfont: "IBM Plex Sans-Light"
 10 | ---
 11 | 
 12 | # tl;dr
 13 | 
 14 | - **Objective**: estimate effect of political microtargeting.
 15 | - **Design**: Two-stage survey experiment with allocation mechanism as treatment.
 16 | - **Result**: Among unaligned respondents who had not pre-voted, targeting:
 17 |     - **Increased proportion anti-Biden by 8.7 percentage points**.
 18 |     - **Decreased proportion intending to vote Biden by 7.1 percentage points**.
 19 | 
 20 | ## Scope 
 21 | 
 22 | - **Tailoring**: constructing a message so that it appeals to a specific audience.
 23 | - **Targeting**: delivering the message so only the intended audience sees it.
 24 | 	- **Micro-**: on the basis of individual characteristics.
 25 | 
 26 | ## Context
 27 | 
 28 | 1. Rise of data-driven political campaigning in the past decade (Fowler et al 2021)
 29 | 	- Creates various legal (Wood and Ravel 2017) and ethical (e.g. Burkell \& Regan 2019) issues.
 30 | 2. Many of these arguments presume that micro-targeting _works_:
 31 |     - "micro-targeting of voters can pay very handsome electoral dividends for a relatively modest investment" (Krotoszynski Jr. 2020).
 32 | 
 33 | ## Puzzle
 34 | 
 35 | 3. Research on _psychometric profiling_ indicates that improvements should be possible (Zarouali et al 2020)
 36 | 4. But extant political science research casts doubt (Nickerson and Rogers 2020)
 37 | 	- Decade of political science research finds largely small/null effects of campaigns (Kalla and Broockman 2018)
 38 | 	- Coppock et al (2020) find **lack of heterogeneity that leaves little room for targeting to operate**.
 39 | 
 40 | ## Question
 41 | 
 42 | - _If ad effects are small and homogeneous, then how can targeting yield any benefits?_
 43 | - _Does (micro)targeting work?_
 44 | 
 45 | ::: {.notes}
 46 | Note that in this context, I take "micro" to mean on the basis of individual traits, as opposed to group-level data.
 47 | :::
 48 | 
 49 | 
 50 | # Research Design
 51 | 
 52 | ## Design Summarized
 53 | 
 54 | ::: {.columns}
 55 | :::: {.column width="30%"}
 56 | **Stage 1**
 57 | 
 58 | - Control
 59 | - N = 1500
 60 | - 5 ads
 61 | - Random allocation
 62 | ::::
 63 | :::: {.column width="40%"}
 64 | **Switch-Over**
 65 | 
 66 | - 5 algorithms trained on stage 1 data learn:
 67 | - Biden fav. <br>= f(ad, traits)
 68 | - Best algorithm uploaded
 69 | ::::
 70 | :::: {.column width="30%"}
 71 | **Stage 2**
 72 | 
 73 | - Treatment
 74 | - N = 900
 75 | - Allocate ad that minimizes Biden favorability
 76 | ::::
 77 | :::
 78 | 
 79 | ## Advertisements
 80 | 
 81 | | Title                           | Description                                 |
 82 | | ------------------------------- | ------------------------------------------- |
 83 | | "They Mock Us"                  | In-group: Clinton and Biden mocking         |
 84 | | "Why did Biden let him do it?"  | Hunter Biden's ostensible corruption        |
 85 | | "Biden will come for your guns" | 2A; Biden will steal guns                   |
 86 | | "Insult"                        | Biden: Black Trump supporters not Black     |
 87 | | "Real Leadership"               | Obama/Biden caused wars, neglected veterans |
 88 | 
 89 | ## Covariates
 90 | 
 91 | ::: {.columns}
 92 | :::: {.column}
 93 | **Demographic (5 items)**
 94 | 
 95 | - Age
 96 | - Gender
 97 | - Race
 98 | - Income Group
 99 | - State
100 | ::::
101 | :::: {.column}
102 | **Political (4 items)**
103 | 
104 | - News interest
105 | - Is country on right track?
106 | - Partisanship (1-7 Dem/Rep)
107 | - Ideology (1-5 L/R)
108 | ::::
109 | :::
110 | 
111 | :::{.fragment}
112 | _Note_: expectation that online advertisers (e.g. Facebook) able to infer these traits with high degree of accuracy.
113 | :::
114 | 
115 | ## Five Candidate Algorithms
116 | 
117 | Chosen for speed and ability to learn highly conditional relationships:
118 | 
119 | - \*Random Forest (RF)
120 | - AdaBoost
121 | - Gradient Boosted Decision Trees (GBDT)
122 | - Multi-Layer Perceptron Regressor (MLPR)
123 | - Support Vector Machine (SVM)
124 | 
125 | ## Outcomes
126 | 
127 | 1. Trump and Biden Favorability (1-5)
128 | 2. Voting Intention:
129 | 	- Trump/Biden/Other
130 | 	- Already voted (Trump/Biden/Other)
131 | 	- Do not intend to vote
132 | 
133 | ## Hypotheses
134 | 
135 | - Targeting anti-Biden ads decreases:
136 | 	- Biden favorability
137 | 	- Intent to vote Biden
138 | 	- Intent to vote (turnout
139 | - Versus group that receives ad at random
140 | 
141 | - All hypotheses tested conditional on partisanship (motivated reasoning)
142 | 
143 | # Results
144 | 
145 | ## {data-background="figures/effect_of_targeting_presentation.png" data-background-size="contain"}
146 | 
147 | ## Robustness
148 | 
149 | - Pre-treatment covariate balance check.
150 | - Multiple comparisons correction (Holm, Benjamini-Hochberg).
151 | - Variety of operationalizations of outcome (linear, binary, ordered categorical)
152 | - Pre-experimental power check using Coppock et al (2020) data (along with permutation test for bias in mechanism).
153 | 
154 | ## Decomposition
155 | 
156 | - Proportion of ads varies between stages.
157 | - Effect can be decomposed:
158 | 	- "Better Allocations": leveraging within-respondent heterogeneity to increase average effect
159 | 	- "Better Ads": more of better ads shown
160 | - Targeting vs simple A/B testing.
161 | 
162 | 
163 | ## {data-background="figures/heterogeneity_presentation.png" data-background-size="contain"} 
164 | 
165 | 
166 | # Discussion
167 | 
168 | ## Limitations
169 | 
170 | - Survey response vs vote choice
171 | - Convenience sample
172 | - Possible confounding due to sequential assignment.
173 | - "Black box" approach
174 | 
175 | ## Envelope Calculation
176 | 
177 | - 7.1pp decrease in unaligned voters voting Biden
178 | - Sufficient to change outcome in Arizona (35.1% unaligned, 0.3% margin)
179 | 	- Unrepresentative sample
180 | 	- Decay
181 | 
182 | ## Contradictory?
183 | 
184 | - I argue consistent with results of Coppock et al. (and others).
185 | - Difference may be due to ML search of covariate space to maximise effect.
186 | - In future, necessary to consider within-respondent heterogeneity?
187 | 
188 | ## What We Learned
189 | 
190 | - _Possible_ to increase the effectiveness of ad campaigns with (micro)targeting.
191 | 	- Creates bad incentives for data harvesting
192 | 	- Targeting negative ads = manipulative?
193 | 
194 | # What's Next
195 | 
196 | ## Follow-Up: Fixes
197 | 
198 | - **Setting**: US 2022 mid-term elections
199 | - **Fixes**:
200 | 	- Stage 1 as training-only (avoid confounding)
201 | 	- General appeal and GOTV ads as control.
202 | 	- Increase $N$, exclude partisan respondents.
203 | 
204 | ## Follow-Up: Additions
205 | 
206 | - How does priming respondents to presence of targeting moderate effect of targeting?
207 | 	- Do existing "warnings" have any effect?
208 | 	- Would a stronger disclosure have an effect?
209 | 	- How can this be used for better regulation of online ads?
210 | 
211 | ## Other Applications
212 | 
213 | - Dynamic optimal allocation:
214 | 	- Continuously updating adaptive treatment design?
215 | - Idea: maximising sample efficiency for estimating HTEs in multi-intervention studies.
216 | 
217 | ## Bonus: Technical Implementation
218 | 
219 | - Built/hosted website on AWS Lightsail instance running Linux/Nginx/PHP/MariaDB (LEMP) stack.
220 | - Responses sent real-time to to server-side kernel and db.
221 | - Python kernel API modified Jupyter interface.
222 | - Kernel hosts pre-trained algorithm, sends best ad back.
223 | - PHP generates webpage to contain assigned video.
224 | - Source code available: `https://github.com/muhark/dotas-design`
225 | 


--------------------------------------------------------------------------------
/misc_presentations/draft4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/misc_presentations/draft4.pdf


--------------------------------------------------------------------------------
/misc_presentations/fig_bernoulli.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/misc_presentations/fig_bernoulli.png


--------------------------------------------------------------------------------
/misc_presentations/figures.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import matplotlib.pyplot as plt
 3 | import matplotlib
 4 | import seaborn as sns
 5 | import numpy as np
 6 | 
 7 | matplotlib.rcParams["text.usetex"] = False
 8 | 
 9 | # Figure 1: Bernoulli Entropy
10 | 
11 | q = np.linspace(0.000000001, 0.9999999999, 1000)
12 | H_x = - q*np.log2(q) - (1-q)*np.log2(1-q)
13 | 
14 | sns.set_style('whitegrid')
15 | 
16 | f, ax = plt.subplots(1, 1, figsize=(9, 5))
17 | ax.plot(q, H_x, color='cornflowerblue')
18 | ax.axvline(0, lw=2, color='k', alpha=0.8)
19 | ax.axhline(0, lw=2, color='k', alpha=0.8)
20 | ax.axvline(0.5, ls="--", color='darkgrey', alpha=0.8)
21 | ax.text(0.49, -0.1, "0.5")
22 | ax.set_xlim(-0.05, 1.05)
23 | ax.set_ylim(-0.05, 1.0025)
24 | matplotlib.rcParams["text.usetex"] = True
25 | ax.set_xlabel("$q$")
26 | ax.set_ylabel("$H_X$")
27 | matplotlib.rcParams["text.usetex"] = False
28 | f.savefig("fig_bernoulli.png", bbox_inches="tight")
29 | 


--------------------------------------------------------------------------------
/misc_presentations/figures/allocations.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/misc_presentations/figures/allocations.pdf


--------------------------------------------------------------------------------
/misc_presentations/figures/effect_of_targeting_presentation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/misc_presentations/figures/effect_of_targeting_presentation.png


--------------------------------------------------------------------------------
/misc_presentations/figures/effect_of_targeting_total.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/misc_presentations/figures/effect_of_targeting_total.pdf


--------------------------------------------------------------------------------
/misc_presentations/figures/feature_importance.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/misc_presentations/figures/feature_importance.pdf


--------------------------------------------------------------------------------
/misc_presentations/figures/heterogeneity_presentation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/misc_presentations/figures/heterogeneity_presentation.png


--------------------------------------------------------------------------------
/misc_presentations/figures/predicted_favorability_f_pid.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/misc_presentations/figures/predicted_favorability_f_pid.pdf


--------------------------------------------------------------------------------
/misc_presentations/knn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/misc_presentations/knn.png


--------------------------------------------------------------------------------
/misc_presentations/minimal-theme.css:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * A simple theme for reveal.js presentations, similar
  3 |  * to the default theme. The accent color is darkblue.
  4 |  *
  5 |  * This theme is Copyright (C) 2012 Owen Versteeg, https://github.com/StereotypicalApps. It is MIT licensed.
  6 |  * reveal.js is Copyright (C) 2011-2012 Hakim El Hattab, http://hakim.se
  7 |  */
  8 | @import url(https://fonts.googleapis.com/css?family=News+Cycle:400,700);
  9 | @import url(https://fonts.googleapis.com/css?family=Lato:400,700,400italic,700italic);
 10 | section.has-dark-background, section.has-dark-background h1, section.has-dark-background h2, section.has-dark-background h3, section.has-dark-background h4, section.has-dark-background h5, section.has-dark-background h6 {
 11 |   color: #fff; }
 12 | 
 13 | /*********************************************
 14 |  * GLOBAL STYLES
 15 |  *********************************************/
 16 | body {
 17 |   background: #fff;
 18 |   background-color: #fff; }
 19 | 
 20 | .reveal {
 21 |   font-family: "Lato", sans-serif;
 22 |   font-size: 35px;
 23 |   font-weight: normal;
 24 |   color: #000; }
 25 | 
 26 | ::selection {
 27 |   color: #000;
 28 |   background: rgba(0, 0, 0, 0.99);
 29 |   text-shadow: none; }
 30 | 
 31 | ::-moz-selection {
 32 |   color: #fff;
 33 |   background: rgba(0, 0, 0, 0.99);
 34 |   text-shadow: none; }
 35 | 
 36 | .reveal .slides section,
 37 | .reveal .slides section > section {
 38 |   line-height: 1.3;
 39 |   font-weight: inherit; }
 40 | 
 41 | /*********************************************
 42 |  * HEADERS
 43 |  *********************************************/
 44 | .reveal h1,
 45 | .reveal h2,
 46 | .reveal h3,
 47 | .reveal h4,
 48 | .reveal h5,
 49 | .reveal h6 {
 50 |   margin: 0 0 20px 0;
 51 |   color: #000;
 52 |   font-family: "Lato", sans-serif;
 53 |   font-weight: bold;
 54 |   line-height: 1.2;
 55 |   letter-spacing: normal;
 56 |   text-transform: none;
 57 |   text-shadow: none;
 58 |   word-wrap: break-word; }
 59 | 
 60 | .reveal h1 {
 61 |   font-size: 1.65em;
 62 |   text-align: left; }
 63 | 
 64 | .reveal h2 {
 65 |   font-size: 1.05em;
 66 |   text-align: left; }
 67 | 
 68 | .reveal h3 {
 69 |   font-size: 1.025em; }
 70 | 
 71 | .reveal h4 {
 72 |   font-size: 1em; }
 73 | 
 74 | .reveal h1 {
 75 |   text-shadow: none; }
 76 | 
 77 | /*********************************************
 78 |  * OTHER
 79 |  *********************************************/
 80 | .reveal p {
 81 |   margin: 5px 0;
 82 |   line-height: 1.15;
 83 |   text-align: left; }
 84 | 
 85 | /* Ensure certain elements are never larger than the slide itself */
 86 | .reveal img,
 87 | .reveal video,
 88 | .reveal iframe {
 89 |   max-width: 95%;
 90 |   max-height: 95%; }
 91 | 
 92 | .reveal strong,
 93 | .reveal b {
 94 |   font-weight: bold; }
 95 | 
 96 | .reveal em {
 97 |   font-style: italic; }
 98 | 
 99 | .reveal ol,
100 | .reveal dl,
101 | .reveal ul {
102 |   text-align: left;
103 |   margin: 0 0 0 1.5em; }
104 | 
105 | .reveal ol {
106 |   list-style-type: decimal; }
107 | 
108 | .reveal ul {
109 |   list-style-type: disc; }
110 | 
111 | .reveal ul ul {
112 |   list-style-type: square; }
113 | 
114 | .reveal ul ul ul {
115 |   list-style-type: circle; }
116 | 
117 | .reveal ul ul,
118 | .reveal ul ol,
119 | .reveal ol ol,
120 | .reveal ol ul {
121 |   display: block;
122 |   margin-left: 1.5em; }
123 | 
124 | .reveal dt {
125 |   font-weight: bold; }
126 | 
127 | .reveal dd {
128 |   margin-left: 30px; }
129 | 
130 | .reveal blockquote {
131 |   display: block;
132 |   position: relative;
133 |   width: 70%;
134 |   margin: 20px auto;
135 |   padding: 5px;
136 |   font-style: italic;
137 |   background: rgba(255, 255, 255, 0.05);
138 |   box-shadow: 0px 0px 2px rgba(0, 0, 0, 0.2); }
139 | 
140 | .reveal blockquote p:first-child,
141 | .reveal blockquote p:last-child {
142 |   display: inline-block; }
143 | 
144 | .reveal q {
145 |   font-style: italic; }
146 | 
147 | .reveal pre {
148 |   display: block;
149 |   position: relative;
150 |   width: 90%;
151 |   margin: 20px auto;
152 |   text-align: left;
153 |   font-size: 0.55em;
154 |   font-family: monospace;
155 |   line-height: 1.2em;
156 |   word-wrap: break-word;
157 |   box-shadow: 0px 5px 15px rgba(0, 0, 0, 0.15); }
158 | 
159 | .reveal code {
160 |   font-family: monospace;
161 |   text-transform: none;
162 |   padding: 2px 4px;
163 |   font-size: 90%;
164 |   color: #c7254e;
165 |   background-color: #f9f2f4;
166 |   vertical-align: baseline;
167 |   white-space: pre-wrap;
168 |   border-radius: 2px;
169 | 
170 | }
171 | 
172 | .reveal pre code {
173 |   display: block;
174 |   padding: 5px;
175 |   overflow: auto;
176 |   max-height: 400px;
177 |   word-wrap: normal; }
178 | 
179 | .reveal table {
180 |   margin: auto;
181 |   border-collapse: collapse;
182 |   border-spacing: 0; }
183 | 
184 | .reveal table th {
185 |   font-weight: bold; }
186 | 
187 | .reveal table th,
188 | .reveal table td {
189 |   text-align: left;
190 |   padding: 0.2em 0.5em 0.2em 0.5em;
191 |   border-bottom: 1px solid; }
192 | 
193 | .reveal table th[align="center"],
194 | .reveal table td[align="center"] {
195 |   text-align: center; }
196 | 
197 | .reveal table th[align="right"],
198 | .reveal table td[align="right"] {
199 |   text-align: right; }
200 | 
201 | .reveal table tbody tr:last-child th,
202 | .reveal table tbody tr:last-child td {
203 |   border-bottom: none; }
204 | 
205 | .reveal sup {
206 |   vertical-align: super;
207 |   font-size: smaller; }
208 | 
209 | .reveal sub {
210 |   vertical-align: sub;
211 |   font-size: smaller; }
212 | 
213 | .reveal small {
214 |   display: inline-block;
215 |   font-size: 0.6em;
216 |   line-height: 1.2em;
217 |   vertical-align: top; }
218 | 
219 | .reveal small * {
220 |   vertical-align: top; }
221 | 
222 | /*********************************************
223 |  * LINKS
224 |  *********************************************/
225 | .reveal a {
226 |   color: #00008B;
227 |   text-decoration: none;
228 |   -webkit-transition: color .15s ease;
229 |   -moz-transition: color .15s ease;
230 |   transition: color .15s ease; }
231 | 
232 | .reveal a:hover {
233 |   color: #0000f1;
234 |   text-shadow: none;
235 |   border: none; }
236 | 
237 | .reveal .roll span:after {
238 |   color: #fff;
239 |   background: #00003f; }
240 | 
241 | /*********************************************
242 |  * IMAGES
243 |  *********************************************/
244 | .reveal section img {
245 |   margin: 15px 0px;
246 |   background: rgba(255, 255, 255, 0.12);
247 |   border: 4px solid #000;
248 |   box-shadow: 0 0 10px rgba(0, 0, 0, 0.15); }
249 | 
250 | .reveal section img.plain {
251 |   border: 0;
252 |   box-shadow: none; }
253 | 
254 | .reveal a img {
255 |   -webkit-transition: all .15s linear;
256 |   -moz-transition: all .15s linear;
257 |   transition: all .15s linear; }
258 | 
259 | .reveal a:hover img {
260 |   background: rgba(255, 255, 255, 0.2);
261 |   border-color: #00008B;
262 |   box-shadow: 0 0 20px rgba(0, 0, 0, 0.55); }
263 | 
264 | /*********************************************
265 |  * NAVIGATION CONTROLS
266 |  *********************************************/
267 | .reveal .controls {
268 |   color: #00008B; }
269 | 
270 | /*********************************************
271 |  * PROGRESS BAR
272 |  *********************************************/
273 | .reveal .progress {
274 |   background: rgba(0, 0, 0, 0.2);
275 |   color: #00008B; }
276 | 
277 | .reveal .progress span {
278 |   -webkit-transition: width 800ms cubic-bezier(0.26, 0.86, 0.44, 0.985);
279 |   -moz-transition: width 800ms cubic-bezier(0.26, 0.86, 0.44, 0.985);
280 |   transition: width 800ms cubic-bezier(0.26, 0.86, 0.44, 0.985); }
281 | 
282 | /*********************************************
283 |  * PRINT BACKGROUND
284 |  *********************************************/
285 | @media print {
286 |   .backgrounds {
287 |     background-color: #fff; } }
288 | 


--------------------------------------------------------------------------------
/misc_presentations/planning.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: DPhil Seminar Planning
  3 | author: Musashi Harukawa
  4 | date: 14 May 2020
  5 | ---
  6 | 
  7 | # Overview
  8 | 
  9 | Presenting two/three papers:
 10 | 
 11 | - An Information Theoretic Approach to Measurement Inference
 12 | - Micro-targeted Political Campaigns:
 13 |     - Does it work?
 14 |     - Does the effect persist when subjects are informed of the targeting?
 15 |     - Which covariates provide the most information about the predicted effectiveness of an ad?
 16 | 
 17 | Present in above order.
 18 | 
 19 | # Paper 1: Mutual Information as a Concept Measure
 20 | 
 21 | Slides:
 22 | 
 23 | - Context: Peterson & Spirling (2018), Goet (2019). Classifier Accuracy as a Quantity of Substantive Interest
 24 |     - Outline core of paper: interpretation of classifier accuracy as measure of polarization in parliament.
 25 |     - What I like about this: non-linear approaches to measurement, measurement from complex high-dimensional data
 26 | - Shortcomings:
 27 |     - Measurement Theory: explain link between concepts, indicators and measures
 28 |     - Show possibility for erroneous measurement inference "OVB".
 29 |     - Show that ideally we would minimally want to be able to "partial out" the link between confounding concepts and our measures.
 30 | - Solution:
 31 |     - Brief outline of solution.
 32 |     - A few slides on information theory.
 33 |     - Explanation of how "partialling out" information works.
 34 |     - Argue that mutual information may be a better metric in their implicit measurement model than classifier accuracy.
 35 | - Empirical Application:
 36 |     - Apply this model to their data, as well as related papers (Dutch parliament, US Congress).
 37 | - Broader relevance:
 38 |     - Frequent claims (implicitly) of the form "what does knowing X tell us about the value of Y". Information Theory gives a formalisation of these relationships that is functionally agnostic.
 39 | 
 40 | # Paper 2 & 3: Micro-targeting
 41 | 
 42 | Papers 2 and 3 share the same broader context, and will rely on the same experiment.
 43 | 
 44 | Slides:
 45 | 
 46 | ## Context
 47 | 
 48 | - Multiple strands of related literature:
 49 |     - Law: (Big Data, etc.)
 50 |     - Psychology: (2 refs)
 51 |     - Negative Campaigning (Ansolabhere, Fridkin and Kenney AJPS 2011)
 52 | - Too many papers simply assert that micro-targeting simply works.
 53 |     - Closest thing to a test uses ABM.
 54 | 
 55 | ## Three Research Questions
 56 | 
 57 | - _Does micro-targeting work?_
 58 | - Many people are aware that they are being targeted, at some level; _how do we engage critical and skeptical engagement?_
 59 | - _What information is most useful to political campaigns when seeking to micro-target?_
 60 | 
 61 | ## Experiment
 62 | 
 63 | - Two-stage design:
 64 |     - Stage 1: Randomly assign treatment to one of five ads. Determine MATE of each ad as function of all covariates, and get five fitted models predicting treatment effect as a function of covariates.
 65 |     - Stage 2: Treatment now defined as being targeted. Control gets random ad, treatment gets ad predicted to have strongest effect. Two more treatment groups where subject is told beforehand that ad is selected for them.
 66 | - What does this show?
 67 |     - Stage 2: T1 - C = Microtargeting Effect
 68 |     - Stage 2: T3 - T1 = Skeptical Engagement
 69 |     - Stage 2: dMI(X_i, Y)/dX_i = Marginal Mutual Information Gain of Covariate
 70 | 
 71 | ## Normative Territory
 72 | 
 73 | A few conclusions that may give us pause for thought:
 74 | 
 75 | - _Political campaigns are able to more effectively persuade voters when they know more about them_.
 76 | - _This effect is conditional on voters not being fully aware of the extent to which this message is being tailored specifically for them_.
 77 | - _Certain information allows campaigns to be more effective at persuading_.
 78 | 
 79 | My views:
 80 | 
 81 | - The use of personal data in campaigning should be heavily regulated, especially when combined with negative campaigning.
 82 | - Individuals should always be aware of the basis upon and extent to which a message has been tailored for them.
 83 | - Certain kinds of information about individuals should not be stored or leveraged by private or public organisations. There are a multitude of ethical concerns about privacy, etc.; I think a consequentialist view that some kinds of information allow for easier manipulation should be cause for regulating their usage.
 84 | 
 85 | ## Outstanding Issues
 86 | 
 87 | - Negative campaign ads only?
 88 | - When and where? Is corona an issue?
 89 | 
 90 | 
 91 | # Specific Section Planning
 92 | 
 93 | ## Primer on Measurement Theory
 94 | 
 95 | Measurement theory is an area of social science methodology concerned with the construction of _measures_.
 96 | 
 97 | Useful distinction between _concepts_, _indicators_ and _measures_.
 98 | 
 99 | - Concepts are (often latent) theoretical constructs. Will not provide an ontology of concepts here (ha) but Goertz is a good resource.
100 | - Indicators are empirical phenomena, and therefore realizable, regular and measurable.
101 | - Measures are constructs that systematize our observations of indicators, and make specific relational claims about the comparability of realizations of the indicators.
102 | 
103 | We usually have some theoretical reasons for believing that concepts are linked to indicators (maybe even causally!)
104 | 
105 | Can be generalised to the following diagram [include measurement diagram]
106 | 
107 | [Needs to be prefaced by article intro, so I can give example with polarization measure]
108 | 
109 | There are many kinds of biases and erroneous inferences that can be made; in this case I am most worried about a kind of confounding; the indicators and therefore the measure are systematically affected by a confounding factor that affects both partisan label and speech, or the link between the two. (e.g. diversity)
110 | 
111 | ## Primer on Information Theory
112 | 
113 | Information Theory is a field of statisitcs/mathematics that, among other things, formalises the _information_ contained in random processes.
114 | 
115 | Some key concepts:
116 | 
117 | - Entropy (also joint and conditional)
118 | - Mutual Information
119 | - KL Divergence (maybe necessary?)
120 | 
121 | Entropy:
122 | 
123 | _The entropy $H_X$ of a discrete random variable $X$ with probability distribution $p(x)$ is defined as:_
124 | 
125 | $$
126 | H_X \equiv - \sum_{x \in \mathfrak{X}}{p(x)log_2p(x)} = \mathbb{E}log_{2}[\frac{1}{p(x)}]
127 | $$
128 | 
129 | Some examples:
130 | 
131 | - A fair coin
132 | 


--------------------------------------------------------------------------------
/syllabus.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Introduction to Python for Social Science Course Syllabus"
  3 | author: "Musashi Harukawa"
  4 | geometry: margin=2cm
  5 | ---
  6 | 
  7 | ## Course Description
  8 | 
  9 | _Introduction to Python for Social Science_ is an 8-week optional methods module aimed at social science researchers seeking to learn programming skills for their research. There will be weekly lectures, lasting 60 to 90 minutes, followed by a workshop, and supplemented by weekly office hours. All of the above will be conducted on Teams and communicated on Canvas.
 10 | 
 11 | This course begins with an extremely brief introduction to the Python programming language, then teaches four key skills for social science research:
 12 | 
 13 | - Data Cleaning and Merging with `pandas`
 14 | - Data Visualization with `matplotlib` and `seaborn`
 15 | - Introductory Machine Learning with `scikit-learn`
 16 | - Automated Data Collection with `beautifulsoup` and `selenium`
 17 | 
 18 | As an optional course, there will be no marked assignments, but there are weekly problem sets and readings designed to aid learning. Students are encouraged to complete these tasks, and will have the opportunity to discuss them during the workshop or office hours.
 19 | 
 20 | This course is aimed at complete beginners, although experience with other programming languages (such as `R`) may provide some useful reference points. Note that those intending to attend the Trinity Term text-as-data module are **strongly** encouraged to attend this course, as the skills taught here will be assumed knowledge.
 21 | 
 22 | ## Information Regarding Attendance
 23 | 
 24 | _Please read this information prior to attending, as I will not have time to help with problems that arise during lectures or the workshop._
 25 | 
 26 | Students will require the means to read, write, and execute code in Jupyter notebooks. For the purposes of the course, [`Google Colab`](https://colab.research.google.com/) is sufficient, but students who desire to have a local installation should read the attached [installation guide](/dpir-intro-python/InstallationGuide.pdf).
 27 | 
 28 | ## Readings
 29 | 
 30 | The course does not follow any particular textbook. However, readings will be assigned primarily from the following books:
 31 | 
 32 | - Week 1: [_Automate the Boring Stuff with Python_](https://automatetheboringstuff.com/) by Al Sweigart. Excellent introduction to Python and what it can do for you. Free from website.
 33 | - Weeks 2 and 3: [_Python for Data Analysis: Data Wrangling with Pandas, NumPy and IPython, 2nd edition_](http://solo.bodleian.ox.ac.uk/permalink/f/89vilt/oxfaleph021507068) by Wes McKinney. Available for free via SOLO.
 34 | - Weeks 5 and 6: [_Elements of Statistical Learning, 2nd Edition_](https://web.stanford.edu/~hastie/ElemStatLearn/) by Hastie, Tibshirani and Friedman. This book provides an excellent and comprehensive introduction to machine learning. Free from website.
 35 | - Weeks 7 and 8: [_Web Scraping with Python, 2nd Edition_](https://www.oreilly.com/library/view/web-scraping-with/9781491985564/) by Ryan Mitchell, 2018. Can be accessed for free via SOLO.
 36 | 
 37 | For further questions regarding course specifics or accessibility requirements, please feel free to write to me at [musashi.harukawa@merton.ox.ac.uk](musashi.harukawa@merton.ox.ac.uk).
 38 | 
 39 | 
 40 | \newpage
 41 | 
 42 | ## Course Outline
 43 | 
 44 | _Please note that do to my own evolving situation, this syllabus may be subject to change._
 45 | 
 46 | 
 47 | ### _Week 1_: Introduction to Python and the Development Environment
 48 | 
 49 | **Learning Aims**:
 50 | 
 51 | 1. What is Python and what can I use it for?
 52 | 2. What are the tools I can use to write Python code?
 53 | 3. Writing your first Python script
 54 | 
 55 | There are three learning goals in the first week. The first relates to what Python is, and how it can be useful for social science researchers. Students will learn about the various use cases for Python, and come up with ways that it may help them achieve their research aims.
 56 | 
 57 | The second learning goal is to gain familiarity with the tools used to code in Python and present their research. These include Jupyter notebooks, IDEs and the terminal. Students will primarily use Jupyter notebooks in this course, but are welcome to use alternative development tools.
 58 | 
 59 | The final goal is to write their first program in Python. Commands and operators such as `print`, `+`, `&` etc. will be introduced.
 60 | 
 61 | ### _Week 2_: Data Structures and Pandas I
 62 | 
 63 | **Learning Aims**:
 64 | 
 65 | 1. Recap: Data Structures in Base Python
 66 | 2. Data I/O with `pandas`
 67 | 3. Selecting, filtering and indexing data in `pandas`
 68 | 4. Summary statistics in `pandas`
 69 | 5. `NumPy` Data Types
 70 | 
 71 | The second week recaps some basic data structures in base Python from the previous week and then introduces a key library for data analysis: `pandas`. In base Python, students will learn about lists and dictionaries.
 72 | 
 73 | In `pandas`, students will learn how to read in various data formats, clean and index data, and produce summary statistics. Students will also be introduced to data types.
 74 | 
 75 | The goal of this week is to be able to use `pandas` to open `csv`, `html`, `xls`, or `dta` files, to slice and filter them, and then to produce summary statistics.
 76 | 
 77 | ### _Week 3_: Data Structures and Pandas II
 78 | 
 79 | **Learning Aims**:
 80 | 
 81 | 1. Writing Python functions
 82 | 2. Vectorize with `apply`
 83 | 3. Split-apply-combine with `groupby`
 84 | 4. Working with datetime data
 85 | 
 86 | The third week builds on students' knowledge of `pandas`, introducing two key tools in data analysis: `apply` and `groupby`. Students will also learn how to write functions and be introduced to the idea of namespaces.
 87 | 
 88 | By the end of this week, students should have a sufficient grounding in handling tabular data with base Python and `pandas` to deal with most data cleaning and reshaping tasks they use in their own research.
 89 | 
 90 | ### _Week 4_: Data Visualization
 91 | 
 92 | **Learning Aims**:
 93 | 
 94 | 1. The "philosophy" of `matplotlib`
 95 | 2. Figures, subplots, axes, legends
 96 | 3. Plotting uni- and bivariate datasets in `matplotlib`
 97 | 4. The convenience of `seaborn`
 98 | 5. Customizing your plots
 99 | 
100 | The fourth week introduces two key libraries for data visualization: `matplotlib` and `seaborn`. Students will learn the philosophy behind data visualization, and how to create a number of useful 2D graphs.
101 | 
102 | ### _Week 5_: Machine Learning I
103 | 
104 | 1. Introduction to Machine Learning
105 | 2. Introduction to `scikit-learn`
106 | 3. kmeans clustering with `scikit-learn`
107 | 4. Presenting your kmeans clustering results
108 | 
109 | The fifth week introduces to students to machine learning with the `scikit-learn` library. After discussing the aims and developments within the field, students learn about unsupervised clustering with the k-means algorithm.
110 | 
111 | ### _Week 6_: Machine Learning II
112 | 
113 | 1. Random Forest regression and classification with `scikit-learn`
114 | 2. k-fold cross validation with `scikit-learn`
115 | 3. Hyperparameter Tuning with `Grid` and `RandomizedSearchCV`
116 | 4. When not to use linear models
117 | 
118 | The sixth week introduces supervised machine learning with the random forest algorithm. Students then learn about cross-validation techniques and their implementation in `scikit-learn`. Finally students learn about hyperparameters, and how to choose the optimal initialising parameters for the model. The lesson ends with a discussion about the difference between prediction and explanation.
119 | 
120 | 
121 | ### _Week 7_: Web Scraping I
122 | 
123 | 1. The Structure of Websites: `html`
124 | 2. Requesting webpages with `requests` or `urllib`
125 | 3. Parsing `html` with `beautifulsoup`
126 | 4. Introduction to regular expressions (`regex`)
127 | 
128 | Students will learn the fundamentals of writing a script to automate web-based data collection. This will include a discussion of the legality and ethics of the method, when and how it should be employed, and the potential consequences of inappropriately applying it.
129 | 
130 | Students will learn the basics of the structure of every webpage; `html`, and how the library `beautifulsoup` can help them parse and navigate this in order to extract data from webpages. Students will also learn a basic introduction to regular expressions with the `re` library.
131 | 
132 | ### _Week 8_: Web Scraping II
133 | 
134 | 1. Working with APIs
135 | 2. Browser automation with `Selenium`
136 | 
137 | Students will learn further techniques in web scraping, using APIs and browser automation to interact with a further variety of data sources on the web.
138 | 


--------------------------------------------------------------------------------
/syllabus.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muhark/dpir-intro-python/e48dab681abe86dae75b1c21889974fbf42789ab/syllabus.pdf


--------------------------------------------------------------------------------
/teaching.yaml:
--------------------------------------------------------------------------------
  1 | name: teaching
  2 | channels:
  3 |   - conda-forge
  4 |   - defaults
  5 | dependencies:
  6 |   - _libgcc_mutex=0.1=main
  7 |   - attrs=19.3.0=py_0
  8 |   - backcall=0.1.0=py37_0
  9 |   - blas=1.0=mkl
 10 |   - bleach=3.1.0=py37_0
 11 |   - ca-certificates=2019.11.28=hecc5488_0
 12 |   - certifi=2019.11.28=py37_0
 13 |   - cycler=0.10.0=py37_0
 14 |   - dbus=1.13.12=h746ee38_0
 15 |   - decorator=4.4.1=py_0
 16 |   - defusedxml=0.6.0=py_0
 17 |   - entrypoints=0.3=py37_0
 18 |   - expat=2.2.6=he6710b0_0
 19 |   - fontconfig=2.13.0=h9420a91_0
 20 |   - freetype=2.9.1=h8a8886c_1
 21 |   - glib=2.63.1=h5a9c865_0
 22 |   - gmp=6.1.2=h6c8ec71_1
 23 |   - gst-plugins-base=1.14.0=hbbd80ab_1
 24 |   - gstreamer=1.14.0=hb453b48_1
 25 |   - icu=58.2=h9c2bf20_1
 26 |   - importlib_metadata=1.3.0=py37_0
 27 |   - intel-openmp=2019.4=243
 28 |   - ipykernel=5.1.3=py37h39e3cac_0
 29 |   - ipython=7.10.2=py37h39e3cac_0
 30 |   - ipython_genutils=0.2.0=py37_0
 31 |   - ipywidgets=7.5.1=py_0
 32 |   - jedi=0.15.1=py37_0
 33 |   - jinja2=2.10.3=py_0
 34 |   - joblib=0.14.1=py_0
 35 |   - jpeg=9b=h024ee3a_2
 36 |   - jsonschema=3.2.0=py37_0
 37 |   - jupyter=1.0.0=py37_7
 38 |   - jupyter_client=5.3.4=py37_0
 39 |   - jupyter_console=5.2.0=py37_1
 40 |   - jupyter_core=4.6.1=py37_0
 41 |   - kiwisolver=1.1.0=py37he6710b0_0
 42 |   - libedit=3.1.20181209=hc058e9b_0
 43 |   - libffi=3.2.1=hd88cf55_4
 44 |   - libgcc-ng=9.1.0=hdf63c60_0
 45 |   - libgfortran-ng=7.3.0=hdf63c60_0
 46 |   - libpng=1.6.37=hbc83047_0
 47 |   - libsodium=1.0.16=h1bed415_0
 48 |   - libstdcxx-ng=9.1.0=hdf63c60_0
 49 |   - libuuid=1.0.3=h1bed415_2
 50 |   - libxcb=1.13=h1bed415_1
 51 |   - libxml2=2.9.9=hea5a465_1
 52 |   - markupsafe=1.1.1=py37h7b6447c_0
 53 |   - matplotlib=3.1.1=py37h5429711_0
 54 |   - mistune=0.8.4=py37h7b6447c_0
 55 |   - mkl=2019.4=243
 56 |   - mkl-service=2.3.0=py37he904b0f_0
 57 |   - mkl_fft=1.0.15=py37ha843d7b_0
 58 |   - mkl_random=1.1.0=py37hd6b4f25_0
 59 |   - more-itertools=8.0.2=py_0
 60 |   - nbconvert=5.6.1=py37_0
 61 |   - nbformat=4.4.0=py37_0
 62 |   - ncurses=6.1=he6710b0_1
 63 |   - notebook=6.0.2=py37_0
 64 |   - numpy=1.17.4=py37hc1035e2_0
 65 |   - numpy-base=1.17.4=py37hde5b4d6_0
 66 |   - openssl=1.1.1d=h516909a_0
 67 |   - pandas=0.25.3=py37he6710b0_0
 68 |   - pandoc=2.2.3.2=0
 69 |   - pandocfilters=1.4.2=py37_1
 70 |   - parso=0.5.2=py_0
 71 |   - patsy=0.5.1=py37_0
 72 |   - pcre=8.43=he6710b0_0
 73 |   - pexpect=4.7.0=py37_0
 74 |   - pickleshare=0.7.5=py37_0
 75 |   - pip=19.3.1=py37_0
 76 |   - prometheus_client=0.7.1=py_0
 77 |   - prompt_toolkit=3.0.2=py_0
 78 |   - ptyprocess=0.6.0=py37_0
 79 |   - pygments=2.5.2=py_0
 80 |   - pyparsing=2.4.5=py_0
 81 |   - pyqt=5.9.2=py37h05f1152_2
 82 |   - pyrsistent=0.15.6=py37h7b6447c_0
 83 |   - python=3.7.5=h0371630_0
 84 |   - python-dateutil=2.8.1=py_0
 85 |   - pytz=2019.3=py_0
 86 |   - pyzmq=18.1.0=py37he6710b0_0
 87 |   - qt=5.9.7=h5867ecd_1
 88 |   - qtconsole=4.6.0=py_0
 89 |   - readline=7.0=h7b6447c_5
 90 |   - rise=5.6.0=py37_0
 91 |   - scikit-learn=0.22=py37hd81dba3_0
 92 |   - scipy=1.3.2=py37h7c811a0_0
 93 |   - seaborn=0.9.0=pyh91ea838_1
 94 |   - send2trash=1.5.0=py37_0
 95 |   - setuptools=42.0.2=py37_0
 96 |   - sip=4.19.8=py37hf484d3e_0
 97 |   - six=1.13.0=py37_0
 98 |   - sqlite=3.30.1=h7b6447c_0
 99 |   - statsmodels=0.10.1=py37hdd07704_0
100 |   - terminado=0.8.3=py37_0
101 |   - testpath=0.4.4=py_0
102 |   - tk=8.6.8=hbc83047_0
103 |   - tornado=6.0.3=py37h7b6447c_0
104 |   - traitlets=4.3.3=py37_0
105 |   - wcwidth=0.1.7=py37_0
106 |   - webencodings=0.5.1=py37_1
107 |   - wheel=0.33.6=py37_0
108 |   - widgetsnbextension=3.5.1=py37_0
109 |   - xz=5.2.4=h14c3975_4
110 |   - zeromq=4.3.1=he6710b0_3
111 |   - zipp=0.6.0=py_0
112 |   - zlib=1.2.11=h7b6447c_3
113 | prefix: /home/lunayneko/anaconda3/envs/teaching
114 | 


--------------------------------------------------------------------------------