├── .gitignore
├── Final
    ├── README.md
    ├── data
    │   ├── channel_indices.json
    │   ├── g_tags_adj.csv
    │   ├── g_text_adj.csv
    │   └── youtube.csv.gz
    ├── exam.ipynb
    └── exam_solutions.ipynb
├── Homework
    ├── 00 - Optional Homework
    │   └── README.md
    ├── 01 - Pandas and Data Wrangling
    │   ├── README.md
    │   ├── data
    │   │   ├── aminer_ai.csv
    │   │   ├── list_of_ai_conferences.txt
    │   │   ├── list_of_selected_authors.txt
    │   │   └── stopwords.txt
    │   ├── hw1.ipynb
    │   └── hw1_solutions.ipynb
    ├── 02 - Applied ML and Scaling Up
    │   ├── README.md
    │   ├── data
    │   │   └── fifa19_players.csv
    │   ├── hw2.ipynb
    │   ├── hw2_solutions.ipynb
    │   └── hw2_utils.py
    └── README.md
└── Tutorials
    ├── 00 - Intro to Tools
        ├── Data
        │   ├── RIRs.wav
        │   ├── german_speech_44100.wav
        │   ├── mic_array.jpg
        │   ├── rlc.jpg
        │   └── twitter_data.txt
        ├── Intro to Jupyter Notebooks.ipynb
        └── README.md
    ├── 01 - Intro to Pandas
        ├── Data
        │   ├── baseball.csv
        │   ├── cdystonia.csv
        │   ├── microbiome.csv
        │   ├── microbiome_MID1.xls
        │   ├── microbiome_MID2.xls
        │   ├── microbiome_missing.csv
        │   ├── transit_segments.csv
        │   └── vessel_information.csv
        ├── Intro to Pandas I.ipynb
        ├── Intro to Pandas II.ipynb
        └── README.md
    ├── 02 - Data From The Web
        ├── Data from the Web.ipynb
        └── README.md
    ├── 03 - Data Visualization
        ├── ADA - Intro to Viz.ipynb
        ├── Folium
        │   ├── README.md
        │   ├── Tutorial - Folium.ipynb
        │   ├── US_Election_2016.csv
        │   ├── US_Election_2016.html
        │   ├── US_Election_2016_binary.csv
        │   ├── US_Election_2016_binary.html
        │   └── us-states.json
        └── README.md
    ├── 04 - Scaling Up
        ├── .gitignore
        ├── PySpark.ipynb
        ├── frankenstein.txt
        ├── img
        │   ├── Hanoi_POL1966.jpg
        │   ├── USS_Constellation.jpg
        │   └── banner.jpg
        └── words_count.py
    ├── 05 - Good Code Practices
        ├── README.md
        └── good_coding_practices.ipynb
    ├── 06 - Applied ML
        ├── AppliedML.ipynb
        ├── README.md
        ├── data
        │   ├── Advertising.csv
        │   └── titanic.xls
        └── img
        │   ├── estimating_coefficients.png
        │   └── slope_intercept.png
    ├── 07 - Handling Text
        ├── README.md
        ├── Tutorial_07_Handling_text.ipynb
        ├── books
        │   ├── DRACULA.txt
        │   ├── Frankenstein.txt
        │   ├── Moby_Dick.txt
        │   └── PRIDE_AND_PREJUDICE.txt
        └── spacy.png
    ├── 08 - Handling Graphs
        ├── README.md
        ├── Tutorial_graphs.ipynb
        ├── data
        │   └── quakers
        │   │   ├── quakers_edgelist.csv
        │   │   └── quakers_nodelist.csv
        └── networks.yml
    └── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints
2 | *.sublime-project
3 | *.DS_Store
4 | 


--------------------------------------------------------------------------------
/Final/README.md:
--------------------------------------------------------------------------------
 1 | # ADA Final Exam
 2 | 
 3 | This repository contains the final exam for Applied Data Analysis (CS-401).
 4 | 
 5 | The questions are contained in the Jupyter Notebook named "exam.ipynb". The "data/" folder contains the data.
 6 | 
 7 | ## Deadline
 8 | Tuesday, January 14th, 2020; 11:15 A.M. (Swiss time)
 9 | 
10 | _For the deadline for extramural exams, see the submission subsection._
11 | 
12 | ## Important notes
13 | * Don't forget to add a textual description of your thought process, the assumptions you made, and your results!
14 | * Please write all your comments in English, and use meaningful variable names in your code.
15 | * As we have seen during the semester, data science is all about multiple iterations on the same dataset. Do not obsess over small details in the beginning, and try to complete as many tasks as possible during the first 2 hours. Then, go back to the obtained results, write meaningful comments, and debug your code if you have found any glaring mistake.
16 | * Fully read the instructions for each question before starting to solve it to avoid misunderstandings, and remember to save your notebook often!
17 | * The exam contains **3 tasks split into several subtasks**, and is designed for more than 3 hours. **You do not need to solve everything in order to get a 6**, and you have some freedom in choosing the tasks you wish to solve. All of the three tasks can be solved independently of each other.
18 | * You cannot leave the room in the first and last 15 minutes.
19 | * You can use all the online resources you want except for communication tools (emails, web chats, forums, phone, etc.). We will be monitoring the network for unusual activity.
20 | * Remember, this is not a homework assignment -- no teamwork allowed!
21 | 
22 | ## Get 5 extra points by taking a survey about Mattermost
23 | * Please take this brief (1-2 min) anonymous survey about class communication: https://forms.gle/CuNjYMqF9xLNgYhPA
24 | * Your input will help us improve ADA in the future.
25 | * At the end of the survey, you'll get a secret code. Submitting your code with your completed exam will give you **5 extra points** (on top of the max of 100 points).
26 | * We recommend taking the survey before you start the exam (this way, you start from 5 instead of 0 points).
27 | 
28 | ## Submission
29 | * You will have until 11:20 (strict deadline) to turn in your submission. **Late submissions will not be accepted.** This deadline is for the students taking the exam at EPFL - students taking the exam extramurally will have their submission deadline as the starting time of the exam plus 3 hours and 5 minutes.
30 | * For students **with** an EPFL email address:
31 |     * Your file has to be named as "NameSurname_SCIPER.ipynb".
32 |     * Upload your Jupyter Notebook (1 file) to this Google form at the end of the exam, with all the cells already evaluated: https://forms.gle/JF7N3zvJ6HRbT7tv5. You need to sign in to Google using your EPFL credentials in order to submit the form. We recommend you to try this right now (at the beginning of the exam) to avoid a rush at the end.
33 |     * In case of problems with the form, send your Jupyter Notebook via email to ramtin.yazdanian@epfl.ch. This is reserved only for those who encounter problems with the submission - you need to have a reasonable justification for using this backup.
34 | * For students **without** an EPFL email address:
35 |     * Your file has to be named as "NameSurname.ipynb".
36 |     * Send your Jupyter Notebook (1 file) via email to ramtin.yazdanian@epfl.ch.


--------------------------------------------------------------------------------
/Final/data/channel_indices.json:
--------------------------------------------------------------------------------
1 | {"League of Legends": 0, "Desi Perkins": 1, "SSundee": 2, "FROST": 3, "DashieGames": 4, "Crafty Panda": 5, "dope2111": 6, "Sanjeev Kapoor Khazana": 7, "BRICO SYMPA": 8, "XpertThief": 9, "Fe4RLess": 10, "GameGrumps": 11, "YourHealth": 12, "Vete a la Versh": 13, "Gabbie Hanna": 14, "alpha m.": 15, "Aphmau": 16, "jacksepticeye": 17, "Miniminter": 18, "JunsKitchen": 19, "FitnessBlender": 20, "How To Cake It": 21, "Vy Qwaint": 22, "SixPackAbs.com": 23, "Denis": 24, "Jazza": 25, "Satisfying Slime Videos": 26, "FGTeeV": 27, "Clash of Clans": 28, "Nintendo": 29, "cutepolish": 30, "MoreAliA": 31, "Carli Bybel": 32, "UnspeakableGaming": 33, "Marshmello": 34, "aLexBY11": 35, "Sernandoe": 36, "TBNRFrags": 37, "Xyz Gyan": 38, "Shay Mitchell": 39, "Aspyn Ovard": 40, "PlayStation": 41, "gameranx": 42, "Prestige Clips": 43, "Mini Ladd": 44, "Smosh Games": 45, "SHORT TIME SECRET": 46, "Huda Beauty": 47, "Rockstar Games": 48, "DeGoBooM": 49, "Creative Channel": 50, "ExplodingTNT": 51, "Tasty": 52, "Bodybuilding.com": 53, "Lachlan": 54, "Bajan Canadian": 55, "videogamedunkey": 56, "jaipurthepinkcity": 57, "Pipe Cleaner Crafts B": 58, "Tom Slime": 59, "Niki and Gabi": 60, "First We Feast": 61, "Jaclyn Hill": 62, "SeaNanners Gaming Channel": 63, "Shruti Arjun Anand": 64, "Mr. Kate": 65, "Clash Royale": 66, "Ingrid Nilsen": 67, "5-Minute Crafts PLAY": 68, "TobyGames": 69, "Azerrz": 70, "Mumbo Jumbo": 71, "Teachingmensfashion": 72, "FaZe Clan": 73, "Sky Does Everything": 74, "theRadBrad": 75, "Ali-A": 76, "TheAtlanticCraft": 77, "The Icing Artist": 78, "Laura in the Kitchen": 79, "Tanya Burr": 80, "Christen Dominique": 81, "jeffreestar": 82, "I AM WILDCAT": 83, "ThreadBanger": 84, "Rosanna Pansino": 85, "Coffi Channel": 86, "Ninja": 87, "MiawAug": 88, "POPSUGAR Fitness": 89, "ItsFunneh": 90, "TheGrefg": 91, "Izabela Stress": 92, "Laura Lee": 93, "Antonio Garza": 94, "YOGSCAST Lewis & Simon": 95, "Karina Garcia": 96, "So Yummy": 97, "Fortnite": 98, "NishaMadhulika": 99, "PopularMMOs": 100, "Talking Tom": 101, "RobTopGames": 102, "BCC Trolling": 103, "Michelle Phan": 104, "Daithi De Nogla": 105, "bysTaXx": 106, "Wengie": 107, "Kabita's Kitchen": 108, "MyLifeAsEva": 109, "LaurDIY": 110, "Frost Diamond": 111, "Markiplier": 112, "speedyw03": 113, "JerryRigEverything": 114, "CaptainSparklez": 115, "GamingWithKev": 116, "Vikkstar123": 117, "Anaysa": 118, "MessYourself": 119, "Lauren Curtis": 120, "LispyJimmy": 121, "LDShadowLady": 122, "Tipsy Bartender": 123, "Howcast": 124, "TsMadaan": 125, "Jess No Limit": 126, "Manny Mua": 127, "Dynamo Gaming": 128, "PatrickStarrr": 129, "KathleenLights": 130, "Indian Health": 131, "Muselk": 132, "5-Minute Crafts GIRLY": 133, "blogilates": 134, "LazarBeam": 135, "ATHLEAN-X\u2122": 136, "TheSyndicateProject": 137, "DanTDM": 138, "JonTronShow": 139, "BRIGHT SIDE": 140, "grav3yardgirl": 141, "Homemade solutions": 142, "iHasCupquake": 143, "InquisitorMaster": 144, "HowToBasic": 145, "Tfue": 146, "Zoella": 147, "Hong Giang DIY Slime": 148, "MeLlamanFredy": 149, "Sadhguru": 150, "JeromeASF": 151, "NikkieTutorials": 152, "Wayne Goss": 153, "GamingWithJen": 154, "H2ODelirious": 155, "Health Tips for You": 156, "Blossom": 157, "Lui Calibre": 158, "Pooja Luthra": 159, "FaZe Apex": 160, "Daequan Loco": 161, "Natalies Outlet": 162, "Roxxsaurus": 163, "TeamYouTube [Help]": 164, "DaveHax": 165, "Chris Smoove": 166, "Call of Duty": 167, "5-Minute Crafts": 168, "Kandee Johnson": 169, "Krazyrayray": 170, "The Game Theorists": 171, "Shroud": 172, "Thinknoodles": 173, "Bon App\u00e9tit": 174, "stampylonghead": 175, "MayBaby": 176, "Fitz": 177, "El Guzii": 178, "windy31": 179, "FaZe Rain": 180, "IGN": 181, "Typical Gamer": 182, "Grandpa Kitchen": 183, "PewDiePie": 184, "SaraBeautyCorner - DIY, Comedy, Makeup, Nail Art": 185, "Braille Skateboarding": 186, "Vogue": 187, "LOLiTO FDEZ": 188, "VanossGaming": 189, "Yoga With Adriene": 190, "Ceeday": 191, "KSIOlajidebtHD KSIOlajidebtHD": 192, "Jelly": 193, "CoryxKenshin": 194}


--------------------------------------------------------------------------------
/Final/data/youtube.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epfl-ada/2019/ab917eb571793ee9bddc673d1e2c465ad4b25888/Final/data/youtube.csv.gz


--------------------------------------------------------------------------------
/Final/exam.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# ADA final exam (winter semester 2019/2020)"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "A friend of yours wants to start a YouTube channel and ideally earn some money via ads. However, there are so many channels and videos out there that your friend has no idea where to even start. Fortunately, they know that you have taken ADA and think you might help them out by analyzing the videos that are currently on YouTube."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "The data you are provided with is a subset of YouTube videos, with videos from some of the giant channels in two categories: \"Gaming\" and \"How-to & Style\", which are the categories your friend is choosing between. The dataset contains a lot of videos, with data on those videos including their titles, their total number of views in 2019, their tags and descriptions, etc. The data is, in gzip-compressed format, contained in the `data/` folder, as the file `youtube.csv.gz`."
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "The three tasks A, B and C are **independent** of each other, and you can solve any combination of them. The exam is designed for more than 3 hours, so don't worry if you don't manage to solve everything; you can still score a 6."
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "You need to run the following two cells to read and prepare the dataset."
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "import pandas as pd\n",
 45 |     "import numpy as np"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "youtube = pd.read_csv('data/youtube.csv.gz', compression='gzip')\n",
 55 |     "youtube.upload_date = pd.to_datetime(youtube.upload_date)"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "## Dataset description"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "metadata": {},
 68 |    "source": [
 69 |     "Each row of the dataset corresponds to one video that was uploaded to YouTube. There are 11 columns:\n",
 70 |     "'channel', 'upload_date', 'title', 'categories', 'tags', 'duration',\n",
 71 |     "       'view_count', 'average_rating', 'height', 'width', 'channel_cat'.\n",
 72 |     "- `channel`: The channel (account) on which the video was uploaded.\n",
 73 |     "- `upload_date`: The date on which the video was uploaded (Pandas Timestamp object).\n",
 74 |     "- `title`: The title of the video.\n",
 75 |     "- `tags`: A list of words that describe the video.\n",
 76 |     "- `duration`: The duration of the video in seconds.\n",
 77 |     "- `view_count`: The number of times the video was watched.\n",
 78 |     "- `average_rating`: The average score with which the viewers rated the video (1-5).\n",
 79 |     "- `height`: The height of the video in pixels.\n",
 80 |     "- `width`: The width of the video in pixels.\n",
 81 |     "- `channel_cat`: The category of the channel on which this video was uploaded. This dataset only contains videos from channels from the 'Gaming' and the 'Howto & Style' category."
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "markdown",
 86 |    "metadata": {},
 87 |    "source": [
 88 |     "# Task A: Welcome to the exam!"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "markdown",
 93 |    "metadata": {},
 94 |    "source": [
 95 |     "All of Task A refers to the videos that were published between and including 2010 and 2018."
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "## A1: A growing platform?"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {},
108 |    "source": [
109 |     "You would first like to know whether YouTube in general is the right platform to invest time into."
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "markdown",
114 |    "metadata": {},
115 |    "source": [
116 |     "1. Using the appropriate plot type, plot the number of videos published per year between and including 2010 and 2018."
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": null,
122 |    "metadata": {},
123 |    "outputs": [],
124 |    "source": []
125 |   },
126 |   {
127 |    "cell_type": "markdown",
128 |    "metadata": {},
129 |    "source": [
130 |     "2. Now for each year, plot the number of channels that have been created between the beginning of 2010 and the end of that year. A channel is considered to be created at the time at which they upload their first video."
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "metadata": {},
137 |    "outputs": [],
138 |    "source": []
139 |   },
140 |   {
141 |    "cell_type": "markdown",
142 |    "metadata": {},
143 |    "source": [
144 |     "3. Normalize the number of videos published each year by the number of channels that have been created between the beginning of 2010 and the end of that year, and plot these quantities. Do seperate plots for gaming channels, how-to channels, and both together. Can you conclude from the plot that both gaming and how-to channels have been becoming less and less active recently? Why, or why not?"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": []
153 |   },
154 |   {
155 |    "cell_type": "markdown",
156 |    "metadata": {},
157 |    "source": [
158 |     "## A2: The one thing we all love: cash money"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "metadata": {},
164 |    "source": [
165 |     "Your friend is really keen on making money from their YouTube channel through ads and wants you to help them choose the most profitable channel category (Gaming or Howto & Style). The ad profit is directly proportional to the number of views of a video."
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "markdown",
170 |    "metadata": {},
171 |    "source": [
172 |     "1. Since your friend wants to keep producing videos for several years to come, it might also be worth looking at the growth of the two categories.\n",
173 |     "  1. Compute the total number of views in each category per year for the years 2010-2018.\n",
174 |     "  2. Divide the yearly view count by the number of channels that posted a video in each category in each year. Plot these normalized counts.\n",
175 |     "\n",
176 |     "\n"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": null,
182 |    "metadata": {},
183 |    "outputs": [],
184 |    "source": []
185 |   },
186 |   {
187 |    "cell_type": "markdown",
188 |    "metadata": {},
189 |    "source": [
190 |     "2. Your friend's channel will be brand new, so you decide to look more closely at newer channels. For this question and all the following questions in A2, only consider channels that uploaded their first video in  2016 or later. Compute the total number of views in each category and divide it by the number of channels in that category.\n"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": null,
196 |    "metadata": {},
197 |    "outputs": [],
198 |    "source": []
199 |   },
200 |   {
201 |    "cell_type": "markdown",
202 |    "metadata": {},
203 |    "source": [
204 |     "3. The number of views might be very unevenly over the different channels, and channels might upload different numbers of videos.\n",
205 |     "  1. Compute the mean number of views per video for each channel.\n",
206 |     "  2. Compute the mean of these means for each of the two categories. Print these values.\n",
207 |     "  3. Using bootstrapping, compute 95% confidence intervals for these two means. From this analysis, can you draw a recommendation for one of the two categories? Why, or why not?"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "metadata": {},
214 |    "outputs": [],
215 |    "source": []
216 |   },
217 |   {
218 |    "cell_type": "markdown",
219 |    "metadata": {},
220 |    "source": [
221 |     "# Task B: View forecasting (Machine Learning)"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "markdown",
226 |    "metadata": {},
227 |    "source": [
228 |     "Your friend wants to figure out how they can optimize their videos for getting the maximum number of views (without using shocking thumbnails and clickbait titles). In this task, you will build a machine learning (ML) model for predicting the success of a video."
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "markdown",
233 |    "metadata": {},
234 |    "source": [
235 |     "## B1: Get those shovels out again"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "markdown",
240 |    "metadata": {},
241 |    "source": [
242 |     "1. For the prediction model, use all rows of the dataset, but keep only the following columns: `view_count, channel, upload_date, duration, average_rating, height, width`."
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": null,
248 |    "metadata": {},
249 |    "outputs": [],
250 |    "source": []
251 |   },
252 |   {
253 |    "cell_type": "markdown",
254 |    "metadata": {},
255 |    "source": [
256 |     "2. Extract the upload year and upload month from the `upload_date` column into the two columns `upload_year` and `upload_month`, and remove `upload_date`."
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": null,
262 |    "metadata": {},
263 |    "outputs": [],
264 |    "source": []
265 |   },
266 |   {
267 |    "cell_type": "markdown",
268 |    "metadata": {},
269 |    "source": [
270 |     "3. The entry in the channel column for a video indicates on which channel the video was uploaded. Encode this column via one-hot encoding."
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": null,
276 |    "metadata": {},
277 |    "outputs": [],
278 |    "source": []
279 |   },
280 |   {
281 |    "cell_type": "markdown",
282 |    "metadata": {},
283 |    "source": [
284 |     "4. Split the data into a train (70%) and a test set (30%) with the appropriate function from sklearn, using 42 as the random seed."
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "code",
289 |    "execution_count": null,
290 |    "metadata": {},
291 |    "outputs": [],
292 |    "source": []
293 |   },
294 |   {
295 |    "cell_type": "markdown",
296 |    "metadata": {},
297 |    "source": [
298 |     "## B2: Who is the most viewed of them all?"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "markdown",
303 |    "metadata": {},
304 |    "source": [
305 |     "1. Train a ridge regression model (i.e., an L2-regularized linear regression model) on the train set that predicts the view count from the other features. Find and use the optimal regularization parameter $\\alpha$ from the set {0.001, 0.01, 0.1} via 3-fold cross validation."
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": null,
311 |    "metadata": {},
312 |    "outputs": [],
313 |    "source": []
314 |   },
315 |   {
316 |    "cell_type": "markdown",
317 |    "metadata": {},
318 |    "source": [
319 |     "2. Report the mean absolute error that the model makes on the test set."
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "code",
324 |    "execution_count": null,
325 |    "metadata": {},
326 |    "outputs": [],
327 |    "source": []
328 |   },
329 |   {
330 |    "cell_type": "markdown",
331 |    "metadata": {},
332 |    "source": [
333 |     "## B3: Checking our ambitions"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "markdown",
338 |    "metadata": {},
339 |    "source": [
340 |     "To improve performance, you want to make the task of the ML model easier and turn it into a classification task. Now it only has to predict whether a video has a high view count (defined as being larger than the median of the view counts in the training set) or a low view count (defined as being smaller or equal to the median of the view counts in the training set)."
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "markdown",
345 |    "metadata": {},
346 |    "source": [
347 |     "1. Train a logistic regression model for this classification task. Find and use the optimal regularization parameter C (as defined in scikit-learn's documentation) from the set {1, 10, 100} via 3-fold cross validation. Use the random seed 42. _Hint_: If you get a warning about the training algorithm failing to converge, increase the maximum number of training iterations."
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "code",
352 |    "execution_count": null,
353 |    "metadata": {},
354 |    "outputs": [],
355 |    "source": []
356 |   },
357 |   {
358 |    "cell_type": "markdown",
359 |    "metadata": {},
360 |    "source": [
361 |     "2. Compute the accuracy of the logistic regression model on the test set."
362 |    ]
363 |   },
364 |   {
365 |    "cell_type": "code",
366 |    "execution_count": null,
367 |    "metadata": {},
368 |    "outputs": [],
369 |    "source": []
370 |   },
371 |   {
372 |    "cell_type": "markdown",
373 |    "metadata": {},
374 |    "source": [
375 |     "## B4: ...something's not right."
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "markdown",
380 |    "metadata": {},
381 |    "source": [
382 |     "You are satisfied with the model performance. In fact, you are a bit surprised at how good the model is given the relatively little amount of information about the videos. So you take a closer look at the features and realize that the (one-hot-encoded) channel feature does not make sense for the application that your friend has in mind."
383 |    ]
384 |   },
385 |   {
386 |    "cell_type": "markdown",
387 |    "metadata": {},
388 |    "source": [
389 |     "1. Why does the channel feature not make sense?"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "code",
394 |    "execution_count": null,
395 |    "metadata": {},
396 |    "outputs": [],
397 |    "source": []
398 |   },
399 |   {
400 |    "cell_type": "markdown",
401 |    "metadata": {},
402 |    "source": [
403 |     "2. Train another logistic regression model with all the features from B3 except the one-hot-encoded channel. Use again 42 as the seed for the train test split and perform the same hyperparameter optimization as in B3. How does the model performance change?"
404 |    ]
405 |   },
406 |   {
407 |    "cell_type": "code",
408 |    "execution_count": null,
409 |    "metadata": {},
410 |    "outputs": [],
411 |    "source": []
412 |   },
413 |   {
414 |    "cell_type": "markdown",
415 |    "metadata": {},
416 |    "source": [
417 |     "## B5: \"We kinda forgot about categories.\""
418 |    ]
419 |   },
420 |   {
421 |    "cell_type": "markdown",
422 |    "metadata": {},
423 |    "source": [
424 |     "On second thought, there is actually one feature that you may use about the channel. Namely, the channel category. The reason this one makes sense might also help you answer B4.1."
425 |    ]
426 |   },
427 |   {
428 |    "cell_type": "markdown",
429 |    "metadata": {},
430 |    "source": [
431 |     "1. Train and evaluate another logistic regression model (in the same way as in B4 regarding train/test split and hyperparameter) that additionally includes the one-hot-encoded channel category."
432 |    ]
433 |   },
434 |   {
435 |    "cell_type": "code",
436 |    "execution_count": null,
437 |    "metadata": {},
438 |    "outputs": [],
439 |    "source": []
440 |   },
441 |   {
442 |    "cell_type": "markdown",
443 |    "metadata": {},
444 |    "source": [
445 |     "2. The dynamics of the two categories might differ a lot, and the two communities might value different properties of a video differently. For instance, for one community, a long duration might be more important, for the other one, a large picture width. Thus, having only a single weight for, e.g., the duration of a video, might not give the best results. Is there something smarter that you can do than simply including the category as a single one-hot-encoded feature to improve the classification performance? Implement your idea and compare the accuracy on the test set with that of the first model (from task B5.1)."
446 |    ]
447 |   },
448 |   {
449 |    "cell_type": "code",
450 |    "execution_count": null,
451 |    "metadata": {},
452 |    "outputs": [],
453 |    "source": []
454 |   },
455 |   {
456 |    "cell_type": "markdown",
457 |    "metadata": {},
458 |    "source": [
459 |     "# Task C: A map of the channels (Graphs)"
460 |    ]
461 |   },
462 |   {
463 |    "cell_type": "markdown",
464 |    "metadata": {},
465 |    "source": [
466 |     "Your friend wants to map out the channels and represent their similarities. For this purpose, we have created two undirected and unweighted graphs for you, where in each graph, each channel has a node and similar channels have edges connecting them. In one graph, the similarity between two channels is based on how similar their video descriptions are, while in the other, the similarity is based on how similar their video tags are. We will call the former $G_{text}$ and the latter $G_{tags}$. You will be analyzing the two graphs loaded by running the cell below."
467 |    ]
468 |   },
469 |   {
470 |    "cell_type": "code",
471 |    "execution_count": null,
472 |    "metadata": {},
473 |    "outputs": [],
474 |    "source": [
475 |     "from networkx import from_numpy_array\n",
476 |     "import json\n",
477 |     "g_text_adj = np.loadtxt(open('data/g_text_adj.csv', 'r'), delimiter=',', skiprows=0)\n",
478 |     "g_tags_adj = np.loadtxt(open('data/g_tags_adj.csv', 'r'), delimiter=',', skiprows=0)\n",
479 |     "channel_to_index = json.load(open('data/channel_indices.json', 'r'))\n",
480 |     "g_text = from_numpy_array(g_text_adj)\n",
481 |     "g_tags = from_numpy_array(g_tags_adj)"
482 |    ]
483 |   },
484 |   {
485 |    "cell_type": "markdown",
486 |    "metadata": {},
487 |    "source": [
488 |     "## C1: Does YouTube have a content diversity problem?"
489 |    ]
490 |   },
491 |   {
492 |    "cell_type": "markdown",
493 |    "metadata": {},
494 |    "source": [
495 |     "1. For each graph, calculate its diameter (i.e., the largest shortest-path length, where the maximization is done over all node pairs). What difference do you see? _Hint_: Don't worry if you get an error, just read the error message carefully."
496 |    ]
497 |   },
498 |   {
499 |    "cell_type": "code",
500 |    "execution_count": null,
501 |    "metadata": {},
502 |    "outputs": [],
503 |    "source": []
504 |   },
505 |   {
506 |    "cell_type": "markdown",
507 |    "metadata": {},
508 |    "source": [
509 |     "2. What does the diameter of $G_{text}$ say about the diversity of the channels’ contents? How about the diameter of $G_{tags}$?"
510 |    ]
511 |   },
512 |   {
513 |    "cell_type": "code",
514 |    "execution_count": null,
515 |    "metadata": {},
516 |    "outputs": [],
517 |    "source": []
518 |   },
519 |   {
520 |    "cell_type": "markdown",
521 |    "metadata": {},
522 |    "source": [
523 |     "3. Based on what you have calculated, which one has greater diversity: descriptions used by channels, or tags used by channels? Justify your answer."
524 |    ]
525 |   },
526 |   {
527 |    "cell_type": "code",
528 |    "execution_count": null,
529 |    "metadata": {},
530 |    "outputs": [],
531 |    "source": []
532 |   },
533 |   {
534 |    "cell_type": "markdown",
535 |    "metadata": {},
536 |    "source": [
537 |     "4. Imagine that you want to **compare** content diversity between two sets of channels (i.e., you want to see which set of channels has more diverse content), and you have calculated a tag-based graph for each set. Do you think the diameter is a good measure for doing the comparison? Justify your answer."
538 |    ]
539 |   },
540 |   {
541 |    "cell_type": "code",
542 |    "execution_count": null,
543 |    "metadata": {},
544 |    "outputs": [],
545 |    "source": []
546 |   },
547 |   {
548 |    "cell_type": "markdown",
549 |    "metadata": {},
550 |    "source": [
551 |     "5. Back to our own two graphs. Based on $G_{text}$, for each category of channels, which channel is the one most representative of the contents of all channels in that category? In other words, for each category, if you needed to provide a summary of all channels in the category via one channel, which channel would you choose? Show us (us being the exam designers and your friend) the descriptions of this channel’s two most-viewed videos. What metric did you use for this purpose? Explain your choice."
552 |    ]
553 |   },
554 |   {
555 |    "cell_type": "code",
556 |    "execution_count": null,
557 |    "metadata": {},
558 |    "outputs": [],
559 |    "source": []
560 |   },
561 |   {
562 |    "cell_type": "markdown",
563 |    "metadata": {},
564 |    "source": [
565 |     "## C2: Going back to categories again"
566 |    ]
567 |   },
568 |   {
569 |    "cell_type": "markdown",
570 |    "metadata": {},
571 |    "source": [
572 |     "1. We want to use the two graphs to cluster channels from the same category together, and we want to compare their effectiveness at doing so. Use Kernighan-Lin bisection in the networkx package (`networkx.algorithms.community.kernighan_lin_bisection`) to divide each graph into two communities. Use 42 as the random seed. For each graph, show how many members of each category fall into each of the two communities."
573 |    ]
574 |   },
575 |   {
576 |    "cell_type": "code",
577 |    "execution_count": null,
578 |    "metadata": {},
579 |    "outputs": [],
580 |    "source": []
581 |   },
582 |   {
583 |    "cell_type": "markdown",
584 |    "metadata": {},
585 |    "source": [
586 |     "2. If one of these graphs were ideal for this clustering task, what would the resulting communities look like? If it were the absolute worst possible graph for the task, what would the resulting communities look like?"
587 |    ]
588 |   },
589 |   {
590 |    "cell_type": "code",
591 |    "execution_count": null,
592 |    "metadata": {},
593 |    "outputs": [],
594 |    "source": []
595 |   },
596 |   {
597 |    "cell_type": "markdown",
598 |    "metadata": {},
599 |    "source": [
600 |     "3. Calculate the probability $P(community|category)$ for each community and category within each graph. Design a metric, using the four $P(community|category)$ values in a graph, whose value would be 1 for the ideal graph and 0 for the worst graph. Calculate this metric for both graphs and compare the two. What do the results say about how representative tags and descriptions are regarding the channel categories? Are tags better suited, or descriptions?"
601 |    ]
602 |   },
603 |   {
604 |    "cell_type": "code",
605 |    "execution_count": null,
606 |    "metadata": {},
607 |    "outputs": [],
608 |    "source": []
609 |   },
610 |   {
611 |    "cell_type": "markdown",
612 |    "metadata": {},
613 |    "source": [
614 |     "4. The Kernighan-Lin bisection you used above performs a min-edge cut: It attempts to partition the nodes of the graph into two sets of almost-equal size by deleting as few edges as possible. It starts off by creating a random partition of the nodes of the graph into two sets A and B that are almost equal in size, and then iteratively and in a greedy fashion moves nodes between A and B to reduce the number of edges between A and B. Show at least one toy example of a graph where the initialization could also be the final result. (Hint: Think of how, as we explained, the bisection algorithm relies on a minimum edge cut with a random initialization; under what circumstances could the original A and B be the best partition given that graph?)"
615 |    ]
616 |   },
617 |   {
618 |    "cell_type": "code",
619 |    "execution_count": null,
620 |    "metadata": {},
621 |    "outputs": [],
622 |    "source": []
623 |   }
624 |  ],
625 |  "metadata": {
626 |   "kernelspec": {
627 |    "display_name": "Python 3",
628 |    "language": "python",
629 |    "name": "python3"
630 |   },
631 |   "language_info": {
632 |    "codemirror_mode": {
633 |     "name": "ipython",
634 |     "version": 3
635 |    },
636 |    "file_extension": ".py",
637 |    "mimetype": "text/x-python",
638 |    "name": "python",
639 |    "nbconvert_exporter": "python",
640 |    "pygments_lexer": "ipython3",
641 |    "version": "3.7.3"
642 |   }
643 |  },
644 |  "nbformat": 4,
645 |  "nbformat_minor": 4
646 | }
647 | 


--------------------------------------------------------------------------------
/Homework/00 - Optional Homework/README.md:
--------------------------------------------------------------------------------
 1 | # 00 - Optional homework
 2 | 
 3 | For this initial homework we will be working with a [dataset](https://github.com/fivethirtyeight/guns-data/blob/master/interactive_data.csv) available as a part of an interesting analysis of [gunshot deaths in the US](http://fivethirtyeight.com/features/gun-deaths/). The goal of this optional homework is to carefully go through the interactive visualization portrayed at the top of the aforementioned article, and use an IPython Notebook to reproduce the following claims made in the visualization:
 4 | - Nearly *two-thirds* of gun deaths are *suicides*.
 5 | - More than *85 percent* of suicide victims are *male*.
 6 | - Around *a third* of all gun deaths are *homicides*.
 7 | - Around *two-thirds* of homicide victims who are *males* in the *age-group of 15--34* are *black*.
 8 | - *Women* constitue only *15 percent* of the total *homicide* victims.
 9 | 
10 | It's not necessary to generate visualizations for the results -- numbers should be more than enough to convince yourself that you 
11 | were able to reproduce the results of that article.
12 | 
13 | You can use this opportunity first of all to refresh your Python skills. If you are coming from another programming language
14 | (especially a static PL like Java and C++), we recommend you to take a look at this presentation:
15 | [Code Like a Pythonista: Idiomatic Python](http://www.omahapython.org/IdiomaticPython.html) -- it will teach
16 | you how to write nice Python code, while at the same time getting you up to speed with the syntax.
17 | Feel free to explore more advanced libraries (like [Pandas](http://pandas.pydata.org/)) if you really want, but keep in mind that you
18 | should be able to reproduce the results with the Python Standard Library.
19 | One advantage of using only the PSL is that once you will get knowledgeable about Pandas you will appreciate how much more concise
20 | and readable your code will become :)
21 | 
22 | Credits to [Michele Catasta](https://github.com/pirroh), on whose material this version is based.
23 | 


--------------------------------------------------------------------------------
/Homework/01 - Pandas and Data Wrangling/README.md:
--------------------------------------------------------------------------------
 1 | # 01 - Pandas and Data Wrangling
 2 | 
 3 | In this homework, you will familiarize yourself with *Pandas*, the most popular Python library for handling tabular data. In addition, you will create some basic visualizations and learn to scrape data from the Web. These are among a data scientist's bread and butter -- perfect Homework 1 material!
 4 | 
 5 | The homework consists of three tasks which are described in the `hw1.ipynb` notebook.
 6 | 
 7 | For each task, please provide *both* a written explanation of the steps you followed, and the corresponding code. 
 8 | Keep in mind that writing the explanation can help you in two ways:
 9 | 1. Clarifying the steps in your mind before writing the actual code
10 | 2. Earning you points if the description is correct, regardless of the potential issues in your code
11 | 
12 | ### Submission Guidelines
13 | You are expected to solve the homework as a team of four, which you specified in the course registration form. By the homework submission deadline, each team should have a single shared private github repo, containing the Jupyter Notebook with the solution. Please, follow the instructions below to create your team repo and start working on the homework:
14 | 1. **One** team member should follow [this link](https://classroom.github.com/g/OGUlNgFN) and create a team with exactly the **same name** as specified in the course registration form.
15 | 2. Creation of the team will automatically create a dedicated private repo. At this point the remaining three team-members should follow the [same link](https://classroom.github.com/g/OGUlNgFN) and join their team. *Make sure you are joining the correct team by checking your team-members' github accounts: there might be teams with similar or same names.*
16 | 3. There is no simple automated way to transfer the materials for Homework 1 from the public course repository into your private team repository. To get started, we suggest that you manually pull the homework materials from the course repository to your local machine, copy them into your local team repository, and push to the remote.
17 | 4. Afterwards -- keep collaborating on the homework as a team in your shared private repository!
18 | 
19 | ### Deliverables
20 | 1. `hw1.ipynb` notebook with disclosed output for each cell. **Do not submit your `data` folder.**
21 | 2. Detailed feedback on the V2 of HW1 via [this](https://forms.gle/q2GVN9TtHqKeBvjCAm) Google form.
22 | 
23 | #### Deadline: October 16th, 23:59
24 | 


--------------------------------------------------------------------------------
/Homework/01 - Pandas and Data Wrangling/data/aminer_ai.csv:
--------------------------------------------------------------------------------
 1 | Rank;Conference (Full Name);Short Name;H5-Index
 2 | 1;IEEE Conference on Computer Vision and Pattern Recognition;CVPR;112
 3 | 2;IEEE Transactions on Pattern Analysis and Machine Intelligence;TPAMI;101
 4 | 3;Expert Systems with Applications;Expert Syst. Appl.;59
 5 | 4;International Journal of Computer Vision;IJCV;58
 6 | 5;IEEE International Conference on Robotics and Automation;ICRA;58
 7 | 6;International Conference on Computer Vision;ICCV;58
 8 | 7;International Conference on Machine Learning;ICML;56
 9 | 8;Annual Conference on Neural Information Processing Systems;NIPS;51
10 | 9;Journal of Machine Learning Research;JMLR;49
11 | 10;Annual Meeting of the Association for Computational Linguistics;ACL;48
12 | 11;IEEE Transactions on Audio, Speech, and Language Processing;TASLP;46
13 | 12;Pattern Recognition;;45
14 | 13;Conference on Empirical Methods in Natural Language Processing;EMNLP;45
15 | 14;AAAI Conference on Artificial Intelligence;AAAI;44
16 | 15;IEEE Transactions on Fuzzy Systems;TFS;44
17 | 16;Decision Support Systems;DSS;43
18 | 17;Neurocomputing;;39
19 | 18;IEEE Transactions on Evolutionary Computation;TEVC;38
20 | 19;Pattern Recognition Letters;PRL;37
21 | 20;Autonomous Agents and Multi-Agent Systems;AAMAS;36
22 | 21;International Joint Conference on Artificial Intelligence;IJCAI;35
23 | 22;Image and Vision Computing;;35
24 | 23;Computer Vision and Image Understanding;CVIU;34
25 | 24;Machine Learning;;33
26 | 25;Neural Networks;;32
27 | 26;Fuzzy Sets and Systems;;31
28 | 27;International Conference on Pattern Recognition;ICPR;31
29 | 28;IEEE Transactions on Neural Networks and Learning Systems;TNNLS;31
30 | 29;International Journal of Approximate Reasoning;IJAR;31
31 | 30;Neural Computation;NC;30
32 | 31;British Machine Vision Conference;BMVC;30
33 | 32;Computer Speech & Language;;27
34 | 33;Evolutionary Computation;;27
35 | 34;Genetic and Evolutionary Computation Conference;GECCO;27
36 | 35;Computational Linguistics;CL;26
37 | 36;International Conference on Computational Linguistics;COLING;26
38 | 37;Transactions on Affective Computing;TAC;26
39 | 38;Journal of Artificial Intelligence Research;JAIR;26
40 | 39;International Conference on Principles of Knowledge Representation and Reasoning;KR;24
41 | 40;Data & Knowledge Engineering;Data Knowl. Eng.;24
42 | 41;Autonomous Agents and Multi-Agent Systems;AAMAS;23
43 | 42;Evolutionary Computation;;22
44 | 43;Neural Computing & Applications;NCA;22
45 | 44;International Conference on Document Analysis and Recognition;ICDAR;22
46 | 45;International Joint Conference on Neural Networks;IJCNN;22
47 | 46;Artificial Intelligence in Medicine;AIM;21
48 | 47;International Journal of Neural Systems;Int. J. Neural Syst.;21
49 | 48;International Conference on Automated Planning and Scheduling;ICAPS;20
50 | 49;European Conference on Artificial Intelligence;ECAI;20
51 | 50;Annual Conference on Computational Learning Theory;COLT;20
52 | 51;International Journal of Intelligent Systems;IJIS;20
53 | 52;Journal of Automated Reasoning;JAR;20
54 | 53;International Journal on Document Analysis and Recognition;IJDAR;18
55 | 54;Applied Intelligence;Appl. Intell.;18
56 | 55;Computational Intelligence;CI;17
57 | 56;Natural Computing;;17
58 | 57;ACM Transactions on Applied Perception;TAP;16
59 | 58;European Conference on Computer Vision;ECCV;16
60 | 59;Natural Language Engineering;NLE;15
61 | 60;International Conference on Uncertainty in Artificial Intelligence;UAI;15
62 | 61;Neural Processing Letters;NPL;14
63 | 62;Artificial Life;;13
64 | 63;International Journal of Uncertainty, Fuzziness and Knowledge-Based Systems;;13
65 | 64;Machine Translation;MT;13
66 | 65;International Joint Conference on Biometrics;ICB;13
67 | 66;International Conference on Algorithmic Learning Theory;ALT;13
68 | 67;Pattern Analysis and Applications;PAA;13
69 | 68;Artificial Life;;12
70 | 69;Expert Systems;;11
71 | 70;IEEE International Conference on Tools with Artificial Intelligence;ICTAI;11
72 | 71;The Annual Conference of the North American Chapter of the Association for Computational Linguistics;NAACL;10
73 | 72;Neural Computing & Applications;NCA;10
74 | 73;Asian Conference on Computer Vision;ACCV;10
75 | 74;Decision Support Systems;DSS;10
76 | 75;International Conference on Case-Based Reasoning;ICCBR;10
77 | 76;International Journal of Pattern Recognition and Artificial Intelligence;IJPRAI;10
78 | 77;Web Intelligence and Agent Systems;WIAS;9
79 | 78;Intelligent Data Analysis;IDA;9
80 | 79;International Conference on Inductive Logic Programming;ILP;9
81 | 80;Journal of Experimental and Theoretical Artificial Intelligence;JETAI;9
82 | 81;ACM Transactions on Speech and Language Processing;TSLP;9
83 | 82;Pacific Rim International Conference on Artificial Intelligence;PRICAI;8
84 | 83;Machine Vision and Applications;;7
85 | 84;Connection Science;;7
86 | 85;Machine Learning;;6
87 | 86;Soft Computing;;6
88 | 87;International conference on Knowledge Science, Engineering and Management;KSEM;6
89 | 88;IEEE Transactions on Cybernetics;;6
90 | 89;International Journal of Computational Intelligence and Applications;IJCIA;5
91 | 90;International Conference on Artificial Neural Networks;ICANN;4
92 | 91;International Conference on Neural Information Processing;ICONIP;4
93 | 92;Natural Computing;;3
94 | 93;ACM Transactions on Asian Language Information Processing;TALIP;3
95 | 94;Autonomous Agents and Multi-Agent Systems;AAMAS;1
96 | 95;International Conference on Automatic Face and Gesture Recognition;FGR;1
97 | 96;Knowledge-Based Systems;KBS;0
98 | 97;Artificial Intelligence;AI;0
99 | 


--------------------------------------------------------------------------------
/Homework/01 - Pandas and Data Wrangling/data/list_of_ai_conferences.txt:
--------------------------------------------------------------------------------
 1 | cvpr
 2 | icra
 3 | iccv
 4 | icml
 5 | nips
 6 | acl
 7 | emnlp
 8 | aaai
 9 | aamas
10 | ijcai
11 | naacl
12 | accv
13 | iccbr
14 | ida
15 | ilp
16 | pricai
17 | ksem
18 | icann
19 | iconip
20 | fgr


--------------------------------------------------------------------------------
/Homework/01 - Pandas and Data Wrangling/data/list_of_selected_authors.txt:
--------------------------------------------------------------------------------
 1 | Parag Havaldar
 2 | Steffen Abraham
 3 | Yukihiro Nakamura
 4 | Iljung S. Kwak
 5 | Charles Freundlich
 6 | Mengjie Zhang
 7 | Naokazu Yokoya
 8 | Sebastian Otte
 9 | Jianwu Dang
10 | Hy Murveit


--------------------------------------------------------------------------------
/Homework/01 - Pandas and Data Wrangling/data/stopwords.txt:
--------------------------------------------------------------------------------
  1 | i
  2 | me
  3 | my
  4 | myself
  5 | we
  6 | our
  7 | ours
  8 | ourselves
  9 | you
 10 | your
 11 | yours
 12 | yourself
 13 | yourselves
 14 | he
 15 | him
 16 | his
 17 | himself
 18 | she
 19 | her
 20 | hers
 21 | herself
 22 | it
 23 | its
 24 | itself
 25 | they
 26 | them
 27 | their
 28 | theirs
 29 | themselves
 30 | what
 31 | which
 32 | who
 33 | whom
 34 | this
 35 | that
 36 | these
 37 | those
 38 | am
 39 | is
 40 | are
 41 | was
 42 | were
 43 | be
 44 | been
 45 | being
 46 | have
 47 | has
 48 | had
 49 | having
 50 | do
 51 | does
 52 | did
 53 | doing
 54 | a
 55 | an
 56 | the
 57 | and
 58 | but
 59 | if
 60 | or
 61 | because
 62 | as
 63 | until
 64 | while
 65 | of
 66 | at
 67 | by
 68 | for
 69 | with
 70 | about
 71 | against
 72 | between
 73 | into
 74 | through
 75 | during
 76 | before
 77 | after
 78 | above
 79 | below
 80 | to
 81 | from
 82 | up
 83 | down
 84 | in
 85 | out
 86 | on
 87 | off
 88 | over
 89 | under
 90 | again
 91 | further
 92 | then
 93 | once
 94 | here
 95 | there
 96 | when
 97 | where
 98 | why
 99 | how
100 | all
101 | any
102 | both
103 | each
104 | few
105 | more
106 | most
107 | other
108 | some
109 | such
110 | no
111 | nor
112 | not
113 | only
114 | own
115 | same
116 | so
117 | than
118 | too
119 | very
120 | s
121 | t
122 | can
123 | will
124 | just
125 | don
126 | should
127 | now
128 | 


--------------------------------------------------------------------------------
/Homework/01 - Pandas and Data Wrangling/hw1.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "colab_type": "text",
  7 |     "id": "l3ZEgl7t28Ee"
  8 |    },
  9 |    "source": [
 10 |     "# Homework 1\n",
 11 |     "## Introduction\n",
 12 |     "### Important dates\n",
 13 |     "* Homework release: Thursday, 3rd October 2019\n",
 14 |     "* **Homework due**: Wednesday, 16th October 2019 (23:59 hours, 11:59 PM)\n",
 15 |     "* Peer review due: Wednesday, 23rd October 2019 (23:59 hours, 11:59 PM)\n",
 16 |     "* Grading release: Wednesday, 30th October 2019\n",
 17 |     "\n",
 18 |     "### Description\n",
 19 |     "\n",
 20 |     "The data you'll be working with comes from multiple sources. The main data source will be [DBLP](https://dblp.uni-trier.de/), a database of publications from major computer science journals and conferences. A subset of DBLP, which you will use in this assignment, is provided to you via a [google drive folder](https://drive.google.com/file/d/1Kci8joML74tCSzuBbhxtd1ylR4f0dlm6/view). Later on, you will enrich the DBLP data with a dataset on conference rankings and with the proceedings of the [NIPS conference](https://nips.cc/) [1] ('proceedings' is another word for the set of papers published at an academic conference). After loading and cleaning the data, you will answer various questions about its contents.\n",
 21 |     "\n",
 22 |     "**Some rules:**\n",
 23 |     "- You are allowed to use any built-in Python library that comes with Anaconda. If you want to use an external library, you have to justify your choice.\n",
 24 |     "- Make sure you use the data folder provided in the repository in *read-only* mode.\n",
 25 |     "- Be sure to provide explanations for your answers. A notebook that only has code cells will not suffice.\n",
 26 |     "- Also, be sure to *hand in a fully-run and evaluated notebook*. We will not run your notebook for you, we will grade it as is, which means that only the results contained in your evaluated code cells will be considered, and we will not see the results in unevaluated code cells. In order to check whether everything looks as intended, you can check the rendered notebook on the GitHub website once you have pushed your solution there.\n",
 27 |     "\n",
 28 |     "[1] Note that NIPS was renamed to NeurIPS in 2018, but for simplicity, whenever we say 'NIPS', we really mean 'NIPS and NeurIPS'."
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {
 34 |     "colab_type": "text",
 35 |     "id": "A0IIdWOs48BB"
 36 |    },
 37 |    "source": [
 38 |     "## Task A. Getting a sense of the dataset\n",
 39 |     "\n",
 40 |     "### A1. Loading the data\n",
 41 |     "Download the DBLP dataset (available on [google drive](https://drive.google.com/file/d/1Kci8joML74tCSzuBbhxtd1ylR4f0dlm6/view)) and load it into a Pandas dataframe. A row of your dataframe should look as follows:\n",
 42 |     "\n",
 43 |     "| paper id | author names | publication year | paper title | \n",
 44 |     "| :----:|:-------------:| :-----:|:-----:|\n",
 45 |     "| conf/nips/doe1036 | [John Doe, Jane Doe] | 2003 | Some Catchy Title: An Expanded and Boring Title | \n",
 46 |     "\n",
 47 |     "\n",
 48 |     "1. Filter the papers: keep only conference papers. For each of the remaining ones, find the acronym of the conference where it was published. Retain only those papers that have been published in the conferences listed in `data/list_of_ai_conferences.txt`. Additionally, add a column named 'conference' to your dataframe.   \n",
 49 |     "_Hint: The `paper id` tells you whether a paper was published at a conference, and if so, at which one._\n",
 50 |     "\n",
 51 |     "2. Report the overall number of papers in the filtered dataset, as well as the number of papers per conference."
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {
 58 |     "colab": {},
 59 |     "colab_type": "code",
 60 |     "id": "ara6mLe1l_CQ"
 61 |    },
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "%matplotlib inline\n",
 65 |     "import pandas as pd\n",
 66 |     "import numpy as np\n",
 67 |     "import re\n",
 68 |     "from matplotlib.ticker import MaxNLocator\n",
 69 |     "import matplotlib.pyplot as plt\n",
 70 |     "from requests import get\n",
 71 |     "from bs4 import BeautifulSoup"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "''' Add your code here'''"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {
 86 |     "colab_type": "text",
 87 |     "id": "mnE6N9zXl-yz"
 88 |    },
 89 |    "source": [
 90 |     "### A2. An author-centric look\n",
 91 |     "The dataframe you created above was a paper-centric one. Now, we want you to create a new dataframe centered around authors. Do this by expanding the author names in the lists in the 2nd column into separate rows. That is, if a paper has 3 authors, turn that row into 3 rows, each of which only contains one of the author names (along with the rest of the paper information, i.e., title, conference and year). Keep both dataframes, we are going to need both of them.    \n",
 92 |     "**Report the number of unique authors.**"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": null,
 98 |    "metadata": {},
 99 |    "outputs": [],
100 |    "source": [
101 |     "''' Add your code here'''"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "markdown",
106 |    "metadata": {
107 |     "colab_type": "text",
108 |     "id": "8ZiuGiDsl60f"
109 |    },
110 |    "source": [
111 |     "### A3. Is your data analysis-friendly?\n",
112 |     "\n",
113 |     "Using an appropriate view of your data (paper-centric or author-centric), solve the following tasks:\n",
114 |     "\n",
115 |     "1. Plot the number of papers per author and analyze it. Do you observe any outliers? Can you identify the source of the problem? Please elaborate!   \n",
116 |     "_Hint: To find out where the problem comes from, try performing an analysis at the conference or year level._   \n",
117 |     "Make sure you remove this inconsistency from your dataframe before moving to the next step, and also create a new plot of the number of papers per author after fixing the problem.   \n",
118 |     "\n",
119 |     "2. Plot the number of papers per year. Do you observe any inconsistency in the output? Real-world data is usually messy, with typos, erroneous entries, and sometimes issues that make even loading the data problematic. Fix any errors that you encounter along the way, find ways to clean the attribute `year`, and redo the plot of the number of papers per year.   \n",
120 |     "\n",
121 |     "3. Machine learning (ML) has been one of the hottest topics within the broader area of AI recently, so let’s see if this is reflected in the number of ML publications. In particular, let’s focus on the two major ML conferences, NIPS and ICML: make a new dataframe with only NIPS and ICML papers (let’s call these the “ML papers”), plot the number of ML papers over time, and analyze and discuss the plot. Do you observe anything odd in this plot? What causes these problems?   \n",
122 |     "_Hint: Try to perform an analysis at the conference or year level._   \n",
123 |     "\n",
124 |     "4. By now, you may have noticed that some conferences are not fully represented in the DBLP dataset. Complete the paper-centric dataframe by scraping the full NIPS data from the online proceedings at https://papers.nips.cc/ (maintain the same schema used in your previous dataframes, but fill in missing values). After this step, remove any remaining papers that have missing values. Redo the plots of steps A3.2 and A3.3 after fixing the issue.   \n",
125 |     "\n",
126 |     "_Note: In order to avoid re-running the cleaning part of the notebook every time, you could save the results at this point as a pickle file! Also, propagating your cleaning to both dataframes might prove useful later on._"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "markdown",
131 |    "metadata": {},
132 |    "source": [
133 |     "***A3.1***"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": null,
139 |    "metadata": {
140 |     "colab": {},
141 |     "colab_type": "code",
142 |     "id": "co_yXyk2TzHo",
143 |     "scrolled": true
144 |    },
145 |    "outputs": [],
146 |    "source": [
147 |     "''' Add your code here '''"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "markdown",
152 |    "metadata": {},
153 |    "source": [
154 |     "***A3.2***"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "metadata": {
161 |     "scrolled": true
162 |    },
163 |    "outputs": [],
164 |    "source": [
165 |     "''' Add your code here '''"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "markdown",
170 |    "metadata": {},
171 |    "source": [
172 |     "***A3.3***"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": null,
178 |    "metadata": {},
179 |    "outputs": [],
180 |    "source": [
181 |     "mlconf = ['icml', 'nips']\n",
182 |     "''' Add your code here '''"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "markdown",
187 |    "metadata": {},
188 |    "source": [
189 |     "***A3.4***"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": null,
195 |    "metadata": {},
196 |    "outputs": [],
197 |    "source": [
198 |     "''' Add your code here '''"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "markdown",
203 |    "metadata": {
204 |     "colab_type": "text",
205 |     "id": "0h0Zw8cPTyEp"
206 |    },
207 |    "source": [
208 |     "### A4. Author activity\n",
209 |     "\n",
210 |     "For each author, calculate their normalized paper count, defined as the total number of papers divided by the author’s period of activity. An author’s period of activity is defined as the number of years between the earliest and latest papers of this author. Plot the distribution of the normalized paper count. What is the appropriate scale for the axes? Does the distribution (roughly) follow a particular law, and if yes, which one?"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": null,
216 |    "metadata": {
217 |     "colab": {},
218 |     "colab_type": "code",
219 |     "id": "dGV5GAndl6LE"
220 |    },
221 |    "outputs": [],
222 |    "source": [
223 |     "''' Add your code here '''"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "markdown",
228 |    "metadata": {
229 |     "colab_type": "text",
230 |     "id": "ZM7pH8Y_UF3H"
231 |    },
232 |    "source": [
233 |     "## Task B. Ranking authors\n",
234 |     "\n",
235 |     "As you may know, there exist rankings for universities, which represent their relative quality and are used to compare the performance of different universities. In the same vein, there are rankings for conferences and journals, which represent the importance and impact of each conference or journal, and therefore allow for approximate comparisons. In this part, you will rank authors based on different aspects of their research output.\n",
236 |     "\n",
237 |     "### B1. A Naïve Score\n",
238 |     "\n",
239 |     "In the absence of citation counts, it is hard to objectively rank the authors based on the impact of their contributions to the field of AI research. A naïve way would be to rank them based on their number of published papers. Obtain such a ranking and analyze your result. Identify and explain some obvious limitations of this scheme."
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": null,
245 |    "metadata": {},
246 |    "outputs": [],
247 |    "source": [
248 |     "''' Add your code here '''"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "markdown",
253 |    "metadata": {},
254 |    "source": [
255 |     "### B2. H5-index\n",
256 |     "\n",
257 |     "Another way to score and rank authors could be based on the quality of the conferences and journals where they publish their papers. For this task, you have to use the H5-index score from AMiner (https://aminer.org/ranks/conf) (another database of scholarly publications), which captures the quality of academic conferences: the higher the H5-index, the better the conference.\n",
258 |     "1. Load the AMiner dataset ( *'aminer_ai.tsv'* available in the folder ``data/``), which contains H5-index values for AI conferences. Load it into a new Pandas dataframe, and join it with the author-centric DBLP dataframe.\n",
259 |     "2. Calculate a *'new'* author ranking (give each author a score, by which the authors are then sorted in order to obtain the ranking), where each author's score is the sum of the H5-indices of all their papers (the H5-index of a paper being the H5-index of the conference it is published in).\n",
260 |     "3. Analyze your new, H5-index-based author ranking and explain how and why your results are different from the previous ranking. Do you see any differences in the top-20 authors based on the H5-index-based ranking and the one produced using publication counts? If yes, list the authors that are ranked in the top 20 based on publication counts but absent in the top 20 based on the H5-index-based ranking. Identify the ranks of these authors in the ranking produced by the H5-index based ranking scheme.\n",
261 |     "4. Now, take the authors in the file `data/list_of_selected_authors.txt`, and compute their rankings using the two (naïve and H5-index-based) ranking schemes. What do you observe? Explain the potential dangers of the naïve, paper-count-based score.\n",
262 |     "5. On the flip side, do you see any potential dangers of using the H5-index-based score?   \n",
263 |     "_Hint: Analyze the conferences in which the top ranked authors publish. Investigate the effect of the conferences in which these authors publish more frequently on the obtained ranking._"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "markdown",
268 |    "metadata": {},
269 |    "source": [
270 |     "***B2.1***"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": null,
276 |    "metadata": {},
277 |    "outputs": [],
278 |    "source": [
279 |     "''' Add your code here '''"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "markdown",
284 |    "metadata": {},
285 |    "source": [
286 |     "***B2.2***"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "code",
291 |    "execution_count": null,
292 |    "metadata": {},
293 |    "outputs": [],
294 |    "source": [
295 |     "''' Add your code here '''"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "markdown",
300 |    "metadata": {},
301 |    "source": [
302 |     "***B2.3***"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": null,
308 |    "metadata": {},
309 |    "outputs": [],
310 |    "source": [
311 |     "''' Add your code here '''"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "markdown",
316 |    "metadata": {},
317 |    "source": [
318 |     "***B2.4***"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "code",
323 |    "execution_count": null,
324 |    "metadata": {},
325 |    "outputs": [],
326 |    "source": [
327 |     "''' Add your code here '''"
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "markdown",
332 |    "metadata": {},
333 |    "source": [
334 |     "***B2.5***"
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "code",
339 |    "execution_count": null,
340 |    "metadata": {},
341 |    "outputs": [],
342 |    "source": [
343 |     "''' Add your code here '''"
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "markdown",
348 |    "metadata": {},
349 |    "source": [
350 |     "### B3. And Justice For All\n",
351 |     "\n",
352 |     "An ideal ranking scheme should not give undue advantage to authors who have been conducting research for a longer period of time and therefore have naturally published more papers when compared to a junior researcher. Does the ranking scheme designed by you in ``Step 2`` take this factor into account? If not, introduce variations in your ranking scheme to mitigate this effect. Do you observe anything odd with this new ranking? Clearly explain your observations.\n",
353 |     "\n",
354 |     "_Hint: What you did in part A4 may be useful here._"
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "code",
359 |    "execution_count": null,
360 |    "metadata": {},
361 |    "outputs": [],
362 |    "source": [
363 |     "''' Add your code here '''"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "markdown",
368 |    "metadata": {},
369 |    "source": [
370 |     "## Task C. Trending topics\n",
371 |     "\n",
372 |     "Historically, the field of AI has witnessed research in two broad flavors: “symbolic” (logic, planning, control, etc.) vs. “connectionist” (neural networks, deep learning, Bayesian methods, etc.). Let’s see if we can see how the popularity of these two approaches to AI is reflected in the DBLP data.\n",
373 |     "\n",
374 |     "To this end, construct two dataframes: ``symbolic`` and ``connectionist``. ``symbolic`` is your cleaned paper-centric dataframe from part A (after fixing everything per the task description) filtered down to those papers whose titles contain at least one of the following words (not differentiating between upper and lower case letters): “logic”, “planning”, “control”; ``connectionist`` is a dataframe constructed in a similar manner, but with the words “deep”, “learning”, “feature”, “bayesian”. Plot the number of papers per year for ``symbolic`` and ``connectionist`` separately (i.e., 2 plots).\n",
375 |     "1. Describe the trends you observe. Based on these plots alone, what might one conclude about the popularity of the two approaches to AI?\n",
376 |     "2. Moving beyond these plots, what do you, as a careful data scientist, conclude about the popularity of symbolic vs. connectionist AI? Corroborate your reasoning with further plots.\n",
377 |     "\n",
378 |     "_Note: You could use the text handling utilities below to clean the text in the paper titles._"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "code",
383 |    "execution_count": null,
384 |    "metadata": {},
385 |    "outputs": [],
386 |    "source": [
387 |     "# Text handling utilities\n",
388 |     "from string import punctuation\n",
389 |     "stopwords_list = open('data/stopwords.txt', 'r').readlines()\n",
390 |     "stopwords_list = [x.strip() for x in stopwords_list]\n",
391 |     "def stopword_remover(text):\n",
392 |     "    text_list = text.split()\n",
393 |     "    text_list = [x for x in text_list if x not in stopwords_list]\n",
394 |     "    return ' '.join(text_list)\n",
395 |     "def lowercase_all(text):\n",
396 |     "    return text.lower()\n",
397 |     "def remove_punct(text):\n",
398 |     "    return ''.join([ch for ch in text if ch not in punctuation])"
399 |    ]
400 |   },
401 |   {
402 |    "cell_type": "code",
403 |    "execution_count": null,
404 |    "metadata": {},
405 |    "outputs": [],
406 |    "source": [
407 |     "''' Add your code here '''"
408 |    ]
409 |   },
410 |   {
411 |    "cell_type": "code",
412 |    "execution_count": null,
413 |    "metadata": {},
414 |    "outputs": [],
415 |    "source": [
416 |     "words_symbolic = ['logic', 'planning', 'control']\n",
417 |     "''' Add your code here '''"
418 |    ]
419 |   },
420 |   {
421 |    "cell_type": "code",
422 |    "execution_count": null,
423 |    "metadata": {},
424 |    "outputs": [],
425 |    "source": [
426 |     "words_connectionist = ['deep', 'learning', 'feature', 'bayesian']\n",
427 |     "''' Add your code here '''"
428 |    ]
429 |   }
430 |  ],
431 |  "metadata": {
432 |   "colab": {
433 |    "collapsed_sections": [],
434 |    "name": "Homework 1.ipynb",
435 |    "provenance": []
436 |   },
437 |   "kernelspec": {
438 |    "display_name": "Python 3",
439 |    "language": "python",
440 |    "name": "python3"
441 |   },
442 |   "language_info": {
443 |    "codemirror_mode": {
444 |     "name": "ipython",
445 |     "version": 3
446 |    },
447 |    "file_extension": ".py",
448 |    "mimetype": "text/x-python",
449 |    "name": "python",
450 |    "nbconvert_exporter": "python",
451 |    "pygments_lexer": "ipython3",
452 |    "version": "3.7.1"
453 |   },
454 |   "toc": {
455 |    "base_numbering": 1,
456 |    "nav_menu": {},
457 |    "number_sections": false,
458 |    "sideBar": true,
459 |    "skip_h1_title": true,
460 |    "title_cell": "Table of Contents",
461 |    "title_sidebar": "Contents",
462 |    "toc_cell": false,
463 |    "toc_position": {},
464 |    "toc_section_display": true,
465 |    "toc_window_display": false
466 |   }
467 |  },
468 |  "nbformat": 4,
469 |  "nbformat_minor": 1
470 | }
471 | 


--------------------------------------------------------------------------------
/Homework/02 - Applied ML and Scaling Up/README.md:
--------------------------------------------------------------------------------
 1 | # 02 - Applied ML and Scaling Up
 2 | 
 3 | In this homework, you will work with two popular frameworks: (1) *[sklearn](https://scikit-learn.org/stable/)* (short for scikit-learn) and (2) *[PySpark](https://spark.apache.org/docs/2.2.0/api/python/pyspark.html)* (the Python API for Apache Spark). In addition, you will create some basic visualizations to explain your findings. Applied ML and Scaling up constitute the two quintessential skills for a data scientist, thereby serving as the perfect material to prepare you for the real world.
 4 | 
 5 | The homework consists of two tasks which are described in the `hw2.ipynb` notebook.
 6 | 
 7 | For each task, please provide *both* a written explanation of the steps you followed, and the corresponding code. 
 8 | Keep in mind that writing the explanation can help you in two ways:
 9 | 1. Clarifying the steps in your mind before writing the actual code
10 | 2. Earning you points if the description is correct, regardless of the potential issues in your code
11 | 
12 | ### Submission Guidelines
13 | You are expected to solve the homework as a team of four, which you specified in the project registration form. By the homework submission deadline, each team should have a single shared private github repo, containing the Jupyter Notebook with the solution. Please follow the instructions below to create your team repo and start working on the homework:
14 | 1. **One** team member should follow [this legendary link](https://classroom.github.com/g/SxsU2280) and create a team by adding a prefix **final_** to the **exact team name** specified in the project registration form.
15 | 2. Creation of the team will automatically create a dedicated private repo. At this point the remaining three team-members should follow the [same link](https://classroom.github.com/g/SxsU2280) and join their team. *Make sure you are joining the correct team by checking your team-members' github accounts: there might be teams with similar names.*
16 | 3. There is no simple automated way to transfer the materials for Homework 2 from the public course repository into your private team repository. To get started, we suggest that you manually pull the homework materials from the course repository to your local machine, copy them into your local team repository, and push to the remote.
17 | 4. Afterwards -- keep collaborating on the homework as a team in your shared private repository!
18 | 
19 | ### Deliverables
20 | `hw2.ipynb` notebook with disclosed output for each cell. **Do not submit your `data` folder.**
21 | 
22 | #### Deadline: November 20th, 2019 (23:59 hours / 11:59 PM)
23 | 


--------------------------------------------------------------------------------
/Homework/02 - Applied ML and Scaling Up/hw2_utils.py:
--------------------------------------------------------------------------------
 1 | from IPython.display import display_html
 2 | import matplotlib
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np 
 5 | 
 6 | # https://stackoverflow.com/questions/38783027/jupyter-notebook-display-two-pandas-tables-side-by-side
 7 | def display_side_by_side(*args):
 8 |     html_str=''
 9 |     for df in args:
10 |         html_str+=df.to_html()
11 |     display_html(html_str.replace('table','table style="display:inline"'),raw=True)
12 | 
13 | # https://matplotlib.org/3.1.1/gallery/images_contours_and_fields/image_annotated_heatmap.html
14 | def heatmap(data, row_labels, col_labels, ax=None, **kwargs):
15 |     
16 |     if not ax:
17 |         ax = plt.gca()
18 | 
19 |     # Plot the heatmap
20 |     im = ax.imshow(data, **kwargs)
21 | 
22 |     # We want to show all ticks...
23 |     ax.set_xticks(np.arange(data.shape[1]))
24 |     ax.set_yticks(np.arange(data.shape[0]))
25 |     # ... and label them with the respective list entries.
26 |     ax.set_xticklabels(col_labels)
27 |     ax.set_yticklabels(row_labels)
28 | 
29 |     # Let the horizontal axes labeling appear on top.
30 |     ax.tick_params(top=True, bottom=False,
31 |                    labeltop=True, labelbottom=False)
32 | 
33 |     # Rotate the tick labels and set their alignment.
34 |     plt.setp(ax.get_xticklabels(), rotation=-30, ha="right",
35 |              rotation_mode="anchor")
36 | 
37 |     # Turn spines off and create white grid.
38 |     for edge, spine in ax.spines.items():
39 |         spine.set_visible(False)
40 | 
41 |     ax.set_xticks(np.arange(data.shape[1]+1)-.5, minor=True)
42 |     ax.set_yticks(np.arange(data.shape[0]+1)-.5, minor=True)
43 |     ax.grid(which="minor", color="w", linestyle='-', linewidth=3)
44 |     ax.tick_params(which="minor", bottom=False, left=False)
45 | 
46 |     return im
47 | 
48 | def annotate_heatmap(im, data=None, valfmt="{x:.2f}",
49 |                      textcolors=["black", "white"],
50 |                      threshold=None, **textkw):
51 |     if not isinstance(data, (list, np.ndarray)):
52 |         data = im.get_array()
53 | 
54 |     # Normalize the threshold to the images color range.
55 |     if threshold is not None:
56 |         threshold = im.norm(threshold)
57 |     else:
58 |         threshold = im.norm(data.max())/2.
59 | 
60 |     # Set default alignment to center, but allow it to be
61 |     # overwritten by textkw.
62 |     kw = dict(horizontalalignment="center",
63 |               verticalalignment="center")
64 |     kw.update(textkw)
65 | 
66 |     # Get the formatter in case a string is supplied
67 |     if isinstance(valfmt, str):
68 |         valfmt = matplotlib.ticker.StrMethodFormatter(valfmt)
69 | 
70 |     # Loop over the data and create a `Text` for each "pixel".
71 |     # Change the text's color depending on the data.
72 |     texts = []
73 |     for i in range(data.shape[0]):
74 |         for j in range(data.shape[1]):
75 |             kw.update(color=textcolors[int(im.norm(data[i, j]) > threshold)])
76 |             text = im.axes.text(j, i, valfmt(data[i, j], None), **kw)
77 |             texts.append(text)
78 | 
79 |     return texts
80 | 
81 | 
82 | def plot_heatmap(df):
83 |     fig, ax = plt.subplots(figsize=(10,10))
84 | 
85 |     X_names = df.columns
86 |     Y_names = df.index
87 | 
88 |     values = df.to_numpy()
89 | 
90 |     im = heatmap(values, X_names, Y_names, ax=ax,
91 |                        cmap="GnBu")
92 |     texts = annotate_heatmap(im, valfmt="{x:.3f}")
93 | 
94 |     fig.tight_layout()
95 |     plt.show()


--------------------------------------------------------------------------------
/Homework/README.md:
--------------------------------------------------------------------------------
1 | # ADA2019-Homework
2 | Repo for homework assignments
3 | 


--------------------------------------------------------------------------------
/Tutorials/00 - Intro to Tools/Data/RIRs.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epfl-ada/2019/ab917eb571793ee9bddc673d1e2c465ad4b25888/Tutorials/00 - Intro to Tools/Data/RIRs.wav


--------------------------------------------------------------------------------
/Tutorials/00 - Intro to Tools/Data/german_speech_44100.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epfl-ada/2019/ab917eb571793ee9bddc673d1e2c465ad4b25888/Tutorials/00 - Intro to Tools/Data/german_speech_44100.wav


--------------------------------------------------------------------------------
/Tutorials/00 - Intro to Tools/Data/mic_array.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epfl-ada/2019/ab917eb571793ee9bddc673d1e2c465ad4b25888/Tutorials/00 - Intro to Tools/Data/mic_array.jpg


--------------------------------------------------------------------------------
/Tutorials/00 - Intro to Tools/Data/rlc.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epfl-ada/2019/ab917eb571793ee9bddc673d1e2c465ad4b25888/Tutorials/00 - Intro to Tools/Data/rlc.jpg


--------------------------------------------------------------------------------
/Tutorials/00 - Intro to Tools/README.md:
--------------------------------------------------------------------------------
  1 | # 00 - Intro to Tools
  2 | 
  3 | This tutorial should get you up and running with your Python environment, plus give you the basics of Jupyter notebooks. Please complete part 1 before the exercise session on Friday, September 20. In that exercise session, we will go through the notebook in part 2 together. The OPTIONAL and UNGRADED homework 0 should be completed before the release of the NON-OPTIONAL homework 1 on Thursday, October 3.
  4 | 
  5 | ## Part 1: Setting up your environment
  6 | 
  7 | * Install Anaconda, Python 3.7 version: [Anaconda](https://www.anaconda.com/distribution/#download-section).
  8 | 
  9 | *For Linux*: Open a terminal, go to the Downloads folder and execute:
 10 | 
 11 | ```
 12 | bash ./Anaconda3-2019.07-Linux-x86_64.sh -b -p $HOME/Anaconda3
 13 | ```
 14 | 
 15 | Check that `conda` is in your path. If `which conda` returns something like `/home/YOURUSERNAME/Anaconda3/bin/conda`, you are good to go. Otherwise execute
 16 | `echo 'export PATH="$HOME/Anaconda3/bin/:$PATH"' >> $HOME/.bashrc`. Next close the terminal and open a new window. Check `which conda` again.
 17 | 
 18 | * IF conda is already installed, run `conda update conda`.
 19 | 
 20 | * Install Git:
 21 | 
 22 | *For Ubuntu/Debian*: `sudo apt-get install git`
 23 | 
 24 | *For Fedora*: `sudo dnf install git`
 25 | 
 26 | *For Windows*:
 27 | Install it following this download link: [Git](https://git-scm.com/downloads). To execute Git commands, use the Git bash: home key - type "git bash" - enter.
 28 | 
 29 | [Git cheatsheet](http://rogerdudler.github.io/git-guide/)
 30 | 
 31 | * Clone the tutorials repo into a local folder:
 32 | 
 33 | ```
 34 | git clone https://github.com/epfl-ada-2019/Tutorials
 35 | ```
 36 | 
 37 | * or pull new changes if you already have it (from the tutorials local folder):
 38 | 
 39 | ```
 40 | git pull
 41 | ```
 42 | 
 43 | * Create a new environment:
 44 | 
 45 | *For Windows*: 
 46 | To execute Anaconda commands, use the Anaconda prompt: home key - type "anaconda prompt" - enter.
 47 | Also note that while it is possible to use Git commands on Anaconda prompt, we advise against it.
 48 | 
 49 | ```
 50 | conda create -y -n ada python=3.7 scipy pandas numpy matplotlib
 51 | ```
 52 | 
 53 | * Activate it:
 54 |     
 55 | ```
 56 | conda activate ada
 57 | ```
 58 | 
 59 | * List your environments:
 60 |     
 61 | ```
 62 | conda env list
 63 | ```
 64 | 
 65 | * Deactivate the current environment and remove an environment. (Do not run the remove command, as it will remove the environment you just installed ...)
 66 |     
 67 | ```
 68 | conda deactivate
 69 | conda remove --name ada --all
 70 | ```
 71 | 
 72 | [more info on managing environments](https://conda.io/docs/user-guide/tasks/manage-environments.html)
 73 | 
 74 | * Install [JupyterLab](https://jupyterlab.readthedocs.io/en/stable/) using conda (the ada environment needs to be activated):
 75 |     
 76 | ```
 77 | conda install jupyterlab bokeh seaborn nb_conda_kernels
 78 | ```
 79 | 
 80 | * Install some extensions using the Python package manager:
 81 |     
 82 | ```
 83 | pip install jupyter_nbextensions_configurator
 84 | ```
 85 | 
 86 | * IF you get an error about packages requiring cython and PyHamcrest, try the following:
 87 | ```
 88 | pip install cython PyHamcrest jupyter_nbextensions_configurator
 89 | ```
 90 | 
 91 | * IF you have later problems with widgets (conda version might not be the latest one!):
 92 |     
 93 | ```
 94 | pip install --upgrade ipywidgets
 95 |     
 96 | jupyter nbextension enable --py widgetsnbextension
 97 |     
 98 | jupyter nbextension enable --py --sys-prefix widgetsnbextension
 99 | ```
100 | 
101 | * Run a Jupyter notebook server (be sure to be at the tutorial's folder (`cd path/to/folder/`)). A browser window should open up for you.
102 | 
103 | ```
104 | jupyter lab
105 | ```
106 | 
107 | * IF you have more problems with widgets:
108 | 
109 | ```
110 | jupyter serverextension enable --sys-prefix jupyter_nbextensions_configurator nb_conda nb_anacondacloud nbpresent
111 | ```
112 | 
113 | * To shut down the server, close the window on your browser, then do CTRL+C in the terminal.
114 | 
115 | * IF you later have warnings about missing fonts, in Ubuntu the missing fonts are in `fonts-humor-sans` (to install them: `sudo apt install fonts-humor-sans`). You may also have to clean `matplotlib`'s font cache (~/.cache/matplotlib/) and restart your current Jupyter session, for matplotlib to take notice of the changes.
116 | 
117 | ## Part 2: Overview of Jupyter notebooks
118 | 
119 | In [this tutorial](Intro%20to%20Jupyter%20Notebooks.ipynb) we explore the functionalities of the Jupyter notebooks. In this repository you can find the notebook we use during the tutorial.
120 | 
121 | Credits to [saloot](https://github.com/saloot) and [Michele Catasta](https://github.com/pirroh), on whose material this version is based.
122 | 
123 | ## Part 3: Homework 0
124 | 
125 | Access Homework 0 (OPTIONAL and UNGRADED) [here](https://github.com/epfl-ada-2019/Homework/tree/master/00%20-%20Optional%20Homework). Clone the repo locally and take the opportunity to freshen up your Python skills, or to acquire them.
126 | 
127 | On Thursday, October 3, the first graded homework ("Homework 1") will be released.
128 | 


--------------------------------------------------------------------------------
/Tutorials/01 - Intro to Pandas/Data/baseball.csv:
--------------------------------------------------------------------------------
  1 | id,player,year,stint,team,lg,g,ab,r,h,X2b,X3b,hr,rbi,sb,cs,bb,so,ibb,hbp,sh,sf,gidp
  2 | 88641,womacto01,2006,2,CHN,NL,19,50,6,14,1,0,1,2.0,1.0,1.0,4,4.0,0.0,0.0,3.0,0.0,0.0
  3 | 88643,schilcu01,2006,1,BOS,AL,31,2,0,1,0,0,0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0
  4 | 88645,myersmi01,2006,1,NYA,AL,62,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
  5 | 88649,helliri01,2006,1,MIL,NL,20,3,0,0,0,0,0,0.0,0.0,0.0,0,2.0,0.0,0.0,0.0,0.0,0.0
  6 | 88650,johnsra05,2006,1,NYA,AL,33,6,0,1,0,0,0,0.0,0.0,0.0,0,4.0,0.0,0.0,0.0,0.0,0.0
  7 | 88652,finlest01,2006,1,SFN,NL,139,426,66,105,21,12,6,40.0,7.0,0.0,46,55.0,2.0,2.0,3.0,4.0,6.0
  8 | 88653,gonzalu01,2006,1,ARI,NL,153,586,93,159,52,2,15,73.0,0.0,1.0,69,58.0,10.0,7.0,0.0,6.0,14.0
  9 | 88662,seleaa01,2006,1,LAN,NL,28,26,2,5,1,0,0,0.0,0.0,0.0,1,7.0,0.0,0.0,6.0,0.0,1.0
 10 | 89177,francju01,2007,2,ATL,NL,15,40,1,10,3,0,0,8.0,0.0,0.0,4,10.0,1.0,0.0,0.0,1.0,1.0
 11 | 89178,francju01,2007,1,NYN,NL,40,50,7,10,0,0,1,8.0,2.0,1.0,10,13.0,0.0,0.0,0.0,1.0,1.0
 12 | 89330,zaungr01,2007,1,TOR,AL,110,331,43,80,24,1,10,52.0,0.0,0.0,51,55.0,8.0,2.0,1.0,6.0,9.0
 13 | 89333,witasja01,2007,1,TBA,AL,3,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
 14 | 89334,williwo02,2007,1,HOU,NL,33,59,3,6,0,0,1,2.0,0.0,0.0,0,25.0,0.0,0.0,5.0,0.0,1.0
 15 | 89335,wickmbo01,2007,2,ARI,NL,8,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
 16 | 89336,wickmbo01,2007,1,ATL,NL,47,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
 17 | 89337,whitero02,2007,1,MIN,AL,38,109,8,19,4,0,4,20.0,0.0,0.0,6,19.0,0.0,3.0,0.0,1.0,2.0
 18 | 89338,whiteri01,2007,1,HOU,NL,20,1,0,0,0,0,0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0
 19 | 89339,wellsda01,2007,2,LAN,NL,7,15,2,4,1,0,0,1.0,0.0,0.0,0,6.0,0.0,0.0,0.0,0.0,0.0
 20 | 89340,wellsda01,2007,1,SDN,NL,22,38,1,4,0,0,0,0.0,0.0,0.0,0,12.0,0.0,0.0,4.0,0.0,0.0
 21 | 89341,weathda01,2007,1,CIN,NL,67,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
 22 | 89343,walketo04,2007,1,OAK,AL,18,48,5,13,1,0,0,4.0,0.0,0.0,2,4.0,0.0,0.0,0.0,2.0,2.0
 23 | 89345,wakefti01,2007,1,BOS,AL,1,2,0,0,0,0,0,0.0,0.0,0.0,0,2.0,0.0,0.0,0.0,0.0,0.0
 24 | 89347,vizquom01,2007,1,SFN,NL,145,513,54,126,18,3,4,51.0,14.0,6.0,44,48.0,6.0,1.0,14.0,3.0,14.0
 25 | 89348,villoro01,2007,1,NYA,AL,6,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
 26 | 89352,valenjo03,2007,1,NYN,NL,51,166,18,40,11,1,3,18.0,2.0,1.0,15,28.0,4.0,0.0,1.0,1.0,5.0
 27 | 89354,trachst01,2007,2,CHN,NL,4,7,0,1,0,0,0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0
 28 | 89355,trachst01,2007,1,BAL,AL,3,5,0,0,0,0,0,0.0,0.0,0.0,0,3.0,0.0,0.0,0.0,0.0,0.0
 29 | 89359,timlimi01,2007,1,BOS,AL,4,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
 30 | 89360,thomeji01,2007,1,CHA,AL,130,432,79,119,19,0,35,96.0,0.0,1.0,95,134.0,11.0,6.0,0.0,3.0,10.0
 31 | 89361,thomafr04,2007,1,TOR,AL,155,531,63,147,30,0,26,95.0,0.0,0.0,81,94.0,3.0,7.0,0.0,5.0,14.0
 32 | 89363,tavarju01,2007,1,BOS,AL,2,4,0,1,0,0,0,0.0,0.0,0.0,1,3.0,0.0,0.0,0.0,0.0,0.0
 33 | 89365,sweenma01,2007,2,LAN,NL,30,33,2,9,1,0,0,3.0,0.0,0.0,1,11.0,0.0,0.0,0.0,0.0,0.0
 34 | 89366,sweenma01,2007,1,SFN,NL,76,90,18,23,8,0,2,10.0,2.0,0.0,13,18.0,0.0,3.0,1.0,0.0,0.0
 35 | 89367,suppaje01,2007,1,MIL,NL,33,61,4,8,0,0,0,2.0,0.0,0.0,3,16.0,0.0,0.0,11.0,0.0,2.0
 36 | 89368,stinnke01,2007,1,SLN,NL,26,82,7,13,3,0,1,5.0,0.0,0.0,5,22.0,2.0,0.0,0.0,0.0,2.0
 37 | 89370,stantmi02,2007,1,CIN,NL,67,2,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
 38 | 89371,stairma01,2007,1,TOR,AL,125,357,58,103,28,1,21,64.0,2.0,1.0,44,66.0,5.0,2.0,0.0,2.0,7.0
 39 | 89372,sprinru01,2007,1,SLN,NL,72,1,0,0,0,0,0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0
 40 | 89374,sosasa01,2007,1,TEX,AL,114,412,53,104,24,1,21,92.0,0.0,0.0,34,112.0,3.0,3.0,0.0,5.0,11.0
 41 | 89375,smoltjo01,2007,1,ATL,NL,30,54,1,5,1,0,0,2.0,0.0,0.0,1,19.0,0.0,0.0,13.0,0.0,0.0
 42 | 89378,sheffga01,2007,1,DET,AL,133,494,107,131,20,1,25,75.0,22.0,5.0,84,71.0,2.0,9.0,0.0,6.0,10.0
 43 | 89381,seleaa01,2007,1,NYN,NL,31,4,0,0,0,0,0,0.0,0.0,0.0,1,1.0,0.0,0.0,1.0,0.0,0.0
 44 | 89382,seaneru01,2007,1,LAN,NL,68,1,0,0,0,0,0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0
 45 | 89383,schmija01,2007,1,LAN,NL,6,7,1,1,0,0,1,1.0,0.0,0.0,0,4.0,0.0,0.0,1.0,0.0,0.0
 46 | 89384,schilcu01,2007,1,BOS,AL,1,2,0,1,0,0,0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0
 47 | 89385,sandere02,2007,1,KCA,AL,24,73,12,23,7,0,2,11.0,0.0,1.0,11,15.0,0.0,1.0,0.0,0.0,2.0
 48 | 89388,rogerke01,2007,1,DET,AL,1,2,0,0,0,0,0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0
 49 | 89389,rodriiv01,2007,1,DET,AL,129,502,50,141,31,3,11,63.0,2.0,2.0,9,96.0,1.0,1.0,1.0,2.0,16.0
 50 | 89396,ramirma02,2007,1,BOS,AL,133,483,84,143,33,1,20,88.0,0.0,0.0,71,92.0,13.0,7.0,0.0,8.0,21.0
 51 | 89398,piazzmi01,2007,1,OAK,AL,83,309,33,85,17,1,8,44.0,0.0,0.0,18,61.0,0.0,0.0,0.0,2.0,9.0
 52 | 89400,perezne01,2007,1,DET,AL,33,64,5,11,3,0,1,6.0,0.0,0.0,4,8.0,0.0,0.0,3.0,0.0,2.0
 53 | 89402,parkch01,2007,1,NYN,NL,1,1,0,0,0,0,0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0
 54 | 89406,oliveda02,2007,1,LAA,AL,5,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
 55 | 89410,myersmi01,2007,1,NYA,AL,6,1,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
 56 | 89411,mussimi01,2007,1,NYA,AL,2,2,0,0,0,0,0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0
 57 | 89412,moyerja01,2007,1,PHI,NL,33,73,4,9,2,0,0,2.0,0.0,0.0,2,26.0,0.0,0.0,8.0,0.0,1.0
 58 | 89420,mesajo01,2007,1,PHI,NL,38,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
 59 | 89421,martipe02,2007,1,NYN,NL,5,9,1,1,1,0,0,0.0,0.0,0.0,0,6.0,0.0,0.0,2.0,0.0,0.0
 60 | 89425,maddugr01,2007,1,SDN,NL,33,62,2,9,2,0,0,0.0,1.0,0.0,1,19.0,0.0,0.0,9.0,0.0,2.0
 61 | 89426,mabryjo01,2007,1,COL,NL,28,34,4,4,1,0,1,5.0,0.0,0.0,5,10.0,0.0,0.0,0.0,0.0,1.0
 62 | 89429,loftoke01,2007,2,CLE,AL,52,173,24,49,9,3,0,15.0,2.0,3.0,17,23.0,0.0,0.0,4.0,2.0,1.0
 63 | 89430,loftoke01,2007,1,TEX,AL,84,317,62,96,16,3,7,23.0,21.0,4.0,39,28.0,1.0,2.0,2.0,3.0,5.0
 64 | 89431,loaizes01,2007,1,LAN,NL,5,7,0,1,0,0,0,2.0,0.0,0.0,0,2.0,0.0,0.0,2.0,0.0,1.0
 65 | 89438,kleskry01,2007,1,SFN,NL,116,362,51,94,27,3,6,44.0,5.0,1.0,46,68.0,2.0,1.0,1.0,1.0,14.0
 66 | 89439,kentje01,2007,1,LAN,NL,136,494,78,149,36,1,20,79.0,1.0,3.0,57,61.0,4.0,5.0,0.0,6.0,17.0
 67 | 89442,jonesto02,2007,1,DET,AL,5,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
 68 | 89445,johnsra05,2007,1,ARI,NL,10,15,0,1,0,0,0,0.0,0.0,0.0,1,7.0,0.0,0.0,2.0,0.0,0.0
 69 | 89450,hoffmtr01,2007,1,SDN,NL,60,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
 70 | 89451,hernaro01,2007,2,LAN,NL,22,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
 71 | 89452,hernaro01,2007,1,CLE,AL,2,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
 72 | 89460,guarded01,2007,1,CIN,NL,15,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
 73 | 89462,griffke02,2007,1,CIN,NL,144,528,78,146,24,1,30,93.0,6.0,1.0,85,99.0,14.0,1.0,0.0,9.0,14.0
 74 | 89463,greensh01,2007,1,NYN,NL,130,446,62,130,30,1,10,46.0,11.0,1.0,37,62.0,4.0,5.0,1.0,1.0,14.0
 75 | 89464,graffto01,2007,1,MIL,NL,86,231,34,55,8,0,9,30.0,0.0,1.0,24,44.0,6.0,3.0,0.0,2.0,7.0
 76 | 89465,gordoto01,2007,1,PHI,NL,44,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
 77 | 89466,gonzalu01,2007,1,LAN,NL,139,464,70,129,23,2,15,68.0,6.0,2.0,56,56.0,4.0,4.0,0.0,2.0,11.0
 78 | 89467,gomezch02,2007,2,CLE,AL,19,53,4,15,2,0,0,5.0,0.0,0.0,0,6.0,0.0,0.0,1.0,1.0,1.0
 79 | 89468,gomezch02,2007,1,BAL,AL,73,169,17,51,10,1,1,16.0,1.0,2.0,10,20.0,1.0,0.0,5.0,1.0,5.0
 80 | 89469,glavito02,2007,1,NYN,NL,33,56,3,12,1,0,0,4.0,0.0,0.0,6,5.0,0.0,0.0,12.0,1.0,0.0
 81 | 89473,floydcl01,2007,1,CHN,NL,108,282,40,80,10,1,9,45.0,0.0,0.0,35,47.0,5.0,5.0,0.0,0.0,6.0
 82 | 89474,finlest01,2007,1,COL,NL,43,94,9,17,3,0,1,2.0,0.0,0.0,8,4.0,1.0,0.0,0.0,0.0,2.0
 83 | 89480,embreal01,2007,1,OAK,AL,4,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
 84 | 89481,edmonji01,2007,1,SLN,NL,117,365,39,92,15,2,12,53.0,0.0,2.0,41,75.0,2.0,0.0,2.0,3.0,9.0
 85 | 89482,easleda01,2007,1,NYN,NL,76,193,24,54,6,0,10,26.0,0.0,1.0,19,35.0,1.0,5.0,0.0,1.0,2.0
 86 | 89489,delgaca01,2007,1,NYN,NL,139,538,71,139,30,0,24,87.0,4.0,0.0,52,118.0,8.0,11.0,0.0,6.0,12.0
 87 | 89493,cormirh01,2007,1,CIN,NL,6,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
 88 | 89494,coninje01,2007,2,NYN,NL,21,41,2,8,2,0,0,5.0,0.0,0.0,7,8.0,2.0,0.0,1.0,1.0,1.0
 89 | 89495,coninje01,2007,1,CIN,NL,80,215,23,57,11,1,6,32.0,4.0,0.0,20,28.0,0.0,0.0,1.0,6.0,4.0
 90 | 89497,clemero02,2007,1,NYA,AL,2,2,0,1,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
 91 | 89498,claytro01,2007,2,BOS,AL,8,6,1,0,0,0,0,0.0,0.0,0.0,0,3.0,0.0,0.0,0.0,0.0,2.0
 92 | 89499,claytro01,2007,1,TOR,AL,69,189,23,48,14,0,1,12.0,2.0,1.0,14,50.0,0.0,1.0,3.0,3.0,8.0
 93 | 89501,cirilje01,2007,2,ARI,NL,28,40,6,8,4,0,0,6.0,0.0,0.0,4,6.0,0.0,0.0,0.0,0.0,1.0
 94 | 89502,cirilje01,2007,1,MIN,AL,50,153,18,40,9,2,2,21.0,2.0,0.0,15,13.0,0.0,1.0,3.0,2.0,9.0
 95 | 89521,bondsba01,2007,1,SFN,NL,126,340,75,94,14,0,28,66.0,5.0,0.0,132,54.0,43.0,3.0,0.0,2.0,13.0
 96 | 89523,biggicr01,2007,1,HOU,NL,141,517,68,130,31,3,10,50.0,4.0,3.0,23,112.0,0.0,3.0,7.0,5.0,5.0
 97 | 89525,benitar01,2007,2,FLO,NL,34,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
 98 | 89526,benitar01,2007,1,SFN,NL,19,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
 99 | 89530,ausmubr01,2007,1,HOU,NL,117,349,38,82,16,3,3,25.0,6.0,1.0,37,74.0,3.0,6.0,4.0,1.0,11.0
100 | 89533,aloumo01,2007,1,NYN,NL,87,328,51,112,19,1,13,49.0,3.0,0.0,27,30.0,5.0,2.0,0.0,3.0,13.0
101 | 89534,alomasa02,2007,1,NYN,NL,8,22,1,3,1,0,0,0.0,0.0,0.0,0,3.0,0.0,0.0,0.0,0.0,0.0
102 | 


--------------------------------------------------------------------------------
/Tutorials/01 - Intro to Pandas/Data/cdystonia.csv:
--------------------------------------------------------------------------------
  1 | patient,obs,week,site,id,treat,age,sex,twstrs
  2 | 1,1,0,1,1,5000U,65,F,32
  3 | 1,2,2,1,1,5000U,65,F,30
  4 | 1,3,4,1,1,5000U,65,F,24
  5 | 1,4,8,1,1,5000U,65,F,37
  6 | 1,5,12,1,1,5000U,65,F,39
  7 | 1,6,16,1,1,5000U,65,F,36
  8 | 2,1,0,1,2,10000U,70,F,60
  9 | 2,2,2,1,2,10000U,70,F,26
 10 | 2,3,4,1,2,10000U,70,F,27
 11 | 2,4,8,1,2,10000U,70,F,41
 12 | 2,5,12,1,2,10000U,70,F,65
 13 | 2,6,16,1,2,10000U,70,F,67
 14 | 3,1,0,1,3,5000U,64,F,44
 15 | 3,2,2,1,3,5000U,64,F,20
 16 | 3,3,4,1,3,5000U,64,F,23
 17 | 3,4,8,1,3,5000U,64,F,26
 18 | 3,5,12,1,3,5000U,64,F,35
 19 | 3,6,16,1,3,5000U,64,F,35
 20 | 4,1,0,1,4,Placebo,59,F,53
 21 | 4,2,2,1,4,Placebo,59,F,61
 22 | 4,3,4,1,4,Placebo,59,F,64
 23 | 4,4,8,1,4,Placebo,59,F,62
 24 | 5,1,0,1,5,10000U,76,F,53
 25 | 5,2,2,1,5,10000U,76,F,35
 26 | 5,3,4,1,5,10000U,76,F,48
 27 | 5,4,8,1,5,10000U,76,F,49
 28 | 5,5,12,1,5,10000U,76,F,41
 29 | 5,6,16,1,5,10000U,76,F,51
 30 | 6,1,0,1,6,10000U,59,F,49
 31 | 6,2,2,1,6,10000U,59,F,34
 32 | 6,3,4,1,6,10000U,59,F,43
 33 | 6,4,8,1,6,10000U,59,F,48
 34 | 6,5,12,1,6,10000U,59,F,48
 35 | 6,6,16,1,6,10000U,59,F,51
 36 | 7,1,0,1,7,5000U,72,M,42
 37 | 7,2,2,1,7,5000U,72,M,32
 38 | 7,3,4,1,7,5000U,72,M,32
 39 | 7,4,8,1,7,5000U,72,M,43
 40 | 7,5,12,1,7,5000U,72,M,42
 41 | 7,6,16,1,7,5000U,72,M,46
 42 | 8,1,0,1,8,Placebo,40,M,34
 43 | 8,2,2,1,8,Placebo,40,M,33
 44 | 8,3,4,1,8,Placebo,40,M,21
 45 | 8,4,8,1,8,Placebo,40,M,27
 46 | 8,5,12,1,8,Placebo,40,M,32
 47 | 8,6,16,1,8,Placebo,40,M,38
 48 | 9,1,0,1,9,5000U,52,F,41
 49 | 9,2,2,1,9,5000U,52,F,32
 50 | 9,3,4,1,9,5000U,52,F,34
 51 | 9,4,8,1,9,5000U,52,F,35
 52 | 9,5,12,1,9,5000U,52,F,37
 53 | 9,6,16,1,9,5000U,52,F,36
 54 | 10,1,0,1,10,Placebo,47,M,27
 55 | 10,2,2,1,10,Placebo,47,M,10
 56 | 10,3,4,1,10,Placebo,47,M,31
 57 | 10,4,8,1,10,Placebo,47,M,32
 58 | 10,5,12,1,10,Placebo,47,M,6
 59 | 10,6,16,1,10,Placebo,47,M,14
 60 | 11,1,0,1,11,10000U,57,F,48
 61 | 11,2,2,1,11,10000U,57,F,41
 62 | 11,3,4,1,11,10000U,57,F,32
 63 | 11,4,8,1,11,10000U,57,F,35
 64 | 11,5,12,1,11,10000U,57,F,57
 65 | 11,6,16,1,11,10000U,57,F,51
 66 | 12,1,0,1,12,Placebo,47,F,34
 67 | 12,2,2,1,12,Placebo,47,F,19
 68 | 12,3,4,1,12,Placebo,47,F,21
 69 | 12,4,8,1,12,Placebo,47,F,24
 70 | 12,5,12,1,12,Placebo,47,F,28
 71 | 12,6,16,1,12,Placebo,47,F,28
 72 | 13,1,0,2,1,Placebo,70,F,49
 73 | 13,2,2,2,1,Placebo,70,F,47
 74 | 13,3,4,2,1,Placebo,70,F,44
 75 | 13,4,8,2,1,Placebo,70,F,48
 76 | 13,5,12,2,1,Placebo,70,F,44
 77 | 13,6,16,2,1,Placebo,70,F,44
 78 | 14,1,0,2,2,5000U,49,F,46
 79 | 14,2,2,2,2,5000U,49,F,35
 80 | 14,3,4,2,2,5000U,49,F,45
 81 | 14,4,8,2,2,5000U,49,F,49
 82 | 14,5,12,2,2,5000U,49,F,53
 83 | 14,6,16,2,2,5000U,49,F,56
 84 | 15,1,0,2,3,10000U,59,F,56
 85 | 15,2,2,2,3,10000U,59,F,44
 86 | 15,3,4,2,3,10000U,59,F,48
 87 | 15,4,8,2,3,10000U,59,F,54
 88 | 15,5,12,2,3,10000U,59,F,49
 89 | 15,6,16,2,3,10000U,59,F,60
 90 | 16,1,0,2,4,5000U,64,M,59
 91 | 16,2,2,2,4,5000U,64,M,48
 92 | 16,3,4,2,4,5000U,64,M,56
 93 | 16,4,8,2,4,5000U,64,M,55
 94 | 16,5,12,2,4,5000U,64,M,57
 95 | 16,6,16,2,4,5000U,64,M,58
 96 | 17,1,0,2,5,10000U,45,F,62
 97 | 17,2,2,2,5,10000U,45,F,60
 98 | 17,3,4,2,5,10000U,45,F,60
 99 | 17,4,8,2,5,10000U,45,F,64
100 | 17,5,12,2,5,10000U,45,F,67
101 | 17,6,16,2,5,10000U,45,F,66
102 | 18,1,0,2,6,Placebo,66,F,50
103 | 18,2,2,2,6,Placebo,66,F,53
104 | 18,3,4,2,6,Placebo,66,F,52
105 | 18,4,8,2,6,Placebo,66,F,57
106 | 18,5,12,2,6,Placebo,66,F,61
107 | 18,6,16,2,6,Placebo,66,F,54
108 | 19,1,0,2,7,10000U,49,F,42
109 | 19,2,2,2,7,10000U,49,F,42
110 | 19,3,4,2,7,10000U,49,F,43
111 | 19,4,8,2,7,10000U,49,F,33
112 | 19,5,12,2,7,10000U,49,F,37
113 | 19,6,16,2,7,10000U,49,F,43
114 | 20,1,0,2,8,Placebo,54,F,53
115 | 20,2,2,2,8,Placebo,54,F,56
116 | 20,3,4,2,8,Placebo,54,F,52
117 | 20,4,8,2,8,Placebo,54,F,54
118 | 20,5,12,2,8,Placebo,54,F,55
119 | 20,6,16,2,8,Placebo,54,F,51
120 | 21,1,0,2,9,5000U,47,F,67
121 | 21,2,2,2,9,5000U,47,F,64
122 | 21,3,4,2,9,5000U,47,F,65
123 | 21,4,8,2,9,5000U,47,F,64
124 | 21,5,12,2,9,5000U,47,F,62
125 | 21,6,16,2,9,5000U,47,F,64
126 | 22,1,0,2,10,Placebo,31,M,44
127 | 22,2,2,2,10,Placebo,31,M,40
128 | 22,3,4,2,10,Placebo,31,M,32
129 | 22,4,8,2,10,Placebo,31,M,36
130 | 22,5,12,2,10,Placebo,31,M,42
131 | 22,6,16,2,10,Placebo,31,M,43
132 | 23,1,0,2,11,10000U,53,F,65
133 | 23,2,2,2,11,10000U,53,F,58
134 | 23,3,4,2,11,10000U,53,F,55
135 | 23,5,12,2,11,10000U,53,F,56
136 | 23,6,16,2,11,10000U,53,F,60
137 | 24,1,0,2,12,5000U,61,M,56
138 | 24,2,2,2,12,5000U,61,M,54
139 | 24,3,4,2,12,5000U,61,M,52
140 | 24,4,8,2,12,5000U,61,M,48
141 | 24,5,12,2,12,5000U,61,M,52
142 | 24,6,16,2,12,5000U,61,M,53
143 | 25,1,0,2,13,Placebo,40,M,30
144 | 25,2,2,2,13,Placebo,40,M,33
145 | 25,3,4,2,13,Placebo,40,M,25
146 | 25,4,8,2,13,Placebo,40,M,29
147 | 25,5,12,2,13,Placebo,40,M,32
148 | 25,6,16,2,13,Placebo,40,M,32
149 | 26,1,0,2,14,5000U,67,M,47
150 | 26,3,4,2,14,5000U,67,M,54
151 | 26,4,8,2,14,5000U,67,M,43
152 | 26,5,12,2,14,5000U,67,M,46
153 | 26,6,16,2,14,5000U,67,M,50
154 | 27,1,0,3,1,10000U,54,F,50
155 | 27,2,2,3,1,10000U,54,F,43
156 | 27,3,4,3,1,10000U,54,F,51
157 | 27,4,8,3,1,10000U,54,F,46
158 | 27,5,12,3,1,10000U,54,F,49
159 | 27,6,16,3,1,10000U,54,F,53
160 | 28,1,0,3,2,Placebo,41,F,34
161 | 28,2,2,3,2,Placebo,41,F,29
162 | 28,3,4,3,2,Placebo,41,F,27
163 | 28,4,8,3,2,Placebo,41,F,21
164 | 28,5,12,3,2,Placebo,41,F,22
165 | 28,6,16,3,2,Placebo,41,F,22
166 | 29,1,0,3,3,5000U,66,M,39
167 | 29,2,2,3,3,5000U,66,M,41
168 | 29,3,4,3,3,5000U,66,M,33
169 | 29,4,8,3,3,5000U,66,M,39
170 | 29,5,12,3,3,5000U,66,M,37
171 | 29,6,16,3,3,5000U,66,M,37
172 | 30,1,0,3,4,Placebo,68,F,43
173 | 30,2,2,3,4,Placebo,68,F,31
174 | 30,3,4,3,4,Placebo,68,F,29
175 | 30,4,8,3,4,Placebo,68,F,28
176 | 30,5,12,3,4,Placebo,68,F,33
177 | 30,6,16,3,4,Placebo,68,F,38
178 | 31,1,0,3,5,10000U,41,F,46
179 | 31,2,2,3,5,10000U,41,F,26
180 | 31,3,4,3,5,10000U,41,F,29
181 | 31,4,8,3,5,10000U,41,F,33
182 | 31,5,12,3,5,10000U,41,F,45
183 | 31,6,16,3,5,10000U,41,F,56
184 | 32,1,0,3,6,5000U,77,M,52
185 | 32,2,2,3,6,5000U,77,M,44
186 | 32,3,4,3,6,5000U,77,M,47
187 | 32,4,8,3,6,5000U,77,M,50
188 | 32,5,12,3,6,5000U,77,M,50
189 | 32,6,16,3,6,5000U,77,M,49
190 | 33,1,0,3,7,10000U,41,M,38
191 | 33,2,2,3,7,10000U,41,M,19
192 | 33,3,4,3,7,10000U,41,M,20
193 | 33,4,8,3,7,10000U,41,M,27
194 | 33,5,12,3,7,10000U,41,M,29
195 | 33,6,16,3,7,10000U,41,M,32
196 | 34,1,0,3,8,Placebo,56,M,33
197 | 34,2,2,3,8,Placebo,56,M,38
198 | 34,3,4,3,8,Placebo,56,M,40
199 | 34,4,8,3,8,Placebo,56,M,48
200 | 34,5,12,3,8,Placebo,56,M,49
201 | 34,6,16,3,8,Placebo,56,M,44
202 | 35,1,0,3,9,5000U,46,F,28
203 | 35,2,2,3,9,5000U,46,F,16
204 | 35,3,4,3,9,5000U,46,F,11
205 | 35,4,8,3,9,5000U,46,F,7
206 | 35,5,12,3,9,5000U,46,F,13
207 | 35,6,16,3,9,5000U,46,F,21
208 | 36,1,0,3,10,10000U,46,F,34
209 | 36,2,2,3,10,10000U,46,F,23
210 | 36,3,4,3,10,10000U,46,F,16
211 | 36,4,8,3,10,10000U,46,F,15
212 | 36,5,12,3,10,10000U,46,F,17
213 | 36,6,16,3,10,10000U,46,F,29
214 | 37,1,0,3,11,Placebo,47,F,39
215 | 37,2,2,3,11,Placebo,47,F,37
216 | 37,3,4,3,11,Placebo,47,F,39
217 | 37,4,8,3,11,Placebo,47,F,39
218 | 37,5,12,3,11,Placebo,47,F,45
219 | 37,6,16,3,11,Placebo,47,F,43
220 | 38,1,0,3,12,5000U,35,M,29
221 | 38,2,2,3,12,5000U,35,M,42
222 | 38,3,4,3,12,5000U,35,M,35
223 | 38,4,8,3,12,5000U,35,M,24
224 | 38,5,12,3,12,5000U,35,M,29
225 | 38,6,16,3,12,5000U,35,M,42
226 | 39,1,0,4,1,Placebo,58,M,52
227 | 39,2,2,4,1,Placebo,58,M,55
228 | 39,3,4,4,1,Placebo,58,M,51
229 | 39,4,8,4,1,Placebo,58,M,52
230 | 39,5,12,4,1,Placebo,58,M,54
231 | 39,6,16,4,1,Placebo,58,M,57
232 | 40,1,0,4,2,5000U,62,F,52
233 | 40,2,2,4,2,5000U,62,F,30
234 | 40,3,4,4,2,5000U,62,F,43
235 | 40,4,8,4,2,5000U,62,F,45
236 | 40,5,12,4,2,5000U,62,F,47
237 | 40,6,16,4,2,5000U,62,F,46
238 | 41,1,0,4,3,10000U,73,F,54
239 | 41,2,2,4,3,10000U,73,F,52
240 | 41,3,4,4,3,10000U,73,F,52
241 | 41,4,8,4,3,10000U,73,F,54
242 | 41,5,12,4,3,10000U,73,F,51
243 | 41,6,16,4,3,10000U,73,F,57
244 | 42,1,0,4,4,10000U,52,F,52
245 | 42,2,2,4,4,10000U,52,F,44
246 | 42,3,4,4,4,10000U,52,F,33
247 | 42,4,8,4,4,10000U,52,F,54
248 | 42,5,12,4,4,10000U,52,F,46
249 | 42,6,16,4,4,10000U,52,F,47
250 | 43,1,0,4,5,Placebo,53,F,47
251 | 43,2,2,4,5,Placebo,53,F,45
252 | 43,3,4,4,5,Placebo,53,F,41
253 | 43,4,8,4,5,Placebo,53,F,45
254 | 43,5,12,4,5,Placebo,53,F,43
255 | 43,6,16,4,5,Placebo,53,F,41
256 | 44,1,0,4,6,5000U,69,M,44
257 | 44,2,2,4,6,5000U,69,M,34
258 | 44,3,4,4,6,5000U,69,M,29
259 | 44,4,8,4,6,5000U,69,M,28
260 | 44,5,12,4,6,5000U,69,M,35
261 | 44,6,16,4,6,5000U,69,M,41
262 | 45,1,0,4,7,Placebo,55,M,42
263 | 45,2,2,4,7,Placebo,55,M,39
264 | 45,3,4,4,7,Placebo,55,M,38
265 | 45,4,8,4,7,Placebo,55,M,47
266 | 45,5,12,4,7,Placebo,55,M,39
267 | 45,6,16,4,7,Placebo,55,M,39
268 | 46,1,0,4,8,10000U,52,F,42
269 | 46,2,2,4,8,10000U,52,F,14
270 | 46,3,4,4,8,10000U,52,F,9
271 | 46,4,8,4,8,10000U,52,F,9
272 | 46,5,12,4,8,10000U,52,F,16
273 | 46,6,16,4,8,10000U,52,F,33
274 | 47,1,0,5,1,10000U,51,F,44
275 | 47,2,2,5,1,10000U,51,F,34
276 | 47,3,4,5,1,10000U,51,F,32
277 | 47,4,8,5,1,10000U,51,F,35
278 | 47,5,12,5,1,10000U,51,F,54
279 | 47,6,16,5,1,10000U,51,F,53
280 | 48,1,0,5,2,Placebo,56,F,60
281 | 48,2,2,5,2,Placebo,56,F,57
282 | 48,3,4,5,2,Placebo,56,F,53
283 | 48,4,8,5,2,Placebo,56,F,52
284 | 48,5,12,5,2,Placebo,56,F,53
285 | 48,6,16,5,2,Placebo,56,F,58
286 | 49,1,0,5,3,5000U,65,F,60
287 | 49,2,2,5,3,5000U,65,F,53
288 | 49,3,4,5,3,5000U,65,F,55
289 | 49,4,8,5,3,5000U,65,F,62
290 | 49,5,12,5,3,5000U,65,F,67
291 | 50,1,0,5,4,10000U,35,F,50
292 | 50,2,2,5,4,10000U,35,F,50
293 | 50,4,8,5,4,10000U,35,F,46
294 | 50,5,12,5,4,10000U,35,F,50
295 | 50,6,16,5,4,10000U,35,F,57
296 | 51,1,0,5,5,5000U,43,M,38
297 | 51,2,2,5,5,5000U,43,M,27
298 | 51,3,4,5,5,5000U,43,M,16
299 | 51,4,8,5,5,5000U,43,M,19
300 | 51,5,12,5,5,5000U,43,M,23
301 | 51,6,16,5,5,5000U,43,M,26
302 | 52,1,0,5,6,Placebo,61,M,44
303 | 52,3,4,5,6,Placebo,61,M,46
304 | 52,4,8,5,6,Placebo,61,M,26
305 | 52,5,12,5,6,Placebo,61,M,30
306 | 52,6,16,5,6,Placebo,61,M,34
307 | 53,1,0,6,1,Placebo,43,M,54
308 | 53,2,2,6,1,Placebo,43,M,53
309 | 53,3,4,6,1,Placebo,43,M,51
310 | 53,4,8,6,1,Placebo,43,M,56
311 | 53,5,12,6,1,Placebo,43,M,39
312 | 53,6,16,6,1,Placebo,43,M,9
313 | 54,1,0,6,2,10000U,64,F,54
314 | 54,2,2,6,2,10000U,64,F,32
315 | 54,3,4,6,2,10000U,64,F,40
316 | 54,4,8,6,2,10000U,64,F,52
317 | 54,5,12,6,2,10000U,64,F,42
318 | 54,6,16,6,2,10000U,64,F,47
319 | 55,1,0,6,3,5000U,57,M,56
320 | 55,2,2,6,3,5000U,57,M,55
321 | 55,3,4,6,3,5000U,57,M,44
322 | 55,4,8,6,3,5000U,57,M,50
323 | 55,5,12,6,3,5000U,57,M,53
324 | 55,6,16,6,3,5000U,57,M,52
325 | 56,1,0,6,4,5000U,60,F,51
326 | 56,2,2,6,4,5000U,60,F,50
327 | 56,3,4,6,4,5000U,60,F,50
328 | 56,4,8,6,4,5000U,60,F,56
329 | 56,5,12,6,4,5000U,60,F,59
330 | 56,6,16,6,4,5000U,60,F,53
331 | 57,1,0,6,5,10000U,44,F,53
332 | 57,2,2,6,5,10000U,44,F,56
333 | 57,3,4,6,5,10000U,44,F,47
334 | 57,4,8,6,5,10000U,44,F,53
335 | 57,5,12,6,5,10000U,44,F,51
336 | 57,6,16,6,5,10000U,44,F,51
337 | 58,1,0,6,6,Placebo,41,F,36
338 | 58,2,2,6,6,Placebo,41,F,29
339 | 58,3,4,6,6,Placebo,41,F,24
340 | 58,4,8,6,6,Placebo,41,F,32
341 | 58,5,12,6,6,Placebo,41,F,45
342 | 58,6,16,6,6,Placebo,41,F,36
343 | 59,1,0,6,7,5000U,51,F,59
344 | 59,2,2,6,7,5000U,51,F,53
345 | 59,3,4,6,7,5000U,51,F,45
346 | 59,4,8,6,7,5000U,51,F,44
347 | 59,5,12,6,7,5000U,51,F,50
348 | 59,6,16,6,7,5000U,51,F,48
349 | 60,1,0,6,8,Placebo,57,F,49
350 | 60,2,2,6,8,Placebo,57,F,50
351 | 60,3,4,6,8,Placebo,57,F,48
352 | 60,4,8,6,8,Placebo,57,F,56
353 | 60,5,12,6,8,Placebo,57,F,49
354 | 60,6,16,6,8,Placebo,57,F,57
355 | 61,1,0,6,9,10000U,42,F,50
356 | 61,2,2,6,9,10000U,42,F,38
357 | 61,3,4,6,9,10000U,42,F,42
358 | 61,4,8,6,9,10000U,42,F,43
359 | 61,5,12,6,9,10000U,42,F,42
360 | 61,6,16,6,9,10000U,42,F,46
361 | 62,1,0,6,10,Placebo,48,F,46
362 | 62,2,2,6,10,Placebo,48,F,48
363 | 62,3,4,6,10,Placebo,48,F,46
364 | 62,4,8,6,10,Placebo,48,F,57
365 | 62,5,12,6,10,Placebo,48,F,57
366 | 62,6,16,6,10,Placebo,48,F,49
367 | 63,1,0,6,11,10000U,57,M,55
368 | 63,2,2,6,11,10000U,57,M,34
369 | 63,3,4,6,11,10000U,57,M,26
370 | 63,4,8,6,11,10000U,57,M,40
371 | 63,5,12,6,11,10000U,57,M,49
372 | 63,6,16,6,11,10000U,57,M,47
373 | 64,1,0,6,12,5000U,39,M,46
374 | 64,2,2,6,12,5000U,39,M,44
375 | 64,3,4,6,12,5000U,39,M,47
376 | 64,4,8,6,12,5000U,39,M,50
377 | 64,5,12,6,12,5000U,39,M,46
378 | 64,6,16,6,12,5000U,39,M,51
379 | 65,1,0,6,13,10000U,67,M,34
380 | 65,2,2,6,13,10000U,67,M,31
381 | 65,3,4,6,13,10000U,67,M,25
382 | 66,1,0,6,14,5000U,39,F,57
383 | 66,2,2,6,14,5000U,39,F,48
384 | 66,3,4,6,14,5000U,39,F,50
385 | 66,4,8,6,14,5000U,39,F,50
386 | 66,5,12,6,14,5000U,39,F,50
387 | 66,6,16,6,14,5000U,39,F,49
388 | 67,1,0,6,15,Placebo,69,M,41
389 | 67,2,2,6,15,Placebo,69,M,40
390 | 67,3,4,6,15,Placebo,69,M,42
391 | 67,4,8,6,15,Placebo,69,M,38
392 | 67,5,12,6,15,Placebo,69,M,50
393 | 67,6,16,6,15,Placebo,69,M,56
394 | 68,1,0,7,1,5000U,54,F,49
395 | 68,2,2,7,1,5000U,54,F,25
396 | 68,3,4,7,1,5000U,54,F,30
397 | 68,4,8,7,1,5000U,54,F,41
398 | 68,5,12,7,1,5000U,54,F,41
399 | 68,6,16,7,1,5000U,54,F,31
400 | 69,1,0,7,2,Placebo,67,F,42
401 | 69,2,2,7,2,Placebo,67,F,30
402 | 69,3,4,7,2,Placebo,67,F,40
403 | 69,4,8,7,2,Placebo,67,F,43
404 | 69,5,12,7,2,Placebo,67,F,36
405 | 69,6,16,7,2,Placebo,67,F,45
406 | 70,1,0,7,3,10000U,58,F,31
407 | 70,2,2,7,3,10000U,58,F,18
408 | 70,3,4,7,3,10000U,58,F,23
409 | 70,4,8,7,3,10000U,58,F,26
410 | 70,5,12,7,3,10000U,58,F,33
411 | 70,6,16,7,3,10000U,58,F,41
412 | 71,1,0,7,4,Placebo,72,F,50
413 | 71,2,2,7,4,Placebo,72,F,27
414 | 71,3,4,7,4,Placebo,72,F,43
415 | 71,4,8,7,4,Placebo,72,F,32
416 | 71,5,12,7,4,Placebo,72,F,40
417 | 71,6,16,7,4,Placebo,72,F,47
418 | 72,1,0,7,5,10000U,65,F,35
419 | 72,2,2,7,5,10000U,65,F,24
420 | 72,3,4,7,5,10000U,65,F,34
421 | 72,4,8,7,5,10000U,65,F,28
422 | 72,5,12,7,5,10000U,65,F,34
423 | 72,6,16,7,5,10000U,65,F,28
424 | 73,1,0,7,6,5000U,68,F,38
425 | 73,2,2,7,6,5000U,68,F,25
426 | 73,3,4,7,6,5000U,68,F,21
427 | 73,4,8,7,6,5000U,68,F,33
428 | 73,5,12,7,6,5000U,68,F,42
429 | 73,6,16,7,6,5000U,68,F,53
430 | 74,1,0,7,7,10000U,75,F,53
431 | 74,2,2,7,7,10000U,75,F,40
432 | 74,3,4,7,7,10000U,75,F,38
433 | 74,4,8,7,7,10000U,75,F,44
434 | 74,5,12,7,7,10000U,75,F,47
435 | 74,6,16,7,7,10000U,75,F,53
436 | 75,1,0,7,8,Placebo,26,F,42
437 | 75,2,2,7,8,Placebo,26,F,48
438 | 75,3,4,7,8,Placebo,26,F,26
439 | 75,4,8,7,8,Placebo,26,F,37
440 | 75,5,12,7,8,Placebo,26,F,37
441 | 75,6,16,7,8,Placebo,26,F,43
442 | 76,1,0,7,9,5000U,36,F,53
443 | 76,2,2,7,9,5000U,36,F,45
444 | 76,3,4,7,9,5000U,36,F,52
445 | 76,4,8,7,9,5000U,36,F,51
446 | 76,5,12,7,9,5000U,36,F,52
447 | 76,6,16,7,9,5000U,36,F,53
448 | 77,1,0,7,10,10000U,72,M,46
449 | 77,2,2,7,10,10000U,72,M,47
450 | 77,3,4,7,10,10000U,72,M,45
451 | 77,4,8,7,10,10000U,72,M,45
452 | 77,5,12,7,10,10000U,72,M,50
453 | 77,6,16,7,10,10000U,72,M,52
454 | 78,1,0,7,11,Placebo,54,F,50
455 | 78,2,2,7,11,Placebo,54,F,42
456 | 78,3,4,7,11,Placebo,54,F,52
457 | 78,4,8,7,11,Placebo,54,F,60
458 | 78,5,12,7,11,Placebo,54,F,54
459 | 78,6,16,7,11,Placebo,54,F,59
460 | 79,1,0,7,12,5000U,64,F,43
461 | 79,2,2,7,12,5000U,64,F,24
462 | 79,3,4,7,12,5000U,64,F,17
463 | 79,4,8,7,12,5000U,64,F,37
464 | 79,5,12,7,12,5000U,64,F,36
465 | 79,6,16,7,12,5000U,64,F,38
466 | 80,1,0,8,1,Placebo,39,F,46
467 | 80,2,2,8,1,Placebo,39,F,39
468 | 80,3,4,8,1,Placebo,39,F,25
469 | 80,4,8,8,1,Placebo,39,F,15
470 | 80,5,12,8,1,Placebo,39,F,21
471 | 80,6,16,8,1,Placebo,39,F,25
472 | 81,1,0,8,2,10000U,54,M,41
473 | 81,2,2,8,2,10000U,54,M,30
474 | 81,3,4,8,2,10000U,54,M,44
475 | 81,4,8,8,2,10000U,54,M,46
476 | 81,5,12,8,2,10000U,54,M,46
477 | 81,6,16,8,2,10000U,54,M,44
478 | 82,1,0,8,3,5000U,48,M,33
479 | 82,2,2,8,3,5000U,48,M,27
480 | 82,3,4,8,3,5000U,48,M,25
481 | 82,4,8,8,3,5000U,48,M,30
482 | 82,5,12,8,3,5000U,48,M,28
483 | 82,6,16,8,3,5000U,48,M,30
484 | 83,1,0,8,4,5000U,83,F,36
485 | 83,2,2,8,4,5000U,83,F,15
486 | 83,3,4,8,4,5000U,83,F,16
487 | 83,4,8,8,4,5000U,83,F,17
488 | 83,5,12,8,4,5000U,83,F,22
489 | 83,6,16,8,4,5000U,83,F,41
490 | 84,1,0,8,5,10000U,74,M,33
491 | 84,2,2,8,5,10000U,74,M,32
492 | 84,3,4,8,5,10000U,74,M,31
493 | 84,4,8,8,5,10000U,74,M,27
494 | 84,5,12,8,5,10000U,74,M,49
495 | 84,6,16,8,5,10000U,74,M,60
496 | 85,1,0,8,6,Placebo,41,M,37
497 | 86,1,0,8,7,10000U,65,F,24
498 | 86,2,2,8,7,10000U,65,F,29
499 | 86,3,4,8,7,10000U,65,F,18
500 | 86,4,8,8,7,10000U,65,F,20
501 | 86,5,12,8,7,10000U,65,F,25
502 | 86,6,16,8,7,10000U,65,F,41
503 | 87,1,0,8,8,5000U,79,M,42
504 | 87,2,2,8,8,5000U,79,M,23
505 | 87,3,4,8,8,5000U,79,M,30
506 | 87,4,8,8,8,5000U,79,M,36
507 | 87,5,12,8,8,5000U,79,M,41
508 | 87,6,16,8,8,5000U,79,M,43
509 | 88,1,0,8,9,Placebo,63,M,30
510 | 88,2,2,8,9,Placebo,63,M,22
511 | 88,3,4,8,9,Placebo,63,M,21
512 | 88,4,8,8,9,Placebo,63,M,25
513 | 88,5,12,8,9,Placebo,63,M,26
514 | 88,6,16,8,9,Placebo,63,M,33
515 | 89,1,0,8,10,Placebo,63,F,42
516 | 89,2,2,8,10,Placebo,63,F,46
517 | 89,3,4,8,10,Placebo,63,F,41
518 | 89,4,8,8,10,Placebo,63,F,43
519 | 89,5,12,8,10,Placebo,63,F,49
520 | 89,6,16,8,10,Placebo,63,F,54
521 | 90,1,0,8,11,10000U,34,F,49
522 | 90,2,2,8,11,10000U,34,F,25
523 | 90,3,4,8,11,10000U,34,F,30
524 | 90,4,8,8,11,10000U,34,F,49
525 | 90,5,12,8,11,10000U,34,F,55
526 | 90,6,16,8,11,10000U,34,F,58
527 | 91,1,0,8,12,5000U,42,M,58
528 | 91,2,2,8,12,5000U,42,M,46
529 | 91,3,4,8,12,5000U,42,M,46
530 | 91,4,8,8,12,5000U,42,M,50
531 | 91,5,12,8,12,5000U,42,M,56
532 | 91,6,16,8,12,5000U,42,M,60
533 | 92,1,0,8,13,Placebo,57,M,26
534 | 92,2,2,8,13,Placebo,57,M,26
535 | 92,3,4,8,13,Placebo,57,M,27
536 | 92,4,8,8,13,Placebo,57,M,22
537 | 92,5,12,8,13,Placebo,57,M,38
538 | 92,6,16,8,13,Placebo,57,M,35
539 | 93,1,0,8,14,5000U,68,M,37
540 | 93,3,4,8,14,5000U,68,M,23
541 | 93,4,8,8,14,5000U,68,M,18
542 | 93,5,12,8,14,5000U,68,M,34
543 | 93,6,16,8,14,5000U,68,M,36
544 | 94,1,0,8,15,10000U,51,M,40
545 | 94,2,2,8,15,10000U,51,M,24
546 | 94,3,4,8,15,10000U,51,M,25
547 | 94,4,8,8,15,10000U,51,M,37
548 | 94,6,16,8,15,10000U,51,M,38
549 | 95,1,0,8,16,5000U,51,F,33
550 | 95,2,2,8,16,5000U,51,F,10
551 | 95,3,4,8,16,5000U,51,F,13
552 | 95,4,8,8,16,5000U,51,F,16
553 | 95,5,12,8,16,5000U,51,F,32
554 | 95,6,16,8,16,5000U,51,F,16
555 | 96,1,0,8,17,10000U,61,F,41
556 | 96,2,2,8,17,10000U,61,F,50
557 | 96,3,4,8,17,10000U,61,F,22
558 | 96,4,8,8,17,10000U,61,F,28
559 | 96,5,12,8,17,10000U,61,F,34
560 | 96,6,16,8,17,10000U,61,F,36
561 | 97,1,0,8,18,Placebo,42,M,46
562 | 97,3,4,8,18,Placebo,42,M,41
563 | 97,4,8,8,18,Placebo,42,M,41
564 | 97,5,12,8,18,Placebo,42,M,58
565 | 97,6,16,8,18,Placebo,42,M,53
566 | 98,1,0,8,19,10000U,73,F,40
567 | 98,2,2,8,19,10000U,73,F,28
568 | 98,3,4,8,19,10000U,73,F,29
569 | 98,4,8,8,19,10000U,73,F,30
570 | 98,5,12,8,19,10000U,73,F,37
571 | 98,6,16,8,19,10000U,73,F,44
572 | 99,1,0,9,1,10000U,57,M,40
573 | 99,2,2,9,1,10000U,57,M,16
574 | 99,3,4,9,1,10000U,57,M,18
575 | 99,4,8,9,1,10000U,57,M,25
576 | 99,5,12,9,1,10000U,57,M,33
577 | 99,6,16,9,1,10000U,57,M,48
578 | 100,1,0,9,2,Placebo,59,M,61
579 | 100,2,2,9,2,Placebo,59,M,52
580 | 100,3,4,9,2,Placebo,59,M,61
581 | 100,4,8,9,2,Placebo,59,M,68
582 | 100,5,12,9,2,Placebo,59,M,59
583 | 100,6,16,9,2,Placebo,59,M,71
584 | 101,1,0,9,3,5000U,57,M,35
585 | 101,2,2,9,3,5000U,57,M,21
586 | 101,3,4,9,3,5000U,57,M,29
587 | 101,4,8,9,3,5000U,57,M,30
588 | 101,5,12,9,3,5000U,57,M,35
589 | 101,6,16,9,3,5000U,57,M,48
590 | 102,1,0,9,4,Placebo,68,F,58
591 | 102,2,2,9,4,Placebo,68,F,38
592 | 102,3,4,9,4,Placebo,68,F,50
593 | 102,4,8,9,4,Placebo,68,F,53
594 | 102,5,12,9,4,Placebo,68,F,47
595 | 102,6,16,9,4,Placebo,68,F,59
596 | 103,1,0,9,5,5000U,55,F,49
597 | 103,2,2,9,5,5000U,55,F,45
598 | 103,3,4,9,5,5000U,55,F,36
599 | 103,5,12,9,5,5000U,55,F,40
600 | 103,6,16,9,5,5000U,55,F,52
601 | 104,1,0,9,6,10000U,46,F,52
602 | 104,2,2,9,6,10000U,46,F,46
603 | 104,3,4,9,6,10000U,46,F,36
604 | 104,5,12,9,6,10000U,46,F,45
605 | 104,6,16,9,6,10000U,46,F,54
606 | 105,1,0,9,7,Placebo,79,F,45
607 | 105,2,2,9,7,Placebo,79,F,46
608 | 105,3,4,9,7,Placebo,79,F,33
609 | 105,4,8,9,7,Placebo,79,F,44
610 | 105,5,12,9,7,Placebo,79,F,46
611 | 105,6,16,9,7,Placebo,79,F,48
612 | 106,1,0,9,8,5000U,43,M,67
613 | 106,2,2,9,8,5000U,43,M,63
614 | 106,3,4,9,8,5000U,43,M,71
615 | 106,4,8,9,8,5000U,43,M,66
616 | 106,5,12,9,8,5000U,43,M,68
617 | 106,6,16,9,8,5000U,43,M,71
618 | 107,1,0,9,9,10000U,50,M,57
619 | 107,3,4,9,9,10000U,50,M,36
620 | 107,4,8,9,9,10000U,50,M,23
621 | 107,6,16,9,9,10000U,50,M,52
622 | 108,1,0,9,10,10000U,39,F,63
623 | 108,2,2,9,10,10000U,39,F,51
624 | 108,3,4,9,10,10000U,39,F,46
625 | 108,4,8,9,10,10000U,39,F,50
626 | 108,5,12,9,10,10000U,39,F,50
627 | 108,6,16,9,10,10000U,39,F,54
628 | 109,1,0,9,11,5000U,57,M,53
629 | 109,2,2,9,11,5000U,57,M,38
630 | 109,4,8,9,11,5000U,57,M,33
631 | 109,5,12,9,11,5000U,57,M,36
632 | 109,6,16,9,11,5000U,57,M,51
633 | 


--------------------------------------------------------------------------------
/Tutorials/01 - Intro to Pandas/Data/microbiome.csv:
--------------------------------------------------------------------------------
 1 | Taxon,Patient,Group,Tissue,Stool
 2 | Firmicutes,1,0,136,4182
 3 | Firmicutes,2,1,1174,703
 4 | Firmicutes,3,0,408,3946
 5 | Firmicutes,4,1,831,8605
 6 | Firmicutes,5,0,693,50
 7 | Firmicutes,6,1,718,717
 8 | Firmicutes,7,0,173,33
 9 | Firmicutes,8,1,228,80
10 | Firmicutes,9,0,162,3196
11 | Firmicutes,10,1,372,32
12 | Firmicutes,11,0,4255,4361
13 | Firmicutes,12,1,107,1667
14 | Firmicutes,13,0,96,223
15 | Firmicutes,14,1,281,2377
16 | Proteobacteria,1,0,2469,1821
17 | Proteobacteria,2,1,839,661
18 | Proteobacteria,3,0,4414,18
19 | Proteobacteria,4,1,12044,83
20 | Proteobacteria,5,0,2310,12
21 | Proteobacteria,6,1,3053,547
22 | Proteobacteria,7,0,395,2174
23 | Proteobacteria,8,1,2651,767
24 | Proteobacteria,9,0,1195,76
25 | Proteobacteria,10,1,6857,795
26 | Proteobacteria,11,0,483,666
27 | Proteobacteria,12,1,2950,3994
28 | Proteobacteria,13,0,1541,816
29 | Proteobacteria,14,1,1307,53
30 | Actinobacteria,1,0,1590,4
31 | Actinobacteria,2,1,25,2
32 | Actinobacteria,3,0,259,300
33 | Actinobacteria,4,1,568,7
34 | Actinobacteria,5,0,1102,9
35 | Actinobacteria,6,1,678,377
36 | Actinobacteria,7,0,260,58
37 | Actinobacteria,8,1,424,233
38 | Actinobacteria,9,0,548,21
39 | Actinobacteria,10,1,201,83
40 | Actinobacteria,11,0,42,75
41 | Actinobacteria,12,1,109,59
42 | Actinobacteria,13,0,51,183
43 | Actinobacteria,14,1,310,204
44 | Bacteroidetes,1,0,67,0
45 | Bacteroidetes,2,1,0,0
46 | Bacteroidetes,3,0,85,5
47 | Bacteroidetes,4,1,143,7
48 | Bacteroidetes,5,0,678,2
49 | Bacteroidetes,6,1,4829,209
50 | Bacteroidetes,7,0,74,651
51 | Bacteroidetes,8,1,169,254
52 | Bacteroidetes,9,0,106,10
53 | Bacteroidetes,10,1,73,381
54 | Bacteroidetes,11,0,30,359
55 | Bacteroidetes,12,1,51,51
56 | Bacteroidetes,13,0,2473,2314
57 | Bacteroidetes,14,1,102,33
58 | Other,1,0,195,18
59 | Other,2,1,42,2
60 | Other,3,0,316,43
61 | Other,4,1,202,40
62 | Other,5,0,116,0
63 | Other,6,1,527,12
64 | Other,7,0,357,11
65 | Other,8,1,106,11
66 | Other,9,0,67,14
67 | Other,10,1,203,6
68 | Other,11,0,392,6
69 | Other,12,1,28,25
70 | Other,13,0,12,22
71 | Other,14,1,305,32


--------------------------------------------------------------------------------
/Tutorials/01 - Intro to Pandas/Data/microbiome_MID1.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epfl-ada/2019/ab917eb571793ee9bddc673d1e2c465ad4b25888/Tutorials/01 - Intro to Pandas/Data/microbiome_MID1.xls


--------------------------------------------------------------------------------
/Tutorials/01 - Intro to Pandas/Data/microbiome_MID2.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epfl-ada/2019/ab917eb571793ee9bddc673d1e2c465ad4b25888/Tutorials/01 - Intro to Pandas/Data/microbiome_MID2.xls


--------------------------------------------------------------------------------
/Tutorials/01 - Intro to Pandas/Data/microbiome_missing.csv:
--------------------------------------------------------------------------------
 1 | Taxon,Patient,Tissue,Stool
 2 | Firmicutes,1,632,305
 3 | Firmicutes,2,136,4182
 4 | Firmicutes,3,,703
 5 | Firmicutes,4,408,3946
 6 | Firmicutes,5,831,8605
 7 | Firmicutes,6,693,50
 8 | Firmicutes,7,718,717
 9 | Firmicutes,8,173,33
10 | Firmicutes,9,228,NA
11 | Firmicutes,10,162,3196
12 | Firmicutes,11,372,-99999
13 | Firmicutes,12,4255,4361
14 | Firmicutes,13,107,1667
15 | Firmicutes,14,?,223
16 | Firmicutes,15,281,2377
17 | Proteobacteria,1,1638,3886
18 | Proteobacteria,2,2469,1821
19 | Proteobacteria,3,839,661
20 | Proteobacteria,4,4414,18
21 | Proteobacteria,5,12044,83
22 | Proteobacteria,6,2310,12
23 | Proteobacteria,7,3053,547
24 | Proteobacteria,8,395,2174
25 | Proteobacteria,9,2651,767
26 | Proteobacteria,10,1195,76
27 | Proteobacteria,11,6857,795
28 | Proteobacteria,12,483,666
29 | Proteobacteria,13,2950,3994
30 | Proteobacteria,14,1541,816
31 | Proteobacteria,15,1307,53
32 | Actinobacteria,1,569,648
33 | Actinobacteria,2,1590,4
34 | Actinobacteria,3,25,2
35 | Actinobacteria,4,259,300
36 | Actinobacteria,5,568,7
37 | Actinobacteria,6,1102,9
38 | Actinobacteria,7,678,377
39 | Actinobacteria,8,260,58
40 | Actinobacteria,9,424,233
41 | Actinobacteria,10,548,21
42 | Actinobacteria,11,201,83
43 | Actinobacteria,12,42,75
44 | Actinobacteria,13,109,59
45 | Actinobacteria,14,51,183
46 | Actinobacteria,15,310,204
47 | Bacteroidetes,1,115,380
48 | Bacteroidetes,2,67,0
49 | Bacteroidetes,3,0,0
50 | Bacteroidetes,4,85,5
51 | Bacteroidetes,5,143,7
52 | Bacteroidetes,6,678,2
53 | Bacteroidetes,7,4829,209
54 | Bacteroidetes,8,74,651
55 | Bacteroidetes,9,169,254
56 | Bacteroidetes,10,106,10
57 | Bacteroidetes,11,73,381
58 | Bacteroidetes,12,30,359
59 | Bacteroidetes,13,51,51
60 | Bacteroidetes,14,2473,2314
61 | Bacteroidetes,15,102,33
62 | Other,1,114,277
63 | Other,2,195,18
64 | Other,3,42,2
65 | Other,4,316,43
66 | Other,5,202,40
67 | Other,6,116,0
68 | Other,7,527,12
69 | Other,8,357,11
70 | Other,9,106,11
71 | Other,10,67,14
72 | Other,11,203,6
73 | Other,12,392,6
74 | Other,13,28,25
75 | Other,14,12,22
76 | Other,15,305,32


--------------------------------------------------------------------------------
/Tutorials/01 - Intro to Pandas/README.md:
--------------------------------------------------------------------------------
 1 | # 01 - Intro to Pandas
 2 | 
 3 | In this tutorial session we explore the fundamental features of [Pandas](http://pandas.pydata.org/), and then we practice the basics of Data Wrangling with some real-world datasets.
 4 | 
 5 | [Give us your feedback on this tutorial!](https://docs.google.com/forms/d/e/1FAIpQLSdkE0HSMVsWD03h3Lr9pYhh5i6U5tXBDfdATwcCgaYYs2spBA/viewform)
 6 | 
 7 | ---
 8 | 
 9 | *Credits to: [Chris Fonnesbeck](https://github.com/fonnesbeck)  and [Michele Catasta](https://github.com/pirroh)*
10 | 
11 | *Updated by: [Panayiotis Smeros](https://github.com/psmeros)*
12 | 


--------------------------------------------------------------------------------
/Tutorials/02 - Data From The Web/Data from the Web.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Data from the Web - ADA 2019 Tutorial\n",
  8 |     "\n",
  9 |     "#### What do you find in this Notebook?\n",
 10 |     "\n",
 11 |     "The purpose of the Notebook is to offer a **quick** overview on how to scrape a Web page. In details, we illustrate the two main libraries used for this purpose. Afterwords, we show how to retrieve data from the Web."
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "## Web scraping libraries"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "To fetch data from the Web with Python, you need to get use to two essential libraries:\n",
 26 |     "\n",
 27 |     " * [`Requests (HTTP)`](https://requests.kennethreitz.org/en/master/): get the `html` page to parse.\n",
 28 |     "\n",
 29 |     " * [`Beautiful Soup (HTML Parsing)`](https://www.crummy.com/software/BeautifulSoup/bs4/doc/): parse the `html` and extract data."
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": null,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "# Import libraries\n",
 39 |     "import requests\n",
 40 |     "from bs4 import BeautifulSoup"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "markdown",
 45 |    "metadata": {},
 46 |    "source": [
 47 |     "If you have an error about missing modules, try this:\n",
 48 |     "```\n",
 49 |     "conda install requests\n",
 50 |     "conda install beautifulsoup4\n",
 51 |     "```"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "markdown",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "### Make a `get` request\n",
 59 |     "\n",
 60 |     "The [GET method](https://en.wikipedia.org/wiki/Hypertext_Transfer_Protocol#Request_methods) retrieves information from the server.\n",
 61 |     "\n",
 62 |     "We start scraping this website: https://httpbin.org/ - HTTP Request & Response Service. The website offers some useful endpoints [1] to check the content of our request. Some of them provide an 'echo service' that reply with the request received.\n",
 63 |     "\n",
 64 |     "[1] Endpoint is a web address (URL) at which clients of a specific service can gain access to it. By referencing that URL, clients can get to operations provided by that service."
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "#### Example 1: Get request basics\n",
 72 |     "Here we show an example on how use a get request. In particular, you see that we can get different information about the response:\n",
 73 |     "\n",
 74 |     "* The status code [2] which tells us whether everything is fine and if the request worked\n",
 75 |     "* The headers\n",
 76 |     "* Body of the response (typically HTML for webpages or JSON/XML for web services)\n",
 77 |     "\n",
 78 |     "[2] Find the reminder of HTTP status codes [here](https://en.wikipedia.org/wiki/List_of_HTTP_status_codes). Some typical codes are: **200 OK** (standard response for successful HTTP requests) and **404 Not Found** (the requested resource could not be found but may be available in the future)."
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "**NOTE:** this is an echo service, what you see is what we sent to the server"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "# Make the request\n",
 95 |     "r = requests.get('https://httpbin.org/ip') # /ip: Returns the requester's IP Address.\n",
 96 |     "\n",
 97 |     "print('Response status code: {0}\\n'.format(r.status_code))\n",
 98 |     "print('Response headers: {0}\\n'.format(r.headers))\n",
 99 |     "print('Response body: {0}'.format(r.text))"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "markdown",
104 |    "metadata": {},
105 |    "source": [
106 |     "#### Example 2: Parsing JSON string from the response\n",
107 |     "\n",
108 |     "If the body of the response is a JSON string, Requests offers a convenient way to parse the text and get a Python dictionary.\n",
109 |     "\n",
110 |     "Let's try to get the current time from here: http://worldtimeapi.org/api/timezone/Europe/Zurich – a simple web service that returns the local-time for a given timezone as either JSON (by default) or plain-text."
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": null,
116 |    "metadata": {},
117 |    "outputs": [],
118 |    "source": [
119 |     "r = requests.get('http://worldtimeapi.org/api/timezone/Europe/Zurich')\n",
120 |     "\n",
121 |     "print('Response body (parsed json):')\n",
122 |     "r.json()"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "metadata": {},
128 |    "source": [
129 |     "#### Example 3: Including parameters into get request\n",
130 |     "\n",
131 |     "This time, the `url` has been slightly changed to include a parameter (key1).\n",
132 |     "\n",
133 |     "Remember that the with the GET method the parameters are part of the URL."
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": null,
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "r = requests.get('https://httpbin.org/get?key1=value1')\n",
143 |     "r.json()"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "markdown",
148 |    "metadata": {},
149 |    "source": [
150 |     "### Make a `post` request\n",
151 |     "The [POST method](https://en.wikipedia.org/wiki/POST_(HTTP)) requests that a web server accepts the data enclosed in the body of the request message, most likely for storing it.\n",
152 |     "\n",
153 |     "A POST request can have the paramenters in the body. Let's how to do this with Requests library:"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": null,
159 |    "metadata": {},
160 |    "outputs": [],
161 |    "source": [
162 |     "payload = {'key1': 'value1', 'key2': 'value2'}\n",
163 |     "r = requests.post('https://httpbin.org/post', data=payload)\n",
164 |     "r.json()"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "markdown",
169 |    "metadata": {},
170 |    "source": [
171 |     "### Make a request and extract the Page Title!"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "markdown",
176 |    "metadata": {},
177 |    "source": [
178 |     "* Send the request and get the `html`"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": null,
184 |    "metadata": {},
185 |    "outputs": [],
186 |    "source": [
187 |     "# Send the request\n",
188 |     "r = requests.get('https://httpbin.org/html')\n",
189 |     "r.text[:300]"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "markdown",
194 |    "metadata": {},
195 |    "source": [
196 |     "* Thus, we start to use our beloved `BeautifulSoup` to parse the HTML and we get the header"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": null,
202 |    "metadata": {},
203 |    "outputs": [],
204 |    "source": [
205 |     "# Extract the header\n",
206 |     "soup = BeautifulSoup(r.text, 'html.parser')\n",
207 |     "soup.h1"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "markdown",
212 |    "metadata": {},
213 |    "source": [
214 |     "## Let's get interesting data - DBLP\n",
215 |     "\n",
216 |     "*DBLP is a computer science bibliography website. Starting in 1993 at the University of Trier, Germany, it grew from a small collection of HTML files and became an organization hosting a database and logic programming bibliography site. DBLP listed more than 3.66 million journal articles, conference papers, and other publications on computer science in July 2016, up from about 14,000 in 1995.*\n",
217 |     "\n",
218 |     "<div align=\"right\">https://en.wikipedia.org/wiki/DBLP</div> \n",
219 |     "\n",
220 |     "We want to check the distribution of the publications by year of the president of EPFL - Martin Vetterli.\n",
221 |     "\n",
222 |     "First of all, let's check the page with the data we need:"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": null,
228 |    "metadata": {},
229 |    "outputs": [],
230 |    "source": [
231 |     "URL = 'http://dblp.uni-trier.de/pers/hd/v/Vetterli:Martin'"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "markdown",
236 |    "metadata": {},
237 |    "source": [
238 |     "The page is public and accessible with a browser using a simple GET:"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": null,
244 |    "metadata": {},
245 |    "outputs": [],
246 |    "source": [
247 |     "r = requests.get(URL)\n",
248 |     "page_body = r.text"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "markdown",
253 |    "metadata": {},
254 |    "source": [
255 |     "Now the page content is downloaded and we can inspect the body of the response:"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": null,
261 |    "metadata": {},
262 |    "outputs": [],
263 |    "source": [
264 |     "page_body[:300]"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "markdown",
269 |    "metadata": {},
270 |    "source": [
271 |     "It is pure HTML, and we need BeautifulSoup to parse the content. We can specify the parser we want to use html.parser, lxml, lxml-xml, xml, html5lib. Each of them has advantages and disadvantages - see [documentation](https://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser)."
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": null,
277 |    "metadata": {},
278 |    "outputs": [],
279 |    "source": [
280 |     "soup = BeautifulSoup(page_body, 'html.parser')"
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "markdown",
285 |    "metadata": {},
286 |    "source": [
287 |     "Now the page is parsed and we can read the data we need!\n",
288 |     "\n",
289 |     "For example, let's get the title! Are we in the right page?"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "code",
294 |    "execution_count": null,
295 |    "metadata": {},
296 |    "outputs": [],
297 |    "source": [
298 |     "soup.title"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "markdown",
303 |    "metadata": {},
304 |    "source": [
305 |     "Yes! And we can get the clean text without HTML tags:"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": null,
311 |    "metadata": {},
312 |    "outputs": [],
313 |    "source": [
314 |     "soup.title.string"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "markdown",
319 |    "metadata": {},
320 |    "source": [
321 |     "A more complex query now! Let's find all the links in the page. \n",
322 |     "\n",
323 |     "In HTML a link is defined using the tag &lt;A&gt;, and BeautifulSoup offers an easy way to find them:"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": null,
329 |    "metadata": {},
330 |    "outputs": [],
331 |    "source": [
332 |     "all_links = soup.find_all('a')\n",
333 |     "print('The webpage cointains {0} links...'.format(len(all_links)))"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": null,
339 |    "metadata": {},
340 |    "outputs": [],
341 |    "source": [
342 |     "external_links = 0\n",
343 |     "for link in all_links:\n",
344 |     "    if(not link.get('href').startswith('http://dblp.uni-trier.de/')\n",
345 |     "       and link.get('href').startswith('http')):  # just an example, you need more checks\n",
346 |     "        external_links += 1\n",
347 |     "\n",
348 |     "print('... and {0} of them point to external websites.'.format(external_links))"
349 |    ]
350 |   },
351 |   {
352 |    "cell_type": "markdown",
353 |    "metadata": {},
354 |    "source": [
355 |     "Let's move on. Now we want to extract the sections that contain the publication details.\n",
356 |     "\n",
357 |     "**The easiest way is to inspect the DOM of the web page with a browser.** Check with your browser how to isolate the portions of the page that represent publications. --- Task not in this Notebook ---\n",
358 |     "\n",
359 |     "Ok, each row is composed by a &lt;li&gt; tag and has a class called 'entry':"
360 |    ]
361 |   },
362 |   {
363 |    "cell_type": "code",
364 |    "execution_count": null,
365 |    "metadata": {},
366 |    "outputs": [],
367 |    "source": [
368 |     "publications_wrappers = soup.find_all('li', class_='entry')"
369 |    ]
370 |   },
371 |   {
372 |    "cell_type": "markdown",
373 |    "metadata": {},
374 |    "source": [
375 |     "Let's check the number of rows:"
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "code",
380 |    "execution_count": null,
381 |    "metadata": {},
382 |    "outputs": [],
383 |    "source": [
384 |     "print('Total number of items: {0}'.format(len(publications_wrappers)))"
385 |    ]
386 |   },
387 |   {
388 |    "cell_type": "code",
389 |    "execution_count": null,
390 |    "metadata": {},
391 |    "outputs": [],
392 |    "source": [
393 |     "for p in publications_wrappers:\n",
394 |     "    print(p.find('span', class_='title').text)"
395 |    ]
396 |   },
397 |   {
398 |    "cell_type": "code",
399 |    "execution_count": null,
400 |    "metadata": {},
401 |    "outputs": [],
402 |    "source": [
403 |     "import pandas as pd\n",
404 |     "%matplotlib inline"
405 |    ]
406 |   },
407 |   {
408 |    "cell_type": "code",
409 |    "execution_count": null,
410 |    "metadata": {},
411 |    "outputs": [],
412 |    "source": [
413 |     "publications_list = []\n",
414 |     "for p in publications_wrappers:\n",
415 |     "    title = p.find('span', class_='title').text  # get the title\n",
416 |     "    authos_list = p.find_all('span', {'itemprop': 'author'})  # get the authors list\n",
417 |     "    authors = [author.text for author in authos_list]  \n",
418 |     "    year = p.find('span', {'itemprop': 'datePublished'}).text\n",
419 |     "    publications_list.append({'title': title, \n",
420 |     "                         'authors': authors, \n",
421 |     "                         'year': int(year)})  # here you should validate the data\n",
422 |     "\n",
423 |     "publications = pd.DataFrame.from_dict(publications_list)\n",
424 |     "publications.head()"
425 |    ]
426 |   },
427 |   {
428 |    "cell_type": "code",
429 |    "execution_count": null,
430 |    "metadata": {},
431 |    "outputs": [],
432 |    "source": [
433 |     "publications.groupby('year')\\\n",
434 |     "    .count()\\\n",
435 |     "    .rename(columns = {'title':'count'})\\\n",
436 |     "    .plot(y='count', kind='bar', grid=True, figsize=(10, 6), title='Data from: ' + URL)"
437 |    ]
438 |   }
439 |  ],
440 |  "metadata": {
441 |   "anaconda-cloud": {},
442 |   "kernelspec": {
443 |    "display_name": "Python [conda env:ada] *",
444 |    "language": "python",
445 |    "name": "conda-env-ada-py"
446 |   },
447 |   "language_info": {
448 |    "codemirror_mode": {
449 |     "name": "ipython",
450 |     "version": 3
451 |    },
452 |    "file_extension": ".py",
453 |    "mimetype": "text/x-python",
454 |    "name": "python",
455 |    "nbconvert_exporter": "python",
456 |    "pygments_lexer": "ipython3",
457 |    "version": "3.7.4"
458 |   }
459 |  },
460 |  "nbformat": 4,
461 |  "nbformat_minor": 4
462 | }
463 | 


--------------------------------------------------------------------------------
/Tutorials/02 - Data From The Web/README.md:
--------------------------------------------------------------------------------
 1 | # 02 - Data from the Web
 2 | 
 3 | The purpose of this tutorial session is to offer a **quick** overview on how to scrape a Web page. In details, we illustrate the two main libraries used for this purpose: [Requests](https://requests.kennethreitz.org/en/master/) and [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/). Afterwords, we show how to retrieve data from the Web.
 4 | 
 5 | [Give us your feedback on this tutorial!](https://forms.gle/GdEwU2uvc6hPZKMW9)
 6 | 
 7 | ---
 8 | 
 9 | *Credits to: [Tiziano Piccardi](https://github.com/tizianopiccardi)*
10 | 
11 | *Updated by: [Ekaterina Svikhnushina](https://github.com/Sea94)*
12 | 
13 | 


--------------------------------------------------------------------------------
/Tutorials/03 - Data Visualization/Folium/README.md:
--------------------------------------------------------------------------------
 1 | # Data Visualization - ADA 2018 Tutorial
 2 | 
 3 | ---
 4 | 
 5 | #### What do you find in this notebook ?
 6 | 
 7 | The purpose of the Notebook is to offer a **quick** overview on how to build maps with [`folium`](https://folium.readthedocs.io/en/latest/). In details, we go through the basics of the `folium` API, then apply these methods to a simple visualization of the results of the 2016 US presidential election.
 8 | 
 9 | This is not meant to be an exhaustive documentation of `folium`, its features or its API, but can be useful in the context of Homework 2.
10 | 
11 | ---
12 | 
13 | Original credits : Dylan Bourgeois
14 | 


--------------------------------------------------------------------------------
/Tutorials/03 - Data Visualization/Folium/US_Election_2016.csv:
--------------------------------------------------------------------------------
 1 | State,Percentage
 2 | AL,62.9
 3 | AK,52.9
 4 | AZ,49.5
 5 | AR,60.4
 6 | CA,32.8
 7 | CO,44.4
 8 | CT,41.2
 9 | DE,41.9
10 | FL,49.1
11 | GA,51.3
12 | HI,30.1
13 | ID,59.2
14 | IL,39.4
15 | IN,57.2
16 | IA,51.8
17 | KS,57.2
18 | KY,62.5
19 | LA,58.1
20 | ME,45.2
21 | MD,35.3
22 | MA,33.5
23 | MI,47.6
24 | MN,45.4
25 | MS,58.3
26 | MO,57.1
27 | MT,56.5
28 | NE,60.3
29 | NV,45.5
30 | NH,47.2
31 | NJ,41.8
32 | NM,40.0
33 | NY,58.8
34 | NC,50.5
35 | ND,64.1
36 | OH,52.1
37 | OK,65.3
38 | OR,41.1
39 | PA,48.8
40 | RI,55.4
41 | SC,54.9
42 | SD,61.5
43 | TN,61.1
44 | TX,52.6
45 | UT,45.9
46 | VT,61.1
47 | VA,45
48 | WA,38.2
49 | WV,68.7
50 | WI,47.9
51 | WY,70.1
52 | 


--------------------------------------------------------------------------------
/Tutorials/03 - Data Visualization/Folium/US_Election_2016_binary.csv:
--------------------------------------------------------------------------------
 1 | State,Winner
 2 | AL,1
 3 | AK,1
 4 | AZ,1
 5 | AR,1
 6 | CA,2
 7 | CO,2
 8 | CT,2
 9 | DE,2
10 | FL,1
11 | GA,1
12 | HI,2
13 | ID,1
14 | IL,2
15 | IN,1
16 | IA,1
17 | KS,1
18 | KY,1
19 | LA,1
20 | ME,2
21 | MD,2
22 | MA,2
23 | MI,1
24 | MN,2
25 | MS,1
26 | MO,1
27 | MT,1
28 | NE,1
29 | NV,2
30 | NH,2
31 | NJ,2
32 | NM,2
33 | NY,2
34 | NC,1
35 | ND,1
36 | OH,1
37 | OK,1
38 | OR,2
39 | PA,1
40 | RI,2
41 | SC,1
42 | SD,1
43 | TN,1
44 | TX,1
45 | UT,1
46 | VT,2
47 | VA,2
48 | WA,2
49 | WV,1
50 | WI,1
51 | WY,1
52 | 


--------------------------------------------------------------------------------
/Tutorials/03 - Data Visualization/README.md:
--------------------------------------------------------------------------------
 1 | # Data Visualization - ADA 2019 Tutorial
 2 | 
 3 | ---
 4 | 
 5 | #### What do you find in this notebook ?
 6 | 
 7 | The purpose of the Notebook is to offer a **quick** overview on how to use [`Matplotlib`](https://matplotlib.org/) / [`Seaborn`](https://seaborn.pydata.org/) and how to build maps with [`Folium`](https://folium.readthedocs.io/en/latest/). In details, we go through the basics of the `folium` API, then apply these methods to a simple visualization of the results of the 2016 US presidential election.
 8 | 
 9 | This is not meant to be an exhaustive documentation of the libraries, their features or their API, but can be useful to get started with the Python visualization ecosystem.
10 | 
11 | ---
12 | 
13 | Credits (intro) : Jérémie Rappaz  
14 | Credits (Folium): Dylan Bourgeois
15 | 


--------------------------------------------------------------------------------
/Tutorials/04 - Scaling Up/.gitignore:
--------------------------------------------------------------------------------
1 | *.gz
2 | 


--------------------------------------------------------------------------------
/Tutorials/04 - Scaling Up/PySpark.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# EPFL ADA 2019 - PySpark Tutorial"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "# Let's import the libraries we need\n",
 17 |     "import pandas as pd\n",
 18 |     "import numpy as np\n",
 19 |     "import matplotlib.pyplot as plt\n",
 20 |     "%matplotlib inline\n",
 21 |     "\n",
 22 |     "from pyspark.sql import *\n",
 23 |     "from pyspark.sql.functions import *\n",
 24 |     "from pyspark import SparkContext"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "Let's initialize the Spark context:"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "# create the session\n",
 41 |     "spark = SparkSession.builder.getOrCreate()\n",
 42 |     "\n",
 43 |     "# create the context\n",
 44 |     "sc = spark.sparkContext"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "You can easily check the current version and get the link of the web interface. In the Spark UI, you can monitor the progress of your job and debug the performance bottlenecks:"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "spark"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "# Vietnam War (Nov 1, 1955 – Apr 30, 1975)\n",
 68 |     "\n",
 69 |     "**Pres. Johnson**: _What do you think about this Vietnam thing? I’d like to hear you talk a little bit._\n",
 70 |     "\n",
 71 |     "**Sen. Russell**: _Well, frankly, Mr. President, it’s the damn worse mess that I ever saw, and I don’t like to brag and I never have been right many times in my life, but I knew that we were going to get into this sort of mess when we went in there._\n",
 72 |     "\n",
 73 |     "May 27, 1964\n",
 74 |     "\n",
 75 |     "![banner](img/banner.jpg)"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "markdown",
 80 |    "metadata": {},
 81 |    "source": [
 82 |     "----\n",
 83 |     "\n",
 84 |     "The Vietnam War, also known as the Second Indochina War, and in Vietnam as the Resistance War Against America or simply the American War, was a conflict that occurred in Vietnam, Laos, and Cambodia from 1 November 1955 to the fall of Saigon on 30 April 1975. It was the second of the Indochina Wars and was officially fought between North Vietnam and the government of South Vietnam.\n",
 85 |     "\n",
 86 |     "**The dataset describes all the air force operation in during the Vietnam War.**\n",
 87 |     "\n",
 88 |     "**Bombing_Operations** [Get the dataset here](https://drive.google.com/a/epfl.ch/file/d/11cUKbpt-CZY-IVsAkaGXqwZlEzPvf0j5/view?usp=sharing)\n",
 89 |     "\n",
 90 |     "- AirCraft: _Aircraft model (example: EC-47)_\n",
 91 |     "- ContryFlyingMission: _Country_\n",
 92 |     "- MissionDate: _Date of the mission_\n",
 93 |     "- OperationSupported: _Supported War operation_ (example: [Operation Rolling Thunder](https://en.wikipedia.org/wiki/Operation_Rolling_Thunder))\n",
 94 |     "- PeriodOfDay: _Day or night_\n",
 95 |     "- TakeoffLocation: _Take off airport_\n",
 96 |     "- TimeOnTarget\n",
 97 |     "- WeaponType\n",
 98 |     "- WeaponsLoadedWeight\n",
 99 |     "\n",
100 |     "**Aircraft_Glossary** [Get the dataset here](https://drive.google.com/a/epfl.ch/file/d/1lrG1gt6Zz3T0Oe_MTpzNoU-S6gaTfonC/view?usp=sharing)\n",
101 |     "\n",
102 |     "- AirCraft: _Aircraft model (example: EC-47)_\n",
103 |     "- AirCraftName\n",
104 |     "- AirCraftType\n",
105 |     "\n",
106 |     "**Dataset Information:**\n",
107 |     "\n",
108 |     "THOR is a painstakingly cultivated database of historic aerial bombings from World War I through Vietnam. THOR has already proven useful in finding unexploded ordnance in Southeast Asia and improving Air Force combat tactics:\n",
109 |     "https://www.kaggle.com/usaf/vietnam-war-bombing-operations"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "markdown",
114 |    "metadata": {},
115 |    "source": [
116 |     "Load the datasets:"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": null,
122 |    "metadata": {},
123 |    "outputs": [],
124 |    "source": [
125 |     "Bombing_Operations = spark.read.json(\"Bombing_Operations.json.gz\")\n",
126 |     "Aircraft_Glossary = spark.read.json(\"Aircraft_Glossary.json.gz\")"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "markdown",
131 |    "metadata": {},
132 |    "source": [
133 |     "Check the schema:"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": null,
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "Bombing_Operations.printSchema()"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": null,
148 |    "metadata": {},
149 |    "outputs": [],
150 |    "source": [
151 |     "Aircraft_Glossary.printSchema()"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "markdown",
156 |    "metadata": {},
157 |    "source": [
158 |     "Get a sample with `take()`:"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": null,
164 |    "metadata": {},
165 |    "outputs": [],
166 |    "source": [
167 |     "Bombing_Operations.take(3)"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "markdown",
172 |    "metadata": {},
173 |    "source": [
174 |     "Get a formatted sample with `show()`:"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": null,
180 |    "metadata": {},
181 |    "outputs": [],
182 |    "source": [
183 |     "Aircraft_Glossary.show()"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "metadata": {},
190 |    "outputs": [],
191 |    "source": [
192 |     "print(\"In total there are {0} operations\".format(Bombing_Operations.count()))"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "markdown",
197 |    "metadata": {},
198 |    "source": [
199 |     "## Question 1: Which countries are involved and in how many missions? \n",
200 |     "\n",
201 |     "Keywords: `Dataframe API`, `SQL`, `group by`, `sort`"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "markdown",
206 |    "metadata": {},
207 |    "source": [
208 |     "Let's group the missions by `ContryFlyingMission` and count how many records exist:"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": null,
214 |    "metadata": {
215 |     "scrolled": true
216 |    },
217 |    "outputs": [],
218 |    "source": [
219 |     "missions_counts = Bombing_Operations.groupBy(\"ContryFlyingMission\")\\\n",
220 |     "                                    .agg(count(\"*\").alias(\"MissionsCount\"))\\\n",
221 |     "                                    .sort(desc(\"MissionsCount\"))\n",
222 |     "missions_counts.show()"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "markdown",
227 |    "metadata": {},
228 |    "source": [
229 |     "In this case we used the DataFrame API, but we could rewite the `groupBy` using pure SQL:"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": null,
235 |    "metadata": {},
236 |    "outputs": [],
237 |    "source": [
238 |     "Bombing_Operations.registerTempTable(\"Bombing_Operations\")\n",
239 |     "\n",
240 |     "query = \"\"\"\n",
241 |     "SELECT ContryFlyingMission, count(*) as MissionsCount\n",
242 |     "FROM Bombing_Operations\n",
243 |     "GROUP BY ContryFlyingMission\n",
244 |     "ORDER BY MissionsCount DESC\n",
245 |     "\"\"\"\n",
246 |     "\n",
247 |     "missions_counts = spark.sql(query)\n",
248 |     "missions_counts.show()"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "markdown",
253 |    "metadata": {},
254 |    "source": [
255 |     "The Dataframe is small enough to be moved to Pandas:"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": null,
261 |    "metadata": {},
262 |    "outputs": [],
263 |    "source": [
264 |     "missions_count_pd = missions_counts.toPandas()\n",
265 |     "missions_count_pd.head()"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "markdown",
270 |    "metadata": {},
271 |    "source": [
272 |     "Let's plot a barchart with the number of missions by country:"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "code",
277 |    "execution_count": null,
278 |    "metadata": {},
279 |    "outputs": [],
280 |    "source": [
281 |     "pl = missions_count_pd.plot(kind=\"bar\", \n",
282 |     "                            x=\"ContryFlyingMission\", y=\"MissionsCount\", \n",
283 |     "                            figsize=(10, 7), log=True, alpha=0.5, color=\"olive\")\n",
284 |     "pl.set_xlabel(\"Country\")\n",
285 |     "pl.set_ylabel(\"Number of Missions (Log scale)\")\n",
286 |     "pl.set_title(\"Number of missions by contry\")"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "markdown",
291 |    "metadata": {},
292 |    "source": [
293 |     "----"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "markdown",
298 |    "metadata": {},
299 |    "source": [
300 |     "## Questions 2: Show the number of missions in time for each of the countries involved.\n",
301 |     "\n",
302 |     "Keywords: `group by`, `parse date`, `plot`\n",
303 |     "\n",
304 |     "Let's select the relevant columns:"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "code",
309 |    "execution_count": null,
310 |    "metadata": {},
311 |    "outputs": [],
312 |    "source": [
313 |     "missions_countries = Bombing_Operations.selectExpr([\"to_date(MissionDate) as MissionDate\", \"ContryFlyingMission\"])\n",
314 |     "missions_countries"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "markdown",
319 |    "metadata": {},
320 |    "source": [
321 |     "The filed MissionDate is converted to a Python `date` object.\n",
322 |     "\n",
323 |     "Now we can group by `MissionDate` and `ContryFlyingMission` to get the count:"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": null,
329 |    "metadata": {},
330 |    "outputs": [],
331 |    "source": [
332 |     "missions_by_date = missions_countries\\\n",
333 |     "                    .groupBy([\"MissionDate\", \"ContryFlyingMission\"])\\\n",
334 |     "                    .agg(count(\"*\").alias(\"MissionsCount\"))\\\n",
335 |     "                    .sort(asc(\"MissionDate\")).toPandas()\n",
336 |     "missions_by_date.head()"
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "markdown",
341 |    "metadata": {},
342 |    "source": [
343 |     "Now we can plot the content with a different series for each country:"
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "code",
348 |    "execution_count": null,
349 |    "metadata": {},
350 |    "outputs": [],
351 |    "source": [
352 |     "fig = plt.figure(figsize=(10, 6))\n",
353 |     "\n",
354 |     "# iterate the different groups to create a different series\n",
355 |     "for country, missions in missions_by_date.groupby(\"ContryFlyingMission\"): \n",
356 |     "    plt.plot(missions[\"MissionDate\"], missions[\"MissionsCount\"], label=country)\n",
357 |     "\n",
358 |     "plt.legend(loc='best')"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "markdown",
363 |    "metadata": {},
364 |    "source": [
365 |     "We can observe how South Vietnam increased its missions starting from 1970. The drop in 1973 is motivated by the [Paris Peace Accords](https://en.wikipedia.org/wiki/Paris_Peace_Accords) that took place on January 27th, 1973, to establish peace in Vietnam and end the war."
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "markdown",
370 |    "metadata": {},
371 |    "source": [
372 |     "----"
373 |    ]
374 |   },
375 |   {
376 |    "cell_type": "markdown",
377 |    "metadata": {},
378 |    "source": [
379 |     "## Question 3: Who bombed this location?\n",
380 |     "\n",
381 |     "Keywords: `RDD map reduce` `cache` `save results`\n",
382 |     "\n",
383 |     "<img style=\"float: right;\" src=\"img/Hanoi_POL1966.jpg\">\n",
384 |     "\n",
385 |     "This picture is the Hanoi POL facility (North Vietnam) burning after it was attacked by the U.S. Air Force on 29 June 1966 in the context of the Rolling Thunder operation. \n",
386 |     "\n",
387 |     "We are interested in discovering what was the most common take-off location during that day."
388 |    ]
389 |   },
390 |   {
391 |    "cell_type": "code",
392 |    "execution_count": null,
393 |    "metadata": {},
394 |    "outputs": [],
395 |    "source": [
396 |     "jun_29_operations = Bombing_Operations.where(\"MissionDate = '1966-06-29' AND TargetCountry='NORTH VIETNAM'\")"
397 |    ]
398 |   },
399 |   {
400 |    "cell_type": "markdown",
401 |    "metadata": {},
402 |    "source": [
403 |     "Which coutries scheduled missions that day?"
404 |    ]
405 |   },
406 |   {
407 |    "cell_type": "code",
408 |    "execution_count": null,
409 |    "metadata": {},
410 |    "outputs": [],
411 |    "source": [
412 |     "jun_29_operations.groupBy(\"ContryFlyingMission\").agg(count(\"*\").alias(\"MissionsCount\")).toPandas()"
413 |    ]
414 |   },
415 |   {
416 |    "cell_type": "markdown",
417 |    "metadata": {},
418 |    "source": [
419 |     "Most of the operation that day were performed by USA airplanes."
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "code",
424 |    "execution_count": null,
425 |    "metadata": {},
426 |    "outputs": [],
427 |    "source": [
428 |     "jun_29_operations.take(1)"
429 |    ]
430 |   },
431 |   {
432 |    "cell_type": "markdown",
433 |    "metadata": {},
434 |    "source": [
435 |     "You can specify to cache the content in memory:"
436 |    ]
437 |   },
438 |   {
439 |    "cell_type": "code",
440 |    "execution_count": null,
441 |    "metadata": {},
442 |    "outputs": [],
443 |    "source": [
444 |     "jun_29_operations.cache()"
445 |    ]
446 |   },
447 |   {
448 |    "cell_type": "markdown",
449 |    "metadata": {},
450 |    "source": [
451 |     "Now you can count the number of rows and move the content to the cache:"
452 |    ]
453 |   },
454 |   {
455 |    "cell_type": "code",
456 |    "execution_count": null,
457 |    "metadata": {},
458 |    "outputs": [],
459 |    "source": [
460 |     "%time jun_29_operations.count()"
461 |    ]
462 |   },
463 |   {
464 |    "cell_type": "markdown",
465 |    "metadata": {},
466 |    "source": [
467 |     "The second time the content is cached and it is faster:"
468 |    ]
469 |   },
470 |   {
471 |    "cell_type": "code",
472 |    "execution_count": null,
473 |    "metadata": {},
474 |    "outputs": [],
475 |    "source": [
476 |     "%time jun_29_operations.count()"
477 |    ]
478 |   },
479 |   {
480 |    "cell_type": "markdown",
481 |    "metadata": {},
482 |    "source": [
483 |     "I can also save the results on a file..."
484 |    ]
485 |   },
486 |   {
487 |    "cell_type": "code",
488 |    "execution_count": null,
489 |    "metadata": {},
490 |    "outputs": [],
491 |    "source": [
492 |     "jun_29_operations.write.mode('overwrite').json(\"jun_29_operations.json\")"
493 |    ]
494 |   },
495 |   {
496 |    "cell_type": "markdown",
497 |    "metadata": {},
498 |    "source": [
499 |     "... and read from the file:"
500 |    ]
501 |   },
502 |   {
503 |    "cell_type": "code",
504 |    "execution_count": null,
505 |    "metadata": {},
506 |    "outputs": [],
507 |    "source": [
508 |     "jun_29_operations = spark.read.json(\"jun_29_operations.json\")"
509 |    ]
510 |   },
511 |   {
512 |    "cell_type": "markdown",
513 |    "metadata": {},
514 |    "source": [
515 |     "We can use the simple DataFrame API..."
516 |    ]
517 |   },
518 |   {
519 |    "cell_type": "code",
520 |    "execution_count": null,
521 |    "metadata": {},
522 |    "outputs": [],
523 |    "source": [
524 |     "TakeoffLocationCounts = jun_29_operations\\\n",
525 |     "                            .groupBy(\"TakeoffLocation\").agg(count(\"*\").alias(\"MissionsCount\"))\\\n",
526 |     "                            .sort(desc(\"MissionsCount\"))\n",
527 |     "TakeoffLocationCounts.show()"
528 |    ]
529 |   },
530 |   {
531 |    "cell_type": "markdown",
532 |    "metadata": {},
533 |    "source": [
534 |     "... or the explicit Map/Reduce format with RDDs.\n",
535 |     "\n",
536 |     "First we emit a pair in the format (Location, 1):"
537 |    ]
538 |   },
539 |   {
540 |    "cell_type": "code",
541 |    "execution_count": null,
542 |    "metadata": {},
543 |    "outputs": [],
544 |    "source": [
545 |     "all_locations = jun_29_operations.rdd.map(lambda row: (row.TakeoffLocation, 1))\n",
546 |     "all_locations.take(3)"
547 |    ]
548 |   },
549 |   {
550 |    "cell_type": "markdown",
551 |    "metadata": {},
552 |    "source": [
553 |     "Then, we sum counters in the reduce step, and we sort by count:"
554 |    ]
555 |   },
556 |   {
557 |    "cell_type": "code",
558 |    "execution_count": null,
559 |    "metadata": {},
560 |    "outputs": [],
561 |    "source": [
562 |     "locations_counts_rdd = all_locations.reduceByKey(lambda a, b: a+b).sortBy(lambda r: -r[1])\n",
563 |     "locations_counts_rdd.take(3)"
564 |    ]
565 |   },
566 |   {
567 |    "cell_type": "markdown",
568 |    "metadata": {},
569 |    "source": [
570 |     "Now we can convent the RDD in dataframe by mapping the pairs to objects of type `Row`"
571 |    ]
572 |   },
573 |   {
574 |    "cell_type": "code",
575 |    "execution_count": null,
576 |    "metadata": {},
577 |    "outputs": [],
578 |    "source": [
579 |     "locations_counts_with_schema = locations_counts_rdd.map(lambda r: Row(TakeoffLocation=r[0], MissionsCount=r[1]))\n",
580 |     "locations_counts = spark.createDataFrame(locations_counts_with_schema)\n",
581 |     "locations_counts.show()"
582 |    ]
583 |   },
584 |   {
585 |    "cell_type": "markdown",
586 |    "metadata": {},
587 |    "source": [
588 |     "<img style=\"float: right;\" src=\"img/USS_Constellation.jpg\">\n",
589 |     "\n",
590 |     "\n",
591 |     "That day the most common take-off location was the ship USS Constellation (CV-64). We cannot univocally identify one take off location, but we can reduce the possible candidates. Next step could be: explore TimeOnTarget feature.\n",
592 |     "\n",
593 |     "_USS Constellation (CV-64), a Kitty Hawk-class supercarrier, was the third ship of the United States Navy to be named in honor of the \"new constellation of stars\" on the flag of the United States. One of the fastest ships in the Navy, as proven by her victory during a battlegroup race held in 1985, she was nicknamed \"Connie\" by her crew and officially as \"America's Flagship\"._"
594 |    ]
595 |   },
596 |   {
597 |    "cell_type": "markdown",
598 |    "metadata": {},
599 |    "source": [
600 |     "----"
601 |    ]
602 |   },
603 |   {
604 |    "cell_type": "markdown",
605 |    "metadata": {},
606 |    "source": [
607 |     "## Questions 4: What is the most used aircraft type during the Vietnam war (number of missions)?\n",
608 |     "\n",
609 |     "Keywords: `join` `group by`"
610 |    ]
611 |   },
612 |   {
613 |    "cell_type": "markdown",
614 |    "metadata": {},
615 |    "source": [
616 |     "Let's check the content of `Aircraft_Glossary`:"
617 |    ]
618 |   },
619 |   {
620 |    "cell_type": "code",
621 |    "execution_count": null,
622 |    "metadata": {},
623 |    "outputs": [],
624 |    "source": [
625 |     "Aircraft_Glossary.show(5)"
626 |    ]
627 |   },
628 |   {
629 |    "cell_type": "markdown",
630 |    "metadata": {},
631 |    "source": [
632 |     "We are interested in the filed `AirCraftType`."
633 |    ]
634 |   },
635 |   {
636 |    "cell_type": "code",
637 |    "execution_count": null,
638 |    "metadata": {},
639 |    "outputs": [],
640 |    "source": [
641 |     "Bombing_Operations.select(\"AirCraft\").show(5)"
642 |    ]
643 |   },
644 |   {
645 |    "cell_type": "markdown",
646 |    "metadata": {},
647 |    "source": [
648 |     "We can join on the column `AirCraft` of both dataframes."
649 |    ]
650 |   },
651 |   {
652 |    "cell_type": "markdown",
653 |    "metadata": {},
654 |    "source": [
655 |     "With Dataframe API:"
656 |    ]
657 |   },
658 |   {
659 |    "cell_type": "code",
660 |    "execution_count": null,
661 |    "metadata": {},
662 |    "outputs": [],
663 |    "source": [
664 |     "missions_joined = Bombing_Operations.join(Aircraft_Glossary, \n",
665 |     "                                          Bombing_Operations.AirCraft == Aircraft_Glossary.AirCraft)\n",
666 |     "missions_joined"
667 |    ]
668 |   },
669 |   {
670 |    "cell_type": "markdown",
671 |    "metadata": {},
672 |    "source": [
673 |     "We can select only the field we are interested in:"
674 |    ]
675 |   },
676 |   {
677 |    "cell_type": "code",
678 |    "execution_count": null,
679 |    "metadata": {},
680 |    "outputs": [],
681 |    "source": [
682 |     "missions_aircrafts = missions_joined.select(\"AirCraftType\")\n",
683 |     "missions_aircrafts.show(5)"
684 |    ]
685 |   },
686 |   {
687 |    "cell_type": "markdown",
688 |    "metadata": {},
689 |    "source": [
690 |     "And finally we can group by `AirCraftType` and count:"
691 |    ]
692 |   },
693 |   {
694 |    "cell_type": "code",
695 |    "execution_count": null,
696 |    "metadata": {},
697 |    "outputs": [],
698 |    "source": [
699 |     "missions_aircrafts.groupBy(\"AirCraftType\").agg(count(\"*\").alias(\"MissionsCount\"))\\\n",
700 |     "                  .sort(desc(\"MissionsCount\"))\\\n",
701 |     "                  .show()"
702 |    ]
703 |   },
704 |   {
705 |    "cell_type": "markdown",
706 |    "metadata": {},
707 |    "source": [
708 |     "In alternative we can rewrite this in pure SQL:"
709 |    ]
710 |   },
711 |   {
712 |    "cell_type": "code",
713 |    "execution_count": null,
714 |    "metadata": {},
715 |    "outputs": [],
716 |    "source": [
717 |     "Bombing_Operations.registerTempTable(\"Bombing_Operations\")\n",
718 |     "Aircraft_Glossary.registerTempTable(\"Aircraft_Glossary\")\n",
719 |     "\n",
720 |     "query = \"\"\"\n",
721 |     "SELECT AirCraftType, count(*) MissionsCount\n",
722 |     "FROM Bombing_Operations bo\n",
723 |     "JOIN Aircraft_Glossary ag\n",
724 |     "ON bo.AirCraft = ag.AirCraft\n",
725 |     "GROUP BY AirCraftType\n",
726 |     "ORDER BY MissionsCount DESC\n",
727 |     "\"\"\"\n",
728 |     "\n",
729 |     "spark.sql(query).show()"
730 |    ]
731 |   },
732 |   {
733 |    "cell_type": "markdown",
734 |    "metadata": {},
735 |    "source": [
736 |     "The aircrafts of type `Fighter Jet Bomber` participated in most of the missions in the Vietnam war.\n",
737 |     "\n",
738 |     "Note: This data need cleaning and normalization. See `Fighter Jet Bomber`, `Jet Fighter Bomber`, `Fighter bomber jet`"
739 |    ]
740 |   },
741 |   {
742 |    "cell_type": "code",
743 |    "execution_count": null,
744 |    "metadata": {},
745 |    "outputs": [],
746 |    "source": []
747 |   }
748 |  ],
749 |  "metadata": {
750 |   "kernelspec": {
751 |    "display_name": "Python 3",
752 |    "language": "python",
753 |    "name": "python3"
754 |   },
755 |   "language_info": {
756 |    "codemirror_mode": {
757 |     "name": "ipython",
758 |     "version": 3
759 |    },
760 |    "file_extension": ".py",
761 |    "mimetype": "text/x-python",
762 |    "name": "python",
763 |    "nbconvert_exporter": "python",
764 |    "pygments_lexer": "ipython3",
765 |    "version": "3.7.3"
766 |   }
767 |  },
768 |  "nbformat": 4,
769 |  "nbformat_minor": 2
770 | }
771 | 


--------------------------------------------------------------------------------
/Tutorials/04 - Scaling Up/img/Hanoi_POL1966.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epfl-ada/2019/ab917eb571793ee9bddc673d1e2c465ad4b25888/Tutorials/04 - Scaling Up/img/Hanoi_POL1966.jpg


--------------------------------------------------------------------------------
/Tutorials/04 - Scaling Up/img/USS_Constellation.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epfl-ada/2019/ab917eb571793ee9bddc673d1e2c465ad4b25888/Tutorials/04 - Scaling Up/img/USS_Constellation.jpg


--------------------------------------------------------------------------------
/Tutorials/04 - Scaling Up/img/banner.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epfl-ada/2019/ab917eb571793ee9bddc673d1e2c465ad4b25888/Tutorials/04 - Scaling Up/img/banner.jpg


--------------------------------------------------------------------------------
/Tutorials/04 - Scaling Up/words_count.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import re
 3 | from pyspark.sql import *
 4 | from pyspark import SparkContext, SQLContext
 5 | 
 6 | # context initialization
 7 | sc = SparkContext()
 8 | sqlContext = SQLContext(sc)
 9 | 
10 | # regex to get one word
11 | word_regex = re.compile(r'\w+')
12 | 
13 | # read the input file line by line
14 | text_file = sc.textFile("frankenstein.txt")
15 | 
16 | 
17 | # convert a text line to words vector
18 | def get_line_words(line):
19 |     return word_regex.findall(line.lower())
20 | 
21 | # count the words and sort in descending order
22 | counts_rdd = text_file.flatMap(get_line_words) \
23 |     .map(lambda word: (word, 1)) \
24 |     .reduceByKey(lambda a, b: a + b) \
25 |     .sortBy(lambda wc: -wc[1])
26 | 
27 | # convert to dataframe
28 | counts = sqlContext.createDataFrame(counts_rdd.map(lambda wc: Row(word=wc[0], count=wc[1])))
29 | 
30 | # save to json
31 | counts.write.json("frankenstein_words_count.txt")


--------------------------------------------------------------------------------
/Tutorials/05 - Good Code Practices/README.md:
--------------------------------------------------------------------------------
1 | # Tutorial 5: Good coding practices
2 | 
3 | This tutorial serves as a collection of both general programming tips and tricks, and specific Pythonic (or ADAic) tips to help improve code quality, readability and extensibility. It is not a thorough lecture on programming, but only serves as an introduction to what good coding practices are and where more good practices can be found. The tutorial is most useful for beginners to Python (more so beginners to programming), but nevertheless could also prove useful to more experienced programmers.
4 | 


--------------------------------------------------------------------------------
/Tutorials/05 - Good Code Practices/good_coding_practices.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Good Coding Practices\n",
  8 |     "\n",
  9 |     "This is a small guide on *how-to-code*. Although it's quite succint, there are a few things you should know:\n",
 10 |     " * Remember that the *you* from two months ago does not reply to emails about what this function does and what that variable is.\n",
 11 |     " * Don't try to be a perfectionist when first learning a new language (there is no need to spend weeks going through documentation).\n",
 12 |     " * These are *good enough* practices to ease your code maintenance and collaboration. You'll have your whole career to find out the *best* practices (if they even exist...).\n",
 13 |     " * Follow the KISS principle (Keep it simple, *s’ilvousplait*).\n",
 14 |     " * The goal of coding is not to code faster but to write maintainable code that can be understood and modified more easily in the future, potentially by developers other than you.\n",
 15 |     " * Consistency is king. Even if your approach is not the best, by keeping it consistent people will be able to understand and deal with your code much more easily."
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "metadata": {},
 21 |    "source": [
 22 |     "## 1. Coding Style\n",
 23 |     "\n",
 24 |     "Especially if you are a beginner, using text editors (or IDEs) may help you a lot in the process. PyCharm, for example, gives you easy acces to code predictions and methods' documentation. Aside from that, the way you structure your code should follow certain guidelines. PEP8 is the latest official standard for Python. In a nutshell, this standard suggests:\n",
 25 |     "\n",
 26 |     " * Use either spaces or tabs to indent your code (but be consistent with your choice):\n",
 27 |     " \n",
 28 |     " ```python\n",
 29 |     " def space_indented():\n",
 30 |     "       pass\n",
 31 |     " ```\n",
 32 |     " **Note:** Although using tabs is not the end of the world, mixing tabs and spaces might very well be.\n",
 33 |     " \n",
 34 |     " * Surround top-level functions and class definitions with two blank lines, while only leaving one line for other methods/functions:\n",
 35 |     " \n",
 36 |     " ```python\n",
 37 |     " # CODE ABOVE\n",
 38 |     " \n",
 39 |     " class MyClass:\n",
 40 |     "     \n",
 41 |     "       def __init__(self):\n",
 42 |     "           pass\n",
 43 |     "         \n",
 44 |     " # CODE BELOW\n",
 45 |     " ```\n",
 46 |     " \n",
 47 |     " * Imports should be on top, one per line and in increasing order of specificity:\n",
 48 |     " \n",
 49 |     " ```python\n",
 50 |     " import library\n",
 51 |     " import library as lb\n",
 52 |     " from library import module\n",
 53 |     " from library import module_one, module_two\n",
 54 |     " ```\n",
 55 |     " \n",
 56 |     " * String delimiters should not matter, unless you want to avoid manually escaping characters with '\\':\n",
 57 |     " \n",
 58 |     " ```python\n",
 59 |     " print('Single quotes look nice.')\n",
 60 |     " print(\"So do double quotes.\")\n",
 61 |     " print('But backslashes in the middle of code don\\'t.')\n",
 62 |     " ```\n",
 63 |     " \n",
 64 |     " * White spaces are good when used properly (like after a punctuation mark) but should be avoided when unnecessary:\n",
 65 |     " \n",
 66 |     " ```python\n",
 67 |     " # Good practice\n",
 68 |     " spam(ham[1], {eggs: 2})\n",
 69 |     " \n",
 70 |     " # Bad practice\n",
 71 |     " spam( ham[ 1 ], { eggs: 2 } )\n",
 72 |     " ```\n",
 73 |     " \n",
 74 |     " * Do not inline *if*, *for* and *while*:\n",
 75 |     " \n",
 76 |     " ```python\n",
 77 |     " # Good practice\n",
 78 |     " if foo == 'blah':\n",
 79 |     "       do_blah_thing()\n",
 80 |     " else:\n",
 81 |     "       do_non_blah_thing()\n",
 82 |     " \n",
 83 |     " # Bad practice\n",
 84 |     " if foo == 'blah': do_blah_thing()\n",
 85 |     " else: do_non_blah_thing()\n",
 86 |     " ```\n",
 87 |     " \n",
 88 |     " * Commenting should be used whenever necessary (but it's not always necessary):\n",
 89 |     " \n",
 90 |     " ```python\n",
 91 |     " # Good practice\n",
 92 |     " x = x + 1\n",
 93 |     " \n",
 94 |     " # Bad practice\n",
 95 |     " x = x + 1 # Increment by one\n",
 96 |     " ```\n",
 97 |     " \n",
 98 |     " * Naming should obey these simple patterns:\n",
 99 |     " \n",
100 |     " ```python\n",
101 |     " # Variables should be named in lower case\n",
102 |     " variable_name = None\n",
103 |     " \n",
104 |     " # Functions should be named like variables\n",
105 |     " def function_name(): pass\n",
106 |     " \n",
107 |     " # Classes should be named in CamelCase\n",
108 |     " class ClassName: pass\n",
109 |     " \n",
110 |     " # Constants should be name in all caps\n",
111 |     " CONSTANT_NAME = None\n",
112 |     " ```\n",
113 |     " \n",
114 |     " * You should stay away from global variables (and assign default parameter values instead):\n",
115 |     " \n",
116 |     " ```python\n",
117 |     " # Good practice\n",
118 |     " def inc(x, step = 1):\n",
119 |     "       return x + step\n",
120 |     "     \n",
121 |     " # Bad practice\n",
122 |     " step = 1\n",
123 |     " def inc(x):\n",
124 |     "       return x + step\n",
125 |     " ```\n",
126 |     " \n",
127 |     " * You should avoid using absolute file paths and use relative paths instead:\n",
128 |     " \n",
129 |     " ```python\n",
130 |     " # Good practice\n",
131 |     " with open('./files/info.txt', 'r') as file:\n",
132 |     "       print(file.read())\n",
133 |     " \n",
134 |     " # Bad practice\n",
135 |     " with open('/home/user/Documents/files/info.txt') as file:\n",
136 |     "       print(file.read())\n",
137 |     " ```\n",
138 |     " * Note that \"./\" refers to current folder/directory, use \"../\" to select the parent direct. \n",
139 |     " <br/><br/>\n",
140 |     " "
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "markdown",
145 |    "metadata": {},
146 |    "source": [
147 |     "## 2. Python Tips\n",
148 |     " \n",
149 |     " * The Collections library is your friend when using dictionaries:\n",
150 |     " \n",
151 |     " ```python\n",
152 |     " # Without Collections\n",
153 |     " todo_list = {}\n",
154 |     " if 'ADA' not in todo_list:\n",
155 |     "       todo_list['ADA'] = list([])\n",
156 |     " todo_list['ADA'].append('Homework 2')\n",
157 |     " \n",
158 |     " # With Collections\n",
159 |     " todo_list = collections.defaultdict(list)\n",
160 |     " todo_list['ADA'].append('Homework 2')\n",
161 |     " ```\n",
162 |     " \n",
163 |     " * Collections library is your friend, for counting multiple objects:\n",
164 |     " \n",
165 |     " ```python \n",
166 |     " # Without Collections\n",
167 |     " counter = {}\n",
168 |     " counter['apples'] = 0\n",
169 |     " counter['oranges'] = 0\n",
170 |     " counter['apples'] += 1\n",
171 |     " counter['oranges'] += 1\n",
172 |     " \n",
173 |     "  # With Collections\n",
174 |     " counter = collections.Counter()\n",
175 |     " counter['apples'] += 1\n",
176 |     " counter['oranges'] += 1\n",
177 |     " ```\n",
178 |     " \n",
179 |     " * Reading files should be done with the following construct:\n",
180 |     " \n",
181 |     " ```python\n",
182 |     " # Good practice\n",
183 |     " with open('file', 'r') as file:\n",
184 |     "       print(file.read())\n",
185 |     "       \n",
186 |     " # Bad practice\n",
187 |     " file = open('file', 'r')\n",
188 |     " try:\n",
189 |     "       print(file.read())\n",
190 |     " finally:\n",
191 |     "       file.close()\n",
192 |     " ```\n",
193 |     " \n",
194 |     " **Note:** Using `open(...) as ...` automatically closes the file after the block finishes running.\n",
195 |     " \n",
196 |     " * You can use pickle to store complex objects in memory (saving time when repeating the same operations):\n",
197 |     " \n",
198 |     " ```python\n",
199 |     " import pickle\n",
200 |     " \n",
201 |     " def load_pickle(file_path):\n",
202 |     "       with open(file_path, 'rb') as file:\n",
203 |     "           return pickle.load(file)\n",
204 |     " \n",
205 |     " def save_pickle(result, file_path = 'pickle'):\n",
206 |     "       with open(file_path, 'wb') as file:\n",
207 |     "           pickle.dump(result, file_path)\n",
208 |     " \n",
209 |     " def very_complex_operation():\n",
210 |     "       try:\n",
211 |     "           return load_pickle('pickle')\n",
212 |     "       except (FileNotFoundError, EOFError) as e:\n",
213 |     "           # 30 minute long computation\n",
214 |     "           save_pickle(result)\n",
215 |     "           return result\n",
216 |     " \n",
217 |     " print(very_complex_operation())  # Takes ~30 minutes\n",
218 |     " print(very_complex_operation())  # Takes a couple of seconds\n",
219 |     " ```\n",
220 |     " \n",
221 |     " * String formatting is easy if you know how to do it:\n",
222 |     " \n",
223 |     " ```python\n",
224 |     " # This will convert the integer to string automatically before printing.\n",
225 |     " print(num_of_apples)\n",
226 |     "\n",
227 |     " # This however, will not. \n",
228 |     " print(\"There are \" + num_of_apples + \" apples on the table.\")\n",
229 |     "\n",
230 |     " # You can convert the integers to strings manually, but it's a bad practice.\n",
231 |     " print(\"There are \" + str(num_of apples) + \" apples on the table.\")\n",
232 |     "\n",
233 |     " # Instead, we recommend you to format strings using %\n",
234 |     " print(\"There are %d apples on the table.\" % (num_off_apples))\n",
235 |     " ```\n",
236 |     "\n",
237 |     " **Note:** Use %d for decimals/integers, %f for floats (alternatively %.**x**f to specify a precision of **x**), and %s for strings (or objects with string representations).\n",
238 |     "\n",
239 |     "* Define configuration-related variables at the top of your modules or scripts. This is a data science class, so you are going to process lots of data by reusing the same code - and this will allow you to re-configure it easily:\n",
240 |     "\n",
241 |     " ```python\n",
242 |     " import pandas as pd\n",
243 |     " \n",
244 |     " DATA_PATH = './data/'\n",
245 |     " DEFAULT_ENCODING = 'UTF8'\n",
246 |     " DEFAULT_COMPRESSION = 'gzip'\n",
247 |     " \n",
248 |     " # CODE BELOW\n",
249 |     " ```\n",
250 |     " \n",
251 |     " * Also, you should avoid manually loading every single file you need to process: \n",
252 |     " \n",
253 |     " ```python\n",
254 |     " # Looping through every file in a directory\n",
255 |     " from os import listdir\n",
256 |     " \n",
257 |     " DATA_PATH = './data/'\n",
258 |     " \n",
259 |     " def process_data(path = DATA_PATH):\n",
260 |     "     for file in listdir(path):\n",
261 |     "         do_something(path + file)\n",
262 |     "         \n",
263 |     " # Providing the target files through the command line\n",
264 |     " from argparse import ArgumentParser\n",
265 |     " \n",
266 |     " parser = ArgumentParser()\n",
267 |     " parser.add_argument(\"--filename\", help=\"Name of the file to process\", type = str)\n",
268 |     " args = parser.parse_args()\n",
269 |     " \n",
270 |     " print(args.filename)\n",
271 |     " ```\n",
272 |     " \n",
273 |     " **Note:** An indepth guide can on command line parsing can be found [here](https://docs.python.org/3/howto/argparse.html).\n",
274 |     " "
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "markdown",
279 |    "metadata": {},
280 |    "source": [
281 |     " ## 3. Code Organization\n",
282 |     " \n",
283 |     " This part definitely comes naturally with practice so here we simply give you simple pointers to get you started:\n",
284 |     " \n",
285 |     "* Every *.py* file should be solely comprised of functions and/or classes at its top level (and their necessary imports). Any executable code should go into the *main* function:\n",
286 |     " \n",
287 |     " ```python\n",
288 |     " import math\n",
289 |     " \n",
290 |     " def factorial(n):\n",
291 |     "       return math.factorial(n)\n",
292 |     " \n",
293 |     " if __name__ == '__main__':\n",
294 |     "       print(factorial(3))\n",
295 |     " ```\n",
296 |     " \n",
297 |     "* Use docstring for documenting your code. More than allowing text editors and IDEs to show you this information afterwards, it will make your code more understandeable and maintainable:\n",
298 |     " \n",
299 |     " ```python\n",
300 |     " '''\n",
301 |     " File name: test.py\n",
302 |     " Author: ADA\n",
303 |     " Date created: 03/10/2018\n",
304 |     " Date last modified: 01/11/2019\n",
305 |     " Python Version: 3.6\n",
306 |     " '''\n",
307 |     " import math\n",
308 |     "\n",
309 |     " def factorial(n):\n",
310 |     "       '''\n",
311 |     "       Calculate the factorial of a number.\n",
312 |     "       :param n: int\n",
313 |     "       :return: result\n",
314 |     "       '''\n",
315 |     "       return math.factorial(n)\n",
316 |     "      \n",
317 |     " if __name__ == '__main__':\n",
318 |     "       print(factorial(3))\n",
319 |     " ```\n",
320 |     " **Note:** Docstring help your IDE recognizing the type of an argument (in this example, PyCharm will recognize n is an int).\n",
321 |     " \n",
322 |     "* Keep differently purposed sections of your code in different *.py* files (and import them as you'd like):\n",
323 |     " \n",
324 |     " ```python\n",
325 |     " # simple_plot.py\n",
326 |     " def plot(x):\n",
327 |     "       raise NotImplementedError\n",
328 |     "     \n",
329 |     " # complex_math.py\n",
330 |     " def factorial(n):\n",
331 |     "       raise NotImplementedError\n",
332 |     "     \n",
333 |     " # main.py\n",
334 |     " from simple_plot import plot\n",
335 |     " from complex_math import factorial\n",
336 |     " \n",
337 |     " if __name__ == '__main__':\n",
338 |     "       results = [factorial(i) for i in range(3)]\n",
339 |     "       plot(results)\n",
340 |     " ```\n",
341 |     " \n",
342 |     "* Use top down design to decide on which modules to use (or parts of a single module):\n",
343 |     " 1. Define your problem.\n",
344 |     " 2. Define the necessary tasks (fetch data, preprocess, etc...) and enumerate them (1, 2, 3, ...). \n",
345 |     " 3. Decompose each task into smaller subtasks and enumerate them again (1.1, 1.2, ...).\n",
346 |     "      \n",
347 |     " **Note:** You can also use this hierarchy in either file comments or file names (naming your files *1-fetch_data.py* or *2-preprocess_data.py* to keep them organized). \n",
348 |     "\n",
349 |     "* Folder organization:\n",
350 |     "\n",
351 |     "| **Directory** | **Purpose** |\n",
352 |     "| :-- | :-- |\n",
353 |     "| /doc/ | For text documents |\n",
354 |     "| /src/ | For source code |\n",
355 |     "| /data/ | For raw data |\n",
356 |     "| /generated/ | For manipulated data |\n",
357 |     "| /temp/ | For temporary files |\n",
358 |     "| /results/ | For results |\n",
359 |     "| requirements.txt | For which 3rd party libraries to install |\n",
360 |     "| readme.md | For *how to run* and examples |\n",
361 |     "\n",
362 |     " **Note:** Although you can experiment with different folders, subfolders or folder names, it's strongly advised to keep *Data Manipulation* (i.e. generating schemas from raw data) and *Data Analysis* (everything else that do not work on raw data) in SEPARATE files."
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "markdown",
367 |    "metadata": {},
368 |    "source": [
369 |     " ## 4. ADA Specific Tips\n",
370 |     " \n",
371 |     " * When working with larger amounts of data (especially for the project) test your code with small chunks of it:\n",
372 |     " \n",
373 |     " ```python\n",
374 |     " def dubious_analysis(df):\n",
375 |     "       raise NotImplementedError\n",
376 |     "     \n",
377 |     " dubious_analysis(df.head(100))\n",
378 |     " ```\n",
379 |     " \n",
380 |     " * You can easily pickle DataFrames with one-liners:\n",
381 |     " \n",
382 |     " ```python\n",
383 |     " df.to_pickle('path')\n",
384 |     " ```\n",
385 |     " \n",
386 |     " * Use databases to keep persistent data (with a three-liner):\n",
387 |     " \n",
388 |     " ```python\n",
389 |     " from sqlalchemy import create_engine\n",
390 |     " engine = create_engine('connection string')\n",
391 |     " df = pd.read_sql('events', con = engine)\n",
392 |     " ```\n",
393 |     " \n",
394 |     " * When you scale up and start considering more scalable tools (that you will hear about in a couple of weeks) you should always consider whether or not the volume of your data justifies your choice. Everything comes at a cost."
395 |    ]
396 |   },
397 |   {
398 |    "cell_type": "markdown",
399 |    "metadata": {},
400 |    "source": [
401 |     "## 5. Modularity\n",
402 |     " \n",
403 |     " Last but definitely not least, examples! For those of you that read in-between the lines this might have been obvious but we still want to make sure everyone remembers these. To simplify your code - and your teammates' life when reading it - we will go over a simple code example that you can draw from in the near future.\n",
404 |     " \n",
405 |     " Imagine a situation where you have several airports filghts' departure data for the month of January. In this instance, you want to create an application that can show you a single airport's data for a given day, sorted by the departure time (much like you see on an actual airport's tableau). The data's columns are very simple: \n",
406 |     " \n",
407 |     "     airport_id, flight_id, dest_id, day, hour\n",
408 |     " \n",
409 |     " Your first approach might be something like this:\n",
410 |     " \n",
411 |     " ```python\n",
412 |     "import pandas as pd\n",
413 |     " \n",
414 |     "# Read data from memory and clean it\n",
415 |     "df = pd.read_csv('flight_data.csv.zip', compression='zip').dropna()\n",
416 |     "df['day'] = df['day'].astype(int)\n",
417 |     "df['hour'] = df['hour'].astype(int)\n",
418 |     "df['dest_id'] = df['dest_id'].astype(str)\n",
419 |     "df['flight_id'] = df['flight_id'].astype(int)\n",
420 |     "df['airport_id'] = df['airport_id'].astype(int)\n",
421 |     "\n",
422 |     "day = 12\n",
423 |     "airport_id = 13\n",
424 |     "\n",
425 |     "# Getting sorted flights for that day\n",
426 |     "flights = df[(df['day'] == day) & (df['airport'] == airport_id)]\n",
427 |     "flights = flights.sort_values('hour')[['flight_id', 'dest_id']]\n",
428 |     " ```\n",
429 |     " \n",
430 |     " But as we know you've been paying attention to this tutorial, you'll quickly come up with something better:\n",
431 |     " \n",
432 |     " ```python\n",
433 |     " '''\n",
434 |     " File name: test.py\n",
435 |     " Author: ADA\n",
436 |     " Date created: 03/10/2018\n",
437 |     " Date last modified: 01/11/2019\n",
438 |     " Python Version: 3.6\n",
439 |     " '''\n",
440 |     " import pandas as pd\n",
441 |     " \n",
442 |     " FLIGHT_DATA = 'flight_data.csv.zip'\n",
443 |     " COMPRESSION = 'zip'\n",
444 |     " \n",
445 |     " def format_attr(dataframe):\n",
446 |     "     '''\n",
447 |     "     Create a new dataframe with all attributes\n",
448 |     "     formatted according to the flight datasets'\n",
449 |     "     documentation (at https://....).\n",
450 |     "     :param dataframe: pandas.DataFrame\n",
451 |     "     :return: new dataframe\n",
452 |     "     '''\n",
453 |     "     formatted_df = pd.DataFrame()\n",
454 |     "     formatted_df['day'] = df['day'].astype(int)\n",
455 |     "     formatted_df['hour'] = df['hour'].astype(int)\n",
456 |     "     formatted_df['dest_id'] = df['dest_id'].astype(str)\n",
457 |     "     formatted_df['flight_id'] = df['flight_id'].astype(int)\n",
458 |     "     formatted_df['airport_id'] = df['airport_id'].astype(int)\n",
459 |     "     return formatted_df\n",
460 |     " \n",
461 |     " if __name__ == '__main__':\n",
462 |     "     day = 12\n",
463 |     "     airport_id = 13\n",
464 |     "     \n",
465 |     "     # Read data from memory and clean it\n",
466 |     "     df = pd.read_csv(FILGHT_DATA, compression=COMPRESSION)\n",
467 |     "     df = df.dropna()\n",
468 |     "     df = format_attr(df)\n",
469 |     "     \n",
470 |     "     # Getting flights for that day\n",
471 |     "     flights = df[(df['day'] == day) & (df['airport'] == airport_id)]\n",
472 |     "     flights = flights.sort_values('hour')[['flight_id', 'dest_id']]\n",
473 |     " ```\n",
474 |     "\n",
475 |     "This may still not be enough. There are several things you can do at this point:\n",
476 |     "* Do a for loop over the columns for casting their types, avoiding multiple lines of essentially the same code.\n",
477 |     "* Put the helper functions in a different file. This will increase modularity and code reusability, and help you structure your project better.\n",
478 |     "* If you're really feeling like diving into software engineering, create a class that contains both the data and the logic around the data. An object-oriented design might be overkill for this case, but for a large project, it could be very useful in logically organising your code and orienting your thinking (but make sure you are familiar with object oriented development before putting everything into classes!)."
479 |    ]
480 |   },
481 |   {
482 |    "cell_type": "markdown",
483 |    "metadata": {},
484 |    "source": [
485 |     "## 6. Summary\n",
486 |     "* Write modular, reusable code. Makes use of classes, different files, and packages to organise your code.\n",
487 |     "* Be aware of libraries that could make your life easier, and also of Pythonic ways of doing things.\n",
488 |     "* Test your code (we didn't go into unit tests because this isn't a Python course) and use small data samples to verify its correctness.\n",
489 |     "* Save intermediate results to make new analyses easier and faster."
490 |    ]
491 |   }
492 |  ],
493 |  "metadata": {
494 |   "kernelspec": {
495 |    "display_name": "Python 3",
496 |    "language": "python",
497 |    "name": "python3"
498 |   },
499 |   "language_info": {
500 |    "codemirror_mode": {
501 |     "name": "ipython",
502 |     "version": 3
503 |    },
504 |    "file_extension": ".py",
505 |    "mimetype": "text/x-python",
506 |    "name": "python",
507 |    "nbconvert_exporter": "python",
508 |    "pygments_lexer": "ipython3",
509 |    "version": "3.7.1"
510 |   }
511 |  },
512 |  "nbformat": 4,
513 |  "nbformat_minor": 2
514 | }
515 | 


--------------------------------------------------------------------------------
/Tutorials/06 - Applied ML/AppliedML.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Applied Machine Learning with Scikit Learn\n",
  8 |     "\n",
  9 |     "*Adapted from https://github.com/justmarkham*\n",
 10 |     "\n",
 11 |     "### Libraries\n",
 12 |     "\n",
 13 |     "- [scikit-learn](http://scikit-learn.org/stable/)\n",
 14 |     "- pandas\n",
 15 |     "- matplotlib\n",
 16 |     "\n",
 17 |     "In this tutorial we will see some basic examples of Linear Regression for regression and Logistic Regression for classification."
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": null,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "import pandas as pd\n",
 27 |     "import numpy as np\n",
 28 |     "import matplotlib.pyplot as plt\n",
 29 |     "from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge\n",
 30 |     "from sklearn.preprocessing import OneHotEncoder\n",
 31 |     "from pandas.plotting import scatter_matrix\n",
 32 |     "from sklearn.model_selection import cross_val_predict\n",
 33 |     "from sklearn.model_selection import cross_val_score\n",
 34 |     "from sklearn.metrics import mean_squared_error\n",
 35 |     "import seaborn as sns\n",
 36 |     "%matplotlib inline"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "# Regression with Linear Regression\n",
 44 |     "\n",
 45 |     "|     *            | continuous     | categorical    |\n",
 46 |     "| ---------------- | -------------- | -------------- |\n",
 47 |     "| **supervised**   | **regression** | classification |\n",
 48 |     "| **unsupervised** | dim. reduction | clustering     |\n",
 49 |     "\n",
 50 |     "### Motivation\n",
 51 |     "\n",
 52 |     "Why are we learning Linear Regression?\n",
 53 |     "- widely used\n",
 54 |     "- runs fast\n",
 55 |     "- easy to use (not a lot of tuning required)\n",
 56 |     "- (in some cases) highly interpretable\n",
 57 |     "- basis for many other methods\n"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "Let's import the dataset:"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "data = pd.read_csv('data/Advertising.csv', index_col=0)\n",
 74 |     "data.head()"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "markdown",
 79 |    "metadata": {},
 80 |    "source": [
 81 |     "What are the **features**?\n",
 82 |     "- TV: advertising dollars spent on TV for a single product in a given market (in thousands of dollars)\n",
 83 |     "- Radio: advertising dollars spent on Radio\n",
 84 |     "- Newspaper: advertising dollars spent on Newspaper\n",
 85 |     "\n",
 86 |     "What is the **response**?\n",
 87 |     "- Sales: sales of a single product in a given market (in thousands of units)"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "data.shape"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "# visualize the relationship between the features and the response using scatterplots\n",
106 |     "fig, axs = plt.subplots(1, 3, sharey=True)\n",
107 |     "data.plot(kind='scatter', x='TV', y='sales', ax=axs[0], figsize=(16, 8), grid=True)\n",
108 |     "data.plot(kind='scatter', x='radio', y='sales', ax=axs[1], grid=True)\n",
109 |     "data.plot(kind='scatter', x='newspaper', y='sales', ax=axs[2], grid=True)"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "markdown",
114 |    "metadata": {},
115 |    "source": [
116 |     "## Estimating (\"Learning\") Model Coefficients\n",
117 |     "\n",
118 |     "Generally speaking, coefficients are estimated using the **least squares criterion**, which means we find the line (mathematically) which minimizes the **sum of squared residuals** (or \"sum of squared errors\"):"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "markdown",
123 |    "metadata": {},
124 |    "source": [
125 |     "<img src=\"img/estimating_coefficients.png\">"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "markdown",
130 |    "metadata": {},
131 |    "source": [
132 |     "What elements are present in the diagram?\n",
133 |     "- The black dots are the **observed values** of x and y.\n",
134 |     "- The blue line is our **least squares line**.\n",
135 |     "- The red lines are the **residuals**, which are the distances between the observed values and the least squares line.\n",
136 |     "\n",
137 |     "How do the model coefficients relate to the least squares line?\n",
138 |     "- $\\beta_0$ is the **intercept** (the value of $y$ when $x$=0)\n",
139 |     "- $\\beta_1$ is the **slope** (the change in $y$ divided by change in $x$)\n",
140 |     "\n",
141 |     "Here is a graphical depiction:"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "markdown",
146 |    "metadata": {},
147 |    "source": [
148 |     "<img src=\"img/slope_intercept.png\">"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "markdown",
153 |    "metadata": {},
154 |    "source": [
155 |     "## Hands on!\n",
156 |     "Let's create the feature matrix and the class vector (X and y):"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": null,
162 |    "metadata": {},
163 |    "outputs": [],
164 |    "source": [
165 |     "feature_cols = ['TV', 'radio', 'newspaper']\n",
166 |     "X = data[feature_cols]\n",
167 |     "y = data.sales\n",
168 |     "\n",
169 |     "X.describe()"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "markdown",
174 |    "metadata": {},
175 |    "source": [
176 |     "**Scikit-learn** provides an easy way to train the model:"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": null,
182 |    "metadata": {},
183 |    "outputs": [],
184 |    "source": [
185 |     "lin_reg = LinearRegression()  # create the model\n",
186 |     "lin_reg.fit(X, y)  # train it"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "markdown",
191 |    "metadata": {},
192 |    "source": [
193 |     "Back to the theory! Let's see what the formula looks like:"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": null,
199 |    "metadata": {},
200 |    "outputs": [],
201 |    "source": [
202 |     "for f in range(len(feature_cols)):\n",
203 |     "    print(\"{0} * {1} + \".format(lin_reg.coef_[f], feature_cols[f]))\n",
204 |     "print(lin_reg.intercept_)"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "markdown",
209 |    "metadata": {},
210 |    "source": [
211 |     "\n",
212 |     "\n",
213 |     "$$y = \\beta_0 + \\beta_1  \\times TV + \\beta_2  \\times radio + \\beta_3  \\times newspaper$$\n",
214 |     "$$y = 2.938 + 0.045 \\times TV + 0.18  \\times radio + -0.001  \\times newspaper$$"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "markdown",
219 |    "metadata": {},
220 |    "source": [
221 |     "Let's plot the predictions and the original values:"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": null,
227 |    "metadata": {},
228 |    "outputs": [],
229 |    "source": [
230 |     "lr = LinearRegression()\n",
231 |     "\n",
232 |     "# cross_val_predict returns an array of the same size as `y` where each entry\n",
233 |     "# is a prediction obtained by cross validation:\n",
234 |     "predicted = cross_val_predict(lr, X, y, cv=5)\n",
235 |     "\n",
236 |     "# Plot the results\n",
237 |     "fig, ax = plt.subplots(figsize=(12, 8))\n",
238 |     "ax.scatter(y, predicted, edgecolors=(0, 0, 0))\n",
239 |     "ax.plot([min(y), max(y)], [min(y), max(y)], 'r--', lw=4)\n",
240 |     "ax.set_xlabel('Original')\n",
241 |     "ax.set_ylabel('Predicted')\n",
242 |     "plt.show()"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": null,
248 |    "metadata": {},
249 |    "outputs": [],
250 |    "source": [
251 |     "mean_squared_error(y, predicted)"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "markdown",
256 |    "metadata": {},
257 |    "source": [
258 |     "## Regularization"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "markdown",
263 |    "metadata": {},
264 |    "source": [
265 |     "In our dataset, we have only 3 features, but 200 records, which is enough for learning a good linear model. But what if we had much fewer records, say, 5?"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "code",
270 |    "execution_count": null,
271 |    "metadata": {},
272 |    "outputs": [],
273 |    "source": [
274 |     "X = X[:5]\n",
275 |     "y = y[:5]"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": null,
281 |    "metadata": {},
282 |    "outputs": [],
283 |    "source": [
284 |     "lr = LinearRegression()\n",
285 |     "\n",
286 |     "predicted = cross_val_predict(lr, X, y, cv=5)\n",
287 |     "\n",
288 |     "# Plot the results\n",
289 |     "fig, ax = plt.subplots(figsize=(12, 8))\n",
290 |     "ax.scatter(y, predicted, edgecolors=(0, 0, 0))\n",
291 |     "ax.plot([min(y), max(y)], [min(y), max(y)], 'r--', lw=4)\n",
292 |     "ax.set_xlabel('Original')\n",
293 |     "ax.set_ylabel('Predicted')\n",
294 |     "plt.show()"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": null,
300 |    "metadata": {},
301 |    "outputs": [],
302 |    "source": [
303 |     "mean_squared_error(y, predicted)"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": null,
309 |    "metadata": {},
310 |    "outputs": [],
311 |    "source": [
312 |     "logistic = LinearRegression()\n",
313 |     "logistic.fit(X, y)\n",
314 |     "predicted_train = logistic.predict(X)\n",
315 |     "mean_squared_error(y, predicted_train)"
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "markdown",
320 |    "metadata": {},
321 |    "source": [
322 |     "**Problem**: The model remembers the training records (overfitting).\n",
323 |     "\n",
324 |     "**Solution**: Regularization\n",
325 |     "\n",
326 |     "Regularization refers to methods that help to reduce overfitting. Let's try Ridge Regression, which puts a penalty on large weights $\\beta_i$ and forces them to be smaller in magnitude. This reduces the complexity of the model. More on this in the homework."
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "code",
331 |    "execution_count": null,
332 |    "metadata": {},
333 |    "outputs": [],
334 |    "source": [
335 |     "ridge = Ridge(alpha=6)\n",
336 |     "\n",
337 |     "# cross_val_predict returns an array of the same size as `y` where each entry\n",
338 |     "# is a prediction obtained by cross validation:\n",
339 |     "predicted_r = cross_val_predict(ridge, X, y, cv=5)\n",
340 |     "\n",
341 |     "fig, ax = plt.subplots(figsize=(12, 8))\n",
342 |     "ax.scatter(y, predicted_r, edgecolors=(0, 0, 0))\n",
343 |     "ax.plot([min(y), max(y)], [min(y), max(y)], 'r--', lw=4)\n",
344 |     "ax.set_xlabel('Original')\n",
345 |     "ax.set_ylabel('Predicted')\n",
346 |     "plt.show()"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "code",
351 |    "execution_count": null,
352 |    "metadata": {},
353 |    "outputs": [],
354 |    "source": [
355 |     "mean_squared_error(y, predicted_r)"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "markdown",
360 |    "metadata": {},
361 |    "source": [
362 |     "# Classification with Logistic Regression\n",
363 |     "\n",
364 |     "|*|continuous|categorical|\n",
365 |     "|---|---|---|\n",
366 |     "|**supervised**|regression|**classification**|\n",
367 |     "|**unsupervised**|dim. reduction|clustering|"
368 |    ]
369 |   },
370 |   {
371 |    "cell_type": "markdown",
372 |    "metadata": {},
373 |    "source": [
374 |     "Let's import the dataset:"
375 |    ]
376 |   },
377 |   {
378 |    "cell_type": "code",
379 |    "execution_count": null,
380 |    "metadata": {},
381 |    "outputs": [],
382 |    "source": [
383 |     "titanic_raw = pd.read_excel('data/titanic.xls')\n",
384 |     "titanic = titanic_raw.dropna(axis=0, how='any')\n",
385 |     "titanic.head()"
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "markdown",
390 |    "metadata": {},
391 |    "source": [
392 |     "What are the **features**?\n",
393 |     "- sex\n",
394 |     "- age\n",
395 |     "- fare: ticket price\n",
396 |     "\n",
397 |     "What is the **response**?\n",
398 |     "- survived: whether the passenger survived the disaster or not"
399 |    ]
400 |   },
401 |   {
402 |    "cell_type": "code",
403 |    "execution_count": null,
404 |    "metadata": {},
405 |    "outputs": [],
406 |    "source": [
407 |     "dead = titanic[titanic['survived']==0]\n",
408 |     "survived = titanic[titanic['survived']==1]\n",
409 |     "\n",
410 |     "print(\"Survived {0}, Dead {1}\".format(len(dead), len(survived)))"
411 |    ]
412 |   },
413 |   {
414 |    "cell_type": "markdown",
415 |    "metadata": {},
416 |    "source": [
417 |     "Specify the columns to use as features:"
418 |    ]
419 |   },
420 |   {
421 |    "cell_type": "code",
422 |    "execution_count": null,
423 |    "metadata": {},
424 |    "outputs": [],
425 |    "source": [
426 |     "titanic_features = ['sex', 'age', 'fare']"
427 |    ]
428 |   },
429 |   {
430 |    "cell_type": "markdown",
431 |    "metadata": {},
432 |    "source": [
433 |     "### Let's prepare the feature vector for the training\n",
434 |     "\n",
435 |     "The dataset contains one categorical variable: sex (male|female)\n",
436 |     "\n",
437 |     "We need to convert it to a numerical variable. Pandas offers the method *get_dummies* that takes care of this:"
438 |    ]
439 |   },
440 |   {
441 |    "cell_type": "code",
442 |    "execution_count": null,
443 |    "metadata": {},
444 |    "outputs": [],
445 |    "source": [
446 |     "# The features vector\n",
447 |     "X = pd.get_dummies(titanic[titanic_features])\n",
448 |     "X.head()\n",
449 |     "# titanic['pclass'] = titanic['pclass'].astype('category')"
450 |    ]
451 |   },
452 |   {
453 |    "cell_type": "markdown",
454 |    "metadata": {},
455 |    "source": [
456 |     "The label used for the traning:"
457 |    ]
458 |   },
459 |   {
460 |    "cell_type": "code",
461 |    "execution_count": null,
462 |    "metadata": {},
463 |    "outputs": [],
464 |    "source": [
465 |     "y = titanic['survived']"
466 |    ]
467 |   },
468 |   {
469 |    "cell_type": "markdown",
470 |    "metadata": {},
471 |    "source": [
472 |     "Let's train the model ..."
473 |    ]
474 |   },
475 |   {
476 |    "cell_type": "code",
477 |    "execution_count": null,
478 |    "metadata": {},
479 |    "outputs": [],
480 |    "source": [
481 |     "logistic = LogisticRegression(solver='lbfgs')\n",
482 |     "\n",
483 |     "# for f in range(len(feature_cols)):\n",
484 |     "#     print(\"{0} * {1} + \".format(logistic.coef_[f], feature_cols[f]))\n",
485 |     "print(logistic)"
486 |    ]
487 |   },
488 |   {
489 |    "cell_type": "markdown",
490 |    "metadata": {},
491 |    "source": [
492 |     "... and evaluate the precison/recall with a cross validation (10 splits)."
493 |    ]
494 |   },
495 |   {
496 |    "cell_type": "code",
497 |    "execution_count": null,
498 |    "metadata": {},
499 |    "outputs": [],
500 |    "source": [
501 |     "precision = cross_val_score(logistic, X, y, cv=10, scoring=\"precision\")\n",
502 |     "recall = cross_val_score(logistic, X, y, cv=10, scoring=\"recall\")\n",
503 |     "\n",
504 |     "# Precision: avoid false positives\n",
505 |     "print(\"Precision: %0.2f (+/- %0.2f)\" % (precision.mean(), precision.std() * 2))\n",
506 |     "# Recall: avoid false negatives\n",
507 |     "print(\"Recall: %0.2f (+/- %0.2f)\" % (recall.mean(), recall.std() * 2))"
508 |    ]
509 |   },
510 |   {
511 |    "cell_type": "markdown",
512 |    "metadata": {},
513 |    "source": [
514 |     "### Explore the model output\n",
515 |     "\n",
516 |     "Let's train on the full dataset:"
517 |    ]
518 |   },
519 |   {
520 |    "cell_type": "code",
521 |    "execution_count": null,
522 |    "metadata": {},
523 |    "outputs": [],
524 |    "source": [
525 |     "logistic = LogisticRegression(solver='lbfgs')\n",
526 |     "logistic.fit(X, y)"
527 |    ]
528 |   },
529 |   {
530 |    "cell_type": "markdown",
531 |    "metadata": {},
532 |    "source": [
533 |     "Of course, since we trained the whole dataset, we don't have new samples to predict, but we can predict the outcome and the relative probability for some artificial samples. Would you have survived?"
534 |    ]
535 |   },
536 |   {
537 |    "cell_type": "code",
538 |    "execution_count": null,
539 |    "metadata": {},
540 |    "outputs": [],
541 |    "source": [
542 |     "X.columns"
543 |    ]
544 |   },
545 |   {
546 |    "cell_type": "code",
547 |    "execution_count": null,
548 |    "metadata": {},
549 |    "outputs": [],
550 |    "source": [
551 |     "logistic.predict([[25, 100, 0, 1]])"
552 |    ]
553 |   },
554 |   {
555 |    "cell_type": "code",
556 |    "execution_count": null,
557 |    "metadata": {},
558 |    "outputs": [],
559 |    "source": [
560 |     "logistic.predict_proba([[25, 100, 0, 1]])"
561 |    ]
562 |   },
563 |   {
564 |    "cell_type": "code",
565 |    "execution_count": null,
566 |    "metadata": {},
567 |    "outputs": [],
568 |    "source": [
569 |     "logistic.predict([[25, 100, 1, 0]])"
570 |    ]
571 |   },
572 |   {
573 |    "cell_type": "code",
574 |    "execution_count": null,
575 |    "metadata": {},
576 |    "outputs": [],
577 |    "source": [
578 |     "logistic.predict_proba([[25, 100, 1, 0]])"
579 |    ]
580 |   }
581 |  ],
582 |  "metadata": {
583 |   "kernelspec": {
584 |    "display_name": "Python 3",
585 |    "language": "python",
586 |    "name": "python3"
587 |   },
588 |   "language_info": {
589 |    "codemirror_mode": {
590 |     "name": "ipython",
591 |     "version": 3
592 |    },
593 |    "file_extension": ".py",
594 |    "mimetype": "text/x-python",
595 |    "name": "python",
596 |    "nbconvert_exporter": "python",
597 |    "pygments_lexer": "ipython3",
598 |    "version": "3.7.4"
599 |   }
600 |  },
601 |  "nbformat": 4,
602 |  "nbformat_minor": 4
603 | }
604 | 


--------------------------------------------------------------------------------
/Tutorials/06 - Applied ML/README.md:
--------------------------------------------------------------------------------
1 | # 06 - Applied ML
2 | 
3 | The goal of this tutorial is to get you familiar with scikit-learn, a popular ML library for Python. It runs you through training and using your own regression and classification models through the examples of linear regression and logistic regression.
4 | 
5 | ---
6 | 
7 | Adapted from https://github.com/justmarkham
8 | 
9 | Credits: Tiziano Piccardi, Valentin Hartmann (regularization part)


--------------------------------------------------------------------------------
/Tutorials/06 - Applied ML/data/Advertising.csv:
--------------------------------------------------------------------------------
  1 | ,TV,radio,newspaper,sales
  2 | 1,230.1,37.8,69.2,22.1
  3 | 2,44.5,39.3,45.1,10.4
  4 | 3,17.2,45.9,69.3,9.3
  5 | 4,151.5,41.3,58.5,18.5
  6 | 5,180.8,10.8,58.4,12.9
  7 | 6,8.7,48.9,75,7.2
  8 | 7,57.5,32.8,23.5,11.8
  9 | 8,120.2,19.6,11.6,13.2
 10 | 9,8.6,2.1,1,4.8
 11 | 10,199.8,2.6,21.2,10.6
 12 | 11,66.1,5.8,24.2,8.6
 13 | 12,214.7,24,4,17.4
 14 | 13,23.8,35.1,65.9,9.2
 15 | 14,97.5,7.6,7.2,9.7
 16 | 15,204.1,32.9,46,19
 17 | 16,195.4,47.7,52.9,22.4
 18 | 17,67.8,36.6,114,12.5
 19 | 18,281.4,39.6,55.8,24.4
 20 | 19,69.2,20.5,18.3,11.3
 21 | 20,147.3,23.9,19.1,14.6
 22 | 21,218.4,27.7,53.4,18
 23 | 22,237.4,5.1,23.5,12.5
 24 | 23,13.2,15.9,49.6,5.6
 25 | 24,228.3,16.9,26.2,15.5
 26 | 25,62.3,12.6,18.3,9.7
 27 | 26,262.9,3.5,19.5,12
 28 | 27,142.9,29.3,12.6,15
 29 | 28,240.1,16.7,22.9,15.9
 30 | 29,248.8,27.1,22.9,18.9
 31 | 30,70.6,16,40.8,10.5
 32 | 31,292.9,28.3,43.2,21.4
 33 | 32,112.9,17.4,38.6,11.9
 34 | 33,97.2,1.5,30,9.6
 35 | 34,265.6,20,0.3,17.4
 36 | 35,95.7,1.4,7.4,9.5
 37 | 36,290.7,4.1,8.5,12.8
 38 | 37,266.9,43.8,5,25.4
 39 | 38,74.7,49.4,45.7,14.7
 40 | 39,43.1,26.7,35.1,10.1
 41 | 40,228,37.7,32,21.5
 42 | 41,202.5,22.3,31.6,16.6
 43 | 42,177,33.4,38.7,17.1
 44 | 43,293.6,27.7,1.8,20.7
 45 | 44,206.9,8.4,26.4,12.9
 46 | 45,25.1,25.7,43.3,8.5
 47 | 46,175.1,22.5,31.5,14.9
 48 | 47,89.7,9.9,35.7,10.6
 49 | 48,239.9,41.5,18.5,23.2
 50 | 49,227.2,15.8,49.9,14.8
 51 | 50,66.9,11.7,36.8,9.7
 52 | 51,199.8,3.1,34.6,11.4
 53 | 52,100.4,9.6,3.6,10.7
 54 | 53,216.4,41.7,39.6,22.6
 55 | 54,182.6,46.2,58.7,21.2
 56 | 55,262.7,28.8,15.9,20.2
 57 | 56,198.9,49.4,60,23.7
 58 | 57,7.3,28.1,41.4,5.5
 59 | 58,136.2,19.2,16.6,13.2
 60 | 59,210.8,49.6,37.7,23.8
 61 | 60,210.7,29.5,9.3,18.4
 62 | 61,53.5,2,21.4,8.1
 63 | 62,261.3,42.7,54.7,24.2
 64 | 63,239.3,15.5,27.3,15.7
 65 | 64,102.7,29.6,8.4,14
 66 | 65,131.1,42.8,28.9,18
 67 | 66,69,9.3,0.9,9.3
 68 | 67,31.5,24.6,2.2,9.5
 69 | 68,139.3,14.5,10.2,13.4
 70 | 69,237.4,27.5,11,18.9
 71 | 70,216.8,43.9,27.2,22.3
 72 | 71,199.1,30.6,38.7,18.3
 73 | 72,109.8,14.3,31.7,12.4
 74 | 73,26.8,33,19.3,8.8
 75 | 74,129.4,5.7,31.3,11
 76 | 75,213.4,24.6,13.1,17
 77 | 76,16.9,43.7,89.4,8.7
 78 | 77,27.5,1.6,20.7,6.9
 79 | 78,120.5,28.5,14.2,14.2
 80 | 79,5.4,29.9,9.4,5.3
 81 | 80,116,7.7,23.1,11
 82 | 81,76.4,26.7,22.3,11.8
 83 | 82,239.8,4.1,36.9,12.3
 84 | 83,75.3,20.3,32.5,11.3
 85 | 84,68.4,44.5,35.6,13.6
 86 | 85,213.5,43,33.8,21.7
 87 | 86,193.2,18.4,65.7,15.2
 88 | 87,76.3,27.5,16,12
 89 | 88,110.7,40.6,63.2,16
 90 | 89,88.3,25.5,73.4,12.9
 91 | 90,109.8,47.8,51.4,16.7
 92 | 91,134.3,4.9,9.3,11.2
 93 | 92,28.6,1.5,33,7.3
 94 | 93,217.7,33.5,59,19.4
 95 | 94,250.9,36.5,72.3,22.2
 96 | 95,107.4,14,10.9,11.5
 97 | 96,163.3,31.6,52.9,16.9
 98 | 97,197.6,3.5,5.9,11.7
 99 | 98,184.9,21,22,15.5
100 | 99,289.7,42.3,51.2,25.4
101 | 100,135.2,41.7,45.9,17.2
102 | 101,222.4,4.3,49.8,11.7
103 | 102,296.4,36.3,100.9,23.8
104 | 103,280.2,10.1,21.4,14.8
105 | 104,187.9,17.2,17.9,14.7
106 | 105,238.2,34.3,5.3,20.7
107 | 106,137.9,46.4,59,19.2
108 | 107,25,11,29.7,7.2
109 | 108,90.4,0.3,23.2,8.7
110 | 109,13.1,0.4,25.6,5.3
111 | 110,255.4,26.9,5.5,19.8
112 | 111,225.8,8.2,56.5,13.4
113 | 112,241.7,38,23.2,21.8
114 | 113,175.7,15.4,2.4,14.1
115 | 114,209.6,20.6,10.7,15.9
116 | 115,78.2,46.8,34.5,14.6
117 | 116,75.1,35,52.7,12.6
118 | 117,139.2,14.3,25.6,12.2
119 | 118,76.4,0.8,14.8,9.4
120 | 119,125.7,36.9,79.2,15.9
121 | 120,19.4,16,22.3,6.6
122 | 121,141.3,26.8,46.2,15.5
123 | 122,18.8,21.7,50.4,7
124 | 123,224,2.4,15.6,11.6
125 | 124,123.1,34.6,12.4,15.2
126 | 125,229.5,32.3,74.2,19.7
127 | 126,87.2,11.8,25.9,10.6
128 | 127,7.8,38.9,50.6,6.6
129 | 128,80.2,0,9.2,8.8
130 | 129,220.3,49,3.2,24.7
131 | 130,59.6,12,43.1,9.7
132 | 131,0.7,39.6,8.7,1.6
133 | 132,265.2,2.9,43,12.7
134 | 133,8.4,27.2,2.1,5.7
135 | 134,219.8,33.5,45.1,19.6
136 | 135,36.9,38.6,65.6,10.8
137 | 136,48.3,47,8.5,11.6
138 | 137,25.6,39,9.3,9.5
139 | 138,273.7,28.9,59.7,20.8
140 | 139,43,25.9,20.5,9.6
141 | 140,184.9,43.9,1.7,20.7
142 | 141,73.4,17,12.9,10.9
143 | 142,193.7,35.4,75.6,19.2
144 | 143,220.5,33.2,37.9,20.1
145 | 144,104.6,5.7,34.4,10.4
146 | 145,96.2,14.8,38.9,11.4
147 | 146,140.3,1.9,9,10.3
148 | 147,240.1,7.3,8.7,13.2
149 | 148,243.2,49,44.3,25.4
150 | 149,38,40.3,11.9,10.9
151 | 150,44.7,25.8,20.6,10.1
152 | 151,280.7,13.9,37,16.1
153 | 152,121,8.4,48.7,11.6
154 | 153,197.6,23.3,14.2,16.6
155 | 154,171.3,39.7,37.7,19
156 | 155,187.8,21.1,9.5,15.6
157 | 156,4.1,11.6,5.7,3.2
158 | 157,93.9,43.5,50.5,15.3
159 | 158,149.8,1.3,24.3,10.1
160 | 159,11.7,36.9,45.2,7.3
161 | 160,131.7,18.4,34.6,12.9
162 | 161,172.5,18.1,30.7,14.4
163 | 162,85.7,35.8,49.3,13.3
164 | 163,188.4,18.1,25.6,14.9
165 | 164,163.5,36.8,7.4,18
166 | 165,117.2,14.7,5.4,11.9
167 | 166,234.5,3.4,84.8,11.9
168 | 167,17.9,37.6,21.6,8
169 | 168,206.8,5.2,19.4,12.2
170 | 169,215.4,23.6,57.6,17.1
171 | 170,284.3,10.6,6.4,15
172 | 171,50,11.6,18.4,8.4
173 | 172,164.5,20.9,47.4,14.5
174 | 173,19.6,20.1,17,7.6
175 | 174,168.4,7.1,12.8,11.7
176 | 175,222.4,3.4,13.1,11.5
177 | 176,276.9,48.9,41.8,27
178 | 177,248.4,30.2,20.3,20.2
179 | 178,170.2,7.8,35.2,11.7
180 | 179,276.7,2.3,23.7,11.8
181 | 180,165.6,10,17.6,12.6
182 | 181,156.6,2.6,8.3,10.5
183 | 182,218.5,5.4,27.4,12.2
184 | 183,56.2,5.7,29.7,8.7
185 | 184,287.6,43,71.8,26.2
186 | 185,253.8,21.3,30,17.6
187 | 186,205,45.1,19.6,22.6
188 | 187,139.5,2.1,26.6,10.3
189 | 188,191.1,28.7,18.2,17.3
190 | 189,286,13.9,3.7,15.9
191 | 190,18.7,12.1,23.4,6.7
192 | 191,39.5,41.1,5.8,10.8
193 | 192,75.5,10.8,6,9.9
194 | 193,17.2,4.1,31.6,5.9
195 | 194,166.8,42,3.6,19.6
196 | 195,149.7,35.6,6,17.3
197 | 196,38.2,3.7,13.8,7.6
198 | 197,94.2,4.9,8.1,9.7
199 | 198,177,9.3,6.4,12.8
200 | 199,283.6,42,66.2,25.5
201 | 200,232.1,8.6,8.7,13.4
202 | 


--------------------------------------------------------------------------------
/Tutorials/06 - Applied ML/data/titanic.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epfl-ada/2019/ab917eb571793ee9bddc673d1e2c465ad4b25888/Tutorials/06 - Applied ML/data/titanic.xls


--------------------------------------------------------------------------------
/Tutorials/06 - Applied ML/img/estimating_coefficients.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epfl-ada/2019/ab917eb571793ee9bddc673d1e2c465ad4b25888/Tutorials/06 - Applied ML/img/estimating_coefficients.png


--------------------------------------------------------------------------------
/Tutorials/06 - Applied ML/img/slope_intercept.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epfl-ada/2019/ab917eb571793ee9bddc673d1e2c465ad4b25888/Tutorials/06 - Applied ML/img/slope_intercept.png


--------------------------------------------------------------------------------
/Tutorials/07 - Handling Text/README.md:
--------------------------------------------------------------------------------
 1 | # Handling Text tutorial
 2 | 
 3 | ## In this tutorial, we will go through:
 4 | 1. Implementing the natural language processing pipeline
 5 | 2. Solving three typical language processing tasks:
 6 |    * Sentiment analysis
 7 |    * Document classification
 8 |    * Topic detection
 9 |    * Semantic analysis
10 | 
11 | 
12 | ### We will be working with four books in plain text:
13 | 1. Moby-Dick, by Herman Melville
14 | 2. Frankenstein; or, The Modern Prometheus, by Mary Shelley
15 | 3. Pride and Prejudice, by Jane Austen
16 | 4. Dracula, Bram Stoker
17 | 
18 | 
19 | #### To run the notebook, you need to install the following libraries:
20 | conda install nltk gensim spacy <br>
21 | pip install pyLDAvis  <br>
22 | pip install vaderSentiment  <br>
23 | pip install empath <br>
24 | python -m spacy download en  <br>
25 | python -m nltk.downloader punkt <br>
26 | python -m nltk.downloader all-corpora <br>
27 | 


--------------------------------------------------------------------------------
/Tutorials/07 - Handling Text/spacy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epfl-ada/2019/ab917eb571793ee9bddc673d1e2c465ad4b25888/Tutorials/07 - Handling Text/spacy.png


--------------------------------------------------------------------------------
/Tutorials/08 - Handling Graphs/README.md:
--------------------------------------------------------------------------------
 1 | # Handling Graphs - ADA 2019 Tutorial
 2 | 
 3 | ---
 4 | 
 5 | #### What do you find in this notebook ?
 6 | 
 7 | This notebook gives an introduction for handling graphs, using [`NetworkX`](https://networkx.github.io/documentation/stable/index.html). It covers a few popular techniques used in the field of **social network analysis**, by applying them on a real world dataset and on multiple synthetic/toy examples in order to get an intuition of the algorithms used.
 8 | 
 9 | 
10 | ---
11 | 
12 | #### Credits
13 | 
14 | Credits to: [Isabela Constantin](https://github.com/isabelaconstantin) and [Jérémie Rappaz](https://github.com/JRappaz)
15 | 
16 | Updated by: [Akhil Arora](https://dlab.epfl.ch/people/aarora/)
17 | 
18 | Inspired from: [The Programming Historian](https://programminghistorian.org/en/lessons/exploring-and-analyzing-network-data-with-python)
19 | 


--------------------------------------------------------------------------------
/Tutorials/08 - Handling Graphs/data/quakers/quakers_edgelist.csv:
--------------------------------------------------------------------------------
  1 | Source,Target
  2 | George Keith,Robert Barclay
  3 | George Keith,Benjamin Furly
  4 | George Keith,Anne Conway Viscountess Conway and Killultagh
  5 | George Keith,Franciscus Mercurius van Helmont
  6 | George Keith,William Penn
  7 | George Keith,George Fox
  8 | George Keith,George Whitehead
  9 | George Keith,William Bradford
 10 | James Parnel,Benjamin Furly
 11 | James Parnel,Stephen Crisp
 12 | Peter Collinson,John Bartram
 13 | Peter Collinson,James Logan
 14 | Joseph Wyeth,Thomas Ellwood
 15 | Alexander Skene of Newtyle,Lilias Skene
 16 | Dorcas Erbery,James Nayler
 17 | William Mucklow,George Fox
 18 | Franciscus Mercurius van Helmont,Benjamin Furly
 19 | William Dewsbury,Edward Burrough
 20 | William Dewsbury,George Fox
 21 | William Dewsbury,John Crook
 22 | John Audland,John Camm
 23 | John Audland,Francis Howgill
 24 | John Audland,Edward Pyott
 25 | John Audland,Charles Marshall
 26 | John Audland,George Fox
 27 | John Audland,Anne Camm
 28 | Francis Howgill,Martha Simmonds
 29 | Francis Howgill,James Nayler
 30 | Francis Howgill,Edward Burrough
 31 | Francis Howgill,George Fox
 32 | Francis Howgill,Richard Farnworth
 33 | Francis Howgill,William Crouch
 34 | William Bradford,William Penn
 35 | William Bradford,Tace Sowle
 36 | John Bellers,Fettiplace Bellers
 37 | William Rogers,Thomas Ellwood
 38 | William Rogers,George Whitehead
 39 | Martha Simmonds,Hannah Stranger
 40 | Martha Simmonds,James Nayler
 41 | Isabel Yeamans,William Penn
 42 | George Fox the younger,Margaret Fell
 43 | George Fox,Ellis Hookes
 44 | George Fox,William Mead
 45 | George Fox,Elizabeth Hooten
 46 | George Fox,Margaret Fell
 47 | George Fox,John Crook
 48 | George Fox,George Whitehead
 49 | George Fox,Benjamin Furly
 50 | George Fox,Thomas Salthouse
 51 | George Fox,James Nayler
 52 | George Fox,Edward Burrough
 53 | George Fox,John Wilkinson
 54 | George Fox,Thomas Ellwood
 55 | George Fox,William Coddington
 56 | George Fox,John Stubbs
 57 | George Fox,John Perrot
 58 | George Fox,Leonard Fell
 59 | George Fox,William Penn
 60 | John Stubbs,Benjamin Furly
 61 | John Stubbs,William Caton
 62 | John Stubbs,Samuel Fisher
 63 | John Stubbs,John Burnyeat
 64 | John Camm,Charles Marshall
 65 | John Camm,Thomas Camm
 66 | John Camm,Edward Pyott
 67 | Thomas Camm,John Story
 68 | Thomas Camm,Anne Camm
 69 | John Wilkinson,Charles Marshall
 70 | John Wilkinson,Solomon Eccles
 71 | John Wilkinson,John Story
 72 | John Wilkinson,Alexander Parker
 73 | Isaac Norris,William Penn
 74 | John Swinton,David Barclay of Ury
 75 | James Nayler,Hannah Stranger
 76 | James Nayler,Gervase Benson
 77 | James Nayler,Margaret Fell
 78 | James Nayler,Richard Farnworth
 79 | James Nayler,George Whitehead
 80 | James Nayler,John Perrot
 81 | James Nayler,Robert Rich
 82 | James Nayler,Anthony Pearson
 83 | James Nayler,Thomas Ellwood
 84 | James Nayler,Edward Burrough
 85 | James Nayler,Rebecca Travers
 86 | James Nayler,William Tomlinson
 87 | Anthony Sharp,Samuel Clarridge
 88 | Anthony Sharp,Thomas Curtis
 89 | Anthony Sharp,William Edmundson
 90 | Thomas Salthouse,Margaret Fell
 91 | William Ames,William Caton
 92 | Mary Fisher,John Perrot
 93 | Mary Fisher,Mary Prince
 94 | Lydia Lancaster,Grace Chamber
 95 | Henry Pickworth,Francis Bugg
 96 | Samuel Clarridge,James Claypoole
 97 | Francis Bugg,George Whitehead
 98 | Thomas Lower,Margaret Fell
 99 | Sarah Gibbons,Dorothy Waugh
100 | Gervase Benson,Anthony Pearson
101 | Stephen Crisp,William Caton
102 | Stephen Crisp,Benjamin Furly
103 | John Freame,David Barclay
104 | Humphrey Norton,John Rous
105 | William Gibson,Thomas Holme
106 | Gideon Wanton,John Wanton
107 | William Mead,Margaret Fell
108 | Benjamin Furly,Robert Barclay
109 | Benjamin Furly,Alexander Parker
110 | Benjamin Furly,William Caton
111 | Benjamin Furly,William Penn
112 | James Logan,John Bartram
113 | James Logan,William Penn
114 | James Logan,David Lloyd
115 | Mary Prince,John Perrot
116 | Edward Haistwell,William Penn
117 | John ap John,John Burnyeat
118 | John Crook,John Perrot
119 | John Crook,Edward Burrough
120 | Gilbert Latey,Edward Burrough
121 | Gilbert Latey,George Whitehead
122 | Richard Hubberthorne,George Whitehead
123 | Richard Hubberthorne,Richard Farnworth
124 | Joseph Besse,Richard Claridge
125 | Joseph Besse,Samuel Bownas
126 | George Whitehead,Alexander Parker
127 | George Whitehead,John Whitehead
128 | George Whitehead,Daniel Quare
129 | George Whitehead,William Penn
130 | George Whitehead,John Story
131 | George Whitehead,Rebecca Travers
132 | Silvanus Bevan,Daniel Quare
133 | Solomon Eccles,John Story
134 | Robert Rich,William Tomlinson
135 | John Burnyeat,William Edmundson
136 | John Burnyeat,William Penn
137 | Katharine Evans,Sarah Cheevers
138 | Edward Burrough,John Perrot
139 | Edward Burrough,Thomas Ellwood
140 | Edward Burrough,William Crouch
141 | John Whiting,Christopher Taylor
142 | Rebecca Travers,Alice Curwen
143 | Rebecca Travers,William Penn
144 | William Caton,Margaret Fell
145 | Thomas Lawson,Margaret Fell
146 | Thomas Lawson,Alexander Parker
147 | Richard Farnworth,John Perrot
148 | Richard Farnworth,Margaret Fell
149 | Richard Farnworth,Anthony Pearson
150 | Thomas Taylor,Christopher Taylor
151 | John Penington,Mary Penington
152 | Humphrey Woolrich,Mary Pennyman
153 | William Penn,Tace Sowle
154 | William Penn,James Claypoole
155 | William Penn,Thomas Story
156 | William Penn,Mary Penington
157 | William Penn,David Lloyd
158 | William Penn,Margaret Fell
159 | William Penn,Richard Claridge
160 | Richard Vickris,George Bishop
161 | Robert Barclay,David Barclay of Ury
162 | Jane Sowle,Tace Sowle
163 | Margaret Fell,Alexander Parker
164 | Margaret Fell,Elizabeth Leavens
165 | Margaret Fell,Anthony Pearson
166 | Elizabeth Leavens,Thomas Holme
167 | Lewis Morris,Sir Charles Wager
168 | Mary Penington,Thomas Curtis
169 | Mary Penington,Thomas Ellwood
170 | Thomas Curtis,Thomas Ellwood
171 | Thomas Curtis,William Simpson
172 | Thomas Curtis,John Story
173 | Alexander Parker,Sir Charles Wager
174 | John Story,Thomas Ellwood
175 | Thomas Aldam,Anthony Pearson
176 | 


--------------------------------------------------------------------------------
/Tutorials/08 - Handling Graphs/data/quakers/quakers_nodelist.csv:
--------------------------------------------------------------------------------
  1 | Name,Historical Significance,Gender,Birthdate,Deathdate,ID
  2 | Joseph Wyeth,religious writer,male,1663,1731,10013191
  3 | Alexander Skene of Newtyle,local politician and author,male,1621,1694,10011149
  4 | James Logan,colonial official and scholar,male,1674,1751,10007567
  5 | Dorcas Erbery,Quaker preacher,female,1656,1659,10003983
  6 | Lilias Skene,Quaker preacher and poet,male,1626,1697,10011152
  7 | William Mucklow,religious writer,male,1630,1713,10008595
  8 | Thomas Salthouse,Quaker preacher and writer,male,1630,1691,10010643
  9 | William Dewsbury,Quaker activist,male,1621,1688,10003478
 10 | John Audland,Quaker preacher,male,1630,1664,10000411
 11 | Richard Claridge,Quaker minister and schoolmaster,male,1649,1723,10002469
 12 | William Bradford,printer,male,1663,1752,10001445
 13 | Fettiplace Bellers,philosophical writer and playwright,male,1687,1750,10000933
 14 | John Bellers,political economist and cloth merchant,male,1654,1725,10000934
 15 | Isabel Yeamans,Quaker preacher,female,1637,1704,10013226
 16 | George Fox the younger,religious writer,male,1551,1661,10004523
 17 | George Fox, a founder of the Religious Society of Friends (Quakers),male,1624,1691,10004524
 18 | John Stubbs,Quaker minister,male,1618,1675,10011695
 19 | Anne Camm,Quaker preacher,female,1627,1705,10001967
 20 | John Camm,Quaker preacher,male,1605,1657,10001968
 21 | Thomas Camm,Quaker preacher and writer,male,1640,1708,10001969
 22 | Katharine Evans,Quaker missionary,female,1618,1692,10004036
 23 | Lydia Lancaster,Quaker minister,female,1683,1761,10007110
 24 | Samuel Clarridge,Quaker activist,male,1631,1704,10002504
 25 | Thomas Lower,Quaker activist and physician,male,1633,1720,10007626
 26 | Gervase Benson,Quaker leader,male,1569,1679,10000972
 27 | Stephen Crisp,Quaker activist and writer,male,1628,1692,10003022
 28 | James Claypoole,merchant and pioneer settler in America,male,1634,1687,10002513
 29 | Thomas Holme,Quaker missionary,male,1626,1666,10006100
 30 | John Freame,banker and lobbyist,male,1665,1745,10004564
 31 | John Swinton,politician,male,1620,1679,10011742
 32 | William Mead,Quaker patron and merchant,male,1627,1713,10008161
 33 | Henry Pickworth,religious controversialist,male,1673,1738,10009697
 34 | John Crook,Quaker leader and writer,male,1616,1699,10003063
 35 | Gilbert Latey,Quaker activist,male,1626,1705,10007166
 36 | Ellis Hookes,Quaker administrator,male,1635,1681,10006146
 37 | Joseph Besse,historian of Quakerism,male,1683,1757,10001027
 38 | James Nayler,Quaker preacher and writer,male,1618,1660,10008713
 39 | Elizabeth Hooten,Quaker preacher,female,1562,1672,10006153
 40 | George Whitehead,Quaker leader and writer,male,1637,1724,10012813
 41 | John Whitehead,Quaker minister and preacher,male,1630,1696,10012815
 42 | William Crouch,Quaker leader and writer,male,1628,1711,10003087
 43 | Benjamin Furly,merchant and religious writer,male,1636,1714,10004625
 44 | Silvanus Bevan,apothecary,male,1691,1765,10001041
 45 | Robert Rich,Quaker adherent and sectary,male,1607,1679,10010260
 46 | John Whiting,Quaker bibliographer and writer,male,1656,1722,10012829
 47 | Christopher Taylor,religious writer and schoolmaster,male,1614,1686,10011811
 48 | Thomas Lawson,Quaker minister and botanist,male,1630,1691,10007210
 49 | Richard Farnworth,Quaker preacher and writer,male,1630,1666,10004141
 50 | William Coddington,merchant and official in America,male,1601,1678,10002606
 51 | Thomas Taylor,Quaker minister and writer,male,1617,1682,10011824
 52 | Richard Vickris,religious writer,male,1590,1700,10012350
 53 | Robert Barclay,religious writer and colonial governor,male,1648,1690,10054848
 54 | Jane Sowle,,female,1631,1711,10011331
 55 | Tace Sowle,printer and bookseller,male,1666,1749,10011332
 56 | Leonard Fell,Quaker missionary and writer,male,1624,1701,10004169
 57 | Margaret Fell,Quaker leader,female,1614,1702,10004170
 58 | George Bishop,government official and religious writer,male,1558,1668,10001097
 59 | Elizabeth Leavens,Quaker missionary,female,1555,1665,10007246
 60 | Thomas Curtis,Quaker schismatic,male,1602,1712,10003161
 61 | Alice Curwen,Quaker missionary,female,1619,1679,10003162
 62 | Alexander Parker,Quaker preacher and author,male,1628,1689,10009307
 63 | John Wilkinson,Quaker schismatic,male,1652,1683,10012893
 64 | Thomas Aldam,Quaker preacher and writer,male,1616,1660,10000099
 65 | David Barclay of Ury,soldier and politician,male,1610,1686,10000621
 66 | David Barclay,merchant,male,1682,1769,10000622
 67 | Sir Charles Wager,naval officer and politician,male,1666,1743,10012403
 68 | George Keith,Quaker schismatic and Church of England clergyman,male,1638,1716,10006784
 69 | James Parnel,Quaker martyr,male,1636,1656,10009347
 70 | Peter Collinson,botanist,male,1694,1768,10002694
 71 | Franciscus Mercurius van Helmont,physician and cabbalist,male,1614,1698,10005781
 72 | William Caton,Quaker preacher,male,1636,1665,10002203
 73 | Francis Howgill,Quaker activist,male,1618,1669,10006305
 74 | Richard Hubberthorne,Quaker activist,male,1628,1662,10006314
 75 | William Ames,Quaker preacher,male,1552,1662,10000175
 76 | William Rogers,Quaker schismatic,male,1601,1711,10010417
 77 | Isaac Norris,colonial official and merchant,male,1671,1735,10008884
 78 | Anthony Sharp,Quaker leader,male,1643,1707,10010941
 79 | Mary Fisher,Quaker missionary,female,1623,1698,10004290
 80 | Anne Conway Viscountess Conway and Killultagh,philosopher,female,1631,1679,10002755
 81 | Samuel Fisher,Quaker preacher and writer,male,1604,1665,10004292
 82 | Francis Bugg,Quaker apostate,male,1640,1727,10001737
 83 | Sarah Gibbons,Quaker preacher in America,female,1634,1659,10004811
 84 | William Tomlinson,religious writer,male,1650,1696,10011989
 85 | Humphrey Norton,Quaker missionary and author,male,1655,1660,10008917
 86 | William Gibson,Quaker leader,male,1628,1684,10004827
 87 | Gideon Wanton,merchant and colonial governor,male,1693,1767,10012509
 88 | John Wanton,merchant and colonial governor,male,1672,1740,10012510
 89 | Grace Chamber,Quaker minister,female,1676,1762,10002274
 90 | Mary Prince,Quaker preacher,female,1569,1679,10009959
 91 | John Bartram,botanist and explorer in America,male,1699,1777,10000745
 92 | Edward Haistwell,merchant,male,1658,1709,10005359
 93 | John ap John,Quaker leader,male,1625,1697,10000243
 94 | John Rous,Quaker missionary,male,1585,1695,10010488
 95 | Anthony Pearson,Quaker administrator,male,1627,1666,10009470
 96 | Solomon Eccles,musician and Quaker missionary,male,1617,1682,10003859
 97 | John Burnyeat,Quaker minister,male,1631,1690,10001815
 98 | Edward Burrough,Quaker activist and writer,male,1633,1663,10001818
 99 | Rebecca Travers,Quaker preacher and writer,female,1609,1688,10012062
100 | William Edmundson,Quaker leader,male,1627,1712,10003882
101 | Sarah Cheevers,Quaker missionary,female,1608,1664,10002354
102 | Edward Pyott,parliamentarian army officer,male,1560,1670,10010036
103 | Daniel Quare,"maker of clocks, watches, and barometers",male,1648,1724,10010037
104 | John Penington,Quaker apologist and controversialist,male,1655,1710,10009526
105 | Mary Penington,Quaker and writer,female,1623,1682,10009527
106 | Charles Marshall,Quaker preacher and apothecary,male,1637,1698,10007992
107 | Humphrey Woolrich,religious writer,male,1633,1707,10013112
108 | William Penn,Quaker leader and founder of Pennsylvania,male,1644,1718,10009531
109 | Mary Pennyman,,female,1630,1701,10009535
110 | Dorothy Waugh,Quaker preacher,female,1636,1666,10012614
111 | David Lloyd,lawyer and politician in America,male,1656,1731,10007509
112 | Lewis Morris,politician in America,male,1671,1746,10008534
113 | Martha Simmonds,Quaker and author,female,1624,1665,10011100
114 | John Story,Quaker schismatic,male,1571,1681,10011613
115 | Thomas Story,Quaker minister and journal writer,male,1670,1742,10011614
116 | Thomas Ellwood,religious controversialist,male,1639,1713,10003945
117 | William Simpson,Quaker preacher,male,1627,1671,10011114
118 | Samuel Bownas,Quaker minister and writer,male,1677,1753,10001390
119 | John Perrot,Quaker schismatic,male,1555,1665,10009584
120 | Hannah Stranger,Quaker missionary,female,1656,1671,10011632
121 | 


--------------------------------------------------------------------------------
/Tutorials/08 - Handling Graphs/networks.yml:
--------------------------------------------------------------------------------
 1 | name: nada
 2 | channels:
 3 |   - conda-forge
 4 |   - defaults
 5 | dependencies:
 6 |   - ca-certificates=2019.11.28=hecc5488_0
 7 |   - certifi=2019.11.28=py38_0
 8 |   - cycler=0.10.0=py_2
 9 |   - freetype=2.10.0=h24853df_1
10 |   - kiwisolver=1.1.0=py38ha1b3eb9_0
11 |   - libblas=3.8.0=14_openblas
12 |   - libcblas=3.8.0=14_openblas
13 |   - libcxx=9.0.0=h89e68fa_1
14 |   - libffi=3.2.1=h6de7cb9_1006
15 |   - libgfortran=4.0.0=2
16 |   - liblapack=3.8.0=14_openblas
17 |   - libopenblas=0.3.7=h3d69b6c_4
18 |   - libpng=1.6.37=h2573ce8_0
19 |   - llvm-openmp=9.0.0=h40edb58_0
20 |   - matplotlib=3.1.2=py38_1
21 |   - matplotlib-base=3.1.2=py38h11da6c2_1
22 |   - ncurses=6.1=h0a44026_1002
23 |   - numpy=1.17.3=py38hde6bac1_0
24 |   - openssl=1.1.1d=h0b31af3_0
25 |   - pip=19.3.1=py38_0
26 |   - pyparsing=2.4.5=py_0
27 |   - python=3.8.0=hd366da7_5
28 |   - python-dateutil=2.8.1=py_0
29 |   - readline=8.0=hcfe32e1_0
30 |   - setuptools=42.0.2=py38_0
31 |   - six=1.13.0=py38_0
32 |   - sqlite=3.30.1=h93121df_0
33 |   - tk=8.6.10=hbbe82c9_0
34 |   - tornado=6.0.3=py38h0b31af3_0
35 |   - wheel=0.33.6=py38_0
36 |   - xz=5.2.4=h1de35cc_1001
37 |   - zlib=1.2.11=h0b31af3_1006
38 |   - pip:
39 |     - decorator==4.4.1
40 |     - networkx==2.2
41 |     - python-louvain==0.13
42 | prefix: /anaconda3/envs/nada
43 | 
44 | 


--------------------------------------------------------------------------------
/Tutorials/README.md:
--------------------------------------------------------------------------------
1 | # ADA2019-Tutorials
2 | Repo for tutorials
3 | 
4 | 
5 | 


--------------------------------------------------------------------------------