├── .gitignore
├── InProgress
├── Autism-Screening-Child-Data Plus Description
│ ├── Autism-Child-Data.arff
│ └── Autism-Screening-Child-Data Description.docx
└── BIG-ML.pptx
├── Module_1
├── Module 1.pptx
└── Module_1.ipynb
├── Module_2
├── Module_2.ipynb
├── Module_2_challenge_problems.ipynb
└── pride_and_prejudicetxt.txt
├── Module_3
├── Module_3.ipynb
├── Module_3_homework.ipynb
├── Python_Pandas_Cheat_Sheet.pdf
├── Python_Pandas_Cheat_Sheet_2.pdf
├── TV_project
│ ├── datasets
│ │ ├── halftime_musicians.csv
│ │ ├── super_bowls.csv
│ │ └── tv.csv
│ └── tv_project.ipynb
└── gapminder_clean.csv
├── Module_5
├── Module_5.ipynb
├── Module_5.pptx
└── data
│ ├── Autism-Child-Data.arff
│ └── Autism-Screening-Child-Data Description.docx
├── Module_6
├── Module_6.ipynb
├── Module_6.pptx
├── Module_6_Homework.ipynb
└── data
│ ├── pima.csv
│ ├── pima_description.txt
│ ├── test.csv
│ └── train.csv
├── Module_7
├── Module_7.ipynb
├── Module_7.pptx
└── data
│ ├── pima.csv
│ └── pima_description.txt
├── Module_8
├── Module_8.ipynb
├── Module_8.pptx
└── data
│ └── KAG_energydata_complete.csv.zip
├── Modules_11&12
├── Module_11.ipynb
├── Module_11.pptx
├── Module_12.ipynb
├── Module_12.pptx
└── data
│ └── stage1_train.zip
├── Modules_9&10
├── Module_10.ipynb
├── Module_9-InstructorVersion.ipynb
└── Module_9.ipynb
├── README.md
└── Syllabus.docx
/.gitignore:
--------------------------------------------------------------------------------
1 | **/*.ipynb_checkpoints/*
2 | *.mp4
3 | .DS_Store
4 | /Module_2/Module_2_challenge_problems_SL.ipynb
5 | *.icloud
6 |
--------------------------------------------------------------------------------
/InProgress/Autism-Screening-Child-Data Plus Description/Autism-Screening-Child-Data Description.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbioinformatics/Biomedical-ML-with-Python/b014b310d93806b93a711227a00f8e4488bedfbb/InProgress/Autism-Screening-Child-Data Plus Description/Autism-Screening-Child-Data Description.docx
--------------------------------------------------------------------------------
/InProgress/BIG-ML.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbioinformatics/Biomedical-ML-with-Python/b014b310d93806b93a711227a00f8e4488bedfbb/InProgress/BIG-ML.pptx
--------------------------------------------------------------------------------
/Module_1/Module 1.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbioinformatics/Biomedical-ML-with-Python/b014b310d93806b93a711227a00f8e4488bedfbb/Module_1/Module 1.pptx
--------------------------------------------------------------------------------
/Module_2/Module_2_challenge_problems.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "97adc6ed",
6 | "metadata": {},
7 | "source": [
8 | "# Module \\#2 Challenge Problems"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "id": "9588e7d7",
14 | "metadata": {},
15 | "source": [
16 | "These problems will test the depth of your knowledge on the concepts covered in the course so far, and give you an opportunity to troubleshoot semi-realistic coding scenarios. They are designed to be difficult. It is not expected that you will be able to solve them all, but you will learn a lot by trying. \n",
17 | "\n",
18 | "*Hint*: When you get stuck, Google is your friend!"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "id": "4c64d23d",
24 | "metadata": {},
25 | "source": [
26 | "1. Fix the following code so that it no longer produces a `TypeError`:"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": null,
32 | "id": "f464bf5d",
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "def print_age(age):\n",
37 | " print(\"You are \" + age + \" years young!\")\n",
38 | " \n",
39 | "print_age(29)"
40 | ]
41 | },
42 | {
43 | "cell_type": "markdown",
44 | "id": "0238aee3",
45 | "metadata": {},
46 | "source": [
47 | "2. Write a function that returns the 3 largest and 2 smallest values in a list. **Hint:** Remember `.sort()` "
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": null,
53 | "id": "16142cd0",
54 | "metadata": {},
55 | "outputs": [],
56 | "source": [
57 | "# Complete the function\n",
58 | "def large3_small2(lst):\n",
59 | " ____________________\n",
60 | " ____________________\n",
61 | "\n",
62 | "# Test the function on some data\n",
63 | "my_lst = [5**3, 7**2**3, 2**-1, 7**2**-1**3, 8**2, 3**4, 9*2, 1888*.5, 8383*.25, 28*1.15, 29*-2**4]\n",
64 | "result = large3_small2(my_lst) # Should return three largest and two smallest values\n",
65 | "print(result)"
66 | ]
67 | },
68 | {
69 | "cell_type": "markdown",
70 | "id": "e0ce07cb",
71 | "metadata": {},
72 | "source": [
73 | "3. What value for `a` will cause `if_function()` to return `\"Hello world!\"`? *Hint*: Experiment with the code and see if you can figure it out!"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": null,
79 | "id": "003500a8",
80 | "metadata": {},
81 | "outputs": [],
82 | "source": [
83 | "def if_function(a):\n",
84 | " \n",
85 | " # This part is checking whether 'a' is an int and returning a TypeError if not\n",
86 | " if type(a) != int:\n",
87 | " raise TypeError(\"Oh No! 'a' is not an 'int'!! What will you do?\")\n",
88 | " \n",
89 | " # If `a` is `int`, continue\n",
90 | " if a > 20:\n",
91 | " result = \"'a' greater than 20\"\n",
92 | " elif a in range(0, 20): \n",
93 | " result = \"'a' between 0 and 20!\"\n",
94 | " elif a < 0:\n",
95 | " result = \"'a' is less than 0!\"\n",
96 | " else:\n",
97 | " result = \"Hello world!\"\n",
98 | " \n",
99 | " return result\n"
100 | ]
101 | },
102 | {
103 | "cell_type": "code",
104 | "execution_count": null,
105 | "id": "24d85bcb",
106 | "metadata": {},
107 | "outputs": [],
108 | "source": [
109 | "a = ____ # Choose a value of 'a' that will return 'Hello World!' from if_function()\n",
110 | "if_function(a) "
111 | ]
112 | },
113 | {
114 | "cell_type": "markdown",
115 | "id": "1a594172",
116 | "metadata": {},
117 | "source": [
118 | "4. Given the radius of a circle, write a function, `analyze_circle()`, which returns the circumference and area in a dictionary."
119 | ]
120 | },
121 | {
122 | "cell_type": "markdown",
123 | "id": "5c34c6e8",
124 | "metadata": {},
125 | "source": [
126 | "5. Given the text of Jane Austen's \"Pride and Prejudice\" novel (`pride_and_prejudice.txt`), determine the number of times the word `the` appears."
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": null,
132 | "id": "c1735686",
133 | "metadata": {},
134 | "outputs": [],
135 | "source": [
136 | "file = open(\"pride_and_prejudicetxt.txt\", \"r\", encoding=\"utf8\")\n",
137 | "text = file.read()\n",
138 | "words = text.split()\n",
139 | "\n",
140 | "# A list containing all the words in the novel\n",
141 | "words\n",
142 | "\n",
143 | "# How many times does \"the\" appear?"
144 | ]
145 | },
146 | {
147 | "cell_type": "markdown",
148 | "id": "486bb903",
149 | "metadata": {},
150 | "source": [
151 | "6. (Continuing from \\#5) Construct a word-count dictionary for all words in the novel following the format:\n",
152 | "\n",
153 | "```python\n",
154 | "result = {\n",
155 | " 'term1': term1_count,\n",
156 | " 'term2': term2_count,\n",
157 | " ...\n",
158 | "}\n",
159 | "```"
160 | ]
161 | },
162 | {
163 | "cell_type": "markdown",
164 | "id": "72875f7f",
165 | "metadata": {},
166 | "source": [
167 | "7. Using list comprehension and `numpy`, make a list with 25 random integers between 0 and 1000."
168 | ]
169 | },
170 | {
171 | "cell_type": "markdown",
172 | "id": "ac86d9cc",
173 | "metadata": {},
174 | "source": [
175 | "8. Using the list from \\#7, create a 2D `array` with 5 rows and 5 columns. "
176 | ]
177 | },
178 | {
179 | "cell_type": "markdown",
180 | "id": "060c3f54",
181 | "metadata": {},
182 | "source": [
183 | "9. With the `array` from \\#8, find the pearson correlation between the standard deviation of the rows and the mean of the rows."
184 | ]
185 | },
186 | {
187 | "cell_type": "markdown",
188 | "id": "b0fe0184",
189 | "metadata": {},
190 | "source": [
191 | "10. Generate two random numbers `(a, b)` between -10 and 10; Make an array containing all the points between `a` and `b` in increments of 0.1. Write a function which takes this array, `a`, and `b`, and returns an array containing all the points whose distance from the midpoint of `a` and `b` is less than 1."
192 | ]
193 | }
194 | ],
195 | "metadata": {
196 | "kernelspec": {
197 | "display_name": "Python 3",
198 | "language": "python",
199 | "name": "python3"
200 | },
201 | "language_info": {
202 | "codemirror_mode": {
203 | "name": "ipython",
204 | "version": 3
205 | },
206 | "file_extension": ".py",
207 | "mimetype": "text/x-python",
208 | "name": "python",
209 | "nbconvert_exporter": "python",
210 | "pygments_lexer": "ipython3",
211 | "version": "3.8.8"
212 | }
213 | },
214 | "nbformat": 4,
215 | "nbformat_minor": 5
216 | }
217 |
--------------------------------------------------------------------------------
/Module_3/Module_3.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Module #3 - Data Science with Python"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "Remember that `pandas` `DataFrames` are always two-dimensional, rectangular arrays of data. So how is that different from a `numpy` array? Simply put, a `pandas` `DataFrame` is a \"reskinned\" or \"repurposed\" `numpy` array in that they are actually built on and around `numpy` arrays. However, there are a few points that make `DataFrames` more specialized for data science:\n",
15 | "\n",
16 | "- A `DataFrame` can contain different data types (numerics, strings, booleans) in different columns (though each column has to be homogenous), whereas a `numpy` array is entirely homogenous\n",
17 | "- A `DataFrame` can have named rows and columns, which makes inspecting data much more convenient\n",
18 | "- A `DataFrame` integrates well with several graphing/plotting packages, such as `matplotlib.pyplot`, `plotly`, `bokeh`, `seaborn`, and others to make data visualization easier\n",
19 | "- A `DataFrame` is always 2-dimensional, whereas a `numpy` array can be N-dimensional\n",
20 | "\n",
21 | "In conclusion, `numpy` arrays are extremely powerful objects for matrix math, whether that is basic linear algebra, image processing, or any other problem that can mathematically be expressed as an N-dimensional matrix. A `DataFrame` is a very specialized version of a matrix that is completely geared toward data science, summarizing and visualizing datasets, and extracting information from them. This means that each has a space of applications where it far outperforms the other."
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": null,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "#installing required packages\n",
31 | "!pip install numpy\n",
32 | "!pip install pandas\n",
33 | "!pip install matplotlib\n",
34 | "!pip install scipy"
35 | ]
36 | },
37 | {
38 | "cell_type": "markdown",
39 | "metadata": {},
40 | "source": [
41 | "## Creating a `DataFrame` and accessing columns\n",
42 | "\n",
43 | "As hinted at during the previous module, `DataFrames` can be created manually from dictionaries (although in real life it's a lot more common to import them from csv files using `pd.read_csv()`)."
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": null,
49 | "metadata": {},
50 | "outputs": [],
51 | "source": [
52 | "import pandas as pd\n",
53 | "my_df = pd.DataFrame(data={\n",
54 | " 'col_one': range(1, 5),\n",
55 | " 'col_two': range(11, 15),\n",
56 | " 'col_three': range(21, 25)\n",
57 | "}, index = [\n",
58 | " 'row_one', 'row_two', 'row_three', 'row_four'\n",
59 | "])\n",
60 | "display(my_df)"
61 | ]
62 | },
63 | {
64 | "cell_type": "markdown",
65 | "metadata": {},
66 | "source": [
67 | "We can then access individual columns or lists of columns by their name. When we use single brackets `[]`, the column is extracted as a `pandas` `Series` object:"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": null,
73 | "metadata": {},
74 | "outputs": [],
75 | "source": [
76 | "display(my_df['col_one'])\n",
77 | "display(type(my_df['col_one']))"
78 | ]
79 | },
80 | {
81 | "cell_type": "markdown",
82 | "metadata": {},
83 | "source": [
84 | "When we use double brackets `[[]]`, the column of interest is extracted as a `DataFrame`:"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": null,
90 | "metadata": {},
91 | "outputs": [],
92 | "source": [
93 | "display(my_df[['col_one']])\n",
94 | "display(type(my_df[['col_one']]))"
95 | ]
96 | },
97 | {
98 | "cell_type": "markdown",
99 | "metadata": {},
100 | "source": [
101 | "**Challenge problem:** Can you write a line of code that will access both `col_one` **and** `col_two`?"
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": null,
107 | "metadata": {},
108 | "outputs": [],
109 | "source": []
110 | },
111 | {
112 | "cell_type": "markdown",
113 | "metadata": {},
114 | "source": [
115 | "## Accessing the index and column names\n",
116 | "\n",
117 | "If we want to access, or possibly overwrite, the index or column names of a `DataFrame`, we can do so by accessing a `DataFrame` object's `index` and `columns` attributes:"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": null,
123 | "metadata": {},
124 | "outputs": [],
125 | "source": [
126 | "display(my_df.index)\n",
127 | "display(my_df.columns)"
128 | ]
129 | },
130 | {
131 | "cell_type": "markdown",
132 | "metadata": {},
133 | "source": [
134 | "## Accessing elements by index\n",
135 | "\n",
136 | "We discussed before how to access columns. But how do we access individual cell values in a `DataFrame`? The `iloc[]` method accomplishes this by accessing the cell at a given row and column index. We can use single brackets `[]` to access the element directly and extract it as the type of object contained in the cell:"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": null,
142 | "metadata": {},
143 | "outputs": [],
144 | "source": [
145 | "display(my_df.iloc[1,1])\n",
146 | "display(type(my_df.iloc[1, 1]))"
147 | ]
148 | },
149 | {
150 | "cell_type": "markdown",
151 | "metadata": {},
152 | "source": [
153 | "Or we can use double brackets `[[],[]]` to extract the element as a `DataFrame` object:"
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": null,
159 | "metadata": {},
160 | "outputs": [],
161 | "source": [
162 | "display(my_df.iloc[[1],[1]])\n",
163 | "display(type(my_df.iloc[[1],[1]]))"
164 | ]
165 | },
166 | {
167 | "cell_type": "markdown",
168 | "metadata": {},
169 | "source": [
170 | "We can use either notation with the `iloc[]` method to set individual cells to a new value:"
171 | ]
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": null,
176 | "metadata": {},
177 | "outputs": [],
178 | "source": [
179 | "my_df.iloc[1,1] = 0\n",
180 | "display(my_df)\n",
181 | "\n",
182 | "my_df.iloc[[1], [1]] = 12\n",
183 | "display(my_df)"
184 | ]
185 | },
186 | {
187 | "cell_type": "markdown",
188 | "metadata": {},
189 | "source": [
190 | "## Accessing elements by name\n",
191 | "\n",
192 | "The `iloc[]` method accesses `DataFrame` elements by index, as shown above. The `loc[]` method does the same thing with row and column names, passed as strings to the method. Again, single brackets `[]` extract the cell as the data type contained within it:"
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": null,
198 | "metadata": {},
199 | "outputs": [],
200 | "source": [
201 | "display(my_df.loc['row_two','col_two'])\n",
202 | "display(type(my_df.loc['row_two', 'col_two']))"
203 | ]
204 | },
205 | {
206 | "cell_type": "markdown",
207 | "metadata": {},
208 | "source": [
209 | "And double brackets `[[],[]]` extract the element as a `DataFrame`:"
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": null,
215 | "metadata": {},
216 | "outputs": [],
217 | "source": [
218 | "display(my_df.loc[['row_two'],['col_two']])\n",
219 | "display(type(my_df.loc[['row_two'],['col_two']]))"
220 | ]
221 | },
222 | {
223 | "cell_type": "markdown",
224 | "metadata": {},
225 | "source": [
226 | "Just like the `iloc[]` method, `loc[]` can be used to assign new values to a cell:"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": null,
232 | "metadata": {},
233 | "outputs": [],
234 | "source": [
235 | "my_df.loc['row_two','col_two'] = 0\n",
236 | "display(my_df)\n",
237 | "\n",
238 | "my_df.loc[['row_two'], ['col_two']] = 12\n",
239 | "display(my_df)"
240 | ]
241 | },
242 | {
243 | "cell_type": "markdown",
244 | "metadata": {},
245 | "source": [
246 | "**Challenge problem:** Can you write a line of code that accesses **all** of `col_two`? You can use either `loc[]` or `iloc[]`"
247 | ]
248 | },
249 | {
250 | "cell_type": "code",
251 | "execution_count": null,
252 | "metadata": {},
253 | "outputs": [],
254 | "source": []
255 | },
256 | {
257 | "cell_type": "markdown",
258 | "metadata": {},
259 | "source": [
260 | "## Filtering rows with logical gates\n",
261 | "\n",
262 | "Just like a `numpy` array, `DataFrames` can be filtered using a boolean, or logical gate. Here, we'll filter `my_df` to display only the rows that correspond to a certain logical filter. This can be used to filter a dataset based on the value of one of the variables within it. Note that this filters rows, not columns."
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": null,
268 | "metadata": {},
269 | "outputs": [],
270 | "source": [
271 | "# Filter rows by column value\n",
272 | "display(my_df[my_df['col_one'] == 1])\n",
273 | "display(my_df[my_df['col_one'] < 4])"
274 | ]
275 | },
276 | {
277 | "cell_type": "markdown",
278 | "metadata": {},
279 | "source": [
280 | "## Discarding data\n",
281 | "\n",
282 | "Our dataset may contain null values, represented in `numpy` as `np.nan`. We can use the `dropna()` method to drop all rows that contain null values (we can also pass a subset of column names to `dropna()` to only filter null values from certain variables). Handling null values is a whole topic on its own that will be treated in depth later in the workshop."
283 | ]
284 | },
285 | {
286 | "cell_type": "code",
287 | "execution_count": null,
288 | "metadata": {},
289 | "outputs": [],
290 | "source": [
291 | "import numpy as np"
292 | ]
293 | },
294 | {
295 | "cell_type": "code",
296 | "execution_count": null,
297 | "metadata": {},
298 | "outputs": [],
299 | "source": [
300 | "my_df.iloc[0,0] = np.nan\n",
301 | "display(my_df)\n",
302 | "my_df.dropna(inplace = True)\n",
303 | "display(my_df)"
304 | ]
305 | },
306 | {
307 | "cell_type": "markdown",
308 | "metadata": {},
309 | "source": [
310 | "**Challenge question:** On line 3 above, what does the argument `inplace = True` do?"
311 | ]
312 | },
313 | {
314 | "cell_type": "markdown",
315 | "metadata": {},
316 | "source": [
317 | "If we want to discard an entire column of data, we can do so using the `drop()` method:"
318 | ]
319 | },
320 | {
321 | "cell_type": "code",
322 | "execution_count": null,
323 | "metadata": {},
324 | "outputs": [],
325 | "source": [
326 | "my_df.drop(labels = 'col_one', axis = 1, inplace = True)\n",
327 | "display(my_df)"
328 | ]
329 | },
330 | {
331 | "cell_type": "markdown",
332 | "metadata": {},
333 | "source": [
334 | "## Counting values and sorting `DataFrames`\n",
335 | "\n",
336 | "The `count()` and `sort_values()` methods are very useful when we want to summarize or rank our data based on a certain metric. There are many other summary methods to calculate the mean, for example, or other statistical metrics. Refer to the `pandas` documentation for more information on existing methods and how to use them."
337 | ]
338 | },
339 | {
340 | "cell_type": "code",
341 | "execution_count": null,
342 | "metadata": {},
343 | "outputs": [],
344 | "source": [
345 | "display(my_df.count())\n",
346 | "display(my_df.sort_values(by = 'col_two', ascending = False))"
347 | ]
348 | },
349 | {
350 | "cell_type": "markdown",
351 | "metadata": {},
352 | "source": [
353 | "## Merging two `DataFrames`\n",
354 | "\n",
355 | "It is often necessary to merge overlapping data from two separate sources. This can be done with the `merge()` method as shown below. The `how` argument specifies the kind of merge we want, such as the union of the two datasets (`outer`) or the intersection (`inner`):"
356 | ]
357 | },
358 | {
359 | "cell_type": "code",
360 | "execution_count": null,
361 | "metadata": {},
362 | "outputs": [],
363 | "source": [
364 | "my_df2 = pd.DataFrame(data={\n",
365 | " 'col_two': range(11, 15),\n",
366 | " 'col_four': range(31, 35),\n",
367 | " 'col_five': range(41, 45)\n",
368 | "}, index = [\n",
369 | " 'row_one', 'row_two', 'row_three', 'row_four'\n",
370 | "])\n",
371 | "display(my_df)\n",
372 | "display(my_df2)"
373 | ]
374 | },
375 | {
376 | "cell_type": "code",
377 | "execution_count": null,
378 | "metadata": {},
379 | "outputs": [],
380 | "source": [
381 | "my_df_merge = my_df.merge(my_df2, how = 'outer', on = 'col_two')\n",
382 | "display(my_df_merge)"
383 | ]
384 | },
385 | {
386 | "cell_type": "code",
387 | "execution_count": null,
388 | "metadata": {},
389 | "outputs": [],
390 | "source": [
391 | "my_df_merge = my_df.merge(my_df2, how = 'inner', on = 'col_two')\n",
392 | "display(my_df_merge)"
393 | ]
394 | },
395 | {
396 | "cell_type": "markdown",
397 | "metadata": {},
398 | "source": [
399 | "## Simple data visualization\n",
400 | "\n",
401 | "Data visualization is a huge subfield of data science and is invaluable in exploratory data analysis (will be discussed in-depth in later classes), and when presenting finished analyses. `Pandas` `DataFrames` integrate well with several popular plotting packages. Here, we use `matplotlib.pyplot` to create a basic scatter plot with the `plot()` method. Note that `plot()` can be used directly on a `DataFrame` object. Alternatively, it could also be used independently as:\n",
402 | "\n",
403 | "``` python\n",
404 | "plt.plot(x = my_df2.col_two, y = my_df2.col_four, kind = 'scatter')\n",
405 | "```\n",
406 | "\n",
407 | "We'll go further into graphing in the TV miniproject and the homework, but know that there are extensive plotting packages that create publication and presentation grade figures. Whenever we want to actually display a `pyplot` graph, we do so by calling `plt.show()`. We want to point out here that `matplotlib.pyplot` is a very ubiquitous but complicated library. We're not going to get into the details here, but refer to [this](https://s3.amazonaws.com/assets.datacamp.com/blog_assets/Python_Matplotlib_Cheat_Sheet.pdf) for a cheat sheet if you want to learn more."
408 | ]
409 | },
410 | {
411 | "cell_type": "code",
412 | "execution_count": null,
413 | "metadata": {},
414 | "outputs": [],
415 | "source": [
416 | "from matplotlib import pyplot as plt\n",
417 | "%matplotlib inline\n",
418 | "\n",
419 | "display(my_df2)\n",
420 | "\n",
421 | "my_df2.plot(x = 'col_two', y = 'col_four', kind = 'scatter')\n",
422 | "plt.xlabel('column two')\n",
423 | "plt.ylabel('column four')\n",
424 | "plt.title('example graph')\n",
425 | "plt.show()"
426 | ]
427 | },
428 | {
429 | "cell_type": "markdown",
430 | "metadata": {},
431 | "source": [
432 | "**Challenge problem:** Can you change the code above to create a line plot instead of a scatter plot? Hint: The word **line** is key. For your convenience, the code from above is duplicated below so you can modify it:"
433 | ]
434 | },
435 | {
436 | "cell_type": "code",
437 | "execution_count": null,
438 | "metadata": {},
439 | "outputs": [],
440 | "source": [
441 | "display(my_df2)\n",
442 | "\n",
443 | "my_df2.plot(x = 'col_two', y = 'col_four', kind = 'scatter')\n",
444 | "plt.xlabel('column two')\n",
445 | "plt.ylabel('column four')\n",
446 | "plt.title('example graph')\n",
447 | "plt.show()"
448 | ]
449 | },
450 | {
451 | "cell_type": "markdown",
452 | "metadata": {},
453 | "source": [
454 | "## Grouping data by a variable\n",
455 | "\n",
456 | "Grouping data by one of the variables in the dataset is done using the `groupby()` method. This allows us to summarize data more specifically, by introducing hierarchy. See the below example where we count values within groups:"
457 | ]
458 | },
459 | {
460 | "cell_type": "code",
461 | "execution_count": null,
462 | "metadata": {},
463 | "outputs": [],
464 | "source": [
465 | "my_df2.iloc[1,0] = 11\n",
466 | "my_df2.iloc[0,2] = np.nan\n",
467 | "display(my_df2)"
468 | ]
469 | },
470 | {
471 | "cell_type": "markdown",
472 | "metadata": {},
473 | "source": [
474 | "Simply applying `count()` will return the number of values in each column."
475 | ]
476 | },
477 | {
478 | "cell_type": "code",
479 | "execution_count": null,
480 | "metadata": {},
481 | "outputs": [],
482 | "source": [
483 | "display(my_df2.count())"
484 | ]
485 | },
486 | {
487 | "cell_type": "markdown",
488 | "metadata": {},
489 | "source": [
490 | "Grouping the data using `groupby()` first allows us to count within groups based on the values in `col_two`. Note how the `object.method()` notation can be chained to perform multiple tasks sequentially on the same data in one line (`object.method1().method2()` etc):"
491 | ]
492 | },
493 | {
494 | "cell_type": "code",
495 | "execution_count": null,
496 | "metadata": {},
497 | "outputs": [],
498 | "source": [
499 | "my_df_grouped = my_df2.groupby('col_two').count()\n",
500 | "display(my_df_grouped)"
501 | ]
502 | },
503 | {
504 | "cell_type": "markdown",
505 | "metadata": {},
506 | "source": [
507 | "## Intro to statistical testing\n",
508 | "\n",
509 | "The `scipy` package is a powerful tool to perform many statistical tests. Here we're just introducing a simple t-test, refer to the `scipy` documentation or Google for more advanced tests in the homework assignments."
510 | ]
511 | },
512 | {
513 | "cell_type": "code",
514 | "execution_count": null,
515 | "metadata": {},
516 | "outputs": [],
517 | "source": [
518 | "from scipy.stats import ttest_ind\n",
519 | "\n",
520 | "display(my_df2)\n",
521 | "\n",
522 | "t, p = ttest_ind(my_df2.col_two, my_df2.col_four, equal_var = True)\n",
523 | "print(\"t statistic for the t-test between col_two and col_four: \" + str(t))\n",
524 | "print(\"p-value for the t-test between col_two and col_four: \" + str(p))"
525 | ]
526 | }
527 | ],
528 | "metadata": {
529 | "kernelspec": {
530 | "display_name": "Python 3",
531 | "language": "python",
532 | "name": "python3"
533 | },
534 | "language_info": {
535 | "codemirror_mode": {
536 | "name": "ipython",
537 | "version": 3
538 | },
539 | "file_extension": ".py",
540 | "mimetype": "text/x-python",
541 | "name": "python",
542 | "nbconvert_exporter": "python",
543 | "pygments_lexer": "ipython3",
544 | "version": "3.9.1"
545 | }
546 | },
547 | "nbformat": 4,
548 | "nbformat_minor": 4
549 | }
550 |
--------------------------------------------------------------------------------
/Module_3/Module_3_homework.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Analysis of the Gapminder dataset\n",
8 | "\n",
9 | "In this miniproject for your homework, you will be analyzing demographic and economic data on various countries from the gapminder dataset. Please write your code in the provided code blocks and whenever appropriate, write a short answer to the question in the markdown block preceding the code.\n",
10 | "\n",
11 | "Not all the functions and code necessary for this homework was explicitly discussed in class, you will have to use your Googling skills to some extent. Make sure to import all the packages you will use. For readability of your code, make sure to include comments so you or anyone else reading this code in the future will know what's going on.\n",
12 | "\n",
13 | "Below is a helpful flow chart if you're trying to decide what statistical test to use for some of the problems in this homework assignment:\n",
14 | "\n",
15 | "\n",
16 | "\n",
17 | "## 1. Read in the `gapminder_clean.csv` data as a `pandas` `DataFrame` and inspect it\n",
18 | "\n",
19 | "Don't forget to import (and, if necessary, install) any packages you may need!\n",
20 | "\n"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "metadata": {},
27 | "outputs": [],
28 | "source": []
29 | },
30 | {
31 | "cell_type": "markdown",
32 | "metadata": {},
33 | "source": [
34 | "## 2. Filter the data to include only rows where `Year` is `1962` and then make a scatter plot comparing `'CO2 emissions (metric tons per capita)'` and `gdpPercap` for the filtered data.\n",
35 | "\n"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": null,
41 | "metadata": {},
42 | "outputs": [],
43 | "source": []
44 | },
45 | {
46 | "cell_type": "markdown",
47 | "metadata": {},
48 | "source": [
49 | "## 3. On the filtered data, calculate the pearson correlation of `'CO2 emissions (metric tons per capita)'` and `gdpPercap`. What is the Pearson R value and associated p-value?\n",
50 | "\n",
51 | "Hint: Use your Googling skills to find an appropriate package and function to calculate the correlation and associated p-value.\n",
52 | "\n"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": []
61 | },
62 | {
63 | "cell_type": "markdown",
64 | "metadata": {},
65 | "source": [
66 | "## 4. On the unfiltered data, answer \"In what year is the correlation between `'CO2 emissions (metric tons per capita)'` and `gdpPercap` the strongest?\" Filter the dataset to that year for the next step...\n",
67 | "\n"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": null,
73 | "metadata": {},
74 | "outputs": [],
75 | "source": []
76 | },
77 | {
78 | "cell_type": "markdown",
79 | "metadata": {},
80 | "source": [
81 | "## 5. Challenge: Using `plotly` or `bokeh`, create an interactive scatter plot comparing `'CO2 emissions (metric tons per capita)'` and gdpPercap, where the point size is determined by `pop` (population) and the color is determined by the `continent`.\n",
82 | "\n",
83 | "This question is harder than the others, since we have not discussed `plotly` or `bokeh` in class. Try and see if you can get the hang of it, refer to these guides: \n",
84 | "- https://www.kaggle.com/kanncaa1/plotly-tutorial-for-beginners\n",
85 | "- https://plotly.com/python/bubble-charts/#categorical-bubble-charts"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": null,
91 | "metadata": {},
92 | "outputs": [],
93 | "source": []
94 | },
95 | {
96 | "cell_type": "markdown",
97 | "metadata": {},
98 | "source": [
99 | "## 6. What is the relationship between `continent` and `'Energy use (kg of oil equivalent per capita)'` in 2007? (Stats test needed)\n",
100 | "\n"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "metadata": {},
107 | "outputs": [],
108 | "source": []
109 | },
110 | {
111 | "cell_type": "markdown",
112 | "metadata": {},
113 | "source": [
114 | "## 7. Is there a significant difference between Europe and Asia with respect to `'Imports of goods and services (% of GDP)'` in the years after 1990? (Stats test needed)\n",
115 | "\n"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": null,
121 | "metadata": {},
122 | "outputs": [],
123 | "source": []
124 | },
125 | {
126 | "cell_type": "markdown",
127 | "metadata": {},
128 | "source": [
129 | "## 8. What is the country (or countries) that has the highest `'Population density (people per sq. km of land area)'` across all years? (i.e., which country has the highest average ranking in this category across each time point in the dataset?)\n",
130 | "\n",
131 | "Hint: First create a new column ranking each country's density within a year, then calculate the average rank for each country across all years in the dataset.\n",
132 | "\n"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": null,
138 | "metadata": {},
139 | "outputs": [],
140 | "source": []
141 | },
142 | {
143 | "cell_type": "markdown",
144 | "metadata": {},
145 | "source": [
146 | "## 9. What country (or countries) has shown the greatest increase in `'Life expectancy at birth, total (years)'` since 1962?\n",
147 | "\n"
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": null,
153 | "metadata": {},
154 | "outputs": [],
155 | "source": []
156 | }
157 | ],
158 | "metadata": {
159 | "kernelspec": {
160 | "display_name": "Python 3",
161 | "language": "python",
162 | "name": "python3"
163 | },
164 | "language_info": {
165 | "codemirror_mode": {
166 | "name": "ipython",
167 | "version": 3
168 | },
169 | "file_extension": ".py",
170 | "mimetype": "text/x-python",
171 | "name": "python",
172 | "nbconvert_exporter": "python",
173 | "pygments_lexer": "ipython3",
174 | "version": "3.9.1"
175 | }
176 | },
177 | "nbformat": 4,
178 | "nbformat_minor": 4
179 | }
180 |
--------------------------------------------------------------------------------
/Module_3/Python_Pandas_Cheat_Sheet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbioinformatics/Biomedical-ML-with-Python/b014b310d93806b93a711227a00f8e4488bedfbb/Module_3/Python_Pandas_Cheat_Sheet.pdf
--------------------------------------------------------------------------------
/Module_3/Python_Pandas_Cheat_Sheet_2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbioinformatics/Biomedical-ML-with-Python/b014b310d93806b93a711227a00f8e4488bedfbb/Module_3/Python_Pandas_Cheat_Sheet_2.pdf
--------------------------------------------------------------------------------
/Module_3/TV_project/datasets/halftime_musicians.csv:
--------------------------------------------------------------------------------
1 | super_bowl,musician,num_songs
2 | 52,Justin Timberlake,11
3 | 52,University of Minnesota Marching Band,1
4 | 51,Lady Gaga,7
5 | 50,Coldplay,6
6 | 50,Beyoncé,3
7 | 50,Bruno Mars,3
8 | 50,Mark Ronson,1
9 | 50,University of California Marching Band,3
10 | 50,Youth Orchestra Los Angeles,3
11 | 50,Gustavo Dudamel,3
12 | 49,Katy Perry,8
13 | 49,Lenny Kravitz,1
14 | 49,Missy Elliott,3
15 | 49,Arizona State University Sun Devil Marching Band,
16 | 48,Bruno Mars,6
17 | 48,Red Hot Chili Peppers,1
18 | 47,Beyoncé,7
19 | 47,Destiny's Child,2
20 | 47,Kelly Rowland,1
21 | 47,Michelle Williams,1
22 | 46,Madonna,5
23 | 46,LMFAO,1
24 | 46,Nicki Minaj,1
25 | 46,M.I.A.,1
26 | 46,Cee Lo Green,2
27 | 45,The Black Eyed Peas,6
28 | 45,Slash,1
29 | 45,Usher,1
30 | 45,will.i.am,1
31 | 45,Fergie,1
32 | 44,The Who,5
33 | 43,Bruce Springsteen and the E Street Band,4
34 | 42,Tom Petty & the Heartbreakers,4
35 | 41,Prince,7
36 | 41,Florida A&M University Marching 100 Band,
37 | 40,The Rolling Stones,3
38 | 39,Paul McCartney,4
39 | 38,Jessica Simpson,1
40 | 38,Janet Jackson,3
41 | 38,P. Diddy,3
42 | 38,Nelly,1
43 | 38,Kid Rock,2
44 | 38,Justin Timberlake,1
45 | 38,The Ocean of Soul Marching Band,
46 | 38,The Spirit of Houston Cougar Marching Band,
47 | 37,Shania Twain,2
48 | 37,No Doubt,2
49 | 37,Sting,1
50 | 36,U2,3
51 | 35,Aerosmith,3
52 | 35,NSYNC,3
53 | 35,Britney Spears,1
54 | 35,Mary J. Blige,1
55 | 35,Nelly,1
56 | 34,Phil Collins,1
57 | 34,Christina Aguilera,1
58 | 34,Enrique Iglesias,1
59 | 34,Toni Braxton,1
60 | 33,Gloria Estefan,3
61 | 33,Stevie Wonder,4
62 | 33,Big Bad Voodoo Daddy,1
63 | 32,Boyz II Men,3
64 | 32,Smokey Robinson,3
65 | 32,Martha Reeves,2
66 | 32,The Temptations,4
67 | 32,Queen Latifah,2
68 | 32,Grambling State University Tiger Marching Band,1
69 | 31,The Blues Brothers,3
70 | 31,ZZ Top,3
71 | 31,James Brown,3
72 | 30,Diana Ross,10
73 | 29,Patti Labelle,3
74 | 29,Tony Bennett,2
75 | 29,Arturo Sandoval,2
76 | 29,Miami Sound Machine,1
77 | 28,Clint Black,2
78 | 28,Tanya Tucker,2
79 | 28,Travis Tritt,2
80 | 28,The Judds,1
81 | 28,Wynonna Judd,2
82 | 27,Michael Jackson,5
83 | 26,Gloria Estefan,2
84 | 26,University of Minnesota Marching Band,
85 | 25,New Kids on the Block,2
86 | 24,Pete Fountain,1
87 | 24,Doug Kershaw,1
88 | 24,Irma Thomas,1
89 | 24,Pride of Nicholls Marching Band,
90 | 24,The Human Jukebox,
91 | 24,Pride of Acadiana,
92 | 23,Elvis Presto,7
93 | 22,Chubby Checker,2
94 | 22,San Diego State University Marching Aztecs,
95 | 22,Spirit of Troy,
96 | 21,Grambling State University Tiger Marching Band,8
97 | 21,Spirit of Troy,8
98 | 20,Up with People,
99 | 19,Tops In Blue,
100 | 18,The University of Florida Fightin' Gator Marching Band,7
101 | 18,The Florida State University Marching Chiefs,7
102 | 17,Los Angeles Unified School District All City Honor Marching Band,
103 | 16,Up with People,
104 | 15,The Human Jukebox,
105 | 15,Helen O'Connell,
106 | 14,Up with People,
107 | 14,Grambling State University Tiger Marching Band,
108 | 13,Ken Hamilton,
109 | 13,Gramacks,
110 | 12,Tyler Junior College Apache Band,
111 | 12,Pete Fountain,
112 | 12,Al Hirt,
113 | 11,Los Angeles Unified School District All City Honor Marching Band,
114 | 10,Up with People,
115 | 9,Mercer Ellington,
116 | 9,Grambling State University Tiger Marching Band,
117 | 8,University of Texas Longhorn Band,
118 | 8,Judy Mallett,
119 | 7,University of Michigan Marching Band,
120 | 7,Woody Herman,
121 | 7,Andy Williams,
122 | 6,Ella Fitzgerald,
123 | 6,Carol Channing,
124 | 6,Al Hirt,
125 | 6,United States Air Force Academy Cadet Chorale,
126 | 5,Southeast Missouri State Marching Band,
127 | 4,Marguerite Piazza,
128 | 4,Doc Severinsen,
129 | 4,Al Hirt,
130 | 4,The Human Jukebox,
131 | 3,Florida A&M University Marching 100 Band,
132 | 2,Grambling State University Tiger Marching Band,
133 | 1,University of Arizona Symphonic Marching Band,
134 | 1,Grambling State University Tiger Marching Band,
135 | 1,Al Hirt,
--------------------------------------------------------------------------------
/Module_3/TV_project/datasets/super_bowls.csv:
--------------------------------------------------------------------------------
1 | date,super_bowl,venue,city,state,attendance,team_winner,winning_pts,qb_winner_1,qb_winner_2,coach_winner,team_loser,losing_pts,qb_loser_1,qb_loser_2,coach_loser,combined_pts,difference_pts
2 | 2018-02-04,52,U.S. Bank Stadium,Minneapolis,Minnesota,67612,Philadelphia Eagles,41,Nick Foles,,Doug Pederson,New England Patriots,33,Tom Brady,,Bill Belichick,74,8
3 | 2017-02-05,51,NRG Stadium,Houston,Texas,70807,New England Patriots,34,Tom Brady,,Bill Belichick,Atlanta Falcons,28,Matt Ryan,,Dan Quinn,62,6
4 | 2016-02-07,50,Levi's Stadium,Santa Clara,California,71088,Denver Broncos,24,Peyton Manning,,Gary Kubiak,Carolina Panthers,10,Cam Newton,,Ron Rivera,34,14
5 | 2015-02-01,49,University of Phoenix Stadium,Glendale,Arizona,70288,New England Patriots,28,Tom Brady,,Bill Belichick,Seattle Seahawks,24,Russell Wilson,,Pete Carroll,52,4
6 | 2014-02-02,48,MetLife Stadium,East Rutherford,New Jersey,82529,Seattle Seahawks,43,Russell Wilson,,Pete Carroll,Denver Broncos,8,Peyton Manning,,John Fox,51,35
7 | 2013-02-03,47,Mercedes-Benz Superdome,New Orleans,Louisiana,71024,Baltimore Ravens,34,Joe Flacco,,John Harbaugh,San Francisco 49ers,31,Colin Kaepernick,,Jim Harbaugh,65,3
8 | 2012-02-05,46,Lucas Oil Stadium,Indianapolis,Indiana,68658,New York Giants,21,Eli Manning,,Tom Coughlin,New England Patriots,17,Tom Brady,,Bill Belichick,38,4
9 | 2011-02-06,45,Cowboys Stadium,Arlington,Texas,103219,Green Bay Packers,31,Aaron Rodgers,,Mike McCarthy,Pittsburgh Steelers,25,Ben Roethlisberger,,Mike Tomlin,56,6
10 | 2010-02-07,44,Sun Life Stadium,Miami Gardens,Florida,74059,New Orleans Saints,31,Drew Brees,,Sean Payton,Indianapolis Colts,17,Peyton Manning,,Jim Caldwell,48,14
11 | 2009-02-01,43,Raymond James Stadium,Tampa,Florida,70774,Pittsburgh Steelers,27,Ben Roethlisberger,,Mike Tomlin,Arizona Cardinals,23,Kurt Warner,,Ken Whisenhunt,50,4
12 | 2008-02-03,42,University of Phoenix Stadium,Glendale,Arizona,71101,New York Giants,17,Eli Manning,,Tom Coughlin,New England Patriots,14,Tom Brady,,Bill Belichick,31,3
13 | 2007-02-04,41,Dolphin Stadium,Miami Gardens,Florida,74512,Indianapolis Colts,29,Peyton Manning,,Tony Dungy,Chicago Bears,17,Rex Grossman,,Lovie Smith,46,12
14 | 2006-02-05,40,Ford Field,Detroit,Michigan,68206,Pittsburgh Steelers,21,Ben Roethlisberger,,Bill Cowher,Seattle Seahawks,10,Matt Hasselbeck,,Mike Holmgren,31,11
15 | 2005-02-06,39,Alltel Stadium,Jacksonville,Florida,78125,New England Patriots,24,Tom Brady,,Bill Belichick,Philadelphia Eagles,21,Donovan McNabb,,Andy Reid,45,3
16 | 2004-02-01,38,Reliant Stadium,Houston,Texas,71525,New England Patriots,32,Tom Brady,,Bill Belichick,Carolina Panthers,29,Jake Delhomme,,John Fox,61,3
17 | 2003-01-26,37,Qualcomm Stadium,San Diego,California,67603,Tampa Bay Buccaneers,48,Brad Johnson,,Jon Gruden,Oakland Raiders,21,Rich Gannon,,Bill Callahan,69,27
18 | 2002-02-03,36,Louisiana Superdome,New Orleans,Louisiana,72922,New England Patriots,20,Tom Brady,,Bill Belichick,St. Louis Rams,17,Kurt Warner,,Mike Martz,37,3
19 | 2001-01-28,35,Raymond James Stadium,Tampa,Florida,71921,Baltimore Ravens,34,Trent Dilfer,,Brian Billick,New York Giants,7,Kerry Collins,,Jim Fassel,41,27
20 | 2000-01-30,34,Georgia Dome,Atlanta,Georgia,72625,St. Louis Rams,23,Kurt Warner,,Dick Vermeil,Tennessee Titans,16,Steve McNair,,Jeff Fisher,39,7
21 | 1999-01-31,33,Pro Player Stadium,Miami Gardens,Florida,74803,Denver Broncos,34,John Elway,,Mike Shanahan,Atlanta Falcons,19,Chris Chandler,,Dan Reeves,53,15
22 | 1998-01-25,32,Qualcomm Stadium,San Diego,California,68912,Denver Broncos,31,John Elway,,Mike Shanahan,Green Bay Packers,24,Brett Favre,,Mike Holmgren,55,7
23 | 1997-01-26,31,Louisiana Superdome,New Orleans,Louisiana,72301,Green Bay Packers,35,Brett Favre,,Mike Holmgren,New England Patriots,21,Drew Bledsoe,,Bill Parcells,56,14
24 | 1996-01-28,30,Sun Devil Stadium,Tempe,Arizona,76347,Dallas Cowboys,27,Troy Aikman,,Barry Switzer,Pittsburgh Steelers,17,Neil O'Donnell,,Bill Cowher,44,10
25 | 1995-01-29,29,Joe Robbie Stadium,Miami Gardens,Florida,74107,San Francisco 49ers,49,Steve Young,,George Seifert,San Diego Chargers,26,Stan Humphreys,,Bobby Ross,75,23
26 | 1994-01-30,28,Georgia Dome,Atlanta,Georgia,72817,Dallas Cowboys,30,Troy Aikman,,Jimmy Johnson,Buffalo Bills,13,Jim Kelly,,Marv Levy,43,17
27 | 1993-01-31,27,Rose Bowl,Pasadena,California,98374,Dallas Cowboys,52,Troy Aikman,,Jimmy Johnson,Buffalo Bills,17,Jim Kelly,Frank Reich,Marv Levy,69,35
28 | 1992-01-26,26,Metrodome,Minneapolis,Minnesota,63130,Washington Redskins,37,Mark Rypien,,Joe Gibbs,Buffalo Bills,24,Jim Kelly,,Marv Levy,61,13
29 | 1991-01-27,25,Tampa Stadium,Tampa,Florida,73813,New York Giants,20,Jeff Hostetler,,Bill Parcells,Buffalo Bills,19,Jim Kelly,,Marv Levy,39,1
30 | 1990-01-28,24,Louisiana Superdome,New Orleans,Louisiana,72919,San Francisco 49ers,55,Joe Montana,,George Seifert,Denver Broncos,10,John Elway,,Dan Reeves,65,45
31 | 1989-01-22,23,Joe Robbie Stadium,Miami Gardens,Florida,75129,San Francisco 49ers,20,Joe Montana,,Bill Walsh,Cincinnati Bengals,16,Boomer Esiason,,Sam Wyche,36,4
32 | 1988-01-31,22,Jack Murphy Stadium,San Diego,California,73302,Washington Redskins,42,Doug Williams,,Joe Gibbs,Denver Broncos,10,John Elway,,Dan Reeves,52,32
33 | 1987-01-25,21,Rose Bowl,Pasadena,California,101063,New York Giants,39,Phil Simms,,Bill Parcells,Denver Broncos,20,John Elway,,Dan Reeves,59,19
34 | 1986-01-26,20,Louisiana Superdome,New Orleans,Louisiana,73818,Chicago Bears,46,Jim McMahon,,Mike Ditka,New England Patriots,10,Tony Eason,Steve Grogan,Raymond Berry,56,36
35 | 1985-01-20,19,Stanford Stadium,Palo Alto,California,84059,San Francisco 49ers,38,Joe Montano,,Bill Walsh,Miami Dolphins,16,Dan Marino,,Don Shula,54,22
36 | 1984-01-22,18,Tampa Stadium,Tampa,Florida,72920,Los Angeles Raiders,38,Jim Plunkett,,Tom Flores,Washington Redskins,9,Joe Theismann,,Joe Gibbs,47,29
37 | 1983-01-30,17,Rose Bowl,Pasadena,California,103667,Washington Redskins,27,Joe Theismann,,Joe Gibbs,Miami Dolphins,17,David Woodley,,Don Shula,44,10
38 | 1982-01-24,16,Pontiac Silverdome,Pontiac,Michigan,81270,San Francisco 49ers,26,Joe Montana,,Bill Walsh,Cincinnati Bengals,21,Ken Anderson,,Forrest Gregg,47,5
39 | 1981-01-25,15,Louisiana Superdome,New Orleans,Louisiana,76135,Oakland Raiders,27,Jim Plunkett,,Tom Flores,Philadelphia Eagles,10,Ron Jaworski,,Dick Vermeil,37,17
40 | 1980-01-20,14,Rose Bowl,Pasadena,California,103985,Pittsburgh Steelers,31,Terry Bradshaw,,Chuck Noll,Los Angeles Rams,19,Vince Ferragamo,,Ray Malavasi,50,12
41 | 1979-01-21,13,Orange Bowl,Miami,Florida,79484,Pittsburgh Steelers,35,Terry Bradshaw,,Chuck Noll,Dallas Cowboys,31,Roger Staubach,,Tom Landry,66,4
42 | 1978-01-15,12,Superdome,New Orleans,Louisiana,76400,Dallas Cowboys,27,Roger Staubach,,Tom Landry,Denver Broncos,10,Craig Morton,,Red Miller,37,17
43 | 1977-01-09,11,Rose Bowl,Pasadena,California,103438,Oakland Raiders,32,Kenny Stabler,,John Madden,Minnesota Vikings,14,Fran Tarkenton,,Bud Grant,46,18
44 | 1976-01-18,10,Orange Bowl,Miami,Florida,80187,Pittsburgh Steelers,21,Terry Bradshaw,,Chuck Noll,Dallas Cowboys,17,Roger Staubach,,Tom Landry,38,4
45 | 1975-01-12,9,Tulane Stadium,New Orleans,Louisiana,80997,Pittsburgh Steelers,16,Terry Bradshaw,,Chuck Noll,Minnesota Vikings,6,Fran Tarkenton,,Bud Grant,22,10
46 | 1974-01-13,8,Rice Stadium,Houston,Texas,71882,Miami Dolphins,24,Bob Griese,,Don Shula,Minnesota Vikings,7,Fran Tarkenton,,Bud Grant,31,17
47 | 1973-01-14,7,Memorial Coliseum,Los Angeles,California,90182,Miami Dolphins,14,Bob Griese,,Don Shula,Washington Redskins,7,Bill Kilmer,,George Allen,21,7
48 | 1972-01-16,6,Tulane Stadium,New Orleans,Louisiana,81023,Dallas Cowboys,24,Roger Staubach,,Tom Landry,Miami Dolphins,3,Bob Griese,,Don Shula,27,21
49 | 1971-01-17,5,Orange Bowl,Miami,Florida,79204,Baltimore Colts,16,Earl Morrall,Johnny Unitas,Don McCafferty,Dallas Cowboys,13,Craig Morton,,Tom Landry,29,3
50 | 1970-01-11,4,Tulane Stadium,New Orleans,Louisiana,80562,Kansas City Chiefs,23,Len Dawson,Mike Livingston,Hank Stram,Minnesota Vikings,7,Joe Kapp,,Bud Grant,30,16
51 | 1969-01-12,3,Orange Bowl,Miami,Florida,75389,New York Jets,16,Joe Namath,,Weeb Ewbank,Baltimore Colts,7,Earl Morrall,Johnny Unitas,Don Shula,23,9
52 | 1968-01-14,2,Orange Bowl,Miami,Florida,75546,Green Bay Packers,33,Bart Starr,,Vince Lombardi,Oakland Raiders,14,Daryle Lamonica,,John Rauch,47,19
53 | 1967-01-15,1,Memorial Coliseum,Los Angeles,California,61946,Green Bay Packers,35,Bart Starr,,Vince Lombardi,Kansas City Chiefs,10,Len Dawson,,Hank Stram,45,25
--------------------------------------------------------------------------------
/Module_3/TV_project/datasets/tv.csv:
--------------------------------------------------------------------------------
1 | super_bowl,network,avg_us_viewers,total_us_viewers,rating_household,share_household,rating_18_49,share_18_49,ad_cost
2 | 52,NBC,103390000,,43.1,68,33.4,78,5000000
3 | 51,Fox,111319000,172000000,45.3,73,37.1,79,5000000
4 | 50,CBS,111864000,167000000,46.6,72,37.7,79,5000000
5 | 49,NBC,114442000,168000000,47.5,71,39.1,79,4500000
6 | 48,Fox,112191000,167000000,46.7,69,39.3,77,4000000
7 | 47,CBS,108693000,164100000,46.3,69,39.7,77,4000000
8 | 46,NBC,111346000,163500000,47,71,40.5,,3500000
9 | 45,Fox,111041000,162900000,46,69,39.9,,3100000
10 | 44,CBS,106476000,153400000,45,68,38.6,,2800000
11 | 43,NBC,98732000,151600000,42,64,36.7,,3000000
12 | 42,Fox,97448000,148300000,43.1,65,37.5,,2699963
13 | 41,CBS,93184000,139800000,42.6,64,35.2,,2385365
14 | 40,ABC,90745000,141400000,41.6,62,,,2500000
15 | 39,Fox,86072000,,41.1,62,,,2400000
16 | 38,CBS,89795000,144400000,41.4,63,,,2302200
17 | 37,ABC,88637000,138500000,40.7,61,,,2200000
18 | 36,Fox,86801000,,40.4,61,,,2200000
19 | 35,CBS,84335000,,40.4,61,,,2200000
20 | 34,ABC,88465000,,43.3,63,37.9,,2100000
21 | 33,Fox,83720000,,40.2,61,36.4,,1600000
22 | 32,NBC,90000000,,44.5,67,,,1291100
23 | 31,Fox,87870000,,43.3,65,,,1200000
24 | 30,NBC,94080000,,46,68,41.2,,1085000
25 | 29,ABC,83420000,,41.3,62,,,1150000
26 | 28,NBC,90000000,,45.5,66,,,900000
27 | 27,NBC,90990000,,45.1,66,,,850000
28 | 26,CBS,79590000,,40.3,61,,,850000
29 | 25,ABC,79510000,,41.9,63,,,800000
30 | 24,CBS,73852000,,39,67,,,700400
31 | 23,NBC,81590000,,43.5,68,,,675000
32 | 22,ABC,80140000,,41.9,62,,,645000
33 | 21,CBS,87190000,,45.8,66,,,600000
34 | 20,NBC,92570000,,48.3,70,,,550000
35 | 19,ABC,85530000,,46.4,63,,,525000
36 | 18,CBS,77620000,,46.4,71,,,368200
37 | 17,NBC,81770000,,48.6,69,,,400000
38 | 16,CBS,85240000,,49.1,73,,,324300
39 | 15,NBC,68290000,,44.4,63,,,275000
40 | 14,CBS,76240000,,46.3,67,,,222000
41 | 13,NBC,74740000,,47.1,74,,,185000
42 | 12,CBS,78940000,,47.2,67,,,162300
43 | 11,NBC,62050000,,44.4,73,,,125000
44 | 10,CBS,57710000,,42.3,78,,,110000
45 | 9,NBC,56050000,,42.4,72,,,107000
46 | 8,CBS,51700000,,41.6,73,,,103500
47 | 7,NBC,53320000,,42.7,72,,,88100
48 | 6,CBS,56640000,,44.2,74,,,86100
49 | 5,NBC,46040000,,39.9,75,,,72500
50 | 4,CBS,44270000,,39.4,69,,,78200
51 | 3,NBC,41660000,,36,70,,,55000
52 | 2,CBS,39120000,,36.8,68,,,54500
53 | 1,CBS,26750000,51180000,22.6,43,,,42500
54 | 1,NBC,24430000,,18.5,36,,,37500
--------------------------------------------------------------------------------
/Module_3/TV_project/tv_project.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "dc": {
7 | "key": "3"
8 | },
9 | "deletable": false,
10 | "editable": false,
11 | "run_control": {
12 | "frozen": true
13 | },
14 | "tags": [
15 | "context"
16 | ]
17 | },
18 | "source": [
19 | "## 1. TV, halftime shows, and the Big Game\n",
20 | "
Whether or not you like football, the Super Bowl is a spectacle. There's a little something for everyone at your Super Bowl party. Drama in the form of blowouts, comebacks, and controversy for the sports fan. There are the ridiculously expensive ads, some hilarious, others gut-wrenching, thought-provoking, and weird. The half-time shows with the biggest musicians in the world, sometimes riding giant mechanical tigers or leaping from the roof of the stadium. It's a show, baby. And in this notebook, we're going to find out how some of the elements of this show interact with each other. After exploring and cleaning our data a little, we're going to answer questions like:
\n",
21 | "\n",
22 | "- What are the most extreme game outcomes?
\n",
23 | "- How does the game affect television viewership?
\n",
24 | "- How have viewership, TV ratings, and ad cost evolved over time?
\n",
25 | "- Who are the most prolific musicians in terms of halftime show performances?
\n",
26 | "
\n",
27 | "
\n",
28 | "Left Shark Steals The Show. Katy Perry performing at halftime of Super Bowl XLIX. Photo by Huntley Paton. Attribution-ShareAlike 2.0 Generic (CC BY-SA 2.0).
\n",
29 | "The dataset we'll use was scraped and polished from Wikipedia. It is made up of three CSV files, one with game data, one with TV data, and one with halftime musician data for all 52 Super Bowls through 2018. Let's take a look, using display()
instead of print()
since its output is much prettier in Jupyter Notebooks.
"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": null,
35 | "metadata": {},
36 | "outputs": [],
37 | "source": [
38 | "!pip install numpy\n",
39 | "!pip install pandas\n",
40 | "!pip install matplotlib\n",
41 | "!pip install seaborn"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": null,
47 | "metadata": {
48 | "dc": {
49 | "key": "3"
50 | },
51 | "scrolled": true,
52 | "tags": [
53 | "sample_code"
54 | ]
55 | },
56 | "outputs": [],
57 | "source": [
58 | "# Import pandas\n",
59 | "import pandas as pd\n",
60 | "\n",
61 | "# Load the CSV data into DataFrames\n",
62 | "super_bowls = pd.read_csv('datasets/super_bowls.csv')\n",
63 | "tv = pd.read_csv('datasets/tv.csv')\n",
64 | "halftime_musicians = pd.read_csv('datasets/halftime_musicians.csv')\n",
65 | "\n",
66 | "# Display the first five rows of each DataFrame\n",
67 | "display(super_bowls.head())\n",
68 | "display(tv.head())\n",
69 | "display(halftime_musicians.head())"
70 | ]
71 | },
72 | {
73 | "cell_type": "markdown",
74 | "metadata": {
75 | "dc": {
76 | "key": "10"
77 | },
78 | "deletable": false,
79 | "editable": false,
80 | "run_control": {
81 | "frozen": true
82 | },
83 | "tags": [
84 | "context"
85 | ]
86 | },
87 | "source": [
88 | "## 2. Taking note of dataset issues\n",
89 | "For the Super Bowl game data, we can see the dataset appears whole except for missing values in the backup quarterback columns (qb_winner_2
and qb_loser_2
), which make sense given most starting QBs in the Super Bowl (qb_winner_1
and qb_loser_1
) play the entire game.
\n",
90 | "From the visual inspection of TV and halftime musicians data, there is only one missing value displayed, but I've got a hunch there are more. The Super Bowl goes all the way back to 1967, and the more granular columns (e.g. the number of songs for halftime musicians) probably weren't tracked reliably over time. Wikipedia is great but not perfect.
\n",
91 | "An inspection of the .info()
output for tv
and halftime_musicians
shows us that there are multiple columns with null values.
"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": null,
97 | "metadata": {
98 | "dc": {
99 | "key": "10"
100 | },
101 | "tags": [
102 | "sample_code"
103 | ]
104 | },
105 | "outputs": [],
106 | "source": [
107 | "# Summary of the TV data to inspect\n",
108 | "tv.info()\n",
109 | "\n",
110 | "print('\\n')\n",
111 | "\n",
112 | "# Summary of the halftime musician data to inspect\n",
113 | "halftime_musicians.info()"
114 | ]
115 | },
116 | {
117 | "cell_type": "markdown",
118 | "metadata": {
119 | "dc": {
120 | "key": "17"
121 | },
122 | "deletable": false,
123 | "editable": false,
124 | "run_control": {
125 | "frozen": true
126 | },
127 | "tags": [
128 | "context"
129 | ]
130 | },
131 | "source": [
132 | "## 3. Combined points distribution\n",
133 | "For the TV data, the following columns have missing values and a lot of them:
\n",
134 | "\n",
135 | "total_us_viewers
(amount of U.S. viewers who watched at least some part of the broadcast) \n",
136 | "rating_18_49
(average % of U.S. adults 18-49 who live in a household with a TV that were watching for the entire broadcast) \n",
137 | "share_18_49
(average % of U.S. adults 18-49 who live in a household with a TV in use that were watching for the entire broadcast) \n",
138 | "
\n",
139 | "For the halftime musician data, there are missing numbers of songs performed (num_songs
) for about a third of the performances.
\n",
140 | "There are a lot of potential reasons for these missing values. Was the data ever tracked? Was it lost in history? Is the research effort to make this data whole worth it? Maybe. Watching every Super Bowl halftime show to get song counts would be pretty fun. But we don't have the time to do that kind of stuff now! Let's take note of where the dataset isn't perfect and start uncovering some insights.
\n",
141 | "Let's start by looking at combined points for each Super Bowl by visualizing the distribution. Let's also pinpoint the Super Bowls with the highest and lowest scores.
"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": null,
147 | "metadata": {
148 | "dc": {
149 | "key": "17"
150 | },
151 | "tags": [
152 | "sample_code"
153 | ]
154 | },
155 | "outputs": [],
156 | "source": [
157 | "# Import matplotlib and set plotting style\n",
158 | "from matplotlib import pyplot as plt\n",
159 | "%matplotlib inline\n",
160 | "plt.style.use('seaborn')\n",
161 | "\n",
162 | "# Plot a histogram of combined points\n",
163 | "plt.hist(super_bowls.combined_pts)\n",
164 | "plt.xlabel('Combined Points')\n",
165 | "plt.ylabel('Number of Super Bowls')\n",
166 | "plt.show()\n",
167 | "\n",
168 | "# Display the Super Bowls with the highest and lowest combined scores\n",
169 | "display(super_bowls[super_bowls['combined_pts'] > 70])\n",
170 | "display(super_bowls[super_bowls['combined_pts'] < 25])"
171 | ]
172 | },
173 | {
174 | "cell_type": "markdown",
175 | "metadata": {
176 | "dc": {
177 | "key": "24"
178 | },
179 | "deletable": false,
180 | "editable": false,
181 | "run_control": {
182 | "frozen": true
183 | },
184 | "tags": [
185 | "context"
186 | ]
187 | },
188 | "source": [
189 | "## 4. Point difference distribution\n",
190 | "Most combined scores are around 40-50 points, with the extremes being roughly equal distance away in opposite directions. Going up to the highest combined scores at 74 and 75, we find two games featuring dominant quarterback performances. One even happened recently in 2018's Super Bowl LII where Tom Brady's Patriots lost to Nick Foles' underdog Eagles 41-33 for a combined score of 74.
\n",
191 | "Going down to the lowest combined scores, we have Super Bowl III and VII, which featured tough defenses that dominated. We also have Super Bowl IX in New Orleans in 1975, whose 16-6 score can be attributed to inclement weather. The field was slick from overnight rain, and it was cold at 46 °F (8 °C), making it hard for the Steelers and Vikings to do much offensively. This was the second-coldest Super Bowl ever and the last to be played in inclement weather for over 30 years. The NFL realized people like points, I guess.
\n",
192 | "UPDATE: In Super Bowl LIII in 2019, the Patriots and Rams broke the record for the lowest-scoring Super Bowl with a combined score of 16 points (13-3 for the Patriots).
\n",
193 | "Let's take a look at point difference now.
"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": null,
199 | "metadata": {
200 | "dc": {
201 | "key": "24"
202 | },
203 | "tags": [
204 | "sample_code"
205 | ]
206 | },
207 | "outputs": [],
208 | "source": [
209 | "# Plot a histogram of point differences\n",
210 | "plt.hist(super_bowls.difference_pts)\n",
211 | "plt.xlabel('Point Difference')\n",
212 | "plt.ylabel('Number of Super Bowls')\n",
213 | "plt.show()\n",
214 | "\n",
215 | "# Display the closest game(s) and biggest blowouts\n",
216 | "display(super_bowls[super_bowls['difference_pts'] == 1])\n",
217 | "display(super_bowls[super_bowls['difference_pts'] >= 35])"
218 | ]
219 | },
220 | {
221 | "cell_type": "markdown",
222 | "metadata": {
223 | "dc": {
224 | "key": "31"
225 | },
226 | "deletable": false,
227 | "editable": false,
228 | "run_control": {
229 | "frozen": true
230 | },
231 | "tags": [
232 | "context"
233 | ]
234 | },
235 | "source": [
236 | "## 5. Do blowouts translate to lost viewers?\n",
237 | "The vast majority of Super Bowls are close games. Makes sense. Both teams are likely to be deserving if they've made it this far. The closest game ever was when the Buffalo Bills lost to the New York Giants by 1 point in 1991, which was best remembered for Scott Norwood's last-second missed field goal attempt that went wide right, kicking off four Bills Super Bowl losses in a row. Poor Scott. The biggest point discrepancy ever was 45 points (!) where Hall of Famer Joe Montana's led the San Francisco 49ers to victory in 1990, one year before the closest game ever.
\n",
238 | "I remember watching the Seahawks crush the Broncos by 35 points (43-8) in 2014, which was a boring experience in my opinion. The game was never really close. I'm pretty sure we changed the channel at the end of the third quarter. Let's combine our game data and TV to see if this is a universal phenomenon. Do large point differences translate to lost viewers? We can plot household share (average percentage of U.S. households with a TV in use that were watching for the entire broadcast) vs. point difference to find out.
"
239 | ]
240 | },
241 | {
242 | "cell_type": "code",
243 | "execution_count": null,
244 | "metadata": {
245 | "dc": {
246 | "key": "31"
247 | },
248 | "tags": [
249 | "sample_code"
250 | ]
251 | },
252 | "outputs": [],
253 | "source": [
254 | "# Join game and TV data, filtering out SB I because it was split over two networks\n",
255 | "games_tv = pd.merge(tv[tv['super_bowl'] > 1], super_bowls, on='super_bowl')\n",
256 | "\n",
257 | "# Import seaborn\n",
258 | "import seaborn as sns\n",
259 | "\n",
260 | "# Create a scatter plot with a linear regression model fit\n",
261 | "sns.regplot(x=games_tv.difference_pts, y=games_tv.share_household, data=games_tv)"
262 | ]
263 | },
264 | {
265 | "cell_type": "markdown",
266 | "metadata": {
267 | "dc": {
268 | "key": "38"
269 | },
270 | "deletable": false,
271 | "editable": false,
272 | "run_control": {
273 | "frozen": true
274 | },
275 | "tags": [
276 | "context"
277 | ]
278 | },
279 | "source": [
280 | "## 6. Viewership and the ad industry over time\n",
281 | "The downward sloping regression line and the 95% confidence interval for that regression suggest that bailing on the game if it is a blowout is common. Though it matches our intuition, we must take it with a grain of salt because the linear relationship in the data is weak due to our small sample size of 52 games.
\n",
282 | "Regardless of the score though, I bet most people stick it out for the halftime show, which is good news for the TV networks and advertisers. A 30-second spot costs a pretty \\$5 million now, but has it always been that way? And how have number of viewers and household ratings trended alongside ad cost? We can find out using line plots that share a \"Super Bowl\" x-axis.
"
283 | ]
284 | },
285 | {
286 | "cell_type": "code",
287 | "execution_count": null,
288 | "metadata": {
289 | "dc": {
290 | "key": "38"
291 | },
292 | "tags": [
293 | "sample_code"
294 | ]
295 | },
296 | "outputs": [],
297 | "source": [
298 | "# Create a figure with 3x1 subplot and activate the top subplot\n",
299 | "plt.subplot(3, 1, 1)\n",
300 | "plt.plot(games_tv.super_bowl, games_tv.avg_us_viewers, color='#648FFF')\n",
301 | "plt.title('Average Number of US Viewers')\n",
302 | "\n",
303 | "# Activate the middle subplot\n",
304 | "plt.subplot(3, 1, 2)\n",
305 | "plt.plot(games_tv.super_bowl, games_tv.rating_household , color = '#DC267F')\n",
306 | "plt.title('Household Rating')\n",
307 | "\n",
308 | "# Activate the bottom subplot\n",
309 | "plt.subplot(3, 1, 3)\n",
310 | "plt.plot(games_tv.super_bowl, games_tv.ad_cost, color = '#FFB000')\n",
311 | "plt.title('Ad Cost')\n",
312 | "plt.xlabel('SUPER BOWL')\n",
313 | "\n",
314 | "# Improve the spacing between subplots\n",
315 | "plt.tight_layout()"
316 | ]
317 | },
318 | {
319 | "cell_type": "markdown",
320 | "metadata": {
321 | "dc": {
322 | "key": "45"
323 | },
324 | "deletable": false,
325 | "editable": false,
326 | "run_control": {
327 | "frozen": true
328 | },
329 | "tags": [
330 | "context"
331 | ]
332 | },
333 | "source": [
334 | "## 7. Halftime shows weren't always this great\n",
335 | "We can see viewers increased before ad costs did. Maybe the networks weren't very data savvy and were slow to react? Makes sense since DataCamp didn't exist back then.
\n",
336 | "Another hypothesis: maybe halftime shows weren't that good in the earlier years? The modern spectacle of the Super Bowl has a lot to do with the cultural prestige of big halftime acts. I went down a YouTube rabbit hole and it turns out the old ones weren't up to today's standards. Some offenders:
\n",
337 | "\n",
338 | "- Super Bowl XXVI in 1992: A Frosty The Snowman rap performed by children.
\n",
339 | "- Super Bowl XXIII in 1989: An Elvis impersonator that did magic tricks and didn't even sing one Elvis song.
\n",
340 | "- Super Bowl XXI in 1987: Tap dancing ponies. (Okay, that's pretty awesome actually.)
\n",
341 | "
\n",
342 | "It turns out Michael Jackson's Super Bowl XXVII performance, one of the most watched events in American TV history, was when the NFL realized the value of Super Bowl airtime and decided they needed to sign big name acts from then on out. The halftime shows before MJ indeed weren't that impressive, which we can see by filtering our halftime_musician
data.
"
343 | ]
344 | },
345 | {
346 | "cell_type": "code",
347 | "execution_count": null,
348 | "metadata": {
349 | "dc": {
350 | "key": "45"
351 | },
352 | "tags": [
353 | "sample_code"
354 | ]
355 | },
356 | "outputs": [],
357 | "source": [
358 | "# Display all halftime musicians for Super Bowls up to and including Super Bowl XXVII\n",
359 | "halftime_musicians[halftime_musicians['super_bowl'] <= 27]"
360 | ]
361 | },
362 | {
363 | "cell_type": "markdown",
364 | "metadata": {
365 | "dc": {
366 | "key": "52"
367 | },
368 | "deletable": false,
369 | "editable": false,
370 | "run_control": {
371 | "frozen": true
372 | },
373 | "tags": [
374 | "context"
375 | ]
376 | },
377 | "source": [
378 | "## 8. Who has the most halftime show appearances?\n",
379 | "Lots of marching bands. American jazz clarinetist Pete Fountain. Miss Texas 1973 playing a violin. Nothing against those performers, they're just simply not Beyoncé. To be fair, no one is.
\n",
380 | "Let's see all of the musicians that have done more than one halftime show, including their performance counts.
"
381 | ]
382 | },
383 | {
384 | "cell_type": "code",
385 | "execution_count": null,
386 | "metadata": {
387 | "dc": {
388 | "key": "52"
389 | },
390 | "tags": [
391 | "sample_code"
392 | ]
393 | },
394 | "outputs": [],
395 | "source": [
396 | "# Count halftime show appearances for each musician and sort them from most to least\n",
397 | "halftime_appearances = halftime_musicians.groupby('musician').count()['super_bowl'].reset_index()\n",
398 | "halftime_appearances = halftime_appearances.sort_values('super_bowl', ascending=False)\n",
399 | "\n",
400 | "# Display musicians with more than one halftime show appearance\n",
401 | "halftime_appearances[halftime_appearances.super_bowl > 1]"
402 | ]
403 | },
404 | {
405 | "cell_type": "markdown",
406 | "metadata": {
407 | "dc": {
408 | "key": "59"
409 | },
410 | "deletable": false,
411 | "editable": false,
412 | "run_control": {
413 | "frozen": true
414 | },
415 | "tags": [
416 | "context"
417 | ]
418 | },
419 | "source": [
420 | "## 9. Who performed the most songs in a halftime show?\n",
421 | "The world famous Grambling State University Tiger Marching Band takes the crown with six appearances. Beyoncé, Justin Timberlake, Nelly, and Bruno Mars are the only post-Y2K musicians with multiple appearances (two each).
\n",
422 | "From our previous inspections, the num_songs
column has lots of missing values:
\n",
423 | "\n",
424 | "- A lot of the marching bands don't have
num_songs
entries. \n",
425 | "- For non-marching bands, missing data starts occurring at Super Bowl XX.
\n",
426 | "
\n",
427 | "Let's filter out marching bands by filtering out musicians with the word \"Marching\" in them and the word \"Spirit\" (a common naming convention for marching bands is \"Spirit of [something]\"). Then we'll filter for Super Bowls after Super Bowl XX to address the missing data issue, then let's see who has the most number of songs.
"
428 | ]
429 | },
430 | {
431 | "cell_type": "code",
432 | "execution_count": null,
433 | "metadata": {
434 | "dc": {
435 | "key": "59"
436 | },
437 | "tags": [
438 | "sample_code"
439 | ]
440 | },
441 | "outputs": [],
442 | "source": [
443 | "# Filter out most marching bands\n",
444 | "no_bands = halftime_musicians[~halftime_musicians.musician.str.contains('Marching')]\n",
445 | "no_bands = no_bands[~no_bands.musician.str.contains('Spirit')]\n",
446 | "\n",
447 | "# Plot a histogram of number of songs per performance\n",
448 | "most_songs = int(max(no_bands['num_songs'].values))\n",
449 | "plt.hist(no_bands.num_songs.dropna(), bins=most_songs)\n",
450 | "plt.xlabel('Number of Songs Per Halftime Show Performance')\n",
451 | "plt.ylabel('Number of Musicians')\n",
452 | "plt.show()\n",
453 | "\n",
454 | "# Sort the non-band musicians by number of songs per appearance...\n",
455 | "no_bands = no_bands.sort_values('num_songs', ascending=False)\n",
456 | "# ...and display the top 15\n",
457 | "display(no_bands.head(15))"
458 | ]
459 | },
460 | {
461 | "cell_type": "markdown",
462 | "metadata": {
463 | "dc": {
464 | "key": "66"
465 | },
466 | "deletable": false,
467 | "editable": false,
468 | "run_control": {
469 | "frozen": true
470 | },
471 | "tags": [
472 | "context"
473 | ]
474 | },
475 | "source": [
476 | "## 10. Conclusion\n",
477 | "So most non-band musicians do 1-3 songs per halftime show. It's important to note that the duration of the halftime show is fixed (roughly 12 minutes) so songs per performance is more a measure of how many hit songs you have. JT went off in 2018, wow. 11 songs! Diana Ross comes in second with 10 in her medley in 1996.
\n",
478 | "In this notebook, we loaded, cleaned, then explored Super Bowl game, television, and halftime show data. We visualized the distributions of combined points, point differences, and halftime show performances using histograms. We used line plots to see how ad cost increases lagged behind viewership increases. And we discovered that blowouts do appear to lead to a drop in viewers.
\n",
479 | "This year's Big Game will be here before you know it. Who do you think will win Super Bowl LIII?
\n",
480 | "UPDATE: Spoiler alert.
"
481 | ]
482 | },
483 | {
484 | "cell_type": "code",
485 | "execution_count": null,
486 | "metadata": {
487 | "dc": {
488 | "key": "66"
489 | },
490 | "tags": [
491 | "sample_code"
492 | ]
493 | },
494 | "outputs": [],
495 | "source": [
496 | "# 2018-2019 conference champions\n",
497 | "patriots = 'New England Patriots'\n",
498 | "rams = 'Los Angeles Rams'\n",
499 | "\n",
500 | "# Who will win Super Bowl LIII?\n",
501 | "super_bowl_LIII_winner = patriots\n",
502 | "print('The winner of Super Bowl LIII will be the', super_bowl_LIII_winner)"
503 | ]
504 | }
505 | ],
506 | "metadata": {
507 | "kernelspec": {
508 | "display_name": "Python 3",
509 | "language": "python",
510 | "name": "python3"
511 | },
512 | "language_info": {
513 | "codemirror_mode": {
514 | "name": "ipython",
515 | "version": 3
516 | },
517 | "file_extension": ".py",
518 | "mimetype": "text/x-python",
519 | "name": "python",
520 | "nbconvert_exporter": "python",
521 | "pygments_lexer": "ipython3",
522 | "version": "3.8.5"
523 | }
524 | },
525 | "nbformat": 4,
526 | "nbformat_minor": 2
527 | }
528 |
--------------------------------------------------------------------------------
/Module_5/Module_5.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbioinformatics/Biomedical-ML-with-Python/b014b310d93806b93a711227a00f8e4488bedfbb/Module_5/Module_5.pptx
--------------------------------------------------------------------------------
/Module_5/data/Autism-Screening-Child-Data Description.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbioinformatics/Biomedical-ML-with-Python/b014b310d93806b93a711227a00f8e4488bedfbb/Module_5/data/Autism-Screening-Child-Data Description.docx
--------------------------------------------------------------------------------
/Module_6/Module_6.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "1ace58e3-d3d7-4743-a14c-21a4f6e2aa21",
6 | "metadata": {},
7 | "source": [
8 | "# Module 6 - Feature Selection"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "id": "2f3c5bdf-2ad0-4cd2-89e5-c9b4a01b141b",
14 | "metadata": {},
15 | "source": [
16 | "Feature Selection is the process of choosing which features to use to answer your central question. Why would anyone want to limit the information availbale to them! Think Ockham's razor - when presented with competing hypotheses about the same prediction, one should select the solution with the fewest assumptions. In short - \"the simplest explination is usually the best one\". This concept of fugality applied to describing nature is what we call parsimony. In practice, we aim to develop models with the least number of features.\n",
17 | "\n",
18 | "The advantages to this are that models train faster, are less prone to overfitting, and are usually more accurate. In this excercise we will apply various feature selection schemes to the Mobile Price Classification dataset distributed with this notebook to examine how it effects model performance."
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "id": "9c9c0e58-1d2a-4bf7-892a-dbc2f216828f",
24 | "metadata": {},
25 | "source": [
26 | " ## About the Mobile Price dataset \n",
27 | " 1. The data is already tidy and partitioned into training and testing csv files. \n",
28 | " 2. There are 2000 observations in the training set and 1000 in testing.\n",
29 | " 3. Each observation consisits of 20 phone features (columns) and one categorical label (final column) describing the phone's price range.\n",
30 | " 4. This is a classification problem. But for our case, it's an exercise in feature selection.\n",
31 | "\n",
32 | "### Data description\n",
33 | "| Feature | Description |\n",
34 | "| ------- | ----------- |\n",
35 | "| battery_power | Total energy a battery can store in one time measured in mAh |\n",
36 | "|blue | Has Bluetooth or not |\n",
37 | "|clock_speed | the speed at which microprocessor executes instructions |\n",
38 | "|dual_sim | Has dual sim support or not |\n",
39 | "| fc | Front Camera megapixels |\n",
40 | "| four_g | Has 4G or not |\n",
41 | "| int_memory | Internal Memory in Gigabytes |\n",
42 | "| m_dep | Mobile Depth in cm |\n",
43 | "| mobile_wt | Weight of mobile phone |\n",
44 | "| n_cores | Number of cores of the processor |\n",
45 | "| pc | Primary Camera megapixels |\n",
46 | "| px_height | Pixel Resolution Height |\n",
47 | "| px_width | Pixel Resolution Width |\n",
48 | "| ram | Random Access Memory in MegaBytes |\n",
49 | "| sc_h | Screen Height of mobile in cm |\n",
50 | "| sc_w | Screen Width of mobile in cm |\n",
51 | "| talk_time | the longest time that a single battery charge will last when you are |\n",
52 | "| three_g | Has 3G or not |\n",
53 | "| touch_screen | Has touch screen or not |\n",
54 | "| wifi | Has wifi or not |\n",
55 | "| price_range | This is the target variable with a value of 0(low cost), 1(medium cost), 2(high cost) and 3(very high cost). |"
56 | ]
57 | },
58 | {
59 | "cell_type": "markdown",
60 | "id": "7103cd06",
61 | "metadata": {},
62 | "source": [
63 | "## Setup\n",
64 | "Let's get all the requirements sorted before we move on to the excercise. Most packages should be familiar at this point. Numpy, pandas, matplotlib, and seaborn where all introduced in Part I of the workshop in modules 1-3 and last week in module 5 we introduced tableone. Notice, today we will be using sklearn for the first time to do some machine learning. Don't worry too much about the models we'll be using or how to train them for now. This will the the topic for modules 7 & 8. "
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": null,
70 | "id": "4a4fec48",
71 | "metadata": {},
72 | "outputs": [],
73 | "source": [
74 | "# Requirements\n",
75 | "!pip install --upgrade ipykernel\n",
76 | "!pip install pandas\n",
77 | "!pip install numpy\n",
78 | "!pip install tableone\n",
79 | "!pip install matplotlib\n",
80 | "!pip install seaborn\n",
81 | "!pip install sklearn\n",
82 | "!pip install boruta\n",
83 | "\n",
84 | "# Globals\n",
85 | "seed = 1017\n",
86 | "\n",
87 | "#imports\n",
88 | "import pandas as pd\n",
89 | "import matplotlib.pyplot as plt\n",
90 | "import numpy as np\n",
91 | "import seaborn as sns\n",
92 | "from tableone import TableOne\n",
93 | "from boruta import BorutaPy\n",
94 | "from sklearn.model_selection import train_test_split\n",
95 | "from sklearn.linear_model import LinearRegression\n",
96 | "from sklearn.tree import DecisionTreeClassifier\n",
97 | "from sklearn.feature_selection import RFECV\n",
98 | "from sklearn.ensemble import RandomForestClassifier\n",
99 | "\n",
100 | "#magic\n",
101 | "%matplotlib inline"
102 | ]
103 | },
104 | {
105 | "cell_type": "markdown",
106 | "id": "35f925e6",
107 | "metadata": {},
108 | "source": [
109 | "## What question am I answering?\n",
110 | "Well, we want to demonstrate the utility of feature selection. I think a convincing approach would be to compare predictive power in a model with and without feature selection. So, for every parsimonious model we train let's compare its performance with that of its couterpart prodigious model (i.e. model that uses all the features). Let's get started."
111 | ]
112 | },
113 | {
114 | "cell_type": "markdown",
115 | "id": "790bd5b1-61c0-435a-b1ae-3047d3519bae",
116 | "metadata": {},
117 | "source": [
118 | "## Loading the data\n",
119 | "As always we should have a look at how the features are distributed grouped by the labels. For this we'll generate a table 1."
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": null,
125 | "id": "71bc3faa",
126 | "metadata": {},
127 | "outputs": [],
128 | "source": [
129 | "# download the data as a pandas dataframe\n",
130 | "df = pd.read_csv(\"data/train.csv\")\n",
131 | "df_test = pd.read_csv(\"data/test.csv\")\n",
132 | "\n",
133 | "# Generate table 1\n",
134 | "TableOne(df, groupby=df.columns[-1],\n",
135 | " pval=True,\n",
136 | " dip_test=True,\n",
137 | " normal_test=True,\n",
138 | " tukey_test=True)"
139 | ]
140 | },
141 | {
142 | "cell_type": "markdown",
143 | "id": "bd5ed772",
144 | "metadata": {},
145 | "source": [
146 | "## Comparing Models\n",
147 | "Let's define a function that will calculate the prodigious and parsimonious model performance."
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": null,
153 | "id": "3bcf4875",
154 | "metadata": {},
155 | "outputs": [],
156 | "source": [
157 | "#define function that compares selected features to full model\n",
158 | "def compare_models(dataset, selfeat):\n",
159 | " \"\"\"compare parsimonious and full linear model\"\"\"\n",
160 | " \n",
161 | " # get predictors and labels\n",
162 | " X = dataset.drop('price_range',axis=1) #independent columns\n",
163 | " y = dataset['price_range'] #target column i.e price range\n",
164 | "\n",
165 | " #get selected feature indecies\n",
166 | " isel = [X.columns.get_loc(feat) for feat in selfeat if feat in X]\n",
167 | " \n",
168 | " #70-30 split\n",
169 | " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=seed)\n",
170 | " \n",
171 | "\n",
172 | " #define the prodigious and parsimonious logistic models\n",
173 | " prodmodel = LinearRegression()\n",
174 | " parsmodel = LinearRegression()\n",
175 | "\n",
176 | " #Fit the models\n",
177 | " prodmodel.fit(X_train, y_train)\n",
178 | " parsmodel.fit(X_train[selfeat], y_train) \n",
179 | "\n",
180 | " #Report errors\n",
181 | " display('Prodigious Model Score: %.2f' %prodmodel.score(X_test, y_test))\n",
182 | " display('Parsimonious Model Score: %.2f' %parsmodel.score(X_test[selfeat], y_test))\n",
183 | "\n",
184 | " return"
185 | ]
186 | },
187 | {
188 | "cell_type": "markdown",
189 | "id": "e667a1c1",
190 | "metadata": {},
191 | "source": [
192 | "## Filter Method\n",
193 | "The Table 1 conveniently has calculated the association of each feature with the outcome. Let's select only those features that are significatly (p<.05) associated. "
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": null,
199 | "id": "afad8328",
200 | "metadata": {},
201 | "outputs": [],
202 | "source": [
203 | "selfeat = ['battery_power', 'int_memory', 'mobile_wt', 'px_height', 'px_width', 'ram', 'sc_h']\n",
204 | "compare_models(df, selfeat)"
205 | ]
206 | },
207 | {
208 | "cell_type": "markdown",
209 | "id": "5d02c583",
210 | "metadata": {},
211 | "source": [
212 | "By keeping only 7 features the parsimonious model has the same score as the full model that uses all 20 features. "
213 | ]
214 | },
215 | {
216 | "cell_type": "markdown",
217 | "id": "b455bccb",
218 | "metadata": {},
219 | "source": [
220 | "## Usupervised Methods\n",
221 | "**Remove highly correlated features** To remove the correlated features, we can make use of the corr() method of the pandas dataframe. The corr() method returns a correlation matrix containing correlation between all the columns of the dataframe. A useful way to visualize the correlations is with a heatmap. We'll use the seaborn library for this."
222 | ]
223 | },
224 | {
225 | "cell_type": "code",
226 | "execution_count": null,
227 | "id": "75ae557b",
228 | "metadata": {},
229 | "outputs": [],
230 | "source": [
231 | "#Create a correlation matrix for the columns in the dataset\n",
232 | "correlation_matrix = df.corr()\n",
233 | "\n",
234 | "#plot heat map\n",
235 | "plt.figure(figsize=(20,20))\n",
236 | "g=sns.heatmap(correlation_matrix, annot=True, cmap=\"RdYlGn\")"
237 | ]
238 | },
239 | {
240 | "cell_type": "markdown",
241 | "id": "00bb8a65",
242 | "metadata": {},
243 | "source": [
244 | "We can loop through all the columns in the correlation_matrix and keep track of the features with a correlation value > 0.5. This 0.5 cut-off is quite strict and chosen for demonstration purposes. A more reasonable value is 80-90%. "
245 | ]
246 | },
247 | {
248 | "cell_type": "code",
249 | "execution_count": null,
250 | "id": "534fa397",
251 | "metadata": {},
252 | "outputs": [],
253 | "source": [
254 | "#init an empty set that will contain the names of the correlated features\n",
255 | "correlated_features = set()\n",
256 | "\n",
257 | "#loop over lower triangle of pairs of features\n",
258 | "# do not consider the last feature which is the label \n",
259 | "for i in range(len(correlation_matrix .columns) - 1):\n",
260 | " for j in range(i):\n",
261 | " if abs(correlation_matrix.iloc[i, j]) > 0.5:\n",
262 | " #accumulate the names of the second correlated feature\n",
263 | " colname = correlation_matrix.columns[j]\n",
264 | " correlated_features.add(colname)"
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": null,
270 | "id": "7e7c084a",
271 | "metadata": {},
272 | "outputs": [],
273 | "source": [
274 | "#display the correlated features\n",
275 | "display(correlated_features)"
276 | ]
277 | },
278 | {
279 | "cell_type": "markdown",
280 | "id": "603a2287",
281 | "metadata": {},
282 | "source": [
283 | "These features are correlated to at least one other feature and can be considered redundant. Let's not include them in our parsimonious set and see how it effects model performance."
284 | ]
285 | },
286 | {
287 | "cell_type": "code",
288 | "execution_count": null,
289 | "id": "355abac6",
290 | "metadata": {},
291 | "outputs": [],
292 | "source": [
293 | "#add label to the correlated features which we will drop\n",
294 | "correlated_features.add('price_range')\n",
295 | "selfeat = df.columns.drop(correlated_features)\n",
296 | "compare_models(df, selfeat)"
297 | ]
298 | },
299 | {
300 | "cell_type": "markdown",
301 | "id": "0833f61f",
302 | "metadata": {},
303 | "source": [
304 | "In this case the parsimonious model scores (goodness of fit) lower than the full model."
305 | ]
306 | },
307 | {
308 | "cell_type": "markdown",
309 | "id": "abc5abb7",
310 | "metadata": {},
311 | "source": [
312 | "## Wrapper Methods\n",
313 | "**Recursive feature elimination (RFE)** is a stepwise feature selection process implemented in sklearn. Recall, the model used for feature selection does not have to be the same as the predictive model. Here we will use a tree based model for RFE."
314 | ]
315 | },
316 | {
317 | "cell_type": "code",
318 | "execution_count": null,
319 | "id": "d0e8d4e3",
320 | "metadata": {},
321 | "outputs": [],
322 | "source": [
323 | "# get predictors and labels\n",
324 | "X = df.drop('price_range', axis=1) \n",
325 | "y = df['price_range']\n",
326 | "\n",
327 | "# use tree based model for RFE\n",
328 | "rfe = RFECV(estimator=DecisionTreeClassifier())\n",
329 | "\n",
330 | "# fit RFE\n",
331 | "rfe.fit(X, y)\n",
332 | "\n",
333 | "# summarize all features\n",
334 | "for i in range(X.shape[1]):\n",
335 | " display('Column: %d, Selected %s, Rank: %.3f' % (i, rfe.support_[i], rfe.ranking_[i]))"
336 | ]
337 | },
338 | {
339 | "cell_type": "markdown",
340 | "id": "41b7164f",
341 | "metadata": {},
342 | "source": [
343 | "We can see which features were selected by thier column index. They correspond to features 'battery_power', 'px_height', 'px_width', and 'ram' . Let's compare the parsimonious linear model with the full model."
344 | ]
345 | },
346 | {
347 | "cell_type": "code",
348 | "execution_count": null,
349 | "id": "d7b2f0cf",
350 | "metadata": {},
351 | "outputs": [],
352 | "source": [
353 | "#get the column indecies\n",
354 | "selcol = [0, 11, 12, 13]\n",
355 | "#get the column names\n",
356 | "selfeat = df.columns[selcol]\n",
357 | "#compare models\n",
358 | "compare_models(df, selfeat)"
359 | ]
360 | },
361 | {
362 | "cell_type": "markdown",
363 | "id": "709ec102",
364 | "metadata": {},
365 | "source": [
366 | "**Boruta** is another wrapper method I like to use. It can be faster than RFE as the number of features increases and stands on a more solid statistical footing. To improve RFE statistics one could employ a repeated k-fold cross vaildation scheme but that would increase the computation time even more."
367 | ]
368 | },
369 | {
370 | "cell_type": "code",
371 | "execution_count": null,
372 | "id": "3ebf7e6f",
373 | "metadata": {},
374 | "outputs": [],
375 | "source": [
376 | "# get predictors and labels\n",
377 | "X = np.array(df.drop('price_range', axis=1)) \n",
378 | "y = np.array(df['price_range'])\n",
379 | "\n",
380 | "# define random forest classifier for boruta\n",
381 | "forest = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)\n",
382 | "forest.fit(X, y)\n",
383 | "\n",
384 | "# define Boruta feature selection method\n",
385 | "feat_selector = BorutaPy(forest, n_estimators='auto', verbose=0, random_state=seed)\n",
386 | "\n",
387 | "# find all relevant features\n",
388 | "feat_selector.fit(X, y)\n",
389 | "\n",
390 | "# zip my names, ranks, and decisions in a single iterable\n",
391 | "feature_ranks = list(zip(df.columns.drop('price_range'), \n",
392 | " feat_selector.ranking_, \n",
393 | " feat_selector.support_))\n",
394 | "\n",
395 | "# iterate through and print out the results\n",
396 | "for feat in feature_ranks:\n",
397 | " display('Feature: {:<25} Rank: {}, Keep: {}'.format(feat[0], feat[1], feat[2]))\n"
398 | ]
399 | },
400 | {
401 | "cell_type": "markdown",
402 | "id": "b5249a03",
403 | "metadata": {},
404 | "source": [
405 | "Looks like bortua selected battery_power, px_height, px_width, and ram. These are the same features selected by RFE so we'll move on."
406 | ]
407 | },
408 | {
409 | "cell_type": "markdown",
410 | "id": "246afc55",
411 | "metadata": {},
412 | "source": [
413 | "## Embedded methods\n",
414 | "**LASSO**"
415 | ]
416 | },
417 | {
418 | "cell_type": "code",
419 | "execution_count": null,
420 | "id": "1de19942",
421 | "metadata": {},
422 | "outputs": [],
423 | "source": [
424 | "from sklearn.linear_model import LassoCV\n",
425 | "\n",
426 | "# get predictors and labels\n",
427 | "X = np.array(df.drop('price_range', axis=1)) \n",
428 | "y = np.array(df['price_range'])\n",
429 | "\n",
430 | "#train lasso model with 5-fold cross validataion\n",
431 | "lasso = LassoCV(cv=5, random_state=0).fit(X, y)\n",
432 | "\n",
433 | "#display the model score\n",
434 | "lasso.score(X, y)\n",
435 | "\n",
436 | "#plot feature importance based on coeficients\n",
437 | "importance = np.abs(lasso.coef_)\n",
438 | "feature_names = np.array(df.columns.drop('price_range'))\n",
439 | "plt.bar(height=importance, x=feature_names)\n",
440 | "plt.xticks(rotation=90)\n",
441 | "plt.title(\"Feature importances via coefficients\")\n",
442 | "plt.show()"
443 | ]
444 | },
445 | {
446 | "cell_type": "markdown",
447 | "id": "2556afc4",
448 | "metadata": {},
449 | "source": [
450 | "Again we see battery power, px_height, px_width, and ram are the most important features that influence price.\n",
451 | "\n",
452 | "## Conclusions\n",
453 | "I hope I have given you a fair overview of different feature selection schemes. Notice, I have not used the testing set to validate any relationships we have found. The next step would be to aggregate the information you have gained from the various feature selection schemes and use them to decide which features to include in your final model. Also, Notice there were some warnings raised by the table 1 when we first loaded the data. Addressing these errors could improve your final model's performance; remember garbage in garbage out. I'll leave that as an excercise to you."
454 | ]
455 | }
456 | ],
457 | "metadata": {
458 | "kernelspec": {
459 | "display_name": "Python 3 (ipykernel)",
460 | "language": "python",
461 | "name": "python3"
462 | },
463 | "language_info": {
464 | "codemirror_mode": {
465 | "name": "ipython",
466 | "version": 3
467 | },
468 | "file_extension": ".py",
469 | "mimetype": "text/x-python",
470 | "name": "python",
471 | "nbconvert_exporter": "python",
472 | "pygments_lexer": "ipython3",
473 | "version": "3.9.4"
474 | }
475 | },
476 | "nbformat": 4,
477 | "nbformat_minor": 5
478 | }
479 |
--------------------------------------------------------------------------------
/Module_6/Module_6.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbioinformatics/Biomedical-ML-with-Python/b014b310d93806b93a711227a00f8e4488bedfbb/Module_6/Module_6.pptx
--------------------------------------------------------------------------------
/Module_6/Module_6_Homework.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "1ace58e3-d3d7-4743-a14c-21a4f6e2aa21",
6 | "metadata": {},
7 | "source": [
8 | "# Module 6 - Feature Selection - Homework"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "id": "2f3c5bdf-2ad0-4cd2-89e5-c9b4a01b141b",
14 | "metadata": {},
15 | "source": [
16 | "For Homework I would like you to conduct your own feature selection proceedure on the PIMA native american dataset distributed with this module."
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "id": "9c9c0e58-1d2a-4bf7-892a-dbc2f216828f",
22 | "metadata": {},
23 | "source": [
24 | " ## About the PIMA dataset \n",
25 | "+ Number of Instances: 768\n",
26 | "+ Number of Attributes: 8 plus class \n",
27 | "+ For Each Attribute: (all numeric-valued)\n",
28 | " 1. Number of times pregnant\n",
29 | " 2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test\n",
30 | " 3. Diastolic blood pressure (mm Hg)\n",
31 | " 4. Triceps skin fold thickness (mm)\n",
32 | " 5. 2-Hour serum insulin (mu U/ml)\n",
33 | " 6. Body mass index (weight in kg/(height in m)^2)\n",
34 | " 7. Diabetes pedigree function\n",
35 | " 8. Age (years)\n",
36 | " 9. Class variable (0 or 1)\n",
37 | "\n",
38 | "+ Missing Attribute Values: Yes\n",
39 | "\n",
40 | "+ Class Distribution: (class value 1 is interpreted as \"tested positive for diabetes\")\n",
41 | "\n",
42 | "+ The datafile does not contain any column names you will have to generate them your self!\n",
43 | "\n",
44 | "This is a binary classification problem. To complete this homework you will need to load and tidy the data. Notice there are missing data that need to be addressed. Use the table 1 to help reveal any issues with the data distributions. The data is also not partitioned. You will have to conduct a 70:30 split before proceeding with feature selection. I would like you to compare filter method, Boruta, and LASSO feature selection and validate your results in a final linear model using your reserved testing set."
45 | ]
46 | },
47 | {
48 | "cell_type": "markdown",
49 | "id": "7103cd06",
50 | "metadata": {},
51 | "source": [
52 | "## Setup\n",
53 | "Let's get all the requirements sorted before we move on to the excercise. Most packages should be familiar at this point. Numpy, pandas, matplotlib, and seaborn where all introduced in Part I of the workshop in modules 1-3 and last week in module 5 we introduced tableone. Notice, today we will be using sklearn for the first time to do some machine learning. Don't worry too much about the models we'll be using or how to train them for now. This will the the topic for modules 7 & 8. "
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": null,
59 | "id": "4a4fec48",
60 | "metadata": {},
61 | "outputs": [],
62 | "source": [
63 | "# Requirements\n",
64 | "!pip install --upgrade ipykernel\n",
65 | "!pip install pandas\n",
66 | "!pip install numpy\n",
67 | "!pip install tableone\n",
68 | "!pip install matplotlib\n",
69 | "!pip install sklearn\n",
70 | "!pip install boruta\n",
71 | "\n",
72 | "# Globals\n",
73 | "seed = 1017\n",
74 | "\n",
75 | "#imports\n",
76 | "import pandas as pd\n",
77 | "import matplotlib.pyplot as plt\n",
78 | "import numpy as np\n",
79 | "from tableone import TableOne\n",
80 | "from boruta import BorutaPy\n",
81 | "from sklearn.model_selection import train_test_split\n",
82 | "from sklearn.linear_model import LinearRegression\n",
83 | "from sklearn.tree import DecisionTreeClassifier\n",
84 | "from sklearn.feature_selection import RFECV\n",
85 | "from sklearn.ensemble import RandomForestClassifier\n",
86 | "\n",
87 | "#magic\n",
88 | "%matplotlib inline"
89 | ]
90 | },
91 | {
92 | "cell_type": "markdown",
93 | "id": "790bd5b1-61c0-435a-b1ae-3047d3519bae",
94 | "metadata": {},
95 | "source": [
96 | "## Loading the data\n",
97 | "Use table 1 to look at how the features are distributed grouped by the outcome. I have used the `` notation to indcate where you have to fill."
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": null,
103 | "id": "71bc3faa",
104 | "metadata": {},
105 | "outputs": [],
106 | "source": [
107 | "# download the data as a pandas dataframe \n",
108 | "# Note, the datafile has no column names\n",
109 | "df = pd.read_csv()\n",
110 | "\n",
111 | "# Generate table 1 - group by the outcome index\n",
112 | "TableOne(df, groupby=df.columns[],\n",
113 | " pval=True,\n",
114 | " dip_test=True,\n",
115 | " normal_test=True,\n",
116 | " tukey_test=True)"
117 | ]
118 | },
119 | {
120 | "cell_type": "markdown",
121 | "id": "fb3d5a51",
122 | "metadata": {},
123 | "source": [
124 | "Let's address the 2 warnings raised by the table 1 and see if we have to reformat some of the features.\n",
125 | "\n",
126 | "### Addressing the warnings\n",
127 | "Let's have a look at the disributions for those features that appeared in the warnings."
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": null,
133 | "id": "ae75a9a2",
134 | "metadata": {
135 | "scrolled": false
136 | },
137 | "outputs": [],
138 | "source": [
139 | "#plot the feature distributions\n",
140 | "for feat in df.columns: \n",
141 | " df[[feat]].dropna().plot.kde(bw_method='scott') #use bw_method=.02 for a lower bandwidth gaussian representation\n",
142 | " plt.legend([feat])\n",
143 | " plt.show()"
144 | ]
145 | },
146 | {
147 | "cell_type": "markdown",
148 | "id": "9151012a",
149 | "metadata": {},
150 | "source": [
151 | "### Tasks:\n",
152 | "1. Impute missing values with the feature mean.\n",
153 | "2. Tuck in any features with long tails by log2 transform?\n",
154 | "3. Partition your data into 70% training and 30% testing"
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": null,
160 | "id": "7f634a16",
161 | "metadata": {},
162 | "outputs": [],
163 | "source": [
164 | "#Impute any missing values with their column median\n",
165 | "df.fillna(, inplace=True)"
166 | ]
167 | },
168 | {
169 | "cell_type": "code",
170 | "execution_count": null,
171 | "id": "27f5fac3",
172 | "metadata": {},
173 | "outputs": [],
174 | "source": [
175 | "#log2 transform - you will need to identify any features with long tails\n",
176 | "df[cols] = np.log2()"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": null,
182 | "id": "09bd82ba",
183 | "metadata": {},
184 | "outputs": [],
185 | "source": [
186 | "#70-30 partition\n",
187 | "df_test = \n",
188 | "df_train = "
189 | ]
190 | },
191 | {
192 | "cell_type": "markdown",
193 | "id": "0bb5a09a",
194 | "metadata": {},
195 | "source": [
196 | "## Comparing Models\n",
197 | "Let's define a function that will calculate the prodigious and parsimonious model performance."
198 | ]
199 | },
200 | {
201 | "cell_type": "code",
202 | "execution_count": null,
203 | "id": "bee1a3a6",
204 | "metadata": {},
205 | "outputs": [],
206 | "source": [
207 | "#define function that compares selected features to full model\n",
208 | "def compare_models(dataset, selfeat):\n",
209 | " \"\"\"compare parsimonious and full linear model\"\"\"\n",
210 | " \n",
211 | " # get predictors and labels\n",
212 | " X = dataset.drop(,axis=1) #independent columns\n",
213 | " y = dataset[] #outcome\n",
214 | "\n",
215 | " #get selected feature indecies\n",
216 | " isel = [X.columns.get_loc(feat) for feat in selfeat if feat in X]\n",
217 | " \n",
218 | " #70-30 split\n",
219 | " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=seed)\n",
220 | " \n",
221 | "\n",
222 | " #define the prodigious and parsimonious logistic models\n",
223 | " prodmodel = linear_model.LinearRegression()\n",
224 | " parsmodel = linear_model.LinearRegression()\n",
225 | "\n",
226 | " #Fit the models\n",
227 | " prodmodel.fit(X_train, y_train)\n",
228 | " parsmodel.fit(X_train[selfeat], y_train) \n",
229 | "\n",
230 | " #Report errors\n",
231 | " display('Prodigious Model Score: %.2f' %prodmodel.score(X_test, y_test))\n",
232 | " display('Parsimonious Model Score: %.2f' %parsmodel.score(X_test[selfeat], y_test))\n",
233 | "\n",
234 | " return"
235 | ]
236 | },
237 | {
238 | "cell_type": "markdown",
239 | "id": "05d92207",
240 | "metadata": {},
241 | "source": [
242 | "## Filter Method\n",
243 | "The Table 1 conveniently has calculated the association of each feature with the outcome. Let's select only those features that are significatly (p<.05) associated. "
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": null,
249 | "id": "a21e6e1e",
250 | "metadata": {},
251 | "outputs": [],
252 | "source": [
253 | "selfeat = []\n",
254 | "compare_models(df_train, selfeat)"
255 | ]
256 | },
257 | {
258 | "cell_type": "markdown",
259 | "id": "ea3da975",
260 | "metadata": {},
261 | "source": [
262 | "## Boruta"
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": null,
268 | "id": "3f711f9c",
269 | "metadata": {},
270 | "outputs": [],
271 | "source": [
272 | "# get predictors and labels\n",
273 | "X = np.array(df.drop(, axis=1)) \n",
274 | "y = np.array(df[])\n",
275 | "\n",
276 | "# define random forest classifier for boruta\n",
277 | "forest = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)\n",
278 | "forest.fit(X, y)\n",
279 | "\n",
280 | "# define Boruta feature selection method\n",
281 | "feat_selector = \n",
282 | "\n",
283 | "# find all relevant features\n",
284 | "feat_selector.fit(X, y)\n",
285 | "\n",
286 | "# zip my names, ranks, and decisions in a single iterable\n",
287 | "feature_ranks = list(zip(df.columns.drop(), \n",
288 | " feat_selector.ranking_, \n",
289 | " feat_selector.support_))\n",
290 | "\n",
291 | "# iterate through and print out the results\n",
292 | "for feat in feature_ranks:\n",
293 | " display('Feature: {:<25} Rank: {}, Keep: {}'.format(feat[0], feat[1], feat[2]))\n"
294 | ]
295 | },
296 | {
297 | "cell_type": "markdown",
298 | "id": "f3269faa",
299 | "metadata": {},
300 | "source": [
301 | "## LASSO"
302 | ]
303 | },
304 | {
305 | "cell_type": "code",
306 | "execution_count": null,
307 | "id": "6064a8c2",
308 | "metadata": {},
309 | "outputs": [],
310 | "source": [
311 | "from sklearn.linear_model import LassoCV\n",
312 | "\n",
313 | "# get predictors and labels\n",
314 | "X = np.array(df.drop(, axis=1)) \n",
315 | "y = np.array(df[])\n",
316 | "\n",
317 | "#train lasso model with 5-fold cross validataion\n",
318 | "lasso = \n",
319 | "\n",
320 | "#display the model score\n",
321 | "lasso.score(X, y)\n",
322 | "\n",
323 | "#plot feature importance based on coeficients\n",
324 | "importance = np.abs(lasso.coef_)\n",
325 | "feature_names = np.array(df.columns.drop())\n",
326 | "plt.bar(height=importance, x=feature_names)\n",
327 | "plt.xticks(rotation=90)\n",
328 | "plt.title(\"Feature importances via coefficients\")\n",
329 | "plt.show()"
330 | ]
331 | },
332 | {
333 | "cell_type": "markdown",
334 | "id": "78d990aa",
335 | "metadata": {},
336 | "source": [
337 | "## Report\n",
338 | "Create a final logistic regression model with your selected features and compute the accuracy to predict outcomes in the reserved testing set. "
339 | ]
340 | },
341 | {
342 | "cell_type": "code",
343 | "execution_count": null,
344 | "id": "b5159c0a",
345 | "metadata": {},
346 | "outputs": [],
347 | "source": [
348 | "#train a logistic regression model and report accuracy\n",
349 | ""
350 | ]
351 | }
352 | ],
353 | "metadata": {
354 | "kernelspec": {
355 | "display_name": "Python 3 (ipykernel)",
356 | "language": "python",
357 | "name": "python3"
358 | },
359 | "language_info": {
360 | "codemirror_mode": {
361 | "name": "ipython",
362 | "version": 3
363 | },
364 | "file_extension": ".py",
365 | "mimetype": "text/x-python",
366 | "name": "python",
367 | "nbconvert_exporter": "python",
368 | "pygments_lexer": "ipython3",
369 | "version": "3.9.4"
370 | }
371 | },
372 | "nbformat": 4,
373 | "nbformat_minor": 5
374 | }
375 |
--------------------------------------------------------------------------------
/Module_6/data/pima_description.txt:
--------------------------------------------------------------------------------
1 | 1. Title: Pima Indians Diabetes Database
2 |
3 | 2. Sources:
4 | (a) Original owners: National Institute of Diabetes and Digestive and
5 | Kidney Diseases
6 | (b) Donor of database: Vincent Sigillito (vgs@aplcen.apl.jhu.edu)
7 | Research Center, RMI Group Leader
8 | Applied Physics Laboratory
9 | The Johns Hopkins University
10 | Johns Hopkins Road
11 | Laurel, MD 20707
12 | (301) 953-6231
13 | (c) Date received: 9 May 1990
14 |
15 | 3. Past Usage:
16 | 1. Smith,~J.~W., Everhart,~J.~E., Dickson,~W.~C., Knowler,~W.~C., \&
17 | Johannes,~R.~S. (1988). Using the ADAP learning algorithm to forecast
18 | the onset of diabetes mellitus. In {\it Proceedings of the Symposium
19 | on Computer Applications and Medical Care} (pp. 261--265). IEEE
20 | Computer Society Press.
21 |
22 | The diagnostic, binary-valued variable investigated is whether the
23 | patient shows signs of diabetes according to World Health Organization
24 | criteria (i.e., if the 2 hour post-load plasma glucose was at least
25 | 200 mg/dl at any survey examination or if found during routine medical
26 | care). The population lives near Phoenix, Arizona, USA.
27 |
28 | Results: Their ADAP algorithm makes a real-valued prediction between
29 | 0 and 1. This was transformed into a binary decision using a cutoff of
30 | 0.448. Using 576 training instances, the sensitivity and specificity
31 | of their algorithm was 76% on the remaining 192 instances.
32 |
33 | 4. Relevant Information:
34 | Several constraints were placed on the selection of these instances from
35 | a larger database. In particular, all patients here are females at
36 | least 21 years old of Pima Indian heritage. ADAP is an adaptive learning
37 | routine that generates and executes digital analogs of perceptron-like
38 | devices. It is a unique algorithm; see the paper for details.
39 |
40 | 5. Number of Instances: 768
41 |
42 | 6. Number of Attributes: 8 plus class
43 |
44 | 7. For Each Attribute: (all numeric-valued)
45 | 1. Number of times pregnant
46 | 2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test
47 | 3. Diastolic blood pressure (mm Hg)
48 | 4. Triceps skin fold thickness (mm)
49 | 5. 2-Hour serum insulin (mu U/ml)
50 | 6. Body mass index (weight in kg/(height in m)^2)
51 | 7. Diabetes pedigree function
52 | 8. Age (years)
53 | 9. Class variable (0 or 1)
54 |
55 | 8. Missing Attribute Values: Yes
56 |
57 | 9. Class Distribution: (class value 1 is interpreted as "tested positive for
58 | diabetes")
59 |
60 | Class Value Number of instances
61 | 0 500
62 | 1 268
63 |
64 | 10. Brief statistical analysis:
65 |
66 | Attribute number: Mean: Standard Deviation:
67 | 1. 3.8 3.4
68 | 2. 120.9 32.0
69 | 3. 69.1 19.4
70 | 4. 20.5 16.0
71 | 5. 79.8 115.2
72 | 6. 32.0 7.9
73 | 7. 0.5 0.3
74 | 8. 33.2 11.8
--------------------------------------------------------------------------------
/Module_7/Module_7.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "1ace58e3-d3d7-4743-a14c-21a4f6e2aa21",
6 | "metadata": {},
7 | "source": [
8 | "# Module 7 - Classification"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "id": "2f3c5bdf-2ad0-4cd2-89e5-c9b4a01b141b",
14 | "metadata": {},
15 | "source": [
16 | "Today we will adapt the Module 6 Homework on the PIMA native american dataset to better undersant the classification concepts introduced today."
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "id": "9c9c0e58-1d2a-4bf7-892a-dbc2f216828f",
22 | "metadata": {},
23 | "source": [
24 | " ## About the PIMA dataset \n",
25 | "+ Number of Instances: 768\n",
26 | "+ Number of Attributes: 8 plus class \n",
27 | "+ For Each Attribute: (all numeric-valued)\n",
28 | " 1. Number of times pregnant\n",
29 | " 2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test\n",
30 | " 3. Diastolic blood pressure (mm Hg)\n",
31 | " 4. Triceps skin fold thickness (mm)\n",
32 | " 5. 2-Hour serum insulin (mu U/ml)\n",
33 | " 6. Body mass index (weight in kg/(height in m)^2)\n",
34 | " 7. Diabetes pedigree function\n",
35 | " 8. Age (years)\n",
36 | " 9. Class variable (0 or 1)\n",
37 | "\n",
38 | "+ Missing Attribute Values: Yes\n",
39 | "\n",
40 | "+ Class Distribution: (class value 1 is interpreted as \"tested positive for diabetes\")\n",
41 | "\n",
42 | "+ The datafile does not contain any column names you will have to generate them your self!\n",
43 | "\n",
44 | "This is a binary classification problem. We need to load and tidy the data. Notice there are missing data that need to be addressed. Use the table 1 to help reveal any issues with the data distributions. The data is also not partitioned. You will have to conduct a 70:30 split before proceeding. I would like you to compare filter method, Boruta, and LASSO feature selection and compare your results in a linear and random forest model using the reserved testing set."
45 | ]
46 | },
47 | {
48 | "cell_type": "markdown",
49 | "id": "7103cd06",
50 | "metadata": {},
51 | "source": [
52 | "## Setup\n",
53 | "Let's get all the requirements sorted before we move on to the excercise. Most packages should be familiar at this point. Numpy, pandas, matplotlib, and seaborn where all introduced in Part I of the workshop in modules 1-3 and last week in module 5 we introduced tableone. Notice, today we will be using sklearn for the first time to do some machine learning. Don't worry too much about the models we'll be using or how to train them for now. This will the the topic for modules 7 & 8. "
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": null,
59 | "id": "4a4fec48",
60 | "metadata": {},
61 | "outputs": [],
62 | "source": [
63 | "# Requirements\n",
64 | "!pip install --upgrade ipykernel\n",
65 | "!pip install pandas\n",
66 | "!pip install numpy\n",
67 | "!pip install tableone\n",
68 | "!pip install matplotlib\n",
69 | "!pip install sklearn\n",
70 | "!pip install boruta\n",
71 | "\n",
72 | "# Globals\n",
73 | "seed = 1017\n",
74 | "\n",
75 | "#imports\n",
76 | "import pandas as pd\n",
77 | "import matplotlib.pyplot as plt\n",
78 | "import numpy as np\n",
79 | "from tableone import TableOne\n",
80 | "from boruta import BorutaPy\n",
81 | "from sklearn.model_selection import train_test_split\n",
82 | "from sklearn.linear_model import LinearRegression\n",
83 | "from sklearn.tree import DecisionTreeClassifier\n",
84 | "from sklearn.feature_selection import RFECV\n",
85 | "from sklearn.ensemble import RandomForestClassifier\n",
86 | "\n",
87 | "#magic\n",
88 | "%matplotlib inline"
89 | ]
90 | },
91 | {
92 | "cell_type": "markdown",
93 | "id": "790bd5b1-61c0-435a-b1ae-3047d3519bae",
94 | "metadata": {},
95 | "source": [
96 | "## Loading the data\n",
97 | "Use table 1 to look at how the features are distributed grouped by the outcome."
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": null,
103 | "id": "71bc3faa",
104 | "metadata": {},
105 | "outputs": [],
106 | "source": [
107 | "# download the data as a pandas dataframe \n",
108 | "# Note, the datafile has no column names\n",
109 | "df = pd.read_csv(\"~/Documents/GitHub/Biomedical-ML-with-Python/Module_7/data/pima.csv\")\n",
110 | "#assign column names\n",
111 | "df.columns = ['npreg','blood_glucose', 'DBP', 'skin_thickness', 'blood_insulin', 'BMI', 'family_history', 'age', 'db']\n",
112 | "df.index = df.index + 1\n",
113 | "\n",
114 | "# Generate table 1 - group by the outcome index\n",
115 | "TableOne(df, groupby=df.columns[-1],\n",
116 | " pval=True,\n",
117 | " dip_test=True,\n",
118 | " normal_test=True,\n",
119 | " tukey_test=True)"
120 | ]
121 | },
122 | {
123 | "cell_type": "markdown",
124 | "id": "fb3d5a51",
125 | "metadata": {},
126 | "source": [
127 | "Let's address the warnings raised by the table 1 and see if we have to reformat some of the features.\n",
128 | "\n",
129 | "### Addressing the warnings\n",
130 | "Let's have a look at the disributions for those features that appeared in the warnings."
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": null,
136 | "id": "ae75a9a2",
137 | "metadata": {
138 | "scrolled": false
139 | },
140 | "outputs": [],
141 | "source": [
142 | "#plot the feature distributions\n",
143 | "for feat in df.columns: \n",
144 | " df[[feat]].dropna().plot.kde(bw_method='scott') #use bw_method=.02 for a lower bandwidth gaussian representation\n",
145 | " plt.legend([feat])\n",
146 | " plt.show()"
147 | ]
148 | },
149 | {
150 | "cell_type": "markdown",
151 | "id": "9151012a",
152 | "metadata": {},
153 | "source": [
154 | "### Tasks:\n",
155 | "1. Impute missing values with the feature mean.\n",
156 | "2. Tuck in any features with long tails by log2 transform?\n",
157 | "3. Partition your data into 70% training and 30% testing"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": null,
163 | "id": "7f634a16",
164 | "metadata": {},
165 | "outputs": [],
166 | "source": [
167 | "#Impute any missing values with their column median\n",
168 | "df.fillna(value=df.median(axis=1, skipna=True), inplace=True)"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": null,
174 | "id": "27f5fac3",
175 | "metadata": {},
176 | "outputs": [],
177 | "source": [
178 | "#log2 transform - you will need to identify any features with long tails\n",
179 | "cols = ['age']\n",
180 | "df[cols] = np.log(df[cols])"
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": null,
186 | "id": "09bd82ba",
187 | "metadata": {},
188 | "outputs": [],
189 | "source": [
190 | "#70-30 partition\n",
191 | "df_test = df.sample(frac=0.3)\n",
192 | "df_train = df.drop(df_test.index)\n",
193 | "display(df_train.shape)\n",
194 | "display(df_test.shape)"
195 | ]
196 | },
197 | {
198 | "cell_type": "markdown",
199 | "id": "5b391104",
200 | "metadata": {},
201 | "source": [
202 | "## Comparing Models\n",
203 | "Let's define a function that will calculate the prodigious and parsimonious model performance."
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": null,
209 | "id": "bee1a3a6",
210 | "metadata": {},
211 | "outputs": [],
212 | "source": [
213 | "#define function that compares selected features to full model\n",
214 | "def compare_models(dataset, selfeat):\n",
215 | " \"\"\"compare parsimonious and full linear model\"\"\"\n",
216 | " \n",
217 | " # get predictors and labels\n",
218 | " X = dataset.drop(,axis=1) #independent columns\n",
219 | " y = dataset[] #outcome\n",
220 | "\n",
221 | " #get selected feature indecies\n",
222 | " isel = [X.columns.get_loc(feat) for feat in selfeat if feat in X]\n",
223 | " \n",
224 | " #70-30 split\n",
225 | " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=seed)\n",
226 | " \n",
227 | "\n",
228 | " #define the prodigious and parsimonious logistic models\n",
229 | " prodmodel = linear_model.LinearRegression()\n",
230 | " parsmodel = linear_model.LinearRegression()\n",
231 | "\n",
232 | " #Fit the models\n",
233 | " prodmodel.fit(X_train, y_train)\n",
234 | " parsmodel.fit(X_train[selfeat], y_train) \n",
235 | "\n",
236 | " #Report errors\n",
237 | " display('Prodigious Model Score: %.2f' %prodmodel.score(X_test, y_test))\n",
238 | " display('Parsimonious Model Score: %.2f' %parsmodel.score(X_test[selfeat], y_test))\n",
239 | "\n",
240 | " return"
241 | ]
242 | },
243 | {
244 | "cell_type": "markdown",
245 | "id": "05d92207",
246 | "metadata": {},
247 | "source": [
248 | "## Filter Method\n",
249 | "The Table 1 conveniently has calculated the association of each feature with the outcome. Let's select only those features that are significatly (p<.05) associated. "
250 | ]
251 | },
252 | {
253 | "cell_type": "code",
254 | "execution_count": null,
255 | "id": "a21e6e1e",
256 | "metadata": {},
257 | "outputs": [],
258 | "source": [
259 | "selfeat = []\n",
260 | "compare_models(df_train, selfeat)"
261 | ]
262 | },
263 | {
264 | "cell_type": "markdown",
265 | "id": "ea3da975",
266 | "metadata": {},
267 | "source": [
268 | "## Boruta"
269 | ]
270 | },
271 | {
272 | "cell_type": "code",
273 | "execution_count": null,
274 | "id": "3f711f9c",
275 | "metadata": {},
276 | "outputs": [],
277 | "source": [
278 | "# get predictors and labels\n",
279 | "X = np.array(df.drop(, axis=1)) \n",
280 | "y = np.array(df[])\n",
281 | "\n",
282 | "# define random forest classifier for boruta\n",
283 | "forest = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)\n",
284 | "forest.fit(X, y)\n",
285 | "\n",
286 | "# define Boruta feature selection method\n",
287 | "feat_selector = \n",
288 | "\n",
289 | "# find all relevant features\n",
290 | "feat_selector.fit(X, y)\n",
291 | "\n",
292 | "# zip my names, ranks, and decisions in a single iterable\n",
293 | "feature_ranks = list(zip(df.columns.drop(), \n",
294 | " feat_selector.ranking_, \n",
295 | " feat_selector.support_))\n",
296 | "\n",
297 | "# iterate through and print out the results\n",
298 | "for feat in feature_ranks:\n",
299 | " display('Feature: {:<25} Rank: {}, Keep: {}'.format(feat[0], feat[1], feat[2]))\n"
300 | ]
301 | },
302 | {
303 | "cell_type": "markdown",
304 | "id": "f3269faa",
305 | "metadata": {},
306 | "source": [
307 | "## LASSO"
308 | ]
309 | },
310 | {
311 | "cell_type": "code",
312 | "execution_count": null,
313 | "id": "6064a8c2",
314 | "metadata": {},
315 | "outputs": [],
316 | "source": [
317 | "from sklearn.linear_model import LassoCV\n",
318 | "\n",
319 | "# get predictors and labels\n",
320 | "X = np.array(df.drop(, axis=1)) \n",
321 | "y = np.array(df[])\n",
322 | "\n",
323 | "#train lasso model with 5-fold cross validataion\n",
324 | "lasso = \n",
325 | "\n",
326 | "#display the model score\n",
327 | "lasso.score(X, y)\n",
328 | "\n",
329 | "#plot feature importance based on coeficients\n",
330 | "importance = np.abs(lasso.coef_)\n",
331 | "feature_names = np.array(df.columns.drop())\n",
332 | "plt.bar(height=importance, x=feature_names)\n",
333 | "plt.xticks(rotation=90)\n",
334 | "plt.title(\"Feature importances via coefficients\")\n",
335 | "plt.show()"
336 | ]
337 | },
338 | {
339 | "cell_type": "markdown",
340 | "id": "78d990aa",
341 | "metadata": {},
342 | "source": [
343 | "## Report\n",
344 | "Create a final logistic regression model with your selected features and compute the accuracy to predict outcomes in the reserved testing set. "
345 | ]
346 | },
347 | {
348 | "cell_type": "code",
349 | "execution_count": null,
350 | "id": "b5159c0a",
351 | "metadata": {},
352 | "outputs": [],
353 | "source": [
354 | "#train a logistic regression model and report accuracy\n",
355 | ""
356 | ]
357 | }
358 | ],
359 | "metadata": {
360 | "kernelspec": {
361 | "display_name": "Python 3 (ipykernel)",
362 | "language": "python",
363 | "name": "python3"
364 | },
365 | "language_info": {
366 | "codemirror_mode": {
367 | "name": "ipython",
368 | "version": 3
369 | },
370 | "file_extension": ".py",
371 | "mimetype": "text/x-python",
372 | "name": "python",
373 | "nbconvert_exporter": "python",
374 | "pygments_lexer": "ipython3",
375 | "version": "3.9.4"
376 | }
377 | },
378 | "nbformat": 4,
379 | "nbformat_minor": 5
380 | }
381 |
--------------------------------------------------------------------------------
/Module_7/Module_7.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbioinformatics/Biomedical-ML-with-Python/b014b310d93806b93a711227a00f8e4488bedfbb/Module_7/Module_7.pptx
--------------------------------------------------------------------------------
/Module_7/data/pima_description.txt:
--------------------------------------------------------------------------------
1 | 1. Title: Pima Indians Diabetes Database
2 |
3 | 2. Sources:
4 | (a) Original owners: National Institute of Diabetes and Digestive and
5 | Kidney Diseases
6 | (b) Donor of database: Vincent Sigillito (vgs@aplcen.apl.jhu.edu)
7 | Research Center, RMI Group Leader
8 | Applied Physics Laboratory
9 | The Johns Hopkins University
10 | Johns Hopkins Road
11 | Laurel, MD 20707
12 | (301) 953-6231
13 | (c) Date received: 9 May 1990
14 |
15 | 3. Past Usage:
16 | 1. Smith,~J.~W., Everhart,~J.~E., Dickson,~W.~C., Knowler,~W.~C., \&
17 | Johannes,~R.~S. (1988). Using the ADAP learning algorithm to forecast
18 | the onset of diabetes mellitus. In {\it Proceedings of the Symposium
19 | on Computer Applications and Medical Care} (pp. 261--265). IEEE
20 | Computer Society Press.
21 |
22 | The diagnostic, binary-valued variable investigated is whether the
23 | patient shows signs of diabetes according to World Health Organization
24 | criteria (i.e., if the 2 hour post-load plasma glucose was at least
25 | 200 mg/dl at any survey examination or if found during routine medical
26 | care). The population lives near Phoenix, Arizona, USA.
27 |
28 | Results: Their ADAP algorithm makes a real-valued prediction between
29 | 0 and 1. This was transformed into a binary decision using a cutoff of
30 | 0.448. Using 576 training instances, the sensitivity and specificity
31 | of their algorithm was 76% on the remaining 192 instances.
32 |
33 | 4. Relevant Information:
34 | Several constraints were placed on the selection of these instances from
35 | a larger database. In particular, all patients here are females at
36 | least 21 years old of Pima Indian heritage. ADAP is an adaptive learning
37 | routine that generates and executes digital analogs of perceptron-like
38 | devices. It is a unique algorithm; see the paper for details.
39 |
40 | 5. Number of Instances: 768
41 |
42 | 6. Number of Attributes: 8 plus class
43 |
44 | 7. For Each Attribute: (all numeric-valued)
45 | 1. Number of times pregnant
46 | 2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test
47 | 3. Diastolic blood pressure (mm Hg)
48 | 4. Triceps skin fold thickness (mm)
49 | 5. 2-Hour serum insulin (mu U/ml)
50 | 6. Body mass index (weight in kg/(height in m)^2)
51 | 7. Diabetes pedigree function
52 | 8. Age (years)
53 | 9. Class variable (0 or 1)
54 |
55 | 8. Missing Attribute Values: Yes
56 |
57 | 9. Class Distribution: (class value 1 is interpreted as "tested positive for
58 | diabetes")
59 |
60 | Class Value Number of instances
61 | 0 500
62 | 1 268
63 |
64 | 10. Brief statistical analysis:
65 |
66 | Attribute number: Mean: Standard Deviation:
67 | 1. 3.8 3.4
68 | 2. 120.9 32.0
69 | 3. 69.1 19.4
70 | 4. 20.5 16.0
71 | 5. 79.8 115.2
72 | 6. 32.0 7.9
73 | 7. 0.5 0.3
74 | 8. 33.2 11.8
--------------------------------------------------------------------------------
/Module_8/Module_8.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "1ace58e3-d3d7-4743-a14c-21a4f6e2aa21",
6 | "metadata": {},
7 | "source": [
8 | "# Module 8 - Regression"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "id": "f461dda1",
14 | "metadata": {},
15 | "source": [
16 | "Today's data set consists of time series data measured at 10 min intervals for about 4.5 months. A house temperature and humidity conditions were monitored with a ZigBee wireless sensor network. Each wireless node transmitted the temperature and humidity conditions every 3.3 min. Then, the wireless data was averaged for 10 minutes periods. The energy data was logged every 10 minutes with m-bus energy meters. Weather from the nearest airport weather station (Chievres Airport, Belgium) was downloaded from a public data set from Reliable Prognosis (rp5.ru), and merged together with the experimental data sets using the date and time column. Two random variables have been included in the data set for testing the regression models and to filter out non predictive attributes (parameters). We will use this data to predict the current energy usage of the appliances in the home based on the indoor and outdoor predictors. Notice, this is not a forecasting excercise. "
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "id": "7103cd06",
22 | "metadata": {},
23 | "source": [
24 | "## Setup\n",
25 | "Let's get all the requirements sorted before we move on to the excercise. Notice, today we will be using the datetime package to deal with timestamps. "
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": null,
31 | "id": "4a4fec48",
32 | "metadata": {},
33 | "outputs": [],
34 | "source": [
35 | "# Requirements\n",
36 | "!pip install --upgrade ipykernel\n",
37 | "!pip install datetime\n",
38 | "!pip install pandas\n",
39 | "!pip install tableone\n",
40 | "!pip install numpy\n",
41 | "!pip install matplotlib\n",
42 | "!pip install scipy\n",
43 | "!pip install boruta\n",
44 | "!pip install sklearn\n",
45 | "\n",
46 | "# Globals\n",
47 | "seed = 1017\n",
48 | "\n",
49 | "#imports\n",
50 | "import pandas as pd\n",
51 | "from datetime import datetime\n",
52 | "from tableone import TableOne\n",
53 | "import numpy as np\n",
54 | "import matplotlib.pyplot as plt\n",
55 | "from scipy import stats\n",
56 | "from boruta import BorutaPy\n",
57 | "from sklearn.ensemble import RandomForestRegressor\n",
58 | "\n",
59 | "#magic\n",
60 | "%matplotlib inline"
61 | ]
62 | },
63 | {
64 | "cell_type": "markdown",
65 | "id": "790bd5b1-61c0-435a-b1ae-3047d3519bae",
66 | "metadata": {},
67 | "source": [
68 | "## Loading the data\n",
69 | "The data for today can be found in the `data` folder distributed along with this notebook."
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": null,
75 | "id": "08d23b16",
76 | "metadata": {},
77 | "outputs": [],
78 | "source": [
79 | "# download the data as a pandas dataframe\n",
80 | "df = pd.read_csv(\"data/KAG_energydata_complete.csv\")"
81 | ]
82 | },
83 | {
84 | "cell_type": "markdown",
85 | "id": "0301be4c",
86 | "metadata": {},
87 | "source": [
88 | "## Data description\n",
89 | "|column | description |\n",
90 | "|-------|-------------|\n",
91 | "|date | time year-month-day hour:minute:second|\n",
92 | "|Appliances | energy use in Wh |\n",
93 | "|lights| energy use of light fixtures in the house in Wh |\n",
94 | "|T1| Temperature in kitchen area, in Celsius|\n",
95 | "|RH1| Relative humidity in kitchen area, in %|\n",
96 | "|T2| Temperature in living room area, in Celsius |\n",
97 | "|RH2| Relative humidity in living room area, in % |\n",
98 | "|T3| Temperature in laundry room area, in Celsius |\n",
99 | "|RH3| Humidity in laundry room area, in % |\n",
100 | "|T4| Temperature in office room, in Celsius|\n",
101 | "|RH4| Humidity in office room, in %|\n",
102 | "|T5| Temperature in bathroom, in Celsius|\n",
103 | "|RH5| Humidity in bathroom, in % |\n",
104 | "|T6| Temperature outside the building (north side), in Celsius |\n",
105 | "|RH6| Humidity outside the building (north side), in %|\n",
106 | "|T7| Temperature in ironing room, in Celsius|\n",
107 | "|RH7| Humidity in ironing room, in % |\n",
108 | "|T8| Temperature in teenager room 2, in Celsius |\n",
109 | "|RH8| Humidity in teenager room 2, in %|\n",
110 | "|T9| Temperature in parents room, in Celsius|\n",
111 | "|RH9| Humidity in parents room, in % |\n",
112 | "|Tout| Temperature outside (from Chievres weather station), in Celsius|\n",
113 | "|Press_mm_hg| Barometric Pressure at Chievres weather station, in mm Hg |\n",
114 | "|RHout| Humidity outside (from Chievres weather station), in %|\n",
115 | "|Windspeed| Wind speed at Chievres weather station, in m/s|\n",
116 | "|Visibility| Ground visibility at Chievres weather station, in km|\n",
117 | "|Tdewpoint| Condensation Temperature at Chievres weather station, in Celsius|\n",
118 | "|rv1| Random variable 1, nondimensional|\n",
119 | "|rv2| Random variable 2, nondimensional|"
120 | ]
121 | },
122 | {
123 | "cell_type": "markdown",
124 | "id": "60583afe",
125 | "metadata": {},
126 | "source": [
127 | "## Formatting\n",
128 | "As is, the date column is acting like a bookeeping index for each observation. Maybe we can get some useful perdictors out of it. Let's engineer a time of day numeric feauture and a weekend/weekday binary variable."
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": null,
134 | "id": "db9c49fc",
135 | "metadata": {},
136 | "outputs": [],
137 | "source": [
138 | "#convert strings in date column into datetime objects\n",
139 | "datetimelist = [ datetime.strptime(date, \"%Y-%m-%d %H:%M:%S\") for date in df['date'] ]\n",
140 | "\n",
141 | "#extract time of day in minutes\n",
142 | "df['time'] = [ obs.hour * 24.0 + obs.minute for obs in datetimelist ]\n",
143 | "\n",
144 | "#extract the day of week as integer 0-6 for Monday-Sunday\n",
145 | "df['day'] = [ obs.weekday() for obs in datetimelist ]\n",
146 | "\n",
147 | "#remove the date column\n",
148 | "df.drop('date', axis=1, inplace=True)"
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": null,
154 | "id": "71bc3faa",
155 | "metadata": {},
156 | "outputs": [],
157 | "source": [
158 | "#Bin appliance energy usage into above and below the median to comapre features\n",
159 | "df['bin'] = df['Appliances']>np.median(df['Appliances'])\n",
160 | "\n",
161 | "# Generate table 1 - group by the Appliance energy use bins\n",
162 | "tbl1 = TableOne(df, groupby='bin', categorical=['day', 'lights'], \n",
163 | " pval=True,\n",
164 | " dip_test=True,\n",
165 | " normal_test=True,\n",
166 | " tukey_test=True)\n",
167 | "\n",
168 | "#Remove the bin variable we created for diagnostic puroposes\n",
169 | "df.drop('bin', axis=1, inplace=True)"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": null,
175 | "id": "c506217d",
176 | "metadata": {},
177 | "outputs": [],
178 | "source": [
179 | "#display the table 1\n",
180 | "display(tbl1)"
181 | ]
182 | },
183 | {
184 | "cell_type": "markdown",
185 | "id": "fb3d5a51",
186 | "metadata": {},
187 | "source": [
188 | "### Feature distributions\n",
189 | "Let's plot the feature distributions and see if we can address the warnings raised by the table 1."
190 | ]
191 | },
192 | {
193 | "cell_type": "code",
194 | "execution_count": null,
195 | "id": "ae75a9a2",
196 | "metadata": {
197 | "scrolled": false
198 | },
199 | "outputs": [],
200 | "source": [
201 | "#plot the feature distributions\n",
202 | "for feat in df.columns: \n",
203 | " df[[feat]].dropna().plot.kde(bw_method='scott') #use bw_method=.02 for a lower bandwidth gaussian representation\n",
204 | " plt.legend([feat])\n",
205 | " plt.show()"
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": null,
211 | "id": "7f634a16",
212 | "metadata": {},
213 | "outputs": [],
214 | "source": [
215 | "#Impute any missing values with their column median\n",
216 | "df.fillna(value=df.median(axis=1, skipna=True), inplace=True)"
217 | ]
218 | },
219 | {
220 | "cell_type": "code",
221 | "execution_count": null,
222 | "id": "27f5fac3",
223 | "metadata": {},
224 | "outputs": [],
225 | "source": [
226 | "#log2 transform - you will need to identify any features with long tails\n",
227 | "cols = ['Appliances', 'lights']\n",
228 | "df[cols] = np.log(df[cols]+1)"
229 | ]
230 | },
231 | {
232 | "cell_type": "code",
233 | "execution_count": null,
234 | "id": "bfa49661",
235 | "metadata": {},
236 | "outputs": [],
237 | "source": [
238 | "#mean center and unit scale the standard deviation for the numerical variables\n",
239 | "cols = df.columns[~df.columns.isin(['day','lights'])]\n",
240 | "df[cols] = stats.zscore(df[cols])"
241 | ]
242 | },
243 | {
244 | "cell_type": "code",
245 | "execution_count": null,
246 | "id": "09bd82ba",
247 | "metadata": {},
248 | "outputs": [],
249 | "source": [
250 | "#70-30 partition\n",
251 | "df_test = df.sample(frac=0.3)\n",
252 | "df_train = df.drop(df_test.index)\n",
253 | "display(df_train.shape)\n",
254 | "display(df_test.shape)"
255 | ]
256 | },
257 | {
258 | "cell_type": "markdown",
259 | "id": "f3269faa",
260 | "metadata": {},
261 | "source": [
262 | "## Linear Regression \n",
263 | "Let's explore linear regression with regularization using the ridge, lasso, and elasticnt regression models."
264 | ]
265 | },
266 | {
267 | "cell_type": "code",
268 | "execution_count": null,
269 | "id": "dded7467",
270 | "metadata": {},
271 | "outputs": [],
272 | "source": [
273 | "from sklearn.linear_model import RidgeCV\n",
274 | "\n",
275 | "# get predictors and labels\n",
276 | "X = np.array(df_train.drop('Appliances', axis=1)) \n",
277 | "y = np.array(df_train['Appliances'])\n",
278 | "\n",
279 | "\n",
280 | "#train train ridge regression model with 10-fold cross validataion\n",
281 | "ridge = RidgeCV(cv=5).fit(X,y)\n",
282 | "\n",
283 | "#plot feature importance based on coeficients\n",
284 | "importance = np.abs(ridge.coef_)\n",
285 | "feature_names = np.array(df.columns.drop('Appliances'))\n",
286 | "plt.bar(height=importance, x=feature_names)\n",
287 | "plt.xticks(rotation=90)\n",
288 | "plt.title(\"Feature importances via coefficients\")\n",
289 | "plt.show()\n",
290 | "\n",
291 | "# Get model score in the testing set\n",
292 | "X_test = np.array(df_test.drop('Appliances', axis=1)) \n",
293 | "y_test = np.array(df_test['Appliances'])\n",
294 | "\n",
295 | "#display the model score\n",
296 | "display(ridge.score(X_test, y_test))"
297 | ]
298 | },
299 | {
300 | "cell_type": "code",
301 | "execution_count": null,
302 | "id": "6064a8c2",
303 | "metadata": {},
304 | "outputs": [],
305 | "source": [
306 | "from sklearn.linear_model import LassoCV\n",
307 | "\n",
308 | "# get predictors and labels\n",
309 | "X = np.array(df_train.drop('Appliances', axis=1)) \n",
310 | "y = np.array(df_train['Appliances'])\n",
311 | "\n",
312 | "#train lasso model with 10-fold cross validataion\n",
313 | "lasso = LassoCV(max_iter=1000, tol=0.0001, cv=5,\n",
314 | " verbose=0, n_jobs=-1, random_state=seed, selection='random').fit(X,y)\n",
315 | "\n",
316 | "#display the model score\n",
317 | "display(lasso.score(X, y))\n",
318 | "\n",
319 | "#plot feature importance based on coeficients\n",
320 | "importance = np.abs(lasso.coef_)\n",
321 | "feature_names = np.array(df.columns.drop('Appliances'))\n",
322 | "plt.bar(height=importance, x=feature_names)\n",
323 | "plt.xticks(rotation=90)\n",
324 | "plt.title(\"Feature importances via coefficients\")\n",
325 | "plt.show()\n",
326 | "\n",
327 | "# Get model score in the testing set\n",
328 | "X_test = np.array(df_test.drop('Appliances', axis=1)) \n",
329 | "y_test = np.array(df_test['Appliances'])\n",
330 | "\n",
331 | "#display the model score\n",
332 | "display(lasso.score(X_test, y_test))\n"
333 | ]
334 | },
335 | {
336 | "cell_type": "code",
337 | "execution_count": null,
338 | "id": "ba9f2747",
339 | "metadata": {},
340 | "outputs": [],
341 | "source": [
342 | "from sklearn.linear_model import ElasticNetCV\n",
343 | "\n",
344 | "# get predictors and labels\n",
345 | "X = np.array(df_train.drop('Appliances', axis=1)) \n",
346 | "y = np.array(df_train['Appliances'])\n",
347 | "\n",
348 | "#train lasso model with 10-fold cross validataion\n",
349 | "elastic = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], max_iter=1000, tol=0.0001, cv=5,\n",
350 | " verbose=0, n_jobs=-1, random_state=seed, selection='random').fit(X,y)\n",
351 | "\n",
352 | "#display the model score\n",
353 | "display(elastic.score(X, y))\n",
354 | "\n",
355 | "#plot feature importance based on coeficients\n",
356 | "importance = np.abs(elastic.coef_)\n",
357 | "feature_names = np.array(df.columns.drop('Appliances'))\n",
358 | "plt.bar(height=importance, x=feature_names)\n",
359 | "plt.xticks(rotation=90)\n",
360 | "plt.title(\"Feature importances via coefficients\")\n",
361 | "plt.show()\n",
362 | "\n",
363 | "# Get model score in the testing set\n",
364 | "X_test = np.array(df_test.drop('Appliances', axis=1)) \n",
365 | "y_test = np.array(df_test['Appliances'])\n",
366 | "\n",
367 | "#display the model score\n",
368 | "display(elastic.score(X_test, y_test))"
369 | ]
370 | },
371 | {
372 | "cell_type": "markdown",
373 | "id": "334d5cb7",
374 | "metadata": {},
375 | "source": [
376 | "## Non-Linear Regression\n",
377 | "Let's explore non-linear regression with a multi layer perceptron model"
378 | ]
379 | },
380 | {
381 | "cell_type": "code",
382 | "execution_count": null,
383 | "id": "320a30bd",
384 | "metadata": {},
385 | "outputs": [],
386 | "source": [
387 | "from sklearn.neural_network import MLPRegressor\n",
388 | "\n",
389 | "# get predictors and labels\n",
390 | "X = np.array(df_train.drop('Appliances', axis=1)) \n",
391 | "y = np.array(df_train['Appliances'])\n",
392 | "\n",
393 | "#train AdaBoost model \n",
394 | "mlp = MLPRegressor(hidden_layer_sizes=(10,10), activation='relu',\n",
395 | " solver='adam', alpha=0.0001, batch_size='auto',\n",
396 | " learning_rate='constant', learning_rate_init=0.001,\n",
397 | " max_iter=200, shuffle=True, random_state=seed, tol=0.0001,\n",
398 | " verbose=False, warm_start=False, \n",
399 | " early_stopping=True, validation_fraction=0.1,\n",
400 | " beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10).fit(X,y)\n",
401 | "\n",
402 | "# Get model score in the testing set\n",
403 | "X_test = np.array(df_test.drop('Appliances', axis=1)) \n",
404 | "y_test = np.array(df_test['Appliances'])\n",
405 | "\n",
406 | "#display the model score\n",
407 | "display(mlp.score(X_test, y_test))\n"
408 | ]
409 | },
410 | {
411 | "cell_type": "markdown",
412 | "id": "aac95fb7",
413 | "metadata": {},
414 | "source": [
415 | "# Homework\n",
416 | "\n",
417 | "### Loading the data via kaggle API\n",
418 | "The data set we used today was sourced from kaggle. Although one can manually download data this way, it is much more efficient and safer to aquire your source data programmatically. \n",
419 | "To download the data directly from [kaggle](kaggle.com) you will need to have a kaggle account. **It's free.** Once you create your kaggle account you can generate an API token. After you log in you should see a circular account icon in the upper-right of any kaggle page. Clicking on your account icon will open a right-sidebar where you can select \"Account\" to edit your account. Scroll down to the API section and click on the \"create new api token\" button. An API token should automatically download and a prompt will also appear telling you which directory to put this token so python knows where find it. For MacOS users this location is \"~/.kaggle/kaggle.json\". Once you have done this modify the code below to download the dataset to the `data` folder distributed with this notebook."
420 | ]
421 | },
422 | {
423 | "cell_type": "code",
424 | "execution_count": null,
425 | "id": "4e2fe231",
426 | "metadata": {},
427 | "outputs": [],
428 | "source": [
429 | "#log in to kaggle using your api token\n",
430 | "kaggle.api.authenticate()\n",
431 | "\n",
432 | "#path relative to this notebook to put the data\n",
433 | "datadir = \n",
434 | "\n",
435 | "#name of the dataset on kaggle\n",
436 | "dataset = 'loveall/appliances-energy-prediction'\n",
437 | "\n",
438 | "#downlaod the data\n",
439 | "kaggle.api.dataset_download_files(dataset, path=datadir, unzip=True)"
440 | ]
441 | },
442 | {
443 | "cell_type": "markdown",
444 | "id": "1b7dfc6b",
445 | "metadata": {},
446 | "source": [
447 | "## Linear vs non-linear models\n",
448 | "Notice the non-linear model scored higher than the 3 regularized linear models. Is the non-linear model more appropriate because is scored higher on the testing set? How would you demonstrate the non-linear model is infact a more appropriate?"
449 | ]
450 | },
451 | {
452 | "cell_type": "code",
453 | "execution_count": null,
454 | "id": "82ab8fe7",
455 | "metadata": {},
456 | "outputs": [],
457 | "source": [
458 | "#Generate some code to prove your point.\n",
459 | ""
460 | ]
461 | },
462 | {
463 | "cell_type": "markdown",
464 | "id": "85b4d054",
465 | "metadata": {},
466 | "source": [
467 | "## Advanced models\n",
468 | "Can you think of an even more appropriate model to use other than the deep-network and regularized models we used today? If the simplest model is the best model then argue why should we consider more complicated model to treat this dataset."
469 | ]
470 | }
471 | ],
472 | "metadata": {
473 | "kernelspec": {
474 | "display_name": "Python 3 (ipykernel)",
475 | "language": "python",
476 | "name": "python3"
477 | },
478 | "language_info": {
479 | "codemirror_mode": {
480 | "name": "ipython",
481 | "version": 3
482 | },
483 | "file_extension": ".py",
484 | "mimetype": "text/x-python",
485 | "name": "python",
486 | "nbconvert_exporter": "python",
487 | "pygments_lexer": "ipython3",
488 | "version": "3.9.4"
489 | }
490 | },
491 | "nbformat": 4,
492 | "nbformat_minor": 5
493 | }
494 |
--------------------------------------------------------------------------------
/Module_8/Module_8.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbioinformatics/Biomedical-ML-with-Python/b014b310d93806b93a711227a00f8e4488bedfbb/Module_8/Module_8.pptx
--------------------------------------------------------------------------------
/Module_8/data/KAG_energydata_complete.csv.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbioinformatics/Biomedical-ML-with-Python/b014b310d93806b93a711227a00f8e4488bedfbb/Module_8/data/KAG_energydata_complete.csv.zip
--------------------------------------------------------------------------------
/Modules_11&12/Module_11.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "1ace58e3-d3d7-4743-a14c-21a4f6e2aa21",
6 | "metadata": {},
7 | "source": [
8 | "# Module 11 - Image Proccessing"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "id": "f461dda1",
14 | "metadata": {},
15 | "source": [
16 | "Today's data set comes from [kaggle](https://www.kaggle.com/c/data-science-bowl-2018/overview) as part of the 2018 Data Science Bowl. The task is to spot nuclei in under varied conditions. The dataset contains a large number of segmented nuclei images. The images were acquired under a variety of conditions and vary in the cell type, magnification, and imaging modality (brightfield vs. fluorescence).\n",
17 | "\n",
18 | "Each image is represented by an associated ImageId. Files belonging to an image are contained in a folder with this ImageId. Within this folder are two subfolders:\n",
19 | "\n",
20 | "+ images contains the image file.\n",
21 | "+ masks contains the segmented masks of each nucleus. This folder is only included in the training set. Each mask contains one nucleus. Masks are not allowed to overlap (no pixel belongs to two masks).\n",
22 | "\n",
23 | "We will be working only with the Competition's stage 1 labeled training data for today."
24 | ]
25 | },
26 | {
27 | "cell_type": "markdown",
28 | "id": "7103cd06",
29 | "metadata": {},
30 | "source": [
31 | "## Setup\n",
32 | "Let's get all the requirements sorted before we move on to the excercise. Notice, today we will be using the datetime package to deal with timestamps. "
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": null,
38 | "id": "4a4fec48",
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "# Requirements\n",
43 | "!pip install --upgrade ipykernel\n",
44 | "!pip install pandas\n",
45 | "!pip install numpy\n",
46 | "!pip install matplotlib\n",
47 | "!pip install tqdm #for progress bar\n",
48 | "!pip install scikit-image\n",
49 | "!pip install scipy\n",
50 | "\n",
51 | "\n",
52 | "# Globals\n",
53 | "seed = 1017\n",
54 | "\n",
55 | "#imports\n",
56 | "import os\n",
57 | "import random\n",
58 | "import pandas as pd\n",
59 | "import numpy as np\n",
60 | "import matplotlib.pyplot as plt\n",
61 | "from scipy import ndimage as nd\n",
62 | "from tqdm import tqdm\n",
63 | "from skimage import img_as_float, img_as_ubyte\n",
64 | "from skimage.io import imread, imshow\n",
65 | "from skimage.transform import resize\n",
66 | "from skimage.restoration import denoise_nl_means, estimate_sigma\n",
67 | "from skimage.exposure import equalize_adapthist\n",
68 | "from skimage.morphology import disk, diameter_closing, diameter_opening\n",
69 | "from skimage.segmentation import clear_border\n",
70 | "from skimage.color import rgb2gray\n",
71 | "from skimage.feature import canny\n",
72 | "\n",
73 | "#magic\n",
74 | "%matplotlib inline"
75 | ]
76 | },
77 | {
78 | "cell_type": "markdown",
79 | "id": "790bd5b1-61c0-435a-b1ae-3047d3519bae",
80 | "metadata": {},
81 | "source": [
82 | "## Loading the data\n",
83 | "The data for today can be found in the `data` folder distributed along with this notebook. You will have to unzip it manually."
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": null,
89 | "id": "4ca00c56",
90 | "metadata": {},
91 | "outputs": [],
92 | "source": [
93 | "#set path to training data\n",
94 | "TRAIN_PATH=\"data/stage1_train/\"\n",
95 | "#get sample IDs\n",
96 | "train_ids=next(os.walk(TRAIN_PATH))[1]\n",
97 | "print(str(len(train_ids)) + \" Samples found!\")"
98 | ]
99 | },
100 | {
101 | "cell_type": "markdown",
102 | "id": "bcd56cc1",
103 | "metadata": {},
104 | "source": [
105 | "## Formatting\n",
106 | "Let's resize the images to something managable so we can speed up calculations."
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": null,
112 | "id": "e49587a4",
113 | "metadata": {},
114 | "outputs": [],
115 | "source": [
116 | "#Declare image shape\n",
117 | "IMG_HEIGHT=128\n",
118 | "IMG_WIDTH=128\n",
119 | "IMG_CHANNELS=3"
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": null,
125 | "id": "3bf9a687",
126 | "metadata": {},
127 | "outputs": [],
128 | "source": [
129 | "#resize images to speedup calculations\n",
130 | "X_train = np.zeros((len(train_ids), IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS), dtype=np.uint8)\n",
131 | "Y_train = np.zeros((len(train_ids), IMG_HEIGHT, IMG_WIDTH, 1), dtype=np.bool)\n",
132 | "\n",
133 | "print('Resizing training images and masks')\n",
134 | "for n, id_ in tqdm(enumerate(train_ids), total=len(train_ids)): \n",
135 | " path = TRAIN_PATH + id_\n",
136 | " img = imread(path + '/images/' + id_ + '.png')[:,:,:IMG_CHANNELS] \n",
137 | " img = resize(img, (IMG_HEIGHT, IMG_WIDTH), mode='constant', preserve_range=True)\n",
138 | " X_train[n] = img #Fill empty X_train with values from img\n",
139 | " mask = np.zeros((IMG_HEIGHT, IMG_WIDTH, 1), dtype=np.bool)\n",
140 | " for mask_file in next(os.walk(path + '/masks/'))[2]:\n",
141 | " mask_ = imread(path + '/masks/' + mask_file)\n",
142 | " mask_ = np.expand_dims(resize(mask_, (IMG_HEIGHT, IMG_WIDTH), mode='constant', \n",
143 | " preserve_range=True), axis=-1)\n",
144 | " mask = np.maximum(mask, mask_) \n",
145 | " \n",
146 | " Y_train[n] = mask "
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": null,
152 | "id": "bd08bcba",
153 | "metadata": {},
154 | "outputs": [],
155 | "source": [
156 | "#Have a look at a random image\n",
157 | "image_x=random.randint(0, len(train_ids))\n",
158 | "print(X_train.shape)\n",
159 | "imshow(X_train[image_x])\n",
160 | "plt.show()\n"
161 | ]
162 | },
163 | {
164 | "cell_type": "code",
165 | "execution_count": null,
166 | "id": "9bd82f34",
167 | "metadata": {},
168 | "outputs": [],
169 | "source": [
170 | "print(Y_train.shape)\n",
171 | "imshow(img_as_ubyte(Y_train[image_x, :, :, 0]))\n",
172 | "plt.show()"
173 | ]
174 | },
175 | {
176 | "cell_type": "markdown",
177 | "id": "2ecb3b54",
178 | "metadata": {},
179 | "source": [
180 | "## Denoising\n",
181 | "We will use a non-local means filter to denoise the images."
182 | ]
183 | },
184 | {
185 | "cell_type": "code",
186 | "execution_count": null,
187 | "id": "2c975a91",
188 | "metadata": {},
189 | "outputs": [],
190 | "source": [
191 | "#apply non-local filter\n",
192 | "patch_kw = dict(patch_size=5, #patch size\n",
193 | " patch_distance=6, #search area\n",
194 | " multichannel=True)\n",
195 | "for ith in range(len(train_ids)):\n",
196 | " img = X_train[ith]\n",
197 | " sigma_est = np.mean(estimate_sigma(img, multichannel=True))\n",
198 | " X_train[ith] = denoise_nl_means(img, h=1.15 * sigma_est, fast_mode=True, **patch_kw)"
199 | ]
200 | },
201 | {
202 | "cell_type": "code",
203 | "execution_count": null,
204 | "id": "c0aa6bc2",
205 | "metadata": {},
206 | "outputs": [],
207 | "source": [
208 | "#replot the random image\n",
209 | "print(X_train.shape)\n",
210 | "imshow(X_train[image_x])\n",
211 | "plt.show()"
212 | ]
213 | },
214 | {
215 | "cell_type": "code",
216 | "execution_count": null,
217 | "id": "3d64f807",
218 | "metadata": {},
219 | "outputs": [],
220 | "source": [
221 | "#apply adaptive histogram equalization\n",
222 | "#for ith in range(len(train_ids)):\n",
223 | "# X_train[ith] = equalize_adapthist(X_train[ith], kernel_size=1, clip_limit=0.01, nbins=100)\n"
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": null,
229 | "id": "662464ee",
230 | "metadata": {},
231 | "outputs": [],
232 | "source": [
233 | "#replot the random image\n",
234 | "print(X_train.shape)\n",
235 | "imshow(img_as_float(X_train[image_x]))\n",
236 | "plt.show()"
237 | ]
238 | },
239 | {
240 | "cell_type": "code",
241 | "execution_count": null,
242 | "id": "01cdf695",
243 | "metadata": {},
244 | "outputs": [],
245 | "source": [
246 | "#clean masked regions with open/close ops\n",
247 | "\n",
248 | "#Dilation enlarges bright regions and shrinks dark regions.\n",
249 | "#Erosion shrinks bright regions and enlarges dark regions.\n",
250 | "\n",
251 | "#declare operation size\n",
252 | "opsize=3\n",
253 | "\n",
254 | "for ith in range(len(train_ids)):\n",
255 | " mask = Y_train[ith, :, :, 0]\n",
256 | " \n",
257 | " #Closing on an image is defined as a dilation followed by an erosion.\n",
258 | " #Closing can remove small dark spots (i.e. “pepper”) and connect small bright cracks.\n",
259 | " #This tends to “close” up (dark) gaps between (bright) features.\n",
260 | " mask = nd.binary_closing(mask, disk(opsize//2))\n",
261 | "\n",
262 | " #diamerter closing will remove dark spots but leave dark cracks\n",
263 | " #mask = diameter_closing(mask, opsize, connectivity=2)\n",
264 | "\n",
265 | " \n",
266 | " #Opening on an image is defined as an erosion followed by a dilation.\n",
267 | " #Opening can remove small bright spots (i.e. “salt”) and connect small dark cracks.\n",
268 | " #This tends to “open” up (dark) gaps between (bright) features\n",
269 | " mask = nd.binary_opening(mask, disk(opsize//2))\n",
270 | " \n",
271 | " #diameter opening will remove bright spots but leave bright lines\n",
272 | " #mask = diameter_opening(mask, opsize, connectivity=2)\n",
273 | "\n",
274 | "\n",
275 | " #fill in enclosed regions\n",
276 | " mask = nd.binary_fill_holes(mask)\n",
277 | "\n",
278 | " #remove segments connected to image border\n",
279 | " mask = clear_border(mask)\n",
280 | " \n",
281 | " Y_train[ith, :, :, 0] = mask\n"
282 | ]
283 | },
284 | {
285 | "cell_type": "code",
286 | "execution_count": null,
287 | "id": "68570815",
288 | "metadata": {
289 | "scrolled": true
290 | },
291 | "outputs": [],
292 | "source": [
293 | "print(Y_train.shape)\n",
294 | "imshow(img_as_ubyte(Y_train[image_x, :, :, 0]))\n",
295 | "plt.show()"
296 | ]
297 | },
298 | {
299 | "cell_type": "markdown",
300 | "id": "9b0d4b78",
301 | "metadata": {},
302 | "source": [
303 | "## Feature detection"
304 | ]
305 | },
306 | {
307 | "cell_type": "code",
308 | "execution_count": null,
309 | "id": "c23388ec",
310 | "metadata": {},
311 | "outputs": [],
312 | "source": [
313 | "#create a dummy channel\n",
314 | "Z = np.zeros(Y_train.shape)\n",
315 | "\n",
316 | "#add two channels for img intensity and canny edges\n",
317 | "X_train = np.concatenate((X_train, Z), axis = 3)\n",
318 | "X_train = np.concatenate((X_train, Z), axis = 3)\n",
319 | "\n",
320 | "print(X_train.shape)"
321 | ]
322 | },
323 | {
324 | "cell_type": "code",
325 | "execution_count": null,
326 | "id": "5089aca0",
327 | "metadata": {},
328 | "outputs": [],
329 | "source": [
330 | "for ith in range(len(train_ids)):\n",
331 | " #get grayscale image\n",
332 | " img_gray = rgb2gray(X_train[ith, :, :, 0:3])\n",
333 | " X_train[ith, :, :, 3] #add new feature\n",
334 | " #caluculate canny edges\n",
335 | " edges = canny(image=img_gray, sigma=2)\n",
336 | " X_train[ith, :, :, 4]=edges #add new feature\n"
337 | ]
338 | },
339 | {
340 | "cell_type": "code",
341 | "execution_count": null,
342 | "id": "5f106a5e",
343 | "metadata": {},
344 | "outputs": [],
345 | "source": [
346 | "#replot the random image graysacle\n",
347 | "print(X_train.shape)\n",
348 | "imshow(X_train[image_x, :, :, 3])\n",
349 | "plt.show()\n",
350 | "\n",
351 | "#replot the random image canny edges\n",
352 | "print(X_train.shape)\n",
353 | "imshow(X_train[image_x, :, :, 4])\n",
354 | "plt.show()"
355 | ]
356 | },
357 | {
358 | "cell_type": "code",
359 | "execution_count": null,
360 | "id": "d8b746ec",
361 | "metadata": {},
362 | "outputs": [],
363 | "source": []
364 | }
365 | ],
366 | "metadata": {
367 | "kernelspec": {
368 | "display_name": "Python 3 (ipykernel)",
369 | "language": "python",
370 | "name": "python3"
371 | },
372 | "language_info": {
373 | "codemirror_mode": {
374 | "name": "ipython",
375 | "version": 3
376 | },
377 | "file_extension": ".py",
378 | "mimetype": "text/x-python",
379 | "name": "python",
380 | "nbconvert_exporter": "python",
381 | "pygments_lexer": "ipython3",
382 | "version": "3.9.5"
383 | }
384 | },
385 | "nbformat": 4,
386 | "nbformat_minor": 5
387 | }
388 |
--------------------------------------------------------------------------------
/Modules_11&12/Module_11.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbioinformatics/Biomedical-ML-with-Python/b014b310d93806b93a711227a00f8e4488bedfbb/Modules_11&12/Module_11.pptx
--------------------------------------------------------------------------------
/Modules_11&12/Module_12.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "1ace58e3-d3d7-4743-a14c-21a4f6e2aa21",
6 | "metadata": {},
7 | "source": [
8 | "# Module 12 - Image Proccessing with Unet"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "id": "f461dda1",
14 | "metadata": {},
15 | "source": [
16 | "We will look at a Unet implementation developed by [Dr. Sreenivas Bhattiprolu]( \n",
17 | "https://github.com/bnsreenu/python_for_microscopists/blob/master/076-077-078-Unet_nuclei_tutorial.py) This Unet will be applied to the [nuclei dataset](https://www.kaggle.com/c/data-science-bowl-2018/overview) introduced last week."
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": null,
23 | "id": "7731611b",
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "##!/usr/bin/env python\n",
28 | "__author__ = \"Sreenivas Bhattiprolu\"\n",
29 | "__license__ = \"Feel free to copy, I appreciate if you acknowledge Python for Microscopists\"\n",
30 | "\n",
31 | "# https://www.youtube.com/watch?v=0kiroPnV1tM\n",
32 | "# https://www.youtube.com/watch?v=cUHPL_dk17E\n",
33 | "# https://www.youtube.com/watch?v=RaswBvMnFxk\n",
34 | "\n",
35 | "\n",
36 | "\"\"\"\n",
37 | "@author: Sreenivas Bhattiprolu\n",
38 | "\"\"\"\n",
39 | "\n",
40 | "import tensorflow as tf\n",
41 | "import os\n",
42 | "import random\n",
43 | "import numpy as np\n",
44 | " \n",
45 | "from tqdm import tqdm \n",
46 | "\n",
47 | "from skimage.io import imread, imshow\n",
48 | "from skimage.transform import resize\n",
49 | "import matplotlib.pyplot as plt\n",
50 | "\n",
51 | "seed = 42\n",
52 | "np.random.seed = seed\n",
53 | "\n",
54 | "IMG_WIDTH = 128\n",
55 | "IMG_HEIGHT = 128\n",
56 | "IMG_CHANNELS = 3\n",
57 | "\n",
58 | "TRAIN_PATH = 'stage1_train/'\n",
59 | "TEST_PATH = 'stage1_test/'\n",
60 | "\n",
61 | "train_ids = next(os.walk(TRAIN_PATH))[1]\n",
62 | "test_ids = next(os.walk(TEST_PATH))[1]\n",
63 | "\n",
64 | "X_train = np.zeros((len(train_ids), IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS), dtype=np.uint8)\n",
65 | "Y_train = np.zeros((len(train_ids), IMG_HEIGHT, IMG_WIDTH, 1), dtype=np.bool)\n",
66 | "\n",
67 | "print('Resizing training images and masks')\n",
68 | "for n, id_ in tqdm(enumerate(train_ids), total=len(train_ids)): \n",
69 | " path = TRAIN_PATH + id_\n",
70 | " img = imread(path + '/images/' + id_ + '.png')[:,:,:IMG_CHANNELS] \n",
71 | " img = resize(img, (IMG_HEIGHT, IMG_WIDTH), mode='constant', preserve_range=True)\n",
72 | " X_train[n] = img #Fill empty X_train with values from img\n",
73 | " mask = np.zeros((IMG_HEIGHT, IMG_WIDTH, 1), dtype=np.bool)\n",
74 | " for mask_file in next(os.walk(path + '/masks/'))[2]:\n",
75 | " mask_ = imread(path + '/masks/' + mask_file)\n",
76 | " mask_ = np.expand_dims(resize(mask_, (IMG_HEIGHT, IMG_WIDTH), mode='constant', \n",
77 | " preserve_range=True), axis=-1)\n",
78 | " mask = np.maximum(mask, mask_) \n",
79 | " \n",
80 | " Y_train[n] = mask \n",
81 | "\n",
82 | "# test images\n",
83 | "X_test = np.zeros((len(test_ids), IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS), dtype=np.uint8)\n",
84 | "sizes_test = []\n",
85 | "print('Resizing test images') \n",
86 | "for n, id_ in tqdm(enumerate(test_ids), total=len(test_ids)):\n",
87 | " path = TEST_PATH + id_\n",
88 | " img = imread(path + '/images/' + id_ + '.png')[:,:,:IMG_CHANNELS]\n",
89 | " sizes_test.append([img.shape[0], img.shape[1]])\n",
90 | " img = resize(img, (IMG_HEIGHT, IMG_WIDTH), mode='constant', preserve_range=True)\n",
91 | " X_test[n] = img\n",
92 | "\n",
93 | "print('Done!')\n",
94 | "\n",
95 | "image_x = random.randint(0, len(train_ids))\n",
96 | "imshow(X_train[image_x])\n",
97 | "plt.show()\n",
98 | "imshow(np.squeeze(Y_train[image_x]))\n",
99 | "plt.show()\n",
100 | "\n",
101 | "\n",
102 | "#Build the model\n",
103 | "inputs = tf.keras.layers.Input((IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS))\n",
104 | "s = tf.keras.layers.Lambda(lambda x: x / 255)(inputs)\n",
105 | "\n",
106 | "#Contraction path\n",
107 | "c1 = tf.keras.layers.Conv2D(16, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(s)\n",
108 | "c1 = tf.keras.layers.Dropout(0.1)(c1)\n",
109 | "c1 = tf.keras.layers.Conv2D(16, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(c1)\n",
110 | "p1 = tf.keras.layers.MaxPooling2D((2, 2))(c1)\n",
111 | "\n",
112 | "c2 = tf.keras.layers.Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(p1)\n",
113 | "c2 = tf.keras.layers.Dropout(0.1)(c2)\n",
114 | "c2 = tf.keras.layers.Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(c2)\n",
115 | "p2 = tf.keras.layers.MaxPooling2D((2, 2))(c2)\n",
116 | " \n",
117 | "c3 = tf.keras.layers.Conv2D(64, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(p2)\n",
118 | "c3 = tf.keras.layers.Dropout(0.2)(c3)\n",
119 | "c3 = tf.keras.layers.Conv2D(64, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(c3)\n",
120 | "p3 = tf.keras.layers.MaxPooling2D((2, 2))(c3)\n",
121 | " \n",
122 | "c4 = tf.keras.layers.Conv2D(128, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(p3)\n",
123 | "c4 = tf.keras.layers.Dropout(0.2)(c4)\n",
124 | "c4 = tf.keras.layers.Conv2D(128, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(c4)\n",
125 | "p4 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2))(c4)\n",
126 | " \n",
127 | "c5 = tf.keras.layers.Conv2D(256, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(p4)\n",
128 | "c5 = tf.keras.layers.Dropout(0.3)(c5)\n",
129 | "c5 = tf.keras.layers.Conv2D(256, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(c5)\n",
130 | "\n",
131 | "#Expansive path \n",
132 | "u6 = tf.keras.layers.Conv2DTranspose(128, (2, 2), strides=(2, 2), padding='same')(c5)\n",
133 | "u6 = tf.keras.layers.concatenate([u6, c4])\n",
134 | "c6 = tf.keras.layers.Conv2D(128, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(u6)\n",
135 | "c6 = tf.keras.layers.Dropout(0.2)(c6)\n",
136 | "c6 = tf.keras.layers.Conv2D(128, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(c6)\n",
137 | " \n",
138 | "u7 = tf.keras.layers.Conv2DTranspose(64, (2, 2), strides=(2, 2), padding='same')(c6)\n",
139 | "u7 = tf.keras.layers.concatenate([u7, c3])\n",
140 | "c7 = tf.keras.layers.Conv2D(64, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(u7)\n",
141 | "c7 = tf.keras.layers.Dropout(0.2)(c7)\n",
142 | "c7 = tf.keras.layers.Conv2D(64, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(c7)\n",
143 | " \n",
144 | "u8 = tf.keras.layers.Conv2DTranspose(32, (2, 2), strides=(2, 2), padding='same')(c7)\n",
145 | "u8 = tf.keras.layers.concatenate([u8, c2])\n",
146 | "c8 = tf.keras.layers.Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(u8)\n",
147 | "c8 = tf.keras.layers.Dropout(0.1)(c8)\n",
148 | "c8 = tf.keras.layers.Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(c8)\n",
149 | " \n",
150 | "u9 = tf.keras.layers.Conv2DTranspose(16, (2, 2), strides=(2, 2), padding='same')(c8)\n",
151 | "u9 = tf.keras.layers.concatenate([u9, c1], axis=3)\n",
152 | "c9 = tf.keras.layers.Conv2D(16, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(u9)\n",
153 | "c9 = tf.keras.layers.Dropout(0.1)(c9)\n",
154 | "c9 = tf.keras.layers.Conv2D(16, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(c9)\n",
155 | " \n",
156 | "outputs = tf.keras.layers.Conv2D(1, (1, 1), activation='sigmoid')(c9)\n",
157 | " \n",
158 | "model = tf.keras.Model(inputs=[inputs], outputs=[outputs])\n",
159 | "model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])\n",
160 | "model.summary()\n",
161 | "\n",
162 | "################################\n",
163 | "#Modelcheckpoint\n",
164 | "checkpointer = tf.keras.callbacks.ModelCheckpoint('model_for_nuclei.h5', verbose=1, save_best_only=True)\n",
165 | "\n",
166 | "callbacks = [\n",
167 | " tf.keras.callbacks.EarlyStopping(patience=2, monitor='val_loss'),\n",
168 | " tf.keras.callbacks.TensorBoard(log_dir='logs')]\n",
169 | "\n",
170 | "results = model.fit(X_train, Y_train, validation_split=0.1, batch_size=16, epochs=25, callbacks=callbacks)\n",
171 | "\n",
172 | "####################################\n",
173 | "\n",
174 | "idx = random.randint(0, len(X_train))\n",
175 | "\n",
176 | "preds_train = model.predict(X_train[:int(X_train.shape[0]*0.9)], verbose=1)\n",
177 | "preds_val = model.predict(X_train[int(X_train.shape[0]*0.9):], verbose=1)\n",
178 | "preds_test = model.predict(X_test, verbose=1)\n",
179 | "\n",
180 | " \n",
181 | "preds_train_t = (preds_train > 0.5).astype(np.uint8)\n",
182 | "preds_val_t = (preds_val > 0.5).astype(np.uint8)\n",
183 | "preds_test_t = (preds_test > 0.5).astype(np.uint8)\n",
184 | "\n",
185 | "\n",
186 | "# Perform a sanity check on some random training samples\n",
187 | "ix = random.randint(0, len(preds_train_t))\n",
188 | "imshow(X_train[ix])\n",
189 | "plt.show()\n",
190 | "imshow(np.squeeze(Y_train[ix]))\n",
191 | "plt.show()\n",
192 | "imshow(np.squeeze(preds_train_t[ix]))\n",
193 | "plt.show()\n",
194 | "\n",
195 | "# Perform a sanity check on some random validation samples\n",
196 | "ix = random.randint(0, len(preds_val_t))\n",
197 | "imshow(X_train[int(X_train.shape[0]*0.9):][ix])\n",
198 | "plt.show()\n",
199 | "imshow(np.squeeze(Y_train[int(Y_train.shape[0]*0.9):][ix]))\n",
200 | "plt.show()\n",
201 | "imshow(np.squeeze(preds_val_t[ix]))\n",
202 | "plt.show()"
203 | ]
204 | }
205 | ],
206 | "metadata": {
207 | "kernelspec": {
208 | "display_name": "Python 3 (ipykernel)",
209 | "language": "python",
210 | "name": "python3"
211 | },
212 | "language_info": {
213 | "codemirror_mode": {
214 | "name": "ipython",
215 | "version": 3
216 | },
217 | "file_extension": ".py",
218 | "mimetype": "text/x-python",
219 | "name": "python",
220 | "nbconvert_exporter": "python",
221 | "pygments_lexer": "ipython3",
222 | "version": "3.9.4"
223 | }
224 | },
225 | "nbformat": 4,
226 | "nbformat_minor": 5
227 | }
228 |
--------------------------------------------------------------------------------
/Modules_11&12/Module_12.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbioinformatics/Biomedical-ML-with-Python/b014b310d93806b93a711227a00f8e4488bedfbb/Modules_11&12/Module_12.pptx
--------------------------------------------------------------------------------
/Modules_11&12/data/stage1_train.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbioinformatics/Biomedical-ML-with-Python/b014b310d93806b93a711227a00f8e4488bedfbb/Modules_11&12/data/stage1_train.zip
--------------------------------------------------------------------------------
/Modules_9&10/Module_9-InstructorVersion.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "1ace58e3-d3d7-4743-a14c-21a4f6e2aa21",
6 | "metadata": {},
7 | "source": [
8 | "# Module 9 - Leukemia project week 1\n",
9 | "\n",
10 | "Today we will put everything we ahve learned so far to develop a complete end to end analysis. We will be working with gene expression data of leukemia patients aquired in 1999 and later [published in Science](https://doi.org/10.1126/science.286.5439.531). The paper demonstrated how new cases of cancer could be classified by gene expression monitoring (via DNA microarray) and thereby provided a general approach for identifying new cancer classes and assigning tumors to known classes. The data was used to classify patients with acute myeloid leukemia (AML) and acute lymphoblastic leukemia (ALL). Our excersice is to develop a model that discriminates between AML and ALL patients based only on the this gene expression data and compare our model with the published results of the 1999 paper."
11 | ]
12 | },
13 | {
14 | "cell_type": "markdown",
15 | "id": "7103cd06",
16 | "metadata": {},
17 | "source": [
18 | "## Setup\n",
19 | "Let's get all the requirements sorted before we move on to the excercise. "
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": null,
25 | "id": "4a4fec48",
26 | "metadata": {
27 | "scrolled": false
28 | },
29 | "outputs": [],
30 | "source": [
31 | "# Requirements\n",
32 | "!pip install --upgrade ipykernel\n",
33 | "!pip install kaggle\n",
34 | "!pip install pandas\n",
35 | "!pip install tableone\n",
36 | "!pip install numpy\n",
37 | "!pip install matplotlib\n",
38 | "!pip install scipy\n",
39 | "!pip install seaborn\n",
40 | "\n",
41 | "# Globals\n",
42 | "seed = 1017\n",
43 | "\n",
44 | "#imports\n",
45 | "import kaggle\n",
46 | "import pandas as pd\n",
47 | "import seaborn as sns\n",
48 | "from tableone import TableOne\n",
49 | "import numpy as np\n",
50 | "import matplotlib.pyplot as plt\n",
51 | "from scipy import stats\n",
52 | "\n",
53 | "\n",
54 | "#magic\n",
55 | "%matplotlib inline"
56 | ]
57 | },
58 | {
59 | "cell_type": "markdown",
60 | "id": "040877a2",
61 | "metadata": {},
62 | "source": [
63 | "## Loading the data via kaggle API\n",
64 | "The leukemia data set was sourced from kaggle. \n",
65 | "To download the data directly from [kaggle](kaggle.com) you will need to have a kaggle account. **It's free.** Once you create your kaggle account you can generate an API token. After you log in you should see a circular account icon in the upper-right of any kaggle page. Clicking on your account icon will open a right-sidebar where you can select \"Account\" to edit your account. Scroll down to the API section and click on the \"create new api token\" button. An API token should automatically download and a prompt will also appear telling you which directory to put this token so python knows where find it. For MacOS users this location is \"~/.kaggle/kaggle.json\". Once you have done this modify the code below to download the dataset to the `data` folder distributed with this notebook."
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "id": "95bd8a2a",
72 | "metadata": {},
73 | "outputs": [],
74 | "source": [
75 | "#log in to kaggle using your api token\n",
76 | "kaggle.api.authenticate()\n",
77 | "\n",
78 | "#path relative to this notebook to put the data\n",
79 | "datadir = 'data'\n",
80 | "\n",
81 | "#name of the dataset on kaggle\n",
82 | "dataset = 'crawford/gene-expression'\n",
83 | "\n",
84 | "#downlaod the data\n",
85 | "kaggle.api.dataset_download_files(dataset, path=datadir, unzip=True)"
86 | ]
87 | },
88 | {
89 | "cell_type": "markdown",
90 | "id": "790bd5b1-61c0-435a-b1ae-3047d3519bae",
91 | "metadata": {},
92 | "source": [
93 | "## Loading the data\n",
94 | "There are two datasets containing the initial (training, 38 samples) and independent (test, 34 samples) datasets used in the paper. These datasets contain measurements corresponding to ALL and AML samples from Bone Marrow and Peripheral Blood. Intensity values have been re-scaled such that overall intensities for each chip are equivalent."
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": null,
100 | "id": "08d23b16",
101 | "metadata": {},
102 | "outputs": [],
103 | "source": [
104 | "# download the data as a pandas dataframe\n",
105 | "labels = pd.read_csv('data/actual.csv', index_col = 'patient')\n",
106 | "test = pd.read_csv('data/data_set_ALL_AML_independent.csv')\n",
107 | "train = pd.read_csv('data/data_set_ALL_AML_train.csv')"
108 | ]
109 | },
110 | {
111 | "cell_type": "markdown",
112 | "id": "8366f740",
113 | "metadata": {},
114 | "source": [
115 | "***Task*** Use the head() function to quickly have a look at the training data frame."
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": null,
121 | "id": "1faeb96f",
122 | "metadata": {},
123 | "outputs": [],
124 | "source": [
125 | "#Use the head() function display th first few rows of the training data frame.\n",
126 | "train.head()"
127 | ]
128 | },
129 | {
130 | "cell_type": "markdown",
131 | "id": "b5b6cf3b",
132 | "metadata": {},
133 | "source": [
134 | "The testing set is formatted the same as the training set. Notice, that the gene description and accession numbers are given along with the count and outcome (call) for each patient. The patient outcomes are also provided in the file `actual.csv`. I think it will be more convientient to use the outcomes in this file and delete the 'call' columns in both the training and testing sets. ***Question: What are the observational units of interest?*** "
135 | ]
136 | },
137 | {
138 | "cell_type": "markdown",
139 | "id": "60583afe",
140 | "metadata": {},
141 | "source": [
142 | "## Formatting\n",
143 | "***Task*** Remove the 'call' columns from the training and testing sets."
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": null,
149 | "id": "db9c49fc",
150 | "metadata": {},
151 | "outputs": [],
152 | "source": [
153 | "#<--remove this for student verssion--> \n",
154 | "cols = [col for col in test.columns if 'call' in col]\n",
155 | "test = test.drop(cols, 1)\n",
156 | "\n",
157 | "cols = [col for col in train.columns if 'call' in col]\n",
158 | "train = train.drop(cols, 1)\n",
159 | "\n",
160 | "train.head()"
161 | ]
162 | },
163 | {
164 | "cell_type": "markdown",
165 | "id": "8d97cd6a",
166 | "metadata": {},
167 | "source": [
168 | "Let's consider what the observational unit should be. ***Task*** Format the data to have observations in rows and features in columns."
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": null,
174 | "id": "fc9f1ecc",
175 | "metadata": {},
176 | "outputs": [],
177 | "source": [
178 | "# remove this for stutent version \n",
179 | "train = train.T\n",
180 | "test = test.T\n",
181 | "train.head()"
182 | ]
183 | },
184 | {
185 | "cell_type": "markdown",
186 | "id": "ac1f645b",
187 | "metadata": {},
188 | "source": [
189 | "We can also remove the gene bookkeeping data because we will not use it today."
190 | ]
191 | },
192 | {
193 | "cell_type": "code",
194 | "execution_count": null,
195 | "id": "44d5879c",
196 | "metadata": {},
197 | "outputs": [],
198 | "source": [
199 | "#remove the gene bookkeeping data.\n",
200 | "train = train.drop(['Gene Description', 'Gene Accession Number'])\n",
201 | "test = test.drop(['Gene Description', 'Gene Accession Number'])\n",
202 | "train.head()"
203 | ]
204 | },
205 | {
206 | "cell_type": "markdown",
207 | "id": "954102d7",
208 | "metadata": {},
209 | "source": [
210 | "Now let's encode the outcomes for binary classification. We'll use Zeros for the ALL outcomes and Ones for AML. Remember the first 38 patients were partitioned for the training set the remainder are in the testing set."
211 | ]
212 | },
213 | {
214 | "cell_type": "code",
215 | "execution_count": null,
216 | "id": "432c0f09",
217 | "metadata": {},
218 | "outputs": [],
219 | "source": [
220 | "#remove for student version\n",
221 | "labels = labels.replace({'ALL':0,'AML':1})\n",
222 | "labels_train = labels[labels.index <= 38]\n",
223 | "labels_test = labels[labels.index > 38]"
224 | ]
225 | },
226 | {
227 | "cell_type": "markdown",
228 | "id": "13caf6ea",
229 | "metadata": {},
230 | "source": [
231 | "### Treat missing data\n",
232 | "Before moving on to a table 1. Let's look for and treat any missing data this. Remember to check for values that don't make sense. I think replacing witht the mean value would be a reasonable imputation strategy.***Task*** check for unreasonable and missing values and impute them with the column mean. "
233 | ]
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": null,
238 | "id": "ae5e1d94",
239 | "metadata": {},
240 | "outputs": [],
241 | "source": [
242 | "#remove in student version\n",
243 | "\n",
244 | "#remove zeros\n",
245 | "#train = train.replace(0, np.nan)\n",
246 | "#test = test.replace(0, np.nan)\n",
247 | "\n",
248 | "#remove inf\n",
249 | "train = train.replace(np.inf, np.nan)\n",
250 | "test = test.replace(np.inf, np.nan)\n",
251 | "\n",
252 | "#impute with mean\n",
253 | "train = train.fillna(value = train.values.mean())\n",
254 | "test = test.fillna(value = test.values.mean())\n",
255 | "\n",
256 | "#KNN imputation -- don't bother\n"
257 | ]
258 | },
259 | {
260 | "cell_type": "markdown",
261 | "id": "74577963",
262 | "metadata": {},
263 | "source": [
264 | "***Question*** How would you go about visualizing the data we just formatted? Why can't I just make a table 1?"
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": null,
270 | "id": "71bc3faa",
271 | "metadata": {},
272 | "outputs": [],
273 | "source": [
274 | "#remove from student version\n",
275 | "from sklearn import preprocessing\n",
276 | "from sklearn.decomposition import PCA\n",
277 | "\n",
278 | "#Do a PCA for all data (this should probably be done only with the training data)\n",
279 | "nPC=10\n",
280 | "df_all = train.append(test, ignore_index=True)\n",
281 | "X_all = preprocessing.StandardScaler().fit_transform(df_all)\n",
282 | "pca = PCA(n_components=nPC, random_state=seed)\n",
283 | "X_pca = pca.fit_transform(X_all)\n",
284 | "print(X_pca.shape)"
285 | ]
286 | },
287 | {
288 | "cell_type": "code",
289 | "execution_count": null,
290 | "id": "653e5c70",
291 | "metadata": {},
292 | "outputs": [],
293 | "source": [
294 | "#remove from student version\n",
295 | "from sklearn import preprocessing\n",
296 | "from sklearn.decomposition import PCA\n",
297 | "\n",
298 | "#Do a PCA for all data (this should probably be done only with the training data)\n",
299 | "nPC=10\n",
300 | "pca = PCA(n_components=nPC, random_state=seed)\n",
301 | "\n",
302 | "#define standard scaler\n",
303 | "scaler = preprocessing.StandardScaler()\n",
304 | "\n",
305 | "#fit pca on the training data\n",
306 | "#df_all = train.append(test, ignore_index=True)\n",
307 | "X_scaled = scaler.fit_transform(train)\n",
308 | "X_train_pca = pca.fit_transform(X_scaled)\n",
309 | "\n",
310 | "#transform the testing data using the fit scaler and pca objects\n",
311 | "X_test_pca = pca.transform(scaler.transform(test))\n",
312 | "\n",
313 | "print(X_train_pca.shape)"
314 | ]
315 | },
316 | {
317 | "cell_type": "markdown",
318 | "id": "fb3d5a51",
319 | "metadata": {},
320 | "source": [
321 | "### Visualize Engineered Features\n",
322 | "Let's plot the feature distributions."
323 | ]
324 | },
325 | {
326 | "cell_type": "code",
327 | "execution_count": null,
328 | "id": "5a3eb1ab",
329 | "metadata": {},
330 | "outputs": [],
331 | "source": [
332 | "#plot the principle component distributions\n",
333 | "for PC in range(nPC):\n",
334 | " sns.kdeplot(data=X_train_pca[:, PC])\n",
335 | " #X_pca[:,nPC].plot.kde(bw_method='scott') #use bw_method=.02 for a lower bandwidth gaussian representation\n",
336 | " plt.legend([\"PC\" + str(PC)])\n",
337 | " plt.show()"
338 | ]
339 | },
340 | {
341 | "cell_type": "code",
342 | "execution_count": null,
343 | "id": "b7b796e5",
344 | "metadata": {},
345 | "outputs": [],
346 | "source": [
347 | "# remove in student version \n",
348 | "\n",
349 | "# rescale the engineered features if neccessary\n",
350 | "X_train_pca = scaler.fit_transform(X_train_pca)\n",
351 | "X_test_pca = scaler.transform(X_test_pca)\n",
352 | "\n",
353 | "#plot the principle component distributions\n",
354 | "for PC in range(nPC):\n",
355 | " sns.kdeplot(data=X_test_pca[:, PC])\n",
356 | " #X_pca[:,nPC].plot.kde(bw_method='scott') #use bw_method=.02 for a lower bandwidth gaussian representation\n",
357 | " plt.legend([\"PC\" + str(PC)])\n",
358 | " plt.show()"
359 | ]
360 | },
361 | {
362 | "cell_type": "markdown",
363 | "id": "db80c954",
364 | "metadata": {},
365 | "source": [
366 | "## Capstone Project\n",
367 | "We have chosen a more [challenging Acute Myeloid Leukemia dataset] (https://www.synapse.org/#!Synapse:syn2455683/wiki/64007) to test your ML skills. You will have to request access to this data through the link provided and follow directions to the 'request access' form. Originally DREAM challenge, this dataset contains multiple outcomes such as disease relapse and response to treatment. Both classificationa and regression tasks are possible. The data represents over 200 Leukemia patients and includes highly dimensional gemonics data in addition to clinical covariates. Your challenge is to develop models to address these outcomes completely End-to-End over the next few weeks. We will review your analysis in the final week of the course. We will periodically check in to make sure your analyses are goinig smoothly but for the mean time your analysis should generally follow these beats.\n",
368 | "+ ***Format*** the data with observations in rows and features in columns\n",
369 | "+ ***Manually exclude data*** like book keeping variables\n",
370 | "+ ***Normalize*** Data\n",
371 | "+ ***Treat missing*** Data\n",
372 | "+ Choose and employ a ***Data Partitioning*** scheme\n",
373 | "+ Do ***Feature selection*** for Clinical covariates and ***Dimension Reduction*** for omics data.\n",
374 | "+ ***Choose a model*** with the highest performance potential\n",
375 | "+ ***Tune your model***\n",
376 | "+ Caclulate ***Performance metrics***\n",
377 | "+ Report ***Model Predicitions***\n",
378 | "+ ***Interpret*** results\n"
379 | ]
380 | },
381 | {
382 | "cell_type": "markdown",
383 | "id": "dce14c9e",
384 | "metadata": {},
385 | "source": [
386 | "## Pathway Representation\n",
387 | "PCA is one of the most common dimension reduction schemes, however interpreting the biology from principle components is difficult. The PCs explain fluctuations in the data itself not necessarily the biological relationships at play. It would be better to choose a representation that is more convienient for interpretation in a clinical setting. Let us aggregate the data of related genes into a value that represents the pathways they participate in. Finding the predictive pathways may be more useful to explain the biology, design future invivio models, and drive treatment development.\n",
388 | "\n",
389 | "The goal is to assign a number to each pathway represented in our dataset and use these as the predictive features for each patient. For each pathway we will need to find all the genes in our dataset that participate in this pathway and describe how only these releated genes fluctuate. We'll use a PCA to do this and use the value of the first PC as the value for the pathway feature. We will then normalize this pathway representation data before moving on to feature selection.\n",
390 | "\n",
391 | "### Convert Affymetrix Gene IDs to Kegg IDs\n",
392 | "The source data has genes in the Affymetrix format. We would like to use KEGG IDs because of the convientient pathway analysis python tools KEGG provides.\n",
393 | "\n",
394 | "[g:Convert](http://www.protocol-online.org/cgi-bin/prot/jump.cgi?ID=4145)\n",
395 | " is a gene identifier tool that allows conversion of genes, proteins, microarray probes, standard names, various database identifiers, etc. A mix of IDs of different types may be inserted to g:Convert. The user needs to select a target database; all input IDs will be converted into target database format. Input IDs that have no corresponding entry in target database will be displayed as N/A.\n",
396 | " \n",
397 | "Let's use the g:Convert web tool to convert the affymetrix IDs to KEGG enzyme IDs. Copy and paste the \"Gene Accession Number\" column in the training set CSV file into the Query box in g:Convert. Exclude the column header when you paste. Select \"Homo sapiens (Human)\" in the organism box and select \"KEGG_ENZYME\" in the target namespace box. Press the \"Run query\" button to convert the IDs. When the query finishes you can download the results as a CSV file and save it in the data folder. "
398 | ]
399 | },
400 | {
401 | "cell_type": "code",
402 | "execution_count": null,
403 | "id": "4081c8f4",
404 | "metadata": {},
405 | "outputs": [],
406 | "source": [
407 | "#Let's look at the g:Convert file\n",
408 | "gprof = pd.read_csv('data/gProfiler_hsapiens.csv')\n",
409 | "gprof.head()"
410 | ]
411 | },
412 | {
413 | "cell_type": "markdown",
414 | "id": "9f67dd25",
415 | "metadata": {},
416 | "source": [
417 | "The corresponding gene names can be found in the 'name' column.Notice, that many initial aliases map to the same gene name. For every unique gene name let's get the pathways they participate in. We'll follow closely the [KEGG tutorial](https://bioservices.readthedocs.io/en/master/kegg_tutorial.html) to show us how."
418 | ]
419 | },
420 | {
421 | "cell_type": "code",
422 | "execution_count": null,
423 | "id": "18896749",
424 | "metadata": {},
425 | "outputs": [],
426 | "source": [
427 | "#install bioservices and import KEGG\n",
428 | "!pip install bioservices\n",
429 | "from bioservices.kegg import KEGG"
430 | ]
431 | },
432 | {
433 | "cell_type": "code",
434 | "execution_count": null,
435 | "id": "8a34449e",
436 | "metadata": {},
437 | "outputs": [],
438 | "source": [
439 | "#start the kegg interface\n",
440 | "ntrface = KEGG() \n",
441 | "\n",
442 | "#Example search for pathways by gene name\n",
443 | "ntrface.get_pathway_by_gene(\"GAPDH\", \"hsa\")"
444 | ]
445 | },
446 | {
447 | "cell_type": "code",
448 | "execution_count": null,
449 | "id": "d6b3bf19",
450 | "metadata": {},
451 | "outputs": [],
452 | "source": [
453 | "#get unique genes in gprof dataframe\n",
454 | "genes = gprof['name'].unique()\n",
455 | "genes = genes[0:50] #limit to first 50 genes so it runs quickly\n",
456 | "len(genes)"
457 | ]
458 | },
459 | {
460 | "cell_type": "code",
461 | "execution_count": null,
462 | "id": "e035b768",
463 | "metadata": {},
464 | "outputs": [],
465 | "source": [
466 | "%%capture\n",
467 | "#init dictionary to hold pathway to gene mapping\n",
468 | "pathway_dict = {}\n",
469 | "\n",
470 | "#loop over unique genes in the dataset\n",
471 | "for gene in genes:\n",
472 | " #get pathways associated with gene\n",
473 | " pathways = ntrface.get_pathway_by_gene(str(gene), \"hsa\")\n",
474 | " #loop over associated pathways\n",
475 | " try:\n",
476 | " for pathway, desc in pathways.items():\n",
477 | " #check if pathway is new\n",
478 | " if not (pathway in pathway_dict):\n",
479 | " #append new pathway and init gene list\n",
480 | " pathway_dict[pathway] = [str(gene)] \n",
481 | " else:\n",
482 | " # add gene to the known pathway's list of genes\n",
483 | " pathway_dict[pathway].extend([str(gene)])\n",
484 | " except AttributeError:\n",
485 | " print(\"No pathways found for \" + str(gene))"
486 | ]
487 | },
488 | {
489 | "cell_type": "code",
490 | "execution_count": null,
491 | "id": "fd20b047",
492 | "metadata": {},
493 | "outputs": [],
494 | "source": [
495 | "#display dictionary as pandas Series so we can show the head\n",
496 | "display(pd.Series(pathway_dict).head())"
497 | ]
498 | },
499 | {
500 | "cell_type": "code",
501 | "execution_count": null,
502 | "id": "482c73b6",
503 | "metadata": {},
504 | "outputs": [],
505 | "source": [
506 | "#remove from student version\n",
507 | "from sklearn import preprocessing\n",
508 | "from sklearn.decomposition import PCA\n",
509 | "\n",
510 | "#Do a PCA for all data (this should probably be done only with the training data)\n",
511 | "nPC=1\n",
512 | "pca = PCA(n_components=nPC, random_state=seed)\n",
513 | "\n",
514 | "#define standard scaler\n",
515 | "scaler = preprocessing.StandardScaler()\n",
516 | "\n",
517 | "#loop over pathways\n",
518 | "for pathway in pathway_dict:\n",
519 | " #get Gene Accession names for every Gene in pathway\n",
520 | " \n",
521 | " #select rows representing the genes in the pathway\n",
522 | " \n",
523 | " #fit pca on the training data\n",
524 | " #df_all = train.append(test, ignore_index=True)\n",
525 | " X_scaled = scaler.fit_transform(train)\n",
526 | " X_train_pca = pca.fit_transform(X_scaled)\n",
527 | "\n",
528 | " #transform the testing data using the fit scaler and pca objects\n",
529 | " X_test_pca = pca.transform(scaler.transform(test))\n",
530 | "\n",
531 | " print(X_train_pca.shape)"
532 | ]
533 | }
534 | ],
535 | "metadata": {
536 | "kernelspec": {
537 | "display_name": "Python 3 (ipykernel)",
538 | "language": "python",
539 | "name": "python3"
540 | },
541 | "language_info": {
542 | "codemirror_mode": {
543 | "name": "ipython",
544 | "version": 3
545 | },
546 | "file_extension": ".py",
547 | "mimetype": "text/x-python",
548 | "name": "python",
549 | "nbconvert_exporter": "python",
550 | "pygments_lexer": "ipython3",
551 | "version": "3.9.4"
552 | }
553 | },
554 | "nbformat": 4,
555 | "nbformat_minor": 5
556 | }
557 |
--------------------------------------------------------------------------------
/Modules_9&10/Module_9.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "1ace58e3-d3d7-4743-a14c-21a4f6e2aa21",
6 | "metadata": {},
7 | "source": [
8 | "# Module 9 - Leukemia project week 1\n",
9 | "\n",
10 | "Today we will put everything we ahve learned so far to develop a complete end to end analysis. We will be working with gene expression data of leukemia patients aquired in 1999 and later [published in Science](https://doi.org/10.1126/science.286.5439.531). The paper demonstrated how new cases of cancer could be classified by gene expression monitoring (via DNA microarray) and thereby provided a general approach for identifying new cancer classes and assigning tumors to known classes. The data was used to classify patients with acute myeloid leukemia (AML) and acute lymphoblastic leukemia (ALL). Our excersice is to develop a model that discriminates between AML and ALL patients based only on the this gene expression data and compare our model with the published results of the 1999 paper."
11 | ]
12 | },
13 | {
14 | "cell_type": "markdown",
15 | "id": "7103cd06",
16 | "metadata": {},
17 | "source": [
18 | "## Setup\n",
19 | "Let's get all the requirements sorted before we move on to the excercise. "
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": null,
25 | "id": "4a4fec48",
26 | "metadata": {
27 | "scrolled": false
28 | },
29 | "outputs": [],
30 | "source": [
31 | "# Requirements\n",
32 | "!pip install --upgrade ipykernel\n",
33 | "!pip install kaggle\n",
34 | "!pip install pandas\n",
35 | "!pip install tableone\n",
36 | "!pip install numpy\n",
37 | "!pip install matplotlib\n",
38 | "!pip install scipy\n",
39 | "!pip install seaborn\n",
40 | "\n",
41 | "# Globals\n",
42 | "seed = 1017\n",
43 | "\n",
44 | "#imports\n",
45 | "import kaggle\n",
46 | "import pandas as pd\n",
47 | "import seaborn as sns\n",
48 | "from tableone import TableOne\n",
49 | "import numpy as np\n",
50 | "import matplotlib.pyplot as plt\n",
51 | "from scipy import stats\n",
52 | "\n",
53 | "\n",
54 | "#magic\n",
55 | "%matplotlib inline"
56 | ]
57 | },
58 | {
59 | "cell_type": "markdown",
60 | "id": "040877a2",
61 | "metadata": {},
62 | "source": [
63 | "## Loading the data via kaggle API\n",
64 | "The leukemia data set was sourced from kaggle. \n",
65 | "To download the data directly from [kaggle](https://kaggle.com) you will need to have a kaggle account. **It's free.** Once you create your kaggle account you can generate an API token. After you log in you should see a circular account icon in the upper-right of any kaggle page. Clicking on your account icon will open a right-sidebar where you can select \"Account\" to edit your account. Scroll down to the API section and click on the \"create new api token\" button. An API token should automatically download and a prompt will also appear telling you which directory to put this token so python knows where find it. For MacOS users this location is \"~/.kaggle/kaggle.json\". Once you have done this modify the code below to download the dataset to the `data` folder distributed with this notebook."
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "id": "95bd8a2a",
72 | "metadata": {},
73 | "outputs": [],
74 | "source": [
75 | "#log in to kaggle using your api token\n",
76 | "kaggle.api.authenticate()\n",
77 | "\n",
78 | "#path relative to this notebook to put the data\n",
79 | "datadir = 'data'\n",
80 | "\n",
81 | "#name of the dataset on kaggle\n",
82 | "dataset = 'crawford/gene-expression'\n",
83 | "\n",
84 | "#downlaod the data\n",
85 | "kaggle.api.dataset_download_files(dataset, path=datadir, unzip=True)"
86 | ]
87 | },
88 | {
89 | "cell_type": "markdown",
90 | "id": "790bd5b1-61c0-435a-b1ae-3047d3519bae",
91 | "metadata": {},
92 | "source": [
93 | "## Loading the data\n",
94 | "There are two datasets containing the initial (training, 38 samples) and independent (test, 34 samples) datasets used in the paper. These datasets contain measurements corresponding to ALL and AML samples from Bone Marrow and Peripheral Blood. Intensity values have been re-scaled such that overall intensities for each chip are equivalent."
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": null,
100 | "id": "08d23b16",
101 | "metadata": {},
102 | "outputs": [],
103 | "source": [
104 | "# download the data as a pandas dataframe\n",
105 | "labels = pd.read_csv('data/actual.csv', index_col = 'patient')\n",
106 | "test = pd.read_csv('data/data_set_ALL_AML_independent.csv')\n",
107 | "train = pd.read_csv('data/data_set_ALL_AML_train.csv')"
108 | ]
109 | },
110 | {
111 | "cell_type": "markdown",
112 | "id": "8366f740",
113 | "metadata": {},
114 | "source": [
115 | "***Task*** Use the head() function to quickly have a look at the training data frame."
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": null,
121 | "id": "1faeb96f",
122 | "metadata": {},
123 | "outputs": [],
124 | "source": [
125 | "#Use the head() function display th first few rows of the training data frame.\n",
126 | ""
127 | ]
128 | },
129 | {
130 | "cell_type": "markdown",
131 | "id": "b5b6cf3b",
132 | "metadata": {},
133 | "source": [
134 | "The testing set is formatted the same as the training set. Notice, that the gene description and accession numbers are given along with the count and outcome (call) for each patient. The patient outcomes are also provided in the file `actual.csv`. I think it will be more convientient to use the outcomes in this file and delete the 'call' columns in both the training and testing sets. ***Question: What are the observational units of interest?*** "
135 | ]
136 | },
137 | {
138 | "cell_type": "markdown",
139 | "id": "60583afe",
140 | "metadata": {},
141 | "source": [
142 | "## Formatting\n",
143 | "***Task*** Remove the 'call' columns from the training and testing sets."
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": null,
149 | "id": "db9c49fc",
150 | "metadata": {},
151 | "outputs": [],
152 | "source": [
153 | "#remove the call columns\n",
154 | "test = \n",
155 | "train = \n",
156 | "\n",
157 | "train.head()"
158 | ]
159 | },
160 | {
161 | "cell_type": "markdown",
162 | "id": "8d97cd6a",
163 | "metadata": {},
164 | "source": [
165 | "Let's consider what the observational unit should be. ***Task*** Format the data to have observations in rows and features in columns."
166 | ]
167 | },
168 | {
169 | "cell_type": "code",
170 | "execution_count": null,
171 | "id": "fc9f1ecc",
172 | "metadata": {},
173 | "outputs": [],
174 | "source": [
175 | "#Format the training and testing sets\n",
176 | "train = \n",
177 | "test = \n",
178 | "train.head()"
179 | ]
180 | },
181 | {
182 | "cell_type": "markdown",
183 | "id": "ac1f645b",
184 | "metadata": {},
185 | "source": [
186 | "We can also remove the gene bookkeeping data. However, it's probably a good idea to make a copy of the Gene Accession Numbers for later analyses."
187 | ]
188 | },
189 | {
190 | "cell_type": "code",
191 | "execution_count": null,
192 | "id": "41caa04e",
193 | "metadata": {},
194 | "outputs": [],
195 | "source": [
196 | "#Save the Gene Accession Numbers\n",
197 | ""
198 | ]
199 | },
200 | {
201 | "cell_type": "code",
202 | "execution_count": null,
203 | "id": "44d5879c",
204 | "metadata": {},
205 | "outputs": [],
206 | "source": [
207 | "#remove the gene bookkeeping data.\n",
208 | "train = \n",
209 | "test = \n",
210 | "train.head()"
211 | ]
212 | },
213 | {
214 | "cell_type": "markdown",
215 | "id": "954102d7",
216 | "metadata": {},
217 | "source": [
218 | "Now let's encode the outcomes for binary classification. We'll use Zeros for the ALL outcomes and Ones for AML. Remember the first 38 patients were partitioned for the training set the remainder are in the testing set."
219 | ]
220 | },
221 | {
222 | "cell_type": "code",
223 | "execution_count": null,
224 | "id": "432c0f09",
225 | "metadata": {},
226 | "outputs": [],
227 | "source": [
228 | "labels = \n",
229 | "labels_train = \n",
230 | "labels_test = "
231 | ]
232 | },
233 | {
234 | "cell_type": "markdown",
235 | "id": "13caf6ea",
236 | "metadata": {},
237 | "source": [
238 | "### Treat missing data\n",
239 | "Before moving on to a table 1. Let's look for and treat any missing data this. Remember to check for values that don't make sense. I think replacing witht the mean value would be a reasonable imputation strategy.***Task*** check for unreasonable and missing values and impute them with the column mean. "
240 | ]
241 | },
242 | {
243 | "cell_type": "code",
244 | "execution_count": null,
245 | "id": "ae5e1d94",
246 | "metadata": {},
247 | "outputs": [],
248 | "source": [
249 | "#remove inf\n",
250 | "train = \n",
251 | "test = \n",
252 | "\n",
253 | "#impute with mean\n",
254 | "train = \n",
255 | "test = "
256 | ]
257 | },
258 | {
259 | "cell_type": "markdown",
260 | "id": "74577963",
261 | "metadata": {},
262 | "source": [
263 | "***Question*** How would you go about visualizing the data we just formatted? Why can't I just make a table 1?"
264 | ]
265 | },
266 | {
267 | "cell_type": "code",
268 | "execution_count": null,
269 | "id": "653e5c70",
270 | "metadata": {},
271 | "outputs": [],
272 | "source": [
273 | "#load preprocessing and PCA from sklearn\n",
274 | "\n",
275 | "\n",
276 | "#Do a PCA for all data (this should probably be done only with the training data)\n",
277 | "nPC=10\n",
278 | "pca = \n",
279 | "\n",
280 | "\n",
281 | "#define standard scaler\n",
282 | "scaler = \n",
283 | "\n",
284 | "\n",
285 | "#fit pca on the training data\n",
286 | "X_scaled = \n",
287 | "X_train_pca = \n",
288 | "\n",
289 | "#transform the testing data using the fit scaler and pca objects\n",
290 | "X_test_pca = \n",
291 | "\n",
292 | "print(X_train_pca.shape)"
293 | ]
294 | },
295 | {
296 | "cell_type": "markdown",
297 | "id": "fb3d5a51",
298 | "metadata": {},
299 | "source": [
300 | "### Visualize Engineered Features\n",
301 | "Let's plot the feature distributions."
302 | ]
303 | },
304 | {
305 | "cell_type": "code",
306 | "execution_count": null,
307 | "id": "b7b796e5",
308 | "metadata": {},
309 | "outputs": [],
310 | "source": [
311 | "# rescale the engineered features\n",
312 | "\n",
313 | "\n",
314 | "#plot the principle component distributions\n",
315 | "for PC in range(nPC):\n",
316 | " sns.kdeplot(data=X_test_pca[:, PC])\n",
317 | " #X_pca[:,nPC].plot.kde(bw_method='scott') #use bw_method=.02 for a lower bandwidth gaussian representation\n",
318 | " plt.legend([\"PC\" + str(PC)])\n",
319 | " plt.show()"
320 | ]
321 | },
322 | {
323 | "cell_type": "markdown",
324 | "id": "db80c954",
325 | "metadata": {},
326 | "source": [
327 | "## Capstone Project\n",
328 | "We have chosen a more [challenging Acute Myeloid Leukemia dataset](https://www.synapse.org/#!Synapse:syn2455683/wiki/64007) to test your ML skills. You will have to request access to this data through the link provided and follow directions to the 'request access' form. Originally DREAM challenge, this dataset contains multiple outcomes such as disease relapse and response to treatment. Both classificationa and regression tasks are possible. The data represents over 200 Leukemia patients and includes highly dimensional gemonics data in addition to clinical covariates. Your challenge is to develop models to address these outcomes completely End-to-End over the next few weeks. We will review your analysis in the final week of the course. We will periodically check in to make sure your analyses are goinig smoothly but for the mean time your analysis should generally follow these beats.\n",
329 | "+ ***Format*** the data with observations in rows and features in columns\n",
330 | "+ ***Manually exclude data*** like book keeping variables\n",
331 | "+ ***Normalize*** Data\n",
332 | "+ ***Treat missing*** Data\n",
333 | "+ Choose and employ a ***Data Partitioning*** scheme\n",
334 | "+ Do ***Feature selection*** for Clinical covariates and ***Dimension Reduction*** for omics data.\n",
335 | "+ ***Choose a model*** with the highest performance potential\n",
336 | "+ ***Tune your model***\n",
337 | "+ Caclulate ***Performance metrics***\n",
338 | "+ Report ***Model Predicitions***\n",
339 | "+ ***Interpret*** results\n"
340 | ]
341 | },
342 | {
343 | "cell_type": "markdown",
344 | "id": "d11b2f87",
345 | "metadata": {},
346 | "source": [
347 | "## Pathway Representation\n",
348 | "PCA is one of the most common dimension reduction schemes, however interpreting the biology from principle components is difficult. The PCs explain fluctuations in the data itself not necessarily the biological relationships at play. It would be better to choose a representation that is more convienient for interpretation in a clinical setting. Let us aggregate the data of related genes into a value that represents the pathways they participate in. Finding the predictive pathways may be more useful to explain the biology, design future invivio models, and drive treatment development.\n",
349 | "\n",
350 | "The goal is to assign a number to each pathway represented in our dataset and use these as the predictive features for each patient. For each pathway we will need to find all the genes in our dataset that participate in this pathway and describe how only these releated genes fluctuate. We'll use a PCA to do this and use the value of the first PC as the value for the pathway feature. We will then normalize this pathway representation data before moving on to feature selection.\n",
351 | "\n",
352 | "### Convert Affymetrix Gene IDs to Kegg IDs\n",
353 | "The source data has genes in the Affymetrix format. We would like to use KEGG IDs because of the convientient pathway analysis python tools KEGG provides.\n",
354 | "\n",
355 | "[g:Convert](http://www.protocol-online.org/cgi-bin/prot/jump.cgi?ID=4145)\n",
356 | " is a gene identifier tool that allows conversion of genes, proteins, microarray probes, standard names, various database identifiers, etc. A mix of IDs of different types may be inserted to g:Convert. The user needs to select a target database; all input IDs will be converted into target database format. Input IDs that have no corresponding entry in target database will be displayed as N/A.\n",
357 | " \n",
358 | "Let's use the g:Convert web tool to convert the affymetrix IDs to KEGG enzyme IDs. Copy and paste the \"Gene Accession Number\" column in the training set CSV file into the Query box in g:Convert. Exclude the column header when you paste. Select \"Homo sapiens (Human)\" in the organism box and select \"KEGG_ENZYME\" in the target namespace box. Press the \"Run query\" button to convert the IDs. When the query finishes you can download the results as a CSV file and save it in the data folder. "
359 | ]
360 | },
361 | {
362 | "cell_type": "code",
363 | "execution_count": null,
364 | "id": "87024f24",
365 | "metadata": {},
366 | "outputs": [],
367 | "source": [
368 | "#Let's look at the g:Convert file\n",
369 | "gprof = pd.read_csv('data/gProfiler_hsapiens.csv')\n",
370 | "gprof.head()"
371 | ]
372 | },
373 | {
374 | "cell_type": "markdown",
375 | "id": "f8f651a7",
376 | "metadata": {},
377 | "source": [
378 | "The corresponding gene names can be found in the 'name' column.Notice, that many initial aliases map to the same gene name. For every unique gene name let's get the pathways they participate in. We'll follow closely the [KEGG tutorial](https://bioservices.readthedocs.io/en/master/kegg_tutorial.html) to show us how."
379 | ]
380 | },
381 | {
382 | "cell_type": "code",
383 | "execution_count": null,
384 | "id": "8fb24fbb",
385 | "metadata": {},
386 | "outputs": [],
387 | "source": [
388 | "#install bioservices and import KEGG\n",
389 | "!pip install bioservices\n",
390 | "from bioservices.kegg import KEGG"
391 | ]
392 | },
393 | {
394 | "cell_type": "code",
395 | "execution_count": null,
396 | "id": "5bdfc955",
397 | "metadata": {},
398 | "outputs": [],
399 | "source": [
400 | "#start the kegg interface\n",
401 | "ntrface = KEGG() \n",
402 | "\n",
403 | "#Example search for pathways by gene name\n",
404 | "ntrface.get_pathway_by_gene(\"GAPDH\", \"hsa\")"
405 | ]
406 | },
407 | {
408 | "cell_type": "code",
409 | "execution_count": null,
410 | "id": "1bc89b85",
411 | "metadata": {},
412 | "outputs": [],
413 | "source": [
414 | "#get unique genes in gprof dataframe\n",
415 | "genes = gprof['name'].unique()\n",
416 | "genes = genes[0:50] #limit to first 50 genes so it runs quickly\n",
417 | "len(genes)"
418 | ]
419 | },
420 | {
421 | "cell_type": "code",
422 | "execution_count": null,
423 | "id": "a603f0cd",
424 | "metadata": {
425 | "scrolled": false
426 | },
427 | "outputs": [],
428 | "source": [
429 | "%%capture\n",
430 | "#init dictionary to hold pathway to gene mapping\n",
431 | "pathway_dict = {}\n",
432 | "\n",
433 | "#loop over unique genes in the dataset\n",
434 | "for gene in genes:\n",
435 | " #get pathways associated with gene\n",
436 | " \n",
437 | " #loop over associated pathways\n",
438 | " try:\n",
439 | " for pathway, desc in pathways.items():\n",
440 | " #check if pathway is new\n",
441 | " if not (pathway in pathway_dict):\n",
442 | " #append new pathway and init gene list\n",
443 | " \n",
444 | " else:\n",
445 | " # extend gene to the known pathway's list of genes\n",
446 | " \n",
447 | " except AttributeError:\n",
448 | " print(\"No pathways found for \" + str(gene))"
449 | ]
450 | },
451 | {
452 | "cell_type": "code",
453 | "execution_count": null,
454 | "id": "3931448d",
455 | "metadata": {},
456 | "outputs": [],
457 | "source": [
458 | "#display dictionary as pandas Series so we can show the head\n",
459 | "display(pd.Series(pathway_dict).head())"
460 | ]
461 | },
462 | {
463 | "cell_type": "code",
464 | "execution_count": null,
465 | "id": "69b76dfc",
466 | "metadata": {},
467 | "outputs": [],
468 | "source": [
469 | "#define a PCA object\n",
470 | "nPC=1\n",
471 | "pca = PCA(n_components=nPC, random_state=seed)\n",
472 | "\n",
473 | "#define standard scaler\n",
474 | "scaler = preprocessing.StandardScaler()\n",
475 | "\n",
476 | "#loop over pathways\n",
477 | "for pathway in pathway_dict:\n",
478 | " #get Gene Accession names for every Gene in pathway\n",
479 | " \n",
480 | "\n",
481 | " #select rows representing the genes in the pathway\n",
482 | " \n",
483 | " \n",
484 | " #fit pca on the training data\n",
485 | " \n",
486 | "\n",
487 | " #transform the testing data using the fit scaler and pca objects\n",
488 | " \n"
489 | ]
490 | },
491 | {
492 | "cell_type": "code",
493 | "execution_count": null,
494 | "id": "8985dcca",
495 | "metadata": {},
496 | "outputs": [],
497 | "source": [
498 | "#display distributions of the pathway values\n",
499 | ""
500 | ]
501 | }
502 | ],
503 | "metadata": {
504 | "kernelspec": {
505 | "display_name": "Python 3 (ipykernel)",
506 | "language": "python",
507 | "name": "python3"
508 | },
509 | "language_info": {
510 | "codemirror_mode": {
511 | "name": "ipython",
512 | "version": 3
513 | },
514 | "file_extension": ".py",
515 | "mimetype": "text/x-python",
516 | "name": "python",
517 | "nbconvert_exporter": "python",
518 | "pygments_lexer": "ipython3",
519 | "version": "3.9.4"
520 | }
521 | },
522 | "nbformat": 4,
523 | "nbformat_minor": 5
524 | }
525 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Biomedical-ML-with-Python
2 | Introductory workshop teaching python for Data Science and Biomedical Machine Learning
3 |
4 | Please see the [course website](https://www.bigbioinformatics.org/workshops) for details.
5 |
--------------------------------------------------------------------------------
/Syllabus.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigbioinformatics/Biomedical-ML-with-Python/b014b310d93806b93a711227a00f8e4488bedfbb/Syllabus.docx
--------------------------------------------------------------------------------