├── .gitignore
├── 10_python_data_intro.ipynb
├── 11_python_data.ipynb
├── 12_python_data.ipynb
├── 13_python_data.ipynb
├── 20_python_data_intro.ipynb
├── 21_live_python_data.ipynb
├── 21_python_data.ipynb
├── 22_python_data.ipynb
├── 23_live_python_data.ipynb
├── 23_python_data.ipynb
├── 24_project.ipynb
├── 24_python_data.ipynb
├── LICENSE
├── README.md
├── cheat_sheet_basic_python.ipynb
├── contributing.md
├── data.txt
├── data
├── AilMel.gff3
├── GRCh38.gff3
├── GRCm38.gff3
├── GRCz11.gff3
├── gapminder.csv
├── gapminder_gdp_africa.csv
├── gapminder_gdp_americas.csv
├── gapminder_gdp_asia.csv
├── gapminder_gdp_europe.csv
├── gapminder_gdp_oceania.csv
├── genes.txt
├── genes_withstrand.txt
├── glpa.fa
├── mydata.txt
└── sample.fa
├── dict_data.txt
├── img
├── mind_maps.key
├── mind_maps
│ ├── mind_maps.001.jpeg
│ ├── mind_maps.002.jpeg
│ ├── mind_maps.003.jpeg
│ └── mind_maps.004.jpeg
└── python_shell.png
├── install
├── 2to3_nb.py
├── Dockerfile
└── vbox_installer.sh
├── my_first_module.py
├── programming.txt
├── resources.ipynb
├── scripts
├── getIDs.py
└── hello.py
└── solutions
├── 24_solution.ipynb
├── ex_11_3.ipynb
├── ex_12_1.ipynb
├── ex_12_2.ipynb
├── ex_12_3.ipynb
├── ex_13_1.ipynb
├── ex_13_2.ipynb
├── ex_13_3.ipynb
├── ex_21_1.ipynb
├── ex_22_1.ipynb
├── ex_23_1.ipynb
└── gapminder.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | .idea
7 | .ipynb_checkpoints
8 |
9 | .DS_Store
10 |
11 | venv
12 |
13 | biopython.fa
14 |
15 | csvdata.tsv
16 |
17 | csvdictdata.tsv
18 |
19 | data/mydata.csv
20 |
21 | gene_lengths_csv.tsv
22 |
23 | gene_lengths.tsv
24 |
25 | out.txt
26 |
27 | sample.long.fa
28 |
29 | mySeqFile.fa
30 |
31 | # C extensions
32 | *.so
33 |
34 | # Distribution / packaging
35 | .Python
36 | env/
37 | build/
38 | develop-eggs/
39 | dist/
40 | downloads/
41 | eggs/
42 | .eggs/
43 | lib/
44 | lib64/
45 | parts/
46 | sdist/
47 | var/
48 | *.egg-info/
49 | .installed.cfg
50 | *.egg
51 |
52 | # PyInstaller
53 | # Usually these files are written by a python script from a template
54 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
55 | *.manifest
56 | *.spec
57 |
58 | # Installer logs
59 | pip-log.txt
60 | pip-delete-this-directory.txt
61 |
62 | # Unit test / coverage reports
63 | htmlcov/
64 | .tox/
65 | .coverage
66 | .coverage.*
67 | .cache
68 | nosetests.xml
69 | coverage.xml
70 | *,cover
71 | .hypothesis/
72 |
73 | # Translations
74 | *.mo
75 | *.pot
76 |
77 | # Django stuff:
78 | *.log
79 | local_settings.py
80 |
81 | # Flask stuff:
82 | instance/
83 | .webassets-cache
84 |
85 | # Scrapy stuff:
86 | .scrapy
87 |
88 | # Sphinx documentation
89 | docs/_build/
90 |
91 | # PyBuilder
92 | target/
93 |
94 | # IPython Notebook
95 | .ipynb_checkpoints
96 |
97 | # pyenv
98 | .python-version
99 |
100 | # celery beat schedule file
101 | celerybeat-schedule
102 |
103 | # dotenv
104 | .env
105 |
106 | # virtualenv
107 | venv/
108 | ENV/
109 |
110 | # Spyder project settings
111 | .spyderproject
112 |
113 | # Rope project settings
114 | .ropeproject
115 |
116 | gene_lengths_csv_module.txt
117 |
118 | gene_lengths.txt
119 |
120 | long_sequences.fa
121 |
--------------------------------------------------------------------------------
/11_python_data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "slideshow": {
7 | "slide_type": "-"
8 | }
9 | },
10 | "source": [
11 | "# Data science in Python\n",
12 | "\n",
13 | "- Course GitHub repo: https://github.com/pycam/python-data-science\n",
14 | "- Python website: https://www.python.org/ \n",
15 | "\n",
16 | "## Session 1.1: Starting with data and Python\n",
17 | "- [Jupyter notebook](#Jupyter-notebook)\n",
18 | " - [Exercise 1.1.1](#Exercise-1.1.1)\n",
19 | "- [Shell commands](#Shell-commands)\n",
20 | " - [Exercise 1.1.2](#Exercise-1.1.2)\n",
21 | "- [Basic Python](#Basic-Python)\n",
22 | " - [Exercise 1.1.3](#Exercise-1.1.3)"
23 | ]
24 | },
25 | {
26 | "cell_type": "markdown",
27 | "metadata": {},
28 | "source": [
29 | "## Mind map\n",
30 | "\n",
31 | "
"
32 | ]
33 | },
34 | {
35 | "cell_type": "markdown",
36 | "metadata": {},
37 | "source": [
38 | "## Jupyter notebook\n",
39 | "\n",
40 | "
\n",
41 | "\n",
42 | "- The [Jupyter Notebook](http://jupyter.org/) is a web application that allows you to create and share documents that contain live code, equations, visualizations and explanatory text. \n",
43 | "\n",
44 | "- Jupyter provides a rich architecture for interactive data science and scientific computing with: \n",
45 | " - Over 40 programming languages such as Python, R, Julia and Scala.\n",
46 | " - A browser-based notebook with support for code, rich text, math expressions, plots and other rich media.\n",
47 | " - Support for interactive data visualization."
48 | ]
49 | },
50 | {
51 | "cell_type": "markdown",
52 | "metadata": {},
53 | "source": [
54 | "### How to install Jupyter on your own computer?\n",
55 | "\n",
56 | "- We recommend using a virtual environment after having installed [Python 3](https://www.python.org/) on your computer\n",
57 | "```bash\n",
58 | "python3 -m venv venv\n",
59 | "source venv/bin/activate # activate your virtual environment\n",
60 | "```\n",
61 | "- Install Jupyter:\n",
62 | "```\n",
63 | "pip install jupyter\n",
64 | "```\n",
65 | "- Start the notebook server from the command line:\n",
66 | "```\n",
67 | "jupyter notebook\n",
68 | "```\n",
69 | "- You should see the notebook home page open in your web browser."
70 | ]
71 | },
72 | {
73 | "cell_type": "markdown",
74 | "metadata": {},
75 | "source": [
76 | "### How to run Python in a Jupyter notebook?\n",
77 | "\n",
78 | "- See [Jupyter Notebook Basics](http://nbviewer.jupyter.org/github/jupyter/notebook/blob/master/docs/source/examples/Notebook/Notebook%20Basics.ipynb)"
79 | ]
80 | },
81 | {
82 | "cell_type": "markdown",
83 | "metadata": {},
84 | "source": [
85 | "## Exercise 1.1.1\n",
86 | "- Create a new Jupyter notebook with one Markdown cell and one Python Code cell. Run the code.\n",
87 | "- Download the python file associated with the notebook newly created"
88 | ]
89 | },
90 | {
91 | "cell_type": "markdown",
92 | "metadata": {},
93 | "source": [
94 | "## Shell commands \n",
95 | "\n",
96 | "- Three commands\n",
97 | " - `pwd` to print working directory\n",
98 | " - `ls` to list content of a directory\n",
99 | " - `cd` to change directory"
100 | ]
101 | },
102 | {
103 | "cell_type": "markdown",
104 | "metadata": {},
105 | "source": [
106 | "### Run the Python interpreter\n",
107 | "\n",
108 | "On a Mac or Linux machine you first have to open a terminal and then type the command `python3`.\n",
109 | "
"
110 | ]
111 | },
112 | {
113 | "cell_type": "markdown",
114 | "metadata": {},
115 | "source": [
116 | "### Run Python code from a file\n",
117 | "\n",
118 | "For running Python code from a file, open a Terminal window and type the command `python3` or just `python` followed by the name of the script or file that contains Python code.\n",
119 | "\n",
120 | "```\n",
121 | "python3 scripts/hello.py\n",
122 | "```\n",
123 | "\n",
124 | "Please, make sure that you are running the version 3 of Python:\n",
125 | "```\n",
126 | "python --version\n",
127 | "```"
128 | ]
129 | },
130 | {
131 | "cell_type": "markdown",
132 | "metadata": {},
133 | "source": [
134 | "\n",
135 | "## Exercise 1.1.2\n",
136 | "- Find in a terminal window the Python script downloaded from Jupyter notebook and execute it.\n",
137 | "```\n",
138 | "python3 my-script.py\n",
139 | "```\n",
140 | "- List all data files in `data/` folder from the course materials and find our first data file `gapminder.csv`."
141 | ]
142 | },
143 | {
144 | "cell_type": "markdown",
145 | "metadata": {},
146 | "source": [
147 | "## Basic Python\n",
148 | "\n",
149 | "### Cheat Sheet\n",
150 | "\n",
151 | "- [Cheat Sheet](cheat_sheet_basic_python.ipynb)"
152 | ]
153 | },
154 | {
155 | "cell_type": "markdown",
156 | "metadata": {},
157 | "source": [
158 | "### For loops\n",
159 | "\n",
160 | "The **`for` loop** in Python iterates over each item in a collection (such as a list) in the order that they appear in the collection. What this means is that a variable (`colour` in the below example) is set to each item from the collection of values in turn, and each time this happens the indented block of code is executed again."
161 | ]
162 | },
163 | {
164 | "cell_type": "code",
165 | "execution_count": null,
166 | "metadata": {},
167 | "outputs": [],
168 | "source": [
169 | "all_colours = ['red', 'blue', 'green']\n",
170 | "for colour in all_colours:\n",
171 | " print(colour)"
172 | ]
173 | },
174 | {
175 | "cell_type": "markdown",
176 | "metadata": {},
177 | "source": [
178 | "### Files\n",
179 | "\n",
180 | "To read from a file, your program needs to open the file and then read the contents of the file. You can read the entire contents of the file at once, or read the file line by line. The **`with`** statement makes sure the file is closed properly when the program has finished accessing the file.\n",
181 | "\n",
182 | "\n",
183 | "Passing the `'w'` argument to `open()` tells Python you want to write to the file. Be careful; this will erase the contents of the file if it already exists. Passing the `'a'` argument tells Python you want to append to the end of an existing file."
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": null,
189 | "metadata": {},
190 | "outputs": [],
191 | "source": [
192 | "# reading from file\n",
193 | "with open(\"data/genes.txt\") as f:\n",
194 | " for line in f:\n",
195 | " print(line.strip())"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": null,
201 | "metadata": {},
202 | "outputs": [],
203 | "source": [
204 | "# printing only the gene name and the chromosome columns\n",
205 | "with open(\"data/genes.txt\") as f:\n",
206 | " for line in f:\n",
207 | " data = line.strip().split()\n",
208 | " print(data[0], data[1])"
209 | ]
210 | },
211 | {
212 | "cell_type": "markdown",
213 | "metadata": {},
214 | "source": [
215 | "### Conditional execution\n",
216 | "\n",
217 | "A conditional **`if/elif`** statement is used to specify that some block of code should only be executed if a conditional expression evaluates to `True`, there can be a final **`else`** statement to do something if all of the conditions are `False`.\n",
218 | "Python uses **indentation** to show which statements are in a block of code. "
219 | ]
220 | },
221 | {
222 | "cell_type": "code",
223 | "execution_count": null,
224 | "metadata": {},
225 | "outputs": [],
226 | "source": [
227 | "# printing only the gene name and its position for chromosome 6\n",
228 | "with open(\"data/genes.txt\") as f:\n",
229 | " for line in f:\n",
230 | " data = line.strip().split()\n",
231 | " if data[1] == '6':\n",
232 | " print(data[0], data[2], data[3])"
233 | ]
234 | },
235 | {
236 | "cell_type": "markdown",
237 | "metadata": {},
238 | "source": [
239 | "### Getting help\n",
240 | "\n",
241 | "[The Python 3 Standard Library](https://docs.python.org/3/library/index.html) is the reference documentation of all libraries included in Python as well as built-in functions and data types.\n",
242 | "\n",
243 | "For example, to get help for the `split()` function, you can look at the [Python documentation]((https://docs.python.org/3/library/index.html)) and search for [`str.split()`](https://docs.python.org/3/library/stdtypes.html?highlight=split#str.split)\n",
244 | "\n",
245 | "The Basic Python [Cheat Sheet](cheat_sheet_basic_python.ipynb) is a quick summary based on the course ['Introduction to solving biological problems with Python'](http://pycam.github.io/)."
246 | ]
247 | },
248 | {
249 | "cell_type": "markdown",
250 | "metadata": {},
251 | "source": [
252 | "Every Python object from the standard library should have documentation (a *docstring*) available that can be viewed in an interactive Python session. Well developed 3rd party packages will have this too, but it depends on the developers.\n",
253 | "\n",
254 | "**Shift+Tab** will pop up a little window giving this information on the object under the cursor. this is good for getting a quick reminder about how a function works, for example.\n",
255 | "\n",
256 | "To print the full docstring:"
257 | ]
258 | },
259 | {
260 | "cell_type": "code",
261 | "execution_count": null,
262 | "metadata": {},
263 | "outputs": [],
264 | "source": [
265 | "# Works in any python instance\n",
266 | "help(len)"
267 | ]
268 | },
269 | {
270 | "cell_type": "code",
271 | "execution_count": null,
272 | "metadata": {},
273 | "outputs": [],
274 | "source": [
275 | "# Works in Jupyter only, but has better formatting\n",
276 | "len?"
277 | ]
278 | },
279 | {
280 | "cell_type": "markdown",
281 | "metadata": {},
282 | "source": [
283 | "*Any* object may have a docstring."
284 | ]
285 | },
286 | {
287 | "cell_type": "code",
288 | "execution_count": null,
289 | "metadata": {},
290 | "outputs": [],
291 | "source": [
292 | "# Methods\n",
293 | "str.split?"
294 | ]
295 | },
296 | {
297 | "cell_type": "code",
298 | "execution_count": null,
299 | "metadata": {},
300 | "outputs": [],
301 | "source": [
302 | "# variables\n",
303 | "x = 123\n",
304 | "x?"
305 | ]
306 | },
307 | {
308 | "cell_type": "markdown",
309 | "metadata": {},
310 | "source": [
311 | "## Exercise 1.1.3\n",
312 | "\n",
313 | "We are going to look at a [Gapminder](https://www.gapminder.org/) dataset, made famous by Hans Rosling from his Ted presentation [‘The best stats you’ve ever seen’](http://www.ted.com/talks/hans_rosling_shows_the_best_stats_you_ve_ever_seen).\n",
314 | "\n",
315 | "- Read data from the file `data/gapminder.csv`.\n",
316 | "- Find which European countries have the largest population in 1957 and 2007."
317 | ]
318 | },
319 | {
320 | "cell_type": "markdown",
321 | "metadata": {
322 | "slideshow": {
323 | "slide_type": "-"
324 | }
325 | },
326 | "source": [
327 | "## Next session\n",
328 | "\n",
329 | "Go to our next notebook: [Session 1.2: Using existing python modules to explore data in files](12_python_data.ipynb)"
330 | ]
331 | }
332 | ],
333 | "metadata": {
334 | "celltoolbar": "Slideshow",
335 | "kernelspec": {
336 | "display_name": "Python 3",
337 | "language": "python",
338 | "name": "python3"
339 | },
340 | "language_info": {
341 | "codemirror_mode": {
342 | "name": "ipython",
343 | "version": 3
344 | },
345 | "file_extension": ".py",
346 | "mimetype": "text/x-python",
347 | "name": "python",
348 | "nbconvert_exporter": "python",
349 | "pygments_lexer": "ipython3",
350 | "version": "3.7.5"
351 | }
352 | },
353 | "nbformat": 4,
354 | "nbformat_minor": 4
355 | }
356 |
--------------------------------------------------------------------------------
/12_python_data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data science in Python\n",
8 | "\n",
9 | "- Course GitHub repo: https://github.com/pycam/python-data-science\n",
10 | "- Python website: https://www.python.org/ \n",
11 | "\n",
12 | "## Session 1.2: Using existing python modules to explore data in files\n",
13 | "\n",
14 | "- [Importing module `statistics`](#Importing-module-statistics)\n",
15 | " - [Exercise 1.2.1](#Exercise-1.2.1)\n",
16 | "- [Python file and directory manipulations](#Python-file-and-directory-manipulations)\n",
17 | " - [Exercise 1.2.2](#Exercise-1.2.2)\n",
18 | "- [Using the `csv` module](#Using-the-csv-module)\n",
19 | " - [Exercise 1.2.3](#Exercise-1.2.3)"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "## Mind map\n",
27 | "\n",
28 | "
"
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {},
34 | "source": [
35 | "## Importing module `statistics`"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "Like other laguages, Python has the ability to import external modules (or libraries) into the current program. These modules may be part of the standard library that is automatically included with the Python installation, they may be extra libraries which you install separately or they may be other Python programs you have written yourself. Whatever the source of the module, they are imported into a program via an **`import`** command.\n",
43 | "\n",
44 | "For example, if we wish to access the `mean()` and `median()` functions in Python, we can use the **`import`** keyword to get [the module named `statistics`](https://docs.python.org/3/library/statistics.html) and access its contents with the dot notation:"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "import statistics\n",
54 | "statistics.mean([1, 2, 3, 4, 4])"
55 | ]
56 | },
57 | {
58 | "cell_type": "markdown",
59 | "metadata": {},
60 | "source": [
61 | "Also we can use the `as` keyword to give the module a different name in our code, which can be useful for brevity and avoiding name conflicts:"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "metadata": {},
68 | "outputs": [],
69 | "source": [
70 | "import statistics as stats\n",
71 | "stats.mean([1, 2, 3, 4, 4])"
72 | ]
73 | },
74 | {
75 | "cell_type": "markdown",
76 | "metadata": {},
77 | "source": [
78 | "Alternatively we can import the separate components using the `from … import` keyword combination:"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": null,
84 | "metadata": {},
85 | "outputs": [],
86 | "source": [
87 | "from statistics import mean, median\n",
88 | "\n",
89 | "mean([1, 2, 3, 4, 4])"
90 | ]
91 | },
92 | {
93 | "cell_type": "markdown",
94 | "metadata": {},
95 | "source": [
96 | "### Listing module contents\n",
97 | "\n",
98 | "Using the [function `dir()`](https://docs.python.org/3/library/functions.html?highlight=dir#dir) and passing the module name:"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": null,
104 | "metadata": {},
105 | "outputs": [],
106 | "source": [
107 | "import statistics\n",
108 | "dir(statistics)"
109 | ]
110 | },
111 | {
112 | "cell_type": "markdown",
113 | "metadata": {},
114 | "source": [
115 | "### Getting help directly from Jupyter notebook\n"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": null,
121 | "metadata": {},
122 | "outputs": [],
123 | "source": [
124 | "statistics?"
125 | ]
126 | },
127 | {
128 | "cell_type": "markdown",
129 | "metadata": {},
130 | "source": [
131 | "## Exercise 1.2.1\n",
132 | "\n",
133 | "- Calculate the average GDP per capita per country in Europe in 1962, its median and standard deviation using `data/gapminder.csv` data; and compare these figures with those from Americas."
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": null,
139 | "metadata": {},
140 | "outputs": [],
141 | "source": []
142 | },
143 | {
144 | "cell_type": "markdown",
145 | "metadata": {},
146 | "source": [
147 | "## Python file and directory manipulations\n",
148 | "\n",
149 | "These two modules `os.path` and `os` implements some useful functions on pathnames, and for accessing the filesystem. To read or write files, we use `open()`. "
150 | ]
151 | },
152 | {
153 | "cell_type": "markdown",
154 | "metadata": {},
155 | "source": [
156 | "### [`os.path` — Common pathname manipulations](https://docs.python.org/3/library/os.path.html)\n",
157 | "\n",
158 | "- `join(*paths)` : joins the paths together into one long path\n",
159 | "- `exists(path)` : returns whether path exists\n",
160 | "- `isfile(path)` : returns whether path is a “regular” file (as opposed to a directory)\n",
161 | "- `isdir(path)` : returns whether path is a directory\n",
162 | "- `dirname(path)` : returns directory containing the path\n",
163 | "- `basename(path)` : returns the path minus the dirname(path) in front\n",
164 | "- `split(path)` : returns (dirname(path), basename(path))\n",
165 | "\n",
166 | "### [`os` — Miscellaneous operating system interfaces](https://docs.python.org/3/library/os.html)\n",
167 | "\n",
168 | "- `listdir(path)` : returns a list of files/directories in the directory path"
169 | ]
170 | },
171 | {
172 | "cell_type": "markdown",
173 | "metadata": {},
174 | "source": [
175 | "Building the path to your file from a list of directory and filename makes your script able to run on any platforms."
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": null,
181 | "metadata": {},
182 | "outputs": [],
183 | "source": [
184 | "import os.path\n",
185 | "data_filepath = os.path.join(\"data\", \"gapminder.csv\")\n",
186 | "# data/mydata.txt - Unix\n",
187 | "# data\\mydata.txt - Windows\n",
188 | "print(data_filepath)"
189 | ]
190 | },
191 | {
192 | "cell_type": "markdown",
193 | "metadata": {},
194 | "source": [
195 | "Checking if a file exists before opening it:"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": null,
201 | "metadata": {},
202 | "outputs": [],
203 | "source": [
204 | "os.path.exists(data_filepath)"
205 | ]
206 | },
207 | {
208 | "cell_type": "markdown",
209 | "metadata": {},
210 | "source": [
211 | "Checking if it is a file:"
212 | ]
213 | },
214 | {
215 | "cell_type": "code",
216 | "execution_count": null,
217 | "metadata": {},
218 | "outputs": [],
219 | "source": [
220 | "os.path.isfile(data_filepath)"
221 | ]
222 | },
223 | {
224 | "cell_type": "markdown",
225 | "metadata": {},
226 | "source": [
227 | "or a directory:"
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": null,
233 | "metadata": {},
234 | "outputs": [],
235 | "source": [
236 | "os.path.isdir(data_filepath)"
237 | ]
238 | },
239 | {
240 | "cell_type": "markdown",
241 | "metadata": {},
242 | "source": [
243 | "Extracting the directory of the file path:"
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": null,
249 | "metadata": {},
250 | "outputs": [],
251 | "source": [
252 | "data_dirname = os.path.dirname(data_filepath)\n",
253 | "print(data_dirname)"
254 | ]
255 | },
256 | {
257 | "cell_type": "markdown",
258 | "metadata": {},
259 | "source": [
260 | "Checking if it is a directory:"
261 | ]
262 | },
263 | {
264 | "cell_type": "code",
265 | "execution_count": null,
266 | "metadata": {},
267 | "outputs": [],
268 | "source": [
269 | "os.path.isdir(data_dirname)"
270 | ]
271 | },
272 | {
273 | "cell_type": "markdown",
274 | "metadata": {},
275 | "source": [
276 | "Extracting the file name from the file path:"
277 | ]
278 | },
279 | {
280 | "cell_type": "code",
281 | "execution_count": null,
282 | "metadata": {},
283 | "outputs": [],
284 | "source": [
285 | "data_filename = os.path.basename(data_filepath)\n",
286 | "print(data_filename)"
287 | ]
288 | },
289 | {
290 | "cell_type": "markdown",
291 | "metadata": {},
292 | "source": [
293 | "Getting the directory and the file name from the file path using `os.path.split()` which returns two variables its directory and file name:"
294 | ]
295 | },
296 | {
297 | "cell_type": "code",
298 | "execution_count": null,
299 | "metadata": {},
300 | "outputs": [],
301 | "source": [
302 | "data_dirname, data_filename = os.path.split(data_filepath)\n",
303 | "print(data_dirname, data_filename)"
304 | ]
305 | },
306 | {
307 | "cell_type": "markdown",
308 | "metadata": {},
309 | "source": [
310 | "Listing the content of a directory using `os.listdir()` is equivalent to `ls` in the shell:"
311 | ]
312 | },
313 | {
314 | "cell_type": "code",
315 | "execution_count": null,
316 | "metadata": {},
317 | "outputs": [],
318 | "source": [
319 | "import os\n",
320 | "print(os.listdir(data_dirname))"
321 | ]
322 | },
323 | {
324 | "cell_type": "markdown",
325 | "metadata": {},
326 | "source": [
327 | "## Exercise 1.2.2\n",
328 | "\n",
329 | "- Print the name of each `.csv` file in the `data` folder.\n",
330 | "- Print the first 3 lines from each `.csv` file in the `data` folder.\n",
331 | "\n",
332 | "You may wish to use the [`enumerate`](https://docs.python.org/3/library/functions.html?highlight=enumerate#enumerate) function, along with a `break` statement to avoid printing every line in the file."
333 | ]
334 | },
335 | {
336 | "cell_type": "code",
337 | "execution_count": null,
338 | "metadata": {},
339 | "outputs": [],
340 | "source": [
341 | "for i, letter in enumerate(['A', 'B', 'C']):\n",
342 | " print(i, letter)"
343 | ]
344 | },
345 | {
346 | "cell_type": "markdown",
347 | "metadata": {},
348 | "source": [
349 | "## Using the `csv` module\n",
350 | "\n",
351 | "The so-called CSV (Comma Separated Values) format is the most common import and export format for spreadsheets and databases. The `csv` module implements methods to read and write tabular data in CSV format.\n",
352 | "\n",
353 | "The csv module’s `reader()` and `writer()` methods read and write CSV files. You can also read and write data into dictionary form using the `DictReader()` and `DictWriter()` methods.\n",
354 | "\n",
355 | "For more information about this built-in Python library go to [CSV File Reading and Writing documentation](https://docs.python.org/3/library/csv.html).\n",
356 | "\n",
357 | "Let's now read our `data/genes.txt` tab separated file using the `csv` module into a dictionary based on the column headers using `csv.DictReader()`.\n",
358 | "\n",
359 | "|gene |\tchrom |\tstart |\tend |\n",
360 | "|-- | -- | -- | -- | \n",
361 | "|BRCA2 |\t13 |\t32889611 |\t32973805 |\n",
362 | "|TNFAIP3 |\t6 |\t138188351 |\t138204449 |\n",
363 | "|TCF7 |\t5 |\t133450402 |\t133487556 |"
364 | ]
365 | },
366 | {
367 | "cell_type": "markdown",
368 | "metadata": {},
369 | "source": [
370 | "First, import the `csv` module:"
371 | ]
372 | },
373 | {
374 | "cell_type": "code",
375 | "execution_count": null,
376 | "metadata": {},
377 | "outputs": [],
378 | "source": [
379 | "import csv"
380 | ]
381 | },
382 | {
383 | "cell_type": "markdown",
384 | "metadata": {},
385 | "source": [
386 | "Read the data and store each dictionary into a list. Note that `DictReader()` returns an [ordered dictionary](https://docs.python.org/3/library/collections.html#ordereddict-objects).\n",
387 | "\n",
388 | "Ordered dictionaries are like regular dictionaries but they remember the order that items were inserted. When iterating over an ordered dictionary, the items are returned in the order their keys were first added."
389 | ]
390 | },
391 | {
392 | "cell_type": "code",
393 | "execution_count": null,
394 | "metadata": {},
395 | "outputs": [],
396 | "source": [
397 | "data = []\n",
398 | "with open(\"data/genes.txt\") as f:\n",
399 | " reader = csv.DictReader(f, delimiter = \"\\t\")\n",
400 | " for row in reader: \n",
401 | " data.append(row)\n",
402 | "data"
403 | ]
404 | },
405 | {
406 | "cell_type": "code",
407 | "execution_count": null,
408 | "metadata": {},
409 | "outputs": [],
410 | "source": [
411 | "for d in data:\n",
412 | " print(d['chrom'], d['gene'], d['start'], d['end'])"
413 | ]
414 | },
415 | {
416 | "cell_type": "markdown",
417 | "metadata": {},
418 | "source": [
419 | "data is a list of ordered dictionary representing each row of the data file:"
420 | ]
421 | },
422 | {
423 | "cell_type": "code",
424 | "execution_count": null,
425 | "metadata": {},
426 | "outputs": [],
427 | "source": [
428 | "# accessing first dictionary from the list\n",
429 | "print(data[0])\n",
430 | "\n",
431 | "# printing its keys\n",
432 | "print(data[0].keys())\n",
433 | "\n",
434 | "# its values\n",
435 | "print(data[0].values())\n",
436 | "\n",
437 | "# the value associated with the key 'gene'\n",
438 | "print(data[0]['gene'])"
439 | ]
440 | },
441 | {
442 | "cell_type": "code",
443 | "execution_count": null,
444 | "metadata": {},
445 | "outputs": [],
446 | "source": [
447 | "# looping over the list to print each gene\n",
448 | "for d in data:\n",
449 | " print(d['gene'])"
450 | ]
451 | },
452 | {
453 | "cell_type": "code",
454 | "execution_count": null,
455 | "metadata": {},
456 | "outputs": [],
457 | "source": [
458 | "# calculating the length of each gene and adding its value into the dictionary\n",
459 | "for d in data:\n",
460 | " d['len'] = int(d['end']) - int(d['start']) + 1\n",
461 | " print(d)"
462 | ]
463 | },
464 | {
465 | "cell_type": "markdown",
466 | "metadata": {},
467 | "source": [
468 | "The main advantage of using the `DictReader()` method and the `csv` module is to write code that is easier to read and more flexible. Using the name of the column instead if its index make it more meaningful when reading code, and using this method of reading comma or tab separated files, give you the flexibility to add columns and changed their orders without having to modify your code.\n",
469 | "\n",
470 | "Let's have a look now at the file `data/genes_withstrand.txt` and spot the differences with `data/genes.txt`. Even though columns `chrom` and `gene` have been swapped and column `strand` added, the code written previously is still working."
471 | ]
472 | },
473 | {
474 | "cell_type": "code",
475 | "execution_count": null,
476 | "metadata": {},
477 | "outputs": [],
478 | "source": [
479 | "data_withstrand = []\n",
480 | "with open(\"data/genes_withstrand.txt\") as f:\n",
481 | " reader = csv.DictReader(f, delimiter = \"\\t\")\n",
482 | " for row in reader:\n",
483 | " print(row)\n",
484 | " data_withstrand.append(row)\n",
485 | "\n",
486 | "for d in data_withstrand:\n",
487 | " print(d['chrom'], d['gene'], d['start'], d['end'])"
488 | ]
489 | },
490 | {
491 | "cell_type": "code",
492 | "execution_count": null,
493 | "metadata": {},
494 | "outputs": [],
495 | "source": [
496 | "# Write a delimited file using the csv module from a list of dictionaries \n",
497 | "with open(\"gene_lengths.txt\", \"w\") as f:\n",
498 | " writer = csv.DictWriter(f, data[0].keys(), delimiter='\\t')\n",
499 | " writer.writeheader() # write header\n",
500 | "\n",
501 | " for d in data:\n",
502 | " writer.writerow(d) # write row\n",
503 | "\n",
504 | "# Open the output file and print out its content\n",
505 | "with open(\"gene_lengths.txt\") as f:\n",
506 | " for line in f:\n",
507 | " print(line.strip())"
508 | ]
509 | },
510 | {
511 | "cell_type": "markdown",
512 | "metadata": {},
513 | "source": [
514 | "## Getting help from the official Python documentation\n",
515 | "\n",
516 | "The most useful information is online on https://www.python.org/ website and should be used as a reference guide.\n",
517 | "\n",
518 | "- [Python3 documentation](https://docs.python.org/3/) is the starting page with links to tutorials and libraries' documentation for Python 3\n",
519 | " - [The Python Tutorial](https://docs.python.org/3/tutorial/index.html)\n",
520 | " - [Modules](https://docs.python.org/3/tutorial/modules.html)\n",
521 | " - [Brief Tour of the Standard Library: Mathematics](https://docs.python.org/3/tutorial/stdlib.html#mathematics)\n",
522 | " - [The Python Standard Library Reference](https://docs.python.org/3/library/index.html) is the reference documentation of all libraries included in Python like:\n",
523 | " - [`statistics` - Mathematical statistics functions](https://docs.python.org/3/library/statistics.html)\n",
524 | " - [`os.path` — Common pathname manipulations](https://docs.python.org/3/library/os.path.html)\n",
525 | " - [`os` — Miscellaneous operating system interfaces](https://docs.python.org/3/library/os.html)\n",
526 | " - [`csv` — CSV File Reading and Writing](https://docs.python.org/3/library/csv.html)"
527 | ]
528 | },
529 | {
530 | "cell_type": "markdown",
531 | "metadata": {},
532 | "source": [
533 | "## Exercise 1.2.3\n",
534 | "\n",
535 | "- Change the script you wrote for [Exercise 1.2.1](#Exercise-1.2.1) to make use of the `csv` module to calculate the average GDP per capita per country in Europe in 1962, its median and standard deviation using `data/gapminder.csv` data; and compare these figures with those from Americas."
536 | ]
537 | },
538 | {
539 | "cell_type": "markdown",
540 | "metadata": {},
541 | "source": [
542 | "## Next session\n",
543 | "\n",
544 | "Go to our next notebook: [Session 1.3: Creating functions and modules to write reusable code](13_python_data.ipynb)"
545 | ]
546 | }
547 | ],
548 | "metadata": {
549 | "kernelspec": {
550 | "display_name": "Python 3",
551 | "language": "python",
552 | "name": "python3"
553 | },
554 | "language_info": {
555 | "codemirror_mode": {
556 | "name": "ipython",
557 | "version": 3
558 | },
559 | "file_extension": ".py",
560 | "mimetype": "text/x-python",
561 | "name": "python",
562 | "nbconvert_exporter": "python",
563 | "pygments_lexer": "ipython3",
564 | "version": "3.7.4"
565 | }
566 | },
567 | "nbformat": 4,
568 | "nbformat_minor": 4
569 | }
570 |
--------------------------------------------------------------------------------
/20_python_data_intro.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "nbpresent": {
7 | "id": "dc7a1635-0bbd-4bf7-a07e-7a36f58e258b"
8 | },
9 | "slideshow": {
10 | "slide_type": "slide"
11 | }
12 | },
13 | "source": [
14 | "# Data science in Python\n",
15 | "\n",
16 | "
"
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {
22 | "slideshow": {
23 | "slide_type": "slide"
24 | }
25 | },
26 | "source": [
27 | "## Aims\n",
28 | "\n",
29 | "This course will cover concepts and strategies for working with data **more effectively** in Python with the aim of: \n",
30 | "\n",
31 | "- Writing **reusable** code, using **functions** and **libraries**\n",
32 | "- Acquiring a working knowledge of **key concepts** which are prerequisites for advanced programming in Python like writing classes to build objects\n",
33 | "- During this course you will learn about:\n",
34 | " - Manipulating data using Pandas\n",
35 | " - Visualising data with Matplotlib \n",
36 | " - Working with biological data using BioPython"
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "metadata": {
42 | "nbpresent": {
43 | "id": "21082cb9-e1b9-4fe9-80d5-9d9e8418937b"
44 | },
45 | "slideshow": {
46 | "slide_type": "slide"
47 | }
48 | },
49 | "source": [
50 | "## Learning objectives\n",
51 | "- **Use** third-party Python libraries: Pandas, Matplotlib & BioPython\n",
52 | "- **Solve** more complex exercises using these concepts"
53 | ]
54 | },
55 | {
56 | "cell_type": "markdown",
57 | "metadata": {
58 | "nbpresent": {
59 | "id": "ceb5f5a0-a5e8-435e-ae16-23c2ba8c6ab2"
60 | },
61 | "slideshow": {
62 | "slide_type": "slide"
63 | }
64 | },
65 | "source": [
66 | "## Course schedule - day 2\n",
67 | "\n",
68 | "\n",
69 | "- 09:30-09:45: [0:15] **Introduction**\n",
70 | "- 09:45-11:00: [1:15] **Session 2.1** - Manipulating data with Pandas\n",
71 | "- 11:00-11:15: [0:15] *break*\n",
72 | "- 11:15-12:30: [1:15] **Session 2.2** - Visualising data with Matplotlib\n",
73 | "- 12:30-13:30: [1:00] *lunch break*\n",
74 | "- 13:30-14:45: [1:15] **Session 2.3** - Working with biological data using BioPython\n",
75 | "- 14:45-15:00: [0:15] *break*\n",
76 | "- 15:00-16:15: [1:15] **Session 2.4** - Creating a data project report using Jupyter\n",
77 | "- 16:15-16:30: [0:15] **Conclusion**"
78 | ]
79 | },
80 | {
81 | "cell_type": "markdown",
82 | "metadata": {
83 | "nbpresent": {
84 | "id": "8458de53-35b5-405e-a372-5db5d2e2c2c5"
85 | },
86 | "slideshow": {
87 | "slide_type": "slide"
88 | }
89 | },
90 | "source": [
91 | "## Course materials\n",
92 | "\n",
93 | "- The course materials is accessible on GitHub:\n",
94 | " - https://github.com/pycam/python-data-science\n",
95 | "- We’d like you to follow along with the example code as we go through the material, and attempt the exercises to practice what you’ve learned\n",
96 | "- If you have specific projects/problems that you think could be attempted using Python, we are happy to (try to) help during the exercises. Just let us know!"
97 | ]
98 | },
99 | {
100 | "cell_type": "markdown",
101 | "metadata": {},
102 | "source": [
103 | "## Feedback\n",
104 | "\n",
105 | "- Questions are welcome at any point!\n",
106 | "- We use a two colour post-it system: \n",
107 | " - **green** when exercise is done; and \n",
108 | " - **red/pink** when you need help"
109 | ]
110 | },
111 | {
112 | "cell_type": "markdown",
113 | "metadata": {
114 | "nbpresent": {
115 | "id": "0e25dad3-add0-466e-8f71-e771d6ec4500"
116 | },
117 | "slideshow": {
118 | "slide_type": "slide"
119 | }
120 | },
121 | "source": [
122 | "## Next session\n",
123 | "\n",
124 | "Go to our next notebook: [Session 2.1: Data manipulation with Pandas](21_python_data.ipynb)"
125 | ]
126 | }
127 | ],
128 | "metadata": {
129 | "anaconda-cloud": {},
130 | "celltoolbar": "Slideshow",
131 | "kernelspec": {
132 | "display_name": "Python 3",
133 | "language": "python",
134 | "name": "python3"
135 | },
136 | "language_info": {
137 | "codemirror_mode": {
138 | "name": "ipython",
139 | "version": 3
140 | },
141 | "file_extension": ".py",
142 | "mimetype": "text/x-python",
143 | "name": "python",
144 | "nbconvert_exporter": "python",
145 | "pygments_lexer": "ipython3",
146 | "version": "3.6.4"
147 | },
148 | "nbpresent": {
149 | "slides": {
150 | "152c5a3b-78f9-4183-bce2-379a4012baf6": {
151 | "id": "152c5a3b-78f9-4183-bce2-379a4012baf6",
152 | "layout": "grid",
153 | "prev": "5613e857-5b4e-42e4-9feb-df0440592ca2",
154 | "regions": {
155 | "20d6059c-7745-410d-a5fb-0b91cacbc2e2": {
156 | "attrs": {
157 | "height": 0.6666666666666666,
158 | "pad": 0.01,
159 | "treemap:weight": 1,
160 | "width": 0.5,
161 | "x": 0,
162 | "y": 0
163 | },
164 | "id": "20d6059c-7745-410d-a5fb-0b91cacbc2e2"
165 | },
166 | "300e6ccd-ecf4-425e-8574-3debe305aafb": {
167 | "attrs": {
168 | "height": 0.3333333333333333,
169 | "pad": 0.01,
170 | "treemap:weight": 1,
171 | "width": 1,
172 | "x": 0,
173 | "y": 0.6666666666666666
174 | },
175 | "content": {
176 | "cell": "9814e8d7-60e0-43e6-aee0-3c33cc2cc809",
177 | "part": "whole"
178 | },
179 | "id": "300e6ccd-ecf4-425e-8574-3debe305aafb"
180 | },
181 | "df2dd6ff-570b-4b75-9cb7-1ff1dbdd4f55": {
182 | "attrs": {
183 | "height": 0.6666666666666666,
184 | "pad": 0.01,
185 | "treemap:weight": 1,
186 | "width": 0.5,
187 | "x": 0.5,
188 | "y": 0
189 | },
190 | "id": "df2dd6ff-570b-4b75-9cb7-1ff1dbdd4f55"
191 | }
192 | }
193 | },
194 | "2586ca7d-5091-40ea-b566-ccc5fbf833c6": {
195 | "id": "2586ca7d-5091-40ea-b566-ccc5fbf833c6",
196 | "prev": "f001d476-5814-4664-a722-f04f5d23cd52",
197 | "regions": {
198 | "d6011048-43db-4990-a82e-768683aa4fe5": {
199 | "attrs": {
200 | "height": 0.8,
201 | "width": 0.8,
202 | "x": 0.1,
203 | "y": 0.1
204 | },
205 | "content": {
206 | "cell": "ceb5f5a0-a5e8-435e-ae16-23c2ba8c6ab2",
207 | "part": "whole"
208 | },
209 | "id": "d6011048-43db-4990-a82e-768683aa4fe5"
210 | }
211 | }
212 | },
213 | "27ee4130-d0bb-4287-b8fe-75a7b0ecf178": {
214 | "id": "27ee4130-d0bb-4287-b8fe-75a7b0ecf178",
215 | "prev": "2586ca7d-5091-40ea-b566-ccc5fbf833c6",
216 | "regions": {
217 | "7a689d66-0c9d-4492-928b-f35bfd2ffc4c": {
218 | "attrs": {
219 | "height": 0.8,
220 | "width": 0.8,
221 | "x": 0.1,
222 | "y": 0.1
223 | },
224 | "content": {
225 | "cell": "e6c2e441-eb7b-4a4c-9c9c-b88cc9a2527f",
226 | "part": "whole"
227 | },
228 | "id": "7a689d66-0c9d-4492-928b-f35bfd2ffc4c"
229 | }
230 | }
231 | },
232 | "2de0c027-7a07-4f7e-8594-a98d36125372": {
233 | "id": "2de0c027-7a07-4f7e-8594-a98d36125372",
234 | "prev": "75e76bd9-24ae-4c42-b6bc-5f58a0550ba8",
235 | "regions": {
236 | "868fd842-e6fb-48b2-9ac5-95e8fe20927e": {
237 | "attrs": {
238 | "height": 0.8,
239 | "width": 0.8,
240 | "x": 0.1,
241 | "y": 0.1
242 | },
243 | "content": {
244 | "cell": "0e25dad3-add0-466e-8f71-e771d6ec4500",
245 | "part": "whole"
246 | },
247 | "id": "868fd842-e6fb-48b2-9ac5-95e8fe20927e"
248 | }
249 | }
250 | },
251 | "5613e857-5b4e-42e4-9feb-df0440592ca2": {
252 | "id": "5613e857-5b4e-42e4-9feb-df0440592ca2",
253 | "prev": "564dae42-4185-46c1-b156-e503f475e25c",
254 | "regions": {
255 | "17e888b0-050b-406a-a5a3-0d5c1605b8df": {
256 | "attrs": {
257 | "height": 0.8,
258 | "width": 0.8,
259 | "x": 0.1,
260 | "y": 0.1
261 | },
262 | "content": {
263 | "cell": "f5bcbcb5-4352-4674-a7b6-c8e576220422",
264 | "part": "whole"
265 | },
266 | "id": "17e888b0-050b-406a-a5a3-0d5c1605b8df"
267 | }
268 | }
269 | },
270 | "564dae42-4185-46c1-b156-e503f475e25c": {
271 | "id": "564dae42-4185-46c1-b156-e503f475e25c",
272 | "prev": "ba285213-f645-4314-afd5-0a656fa35631",
273 | "regions": {
274 | "328d4d72-cd9e-4e5b-aaa8-175833f5bfdb": {
275 | "attrs": {
276 | "height": 0.8,
277 | "width": 0.8,
278 | "x": 0.1,
279 | "y": 0.1
280 | },
281 | "content": {
282 | "cell": "8a4ac456-6c4b-4249-8662-b1cabfd7cee4",
283 | "part": "whole"
284 | },
285 | "id": "328d4d72-cd9e-4e5b-aaa8-175833f5bfdb"
286 | }
287 | }
288 | },
289 | "6ff94ac3-8ded-442e-ae43-aa0a5c14d468": {
290 | "id": "6ff94ac3-8ded-442e-ae43-aa0a5c14d468",
291 | "prev": "27ee4130-d0bb-4287-b8fe-75a7b0ecf178",
292 | "regions": {
293 | "ad759b3a-6080-4356-a9fd-87f2b1b90bc2": {
294 | "attrs": {
295 | "height": 0.8,
296 | "width": 0.8,
297 | "x": 0.1,
298 | "y": 0.1
299 | },
300 | "content": {
301 | "cell": "8458de53-35b5-405e-a372-5db5d2e2c2c5",
302 | "part": "whole"
303 | },
304 | "id": "ad759b3a-6080-4356-a9fd-87f2b1b90bc2"
305 | }
306 | }
307 | },
308 | "75e76bd9-24ae-4c42-b6bc-5f58a0550ba8": {
309 | "id": "75e76bd9-24ae-4c42-b6bc-5f58a0550ba8",
310 | "prev": "152c5a3b-78f9-4183-bce2-379a4012baf6",
311 | "regions": {
312 | "4afd3b41-071f-44eb-a8f6-9a7f780041c2": {
313 | "attrs": {
314 | "height": 0.8,
315 | "width": 0.8,
316 | "x": 0.1,
317 | "y": 0.1
318 | },
319 | "content": {
320 | "cell": "62fdd00c-a006-4f11-b9dc-e2ca072225d7",
321 | "part": "whole"
322 | },
323 | "id": "4afd3b41-071f-44eb-a8f6-9a7f780041c2"
324 | }
325 | }
326 | },
327 | "8c46fa2c-d5dc-4ef7-8d99-f504e2c3a4a1": {
328 | "id": "8c46fa2c-d5dc-4ef7-8d99-f504e2c3a4a1",
329 | "prev": "e2f5626f-0d60-47cb-967f-0edababb0329",
330 | "regions": {
331 | "af33776f-ec36-45be-a627-39573a78b1d6": {
332 | "attrs": {
333 | "height": 0.8,
334 | "width": 0.8,
335 | "x": 0.1,
336 | "y": 0.1
337 | },
338 | "content": {
339 | "cell": "0d61b4b4-163f-47fe-80f1-092287218273",
340 | "part": "whole"
341 | },
342 | "id": "af33776f-ec36-45be-a627-39573a78b1d6"
343 | }
344 | }
345 | },
346 | "ae3f4c01-80dc-4add-889a-05c74f7155a5": {
347 | "id": "ae3f4c01-80dc-4add-889a-05c74f7155a5",
348 | "prev": "6ff94ac3-8ded-442e-ae43-aa0a5c14d468",
349 | "regions": {
350 | "15f00a98-7b04-439d-996d-851b773b060a": {
351 | "attrs": {
352 | "height": 0.8,
353 | "width": 0.8,
354 | "x": 0.1,
355 | "y": 0.1
356 | },
357 | "content": {
358 | "cell": "96ca5c44-2cfc-471c-8da7-39870c822e20",
359 | "part": "whole"
360 | },
361 | "id": "15f00a98-7b04-439d-996d-851b773b060a"
362 | }
363 | }
364 | },
365 | "ba285213-f645-4314-afd5-0a656fa35631": {
366 | "id": "ba285213-f645-4314-afd5-0a656fa35631",
367 | "prev": "8c46fa2c-d5dc-4ef7-8d99-f504e2c3a4a1",
368 | "regions": {
369 | "6cddb9f2-8e39-4010-8fab-3e70b3a8993f": {
370 | "attrs": {
371 | "height": 0.8,
372 | "width": 0.8,
373 | "x": 0.1,
374 | "y": 0.1
375 | },
376 | "content": {
377 | "cell": "b878a4f9-4345-4abb-81f4-5a731c639ab8",
378 | "part": "whole"
379 | },
380 | "id": "6cddb9f2-8e39-4010-8fab-3e70b3a8993f"
381 | }
382 | }
383 | },
384 | "cd587236-8a19-444d-8b18-69d782dbf725": {
385 | "id": "cd587236-8a19-444d-8b18-69d782dbf725",
386 | "prev": null,
387 | "regions": {
388 | "ef377bfe-ff45-49db-b471-f79ecb10b580": {
389 | "attrs": {
390 | "height": 0.8,
391 | "width": 0.8,
392 | "x": 0.1,
393 | "y": 0.1
394 | },
395 | "content": {
396 | "cell": "dc7a1635-0bbd-4bf7-a07e-7a36f58e258b",
397 | "part": "whole"
398 | },
399 | "id": "ef377bfe-ff45-49db-b471-f79ecb10b580"
400 | }
401 | }
402 | },
403 | "e2f5626f-0d60-47cb-967f-0edababb0329": {
404 | "id": "e2f5626f-0d60-47cb-967f-0edababb0329",
405 | "prev": "ae3f4c01-80dc-4add-889a-05c74f7155a5",
406 | "regions": {
407 | "eef49fa0-0f9b-4228-8fb8-79e079bf7682": {
408 | "attrs": {
409 | "height": 0.8,
410 | "width": 0.8,
411 | "x": 0.1,
412 | "y": 0.1
413 | },
414 | "content": {
415 | "cell": "9110098b-9675-4d64-adf3-c947073d4c4d",
416 | "part": "whole"
417 | },
418 | "id": "eef49fa0-0f9b-4228-8fb8-79e079bf7682"
419 | }
420 | }
421 | },
422 | "f001d476-5814-4664-a722-f04f5d23cd52": {
423 | "id": "f001d476-5814-4664-a722-f04f5d23cd52",
424 | "prev": "cd587236-8a19-444d-8b18-69d782dbf725",
425 | "regions": {
426 | "5a176076-c5a5-4b50-ab2c-9cd0baedad45": {
427 | "attrs": {
428 | "height": 0.8,
429 | "width": 0.8,
430 | "x": 0.1,
431 | "y": 0.1
432 | },
433 | "content": {
434 | "cell": "53eee250-b3d0-4262-ad09-e87fb2acf82e",
435 | "part": "whole"
436 | },
437 | "id": "5a176076-c5a5-4b50-ab2c-9cd0baedad45"
438 | }
439 | }
440 | }
441 | },
442 | "themes": {
443 | "default": "c6b5d1ad-d691-4000-9f62-de7fc0e83644",
444 | "theme": {
445 | "586a6e7a-f661-4d6c-90d0-1392715bea27": {
446 | "id": "586a6e7a-f661-4d6c-90d0-1392715bea27",
447 | "palette": {
448 | "19cc588f-0593-49c9-9f4b-e4d7cc113b1c": {
449 | "id": "19cc588f-0593-49c9-9f4b-e4d7cc113b1c",
450 | "rgb": [
451 | 252,
452 | 252,
453 | 252
454 | ]
455 | },
456 | "31af15d2-7e15-44c5-ab5e-e04b16a89eff": {
457 | "id": "31af15d2-7e15-44c5-ab5e-e04b16a89eff",
458 | "rgb": [
459 | 68,
460 | 68,
461 | 68
462 | ]
463 | },
464 | "50f92c45-a630-455b-aec3-788680ec7410": {
465 | "id": "50f92c45-a630-455b-aec3-788680ec7410",
466 | "rgb": [
467 | 155,
468 | 177,
469 | 192
470 | ]
471 | },
472 | "c5cc3653-2ee1-402a-aba2-7caae1da4f6c": {
473 | "id": "c5cc3653-2ee1-402a-aba2-7caae1da4f6c",
474 | "rgb": [
475 | 43,
476 | 126,
477 | 184
478 | ]
479 | },
480 | "efa7f048-9acb-414c-8b04-a26811511a21": {
481 | "id": "efa7f048-9acb-414c-8b04-a26811511a21",
482 | "rgb": [
483 | 25.118061674008803,
484 | 73.60176211453744,
485 | 107.4819383259912
486 | ]
487 | }
488 | },
489 | "rules": {
490 | "blockquote": {
491 | "color": "50f92c45-a630-455b-aec3-788680ec7410"
492 | },
493 | "code": {
494 | "font-family": "Anonymous Pro"
495 | },
496 | "h1": {
497 | "color": "c5cc3653-2ee1-402a-aba2-7caae1da4f6c",
498 | "font-family": "Lato",
499 | "font-size": 8
500 | },
501 | "h2": {
502 | "color": "c5cc3653-2ee1-402a-aba2-7caae1da4f6c",
503 | "font-family": "Lato",
504 | "font-size": 6
505 | },
506 | "h3": {
507 | "color": "50f92c45-a630-455b-aec3-788680ec7410",
508 | "font-family": "Lato",
509 | "font-size": 5.5
510 | },
511 | "h4": {
512 | "color": "c5cc3653-2ee1-402a-aba2-7caae1da4f6c",
513 | "font-family": "Lato",
514 | "font-size": 5
515 | },
516 | "h5": {
517 | "font-family": "Lato"
518 | },
519 | "h6": {
520 | "font-family": "Lato"
521 | },
522 | "h7": {
523 | "font-family": "Lato"
524 | },
525 | "pre": {
526 | "font-family": "Anonymous Pro",
527 | "font-size": 4
528 | }
529 | },
530 | "text-base": {
531 | "font-family": "Merriweather",
532 | "font-size": 4
533 | }
534 | },
535 | "c6b5d1ad-d691-4000-9f62-de7fc0e83644": {
536 | "backgrounds": {
537 | "dc7afa04-bf90-40b1-82a5-726e3cff5267": {
538 | "background-color": "31af15d2-7e15-44c5-ab5e-e04b16a89eff",
539 | "id": "dc7afa04-bf90-40b1-82a5-726e3cff5267"
540 | }
541 | },
542 | "id": "c6b5d1ad-d691-4000-9f62-de7fc0e83644",
543 | "palette": {
544 | "19cc588f-0593-49c9-9f4b-e4d7cc113b1c": {
545 | "id": "19cc588f-0593-49c9-9f4b-e4d7cc113b1c",
546 | "rgb": [
547 | 252,
548 | 252,
549 | 252
550 | ]
551 | },
552 | "31af15d2-7e15-44c5-ab5e-e04b16a89eff": {
553 | "id": "31af15d2-7e15-44c5-ab5e-e04b16a89eff",
554 | "rgb": [
555 | 68,
556 | 68,
557 | 68
558 | ]
559 | },
560 | "50f92c45-a630-455b-aec3-788680ec7410": {
561 | "id": "50f92c45-a630-455b-aec3-788680ec7410",
562 | "rgb": [
563 | 197,
564 | 226,
565 | 245
566 | ]
567 | },
568 | "c5cc3653-2ee1-402a-aba2-7caae1da4f6c": {
569 | "id": "c5cc3653-2ee1-402a-aba2-7caae1da4f6c",
570 | "rgb": [
571 | 43,
572 | 126,
573 | 184
574 | ]
575 | },
576 | "efa7f048-9acb-414c-8b04-a26811511a21": {
577 | "id": "efa7f048-9acb-414c-8b04-a26811511a21",
578 | "rgb": [
579 | 25.118061674008803,
580 | 73.60176211453744,
581 | 107.4819383259912
582 | ]
583 | }
584 | },
585 | "rules": {
586 | "a": {
587 | "color": "19cc588f-0593-49c9-9f4b-e4d7cc113b1c"
588 | },
589 | "blockquote": {
590 | "color": "50f92c45-a630-455b-aec3-788680ec7410",
591 | "font-size": 3
592 | },
593 | "code": {
594 | "font-family": "Anonymous Pro"
595 | },
596 | "h1": {
597 | "color": "19cc588f-0593-49c9-9f4b-e4d7cc113b1c",
598 | "font-family": "Merriweather",
599 | "font-size": 8
600 | },
601 | "h2": {
602 | "color": "19cc588f-0593-49c9-9f4b-e4d7cc113b1c",
603 | "font-family": "Merriweather",
604 | "font-size": 6
605 | },
606 | "h3": {
607 | "color": "50f92c45-a630-455b-aec3-788680ec7410",
608 | "font-family": "Lato",
609 | "font-size": 5.5
610 | },
611 | "h4": {
612 | "color": "c5cc3653-2ee1-402a-aba2-7caae1da4f6c",
613 | "font-family": "Lato",
614 | "font-size": 5
615 | },
616 | "h5": {
617 | "font-family": "Lato"
618 | },
619 | "h6": {
620 | "font-family": "Lato"
621 | },
622 | "h7": {
623 | "font-family": "Lato"
624 | },
625 | "li": {
626 | "color": "50f92c45-a630-455b-aec3-788680ec7410",
627 | "font-size": 3.25
628 | },
629 | "pre": {
630 | "font-family": "Anonymous Pro",
631 | "font-size": 4
632 | }
633 | },
634 | "text-base": {
635 | "color": "19cc588f-0593-49c9-9f4b-e4d7cc113b1c",
636 | "font-family": "Lato",
637 | "font-size": 4
638 | }
639 | }
640 | }
641 | }
642 | }
643 | },
644 | "nbformat": 4,
645 | "nbformat_minor": 1
646 | }
647 |
--------------------------------------------------------------------------------
/21_live_python_data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Session 2.1: Manipulating data with Pandas (live coding session)"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {
14 | "collapsed": true
15 | },
16 | "outputs": [],
17 | "source": [
18 | "import pandas\n",
19 | "mouse_data = pandas.read_csv('data/GRCm38.gff3', sep='\\t')"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": null,
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "mouse_data.head()"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": null,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "mouse_data.info()"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": null,
43 | "metadata": {},
44 | "outputs": [],
45 | "source": [
46 | "mouse_data.describe()"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": null,
52 | "metadata": {},
53 | "outputs": [],
54 | "source": [
55 | "mouse_data.head()"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": null,
61 | "metadata": {},
62 | "outputs": [],
63 | "source": [
64 | "mouse_data_type = mouse_data.iloc[:,[2]]\n",
65 | "type(mouse_data_type)"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "metadata": {},
72 | "outputs": [],
73 | "source": [
74 | "mouse_data_type.head()"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": null,
80 | "metadata": {},
81 | "outputs": [],
82 | "source": [
83 | "mouse_data_type = mouse_data.iloc[:,2:3]\n",
84 | "type(mouse_data_type)"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": null,
90 | "metadata": {},
91 | "outputs": [],
92 | "source": [
93 | "mouse_data_type.head()"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": null,
99 | "metadata": {},
100 | "outputs": [],
101 | "source": [
102 | "mouse_data['type'].head()"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": null,
108 | "metadata": {},
109 | "outputs": [],
110 | "source": [
111 | "type(mouse_data['type'])"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "metadata": {},
118 | "outputs": [],
119 | "source": [
120 | "mouse_data['type'].unique()"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": null,
126 | "metadata": {},
127 | "outputs": [],
128 | "source": [
129 | "feature_counts = mouse_data['type'].value_counts()\n",
130 | "print(feature_counts)"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": null,
136 | "metadata": {},
137 | "outputs": [],
138 | "source": [
139 | "%matplotlib inline\n",
140 | "feature_counts.plot(kind='bar')"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": null,
146 | "metadata": {},
147 | "outputs": [],
148 | "source": [
149 | "mouse_data.head()"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": null,
155 | "metadata": {},
156 | "outputs": [],
157 | "source": [
158 | "mouse_data['len'] = mouse_data.end - mouse_data.start + 1\n",
159 | "mouse_data.head()"
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": null,
165 | "metadata": {},
166 | "outputs": [],
167 | "source": [
168 | "mouse_exons = mouse_data[mouse_data.type=='exon']\n",
169 | "mouse_exons.head()"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": null,
175 | "metadata": {},
176 | "outputs": [],
177 | "source": [
178 | "long_mouse_exons = mouse_data[(mouse_data.type=='exon') & (mouse_data.len > 500)]\n",
179 | "long_mouse_exons.head()"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": null,
185 | "metadata": {},
186 | "outputs": [],
187 | "source": [
188 | "len(long_mouse_exons)"
189 | ]
190 | },
191 | {
192 | "cell_type": "markdown",
193 | "metadata": {},
194 | "source": [
195 | "## Exercise\n",
196 | "Plot the length of exons from the mouse dataframe."
197 | ]
198 | },
199 | {
200 | "cell_type": "code",
201 | "execution_count": null,
202 | "metadata": {},
203 | "outputs": [],
204 | "source": [
205 | "mouse_exons.head()"
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": null,
211 | "metadata": {},
212 | "outputs": [],
213 | "source": [
214 | "%matplotlib inline\n",
215 | "mouse_exons['len'].plot()"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": null,
221 | "metadata": {},
222 | "outputs": [],
223 | "source": [
224 | "%matplotlib inline\n",
225 | "mouse_exons.plot.scatter(x='start', y='len', title='mouse exons')"
226 | ]
227 | },
228 | {
229 | "cell_type": "markdown",
230 | "metadata": {},
231 | "source": [
232 | "## Re-ordering columns and re-indexing data frame"
233 | ]
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": null,
238 | "metadata": {},
239 | "outputs": [],
240 | "source": [
241 | "mouse_data.head()"
242 | ]
243 | },
244 | {
245 | "cell_type": "code",
246 | "execution_count": null,
247 | "metadata": {},
248 | "outputs": [],
249 | "source": [
250 | "reordered_mouse_data = mouse_data[['source', 'type', 'len']]\n",
251 | "reordered_mouse_data.head()"
252 | ]
253 | },
254 | {
255 | "cell_type": "code",
256 | "execution_count": null,
257 | "metadata": {},
258 | "outputs": [],
259 | "source": [
260 | "mouse_exons.head()"
261 | ]
262 | },
263 | {
264 | "cell_type": "code",
265 | "execution_count": null,
266 | "metadata": {},
267 | "outputs": [],
268 | "source": [
269 | "reindexed_mouse_exons = mouse_exons.reset_index()\n",
270 | "reindexed_mouse_exons.head()"
271 | ]
272 | }
273 | ],
274 | "metadata": {
275 | "kernelspec": {
276 | "display_name": "Python 3",
277 | "language": "python",
278 | "name": "python3"
279 | },
280 | "language_info": {
281 | "codemirror_mode": {
282 | "name": "ipython",
283 | "version": 3
284 | },
285 | "file_extension": ".py",
286 | "mimetype": "text/x-python",
287 | "name": "python",
288 | "nbconvert_exporter": "python",
289 | "pygments_lexer": "ipython3",
290 | "version": "3.6.4"
291 | }
292 | },
293 | "nbformat": 4,
294 | "nbformat_minor": 2
295 | }
296 |
--------------------------------------------------------------------------------
/22_python_data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Matplotlib\n",
8 | "\n",
9 | "[`matplotlib`](http://matplotlib.org/) is probably the single most-used Python package for graphics. It provides both a very quick way to visualize data from Python and publication-quality figures in many formats. It is also highly customizable, allowing you to create [rich visualizations](http://droettboom.com/jhepc2018-judge-packet/) of complex data.\n",
10 | "\n",
11 | "The `matplotlib.pyplot` module is a collection of command style functions that make `matplotlib` work like MATLAB. Each `pyplot` function makes some change to a figure: e.g., creates a figure, creates a plotting area in a figure, plots some lines in a plotting area, decorates the plot with labels, etc. \n",
12 | "\n",
13 | "Let's start with a very simple plot. First we set the `%matplotlib inline` option, which tells the Jupyter notebook to embed all plots as static images. Next, we import the `matplotlib.pyplot` module using the shorter alias `plt`, so that we don't have to use the full module name every time we call a `pyplot` function. Then we call the `.plot()` command on a list of integers to create a plot. Finally, we use the `.show()` command to render the plot and embed it underneath the code block."
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "# Data science in Python\n",
21 | "\n",
22 | "## Session 2.2: Data visualisation with Matplotlib\n",
23 | "\n",
24 | "- [Matplotlib](#Matplotlib)\n",
25 | "- [Exercise 2.2.1](#Exercise-2.2.1)"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": null,
31 | "metadata": {},
32 | "outputs": [],
33 | "source": [
34 | "%matplotlib inline\n",
35 | "\n",
36 | "import matplotlib.pyplot as plt\n",
37 | "plt.plot([1,2,3,4])\n",
38 | "plt.show()"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {},
44 | "source": [
45 | "After we create a plot, we can use the `.xlabel()` and `.ylabel()` commands to decorate it with axis legends, and add a title using the `.title()` command"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": null,
51 | "metadata": {},
52 | "outputs": [],
53 | "source": [
54 | "plt.plot([1,2,3,4])\n",
55 | "plt.xlabel(\"X axis label\")\n",
56 | "plt.ylabel(\"Y axis label\")\n",
57 | "plt.title(\"Plot title\")\n",
58 | "plt.show()"
59 | ]
60 | },
61 | {
62 | "cell_type": "markdown",
63 | "metadata": {},
64 | "source": [
65 | "`.plot()` is a versatile command. To plot x versus y, we can input two lists of integers:"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "metadata": {},
72 | "outputs": [],
73 | "source": [
74 | "plt.plot([1,2,3,4], [1,4,9,16])"
75 | ]
76 | },
77 | {
78 | "cell_type": "markdown",
79 | "metadata": {},
80 | "source": [
81 | "For every x, y pair of arguments, there is an **optional third argument** which is the format string that indicates the color and line type of the plot. The letters and symbols of the format string are from MATLAB, and you concatenate a color string with a line style string. The default format string is `'b-'`, which is a solid blue line. For example, to plot the above with red circles, you would chose `'ro'`."
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": null,
87 | "metadata": {},
88 | "outputs": [],
89 | "source": [
90 | "%matplotlib inline\n",
91 | "\n",
92 | "import matplotlib.pyplot as plt\n",
93 | "plt.plot([1,2,3,4], [1,4,9,16], 'ro')\n",
94 | "plt.axis([0, 6, 0, 20])\n",
95 | "plt.show()"
96 | ]
97 | },
98 | {
99 | "cell_type": "markdown",
100 | "metadata": {},
101 | "source": [
102 | "These formatting options can be especially useful when displaying data from different sources on the same plot. Once the plot is created, additional lines with different options can be added by calling the `.plot()` command multiple times before calling `.show()`. "
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": null,
108 | "metadata": {},
109 | "outputs": [],
110 | "source": [
111 | "%matplotlib inline\n",
112 | "\n",
113 | "import matplotlib.pyplot as plt\n",
114 | "year = [1960, 1970, 1980, 1990, 2000]\n",
115 | "rainfall_Manchester = [200, 190, 160, 185, 170]\n",
116 | "rainfall_London = [160, 155, 140, 145, 140]\n",
117 | "plt.plot(year, rainfall_Manchester, 'b-')\n",
118 | "plt.plot(year, rainfall_London, 'r--')\n",
119 | "plt.show()"
120 | ]
121 | },
122 | {
123 | "cell_type": "markdown",
124 | "metadata": {},
125 | "source": [
126 | "With more than one dataset on the same plot, we often need to add a legend to the plot. This can be done by passing the `label` argument to each `.plot()` command, and then calling the `.legend()` command on the plot."
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": null,
132 | "metadata": {},
133 | "outputs": [],
134 | "source": [
135 | "%matplotlib inline\n",
136 | "\n",
137 | "import matplotlib.pyplot as plt\n",
138 | "year = [1960, 1970, 1980, 1990, 2000]\n",
139 | "rainfall_Manchester = [200, 190, 160, 185, 170]\n",
140 | "rainfall_London = [160, 155, 140, 145, 140]\n",
141 | "plt.plot(year, rainfall_Manchester, 'b-', label = 'Manchester')\n",
142 | "plt.plot(year, rainfall_London, 'r--', label = 'London')\n",
143 | "plt.legend()\n",
144 | "plt.show()"
145 | ]
146 | },
147 | {
148 | "cell_type": "markdown",
149 | "metadata": {},
150 | "source": [
151 | "Once we have made a plot, we can write it to disk by using the `.savefig()` command instead of the `.show()` command. This will infer the file format from the ending of the file name e.g. \"myplot.svg\" will write in SVG format, \"myplot.pdf\" will write in PDF format etc. **Note: the range of file types that can be written may vary between different operating systems.**"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": null,
157 | "metadata": {},
158 | "outputs": [],
159 | "source": [
160 | "%matplotlib inline\n",
161 | "\n",
162 | "import matplotlib.pyplot as plt\n",
163 | "plt.plot([1,2,3,4], [1,4,9,16])\n",
164 | "plt.savefig(\"myplot.svg\")"
165 | ]
166 | },
167 | {
168 | "cell_type": "markdown",
169 | "metadata": {},
170 | "source": [
171 | "## Functions for specific plot types\n",
172 | "`matplotlib.pyplot` has a few functions that make creating common types of plots faster and more convenient because they automatically create a Figure and an Axes object (more on this in the next section). The most widely used are:\n",
173 | "\n",
174 | "- [`plt.bar()`](http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.bar) – creates a bar chart.\n",
175 | "- [`plt.boxplot()`](http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.boxplot) – makes a box and whisker plot.\n",
176 | "- [`plt.hist()`](http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.hist) – makes a histogram.\n",
177 | "- [`plt.plot()`](http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.plot) – creates a line plot.\n",
178 | "- [`plt.scatter()`](http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.scatter) – makes a scatter plot.\n",
179 | "\n",
180 | "Each of these methods has different parameters that can be passed in to modify the resulting plot. For example, let's plot a bar chart of the population size of different European capital cities:"
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": null,
186 | "metadata": {},
187 | "outputs": [],
188 | "source": [
189 | "%matplotlib inline\n",
190 | "\n",
191 | "import matplotlib.pyplot as plt\n",
192 | "city = [\"Dublin\", \"Paris\", \"Madrid\", \"Berlin\", \"Brussels\"]\n",
193 | "population = [553165, 2206488, 3182981, 3711930, 1191604]\n",
194 | "plt.bar(range(len(city)), population)\n",
195 | "plt.xticks(range(len(city)), city)\n",
196 | "plt.ylabel(\"Population size\")\n",
197 | "plt.title(\"Population size of European capital cities\")\n",
198 | "plt.ylim(0,4000000)\n",
199 | "plt.show()"
200 | ]
201 | },
202 | {
203 | "cell_type": "markdown",
204 | "metadata": {},
205 | "source": [
206 | "A full list of functions and details of how to use them is available from the `pyplot` [API documentation](https://matplotlib.org/api/_as_gen/matplotlib.pyplot.html)."
207 | ]
208 | },
209 | {
210 | "cell_type": "markdown",
211 | "metadata": {},
212 | "source": [
213 | "## Storing the Figure and Axes objects"
214 | ]
215 | },
216 | {
217 | "cell_type": "markdown",
218 | "metadata": {},
219 | "source": [
220 | "So far, we have been using `matplotlib`'s **state-based interface** , calling `pyplot` functions such as `.plot()`, `.title()`, and `.xlabel()` directly, and using them to add elements to a single, currently-active plot. \n",
221 | "\n",
222 | "It is also common to use `matplotlib`'s **object-oriented interface**. To do this, we first create a Figure object, and store it using the variable name `fig`. Next, we use the `.gca()` method to retrieve the Axes object associated with the Figure, which we store using the variable name `ax`. Finally, we decorate the Axes object, adding a title, labels, etc.\n",
223 | "\n",
224 | "This is often a point of confusion when first using `matplotlib`, because the methods used to decorate the plot change slightly in the object-oriented case. When using this approach, we now have to call the object methods `.set_title()`, `.set_xlabel()`, `.set_ylabel()`, etc.\n",
225 | "\n",
226 | "For example, let's plot the GC content along a gene using the quick state-based approach:"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": null,
232 | "metadata": {},
233 | "outputs": [],
234 | "source": [
235 | "%matplotlib inline\n",
236 | "seq = 'ATGGTGCATCTGACTCCTGAGGAGAAGTCTGCCGTTACTGCCCTGTGGGGCAAGGTG'\n",
237 | "gc = [40.0, 60.0, 80.0, 60.0, 40.0, 60.0, 40.0, 40.0, 40.0, 60.0, \n",
238 | " 40.0, 60.0, 60.0, 60.0, 60.0, 60.0, 60.0, 60.0, 60.0, 60.0, \n",
239 | " 60.0, 40.0, 40.0, 40.0, 40.0, 40.0, 60.0, 60.0, 80.0, 80.0, \n",
240 | " 80.0, 60.0, 40.0, 40.0, 20.0, 40.0, 60.0, 80.0, 80.0, 80.0, \n",
241 | " 80.0, 60.0, 60.0, 60.0, 80.0, 80.0, 100.0, 80.0, 60.0, 60.0, \n",
242 | " 60.0, 40.0, 60.0]\n",
243 | "\n",
244 | "import matplotlib.pyplot as plt\n",
245 | "plt.plot(gc, '--')\n",
246 | "plt.xlabel('Window along the sequence (5bp)')\n",
247 | "plt.ylabel('%GC')\n",
248 | "plt.title('GC plot for sequence\\n' + seq)\n",
249 | "plt.show()"
250 | ]
251 | },
252 | {
253 | "cell_type": "markdown",
254 | "metadata": {},
255 | "source": [
256 | "Now let's generate the same plot using the object-oriented approach:"
257 | ]
258 | },
259 | {
260 | "cell_type": "code",
261 | "execution_count": null,
262 | "metadata": {},
263 | "outputs": [],
264 | "source": [
265 | "%matplotlib inline\n",
266 | "seq = 'ATGGTGCATCTGACTCCTGAGGAGAAGTCTGCCGTTACTGCCCTGTGGGGCAAGGTG'\n",
267 | "gc = [40.0, 60.0, 80.0, 60.0, 40.0, 60.0, 40.0, 40.0, 40.0, 60.0, \n",
268 | " 40.0, 60.0, 60.0, 60.0, 60.0, 60.0, 60.0, 60.0, 60.0, 60.0, \n",
269 | " 60.0, 40.0, 40.0, 40.0, 40.0, 40.0, 60.0, 60.0, 80.0, 80.0, \n",
270 | " 80.0, 60.0, 40.0, 40.0, 20.0, 40.0, 60.0, 80.0, 80.0, 80.0, \n",
271 | " 80.0, 60.0, 60.0, 60.0, 80.0, 80.0, 100.0, 80.0, 60.0, 60.0, \n",
272 | " 60.0, 40.0, 60.0]\n",
273 | "\n",
274 | "import matplotlib.pyplot as plt\n",
275 | "fig = plt.figure()\n",
276 | "ax = fig.gca()\n",
277 | "ax.plot(gc, '--')\n",
278 | "ax.set_xlabel('Window along the sequence (5bp)')\n",
279 | "ax.set_ylabel('%GC')\n",
280 | "ax.set_title('GC plot for sequence\\n' + seq)\n",
281 | "plt.show()"
282 | ]
283 | },
284 | {
285 | "cell_type": "markdown",
286 | "metadata": {},
287 | "source": [
288 | "Using the object-oriented approach, you can create one Figure object with multiple Axes for different sub-panels. Giving each Axes a distinct variable name allows you to plot different information in each panel. "
289 | ]
290 | },
291 | {
292 | "cell_type": "markdown",
293 | "metadata": {},
294 | "source": [
295 | "## Plotting directly from `Pandas` objects\n",
296 | "\n",
297 | "As seen in the previous session, it is possible to plot directly from `pandas`. We first create a `pandas` DataFrame object from the GC data above:"
298 | ]
299 | },
300 | {
301 | "cell_type": "code",
302 | "execution_count": null,
303 | "metadata": {},
304 | "outputs": [],
305 | "source": [
306 | "import pandas\n",
307 | "gc_df = pandas.DataFrame(gc, columns=['gc'])\n",
308 | "gc_df.head()"
309 | ]
310 | },
311 | {
312 | "cell_type": "markdown",
313 | "metadata": {},
314 | "source": [
315 | "Next, we call the `.plot()` method on the `pandas` DataFrame, which is a wrapper around `matplotlib.pyplot.plot()`. We store the object using the variable name `thisplot`:"
316 | ]
317 | },
318 | {
319 | "cell_type": "code",
320 | "execution_count": null,
321 | "metadata": {},
322 | "outputs": [],
323 | "source": [
324 | "%matplotlib inline\n",
325 | "\n",
326 | "thisplot = gc_df.gc.plot(style='--')\n",
327 | "thisplot.set_xlabel('Window along the sequence (5bp)')\n",
328 | "thisplot.set_ylabel('%GC')\n",
329 | "thisplot.set_title('GC plot for sequence\\n' + seq)"
330 | ]
331 | },
332 | {
333 | "cell_type": "markdown",
334 | "metadata": {},
335 | "source": [
336 | "Note that the `pandas` plots use the object-oriented interface, so we have to call `.set_xlabel()` on our stored figure object rather than `.xlabel()` to decorate the plot. "
337 | ]
338 | },
339 | {
340 | "cell_type": "markdown",
341 | "metadata": {},
342 | "source": [
343 | "## Modifying figure aesthetics with `Seaborn`"
344 | ]
345 | },
346 | {
347 | "cell_type": "markdown",
348 | "metadata": {},
349 | "source": [
350 | "[`seaborn`](https://seaborn.pydata.org) is a data visualization library based on `matplotlib`. It provides easy ways to customize the aesthetics of basic `matplotlib` figures, as well as a range of additional plot types (see [examples](https://seaborn.pydata.org/examples/index.html)). We import `seaborn` and give it the alias `sns`.\n",
351 | "\n",
352 | "Let's now modify the style of our previous `matplotlib` code. \n",
353 | "\n",
354 | "Using the `.set_style()` method with the option `'darkgrid'` we can create a similar aesthetic to `R`'s `ggplot2` package. Other [available styles](https://seaborn.pydata.org/tutorial/aesthetics.html) include `'whitegrid'`, `'dark'`, `'white'`, and `'ticks'`."
355 | ]
356 | },
357 | {
358 | "cell_type": "code",
359 | "execution_count": null,
360 | "metadata": {},
361 | "outputs": [],
362 | "source": [
363 | "%matplotlib inline\n",
364 | "\n",
365 | "import matplotlib.pyplot as plt\n",
366 | "\n",
367 | "import seaborn as sns\n",
368 | "sns.set_style('darkgrid')\n",
369 | "\n",
370 | "year = [1960, 1970, 1980, 1990, 2000]\n",
371 | "rainfall_Manchester = [200, 190, 160, 185, 170]\n",
372 | "rainfall_London = [160, 155, 140, 145, 140]\n",
373 | "plt.plot(year, rainfall_Manchester, label = 'Manchester')\n",
374 | "plt.plot(year, rainfall_London, label = 'London')\n",
375 | "plt.legend()\n",
376 | "plt.show()"
377 | ]
378 | },
379 | {
380 | "cell_type": "markdown",
381 | "metadata": {},
382 | "source": [
383 | "To modify a plot so that it will be clearly visible on a slide or poster, we can use the `.set_context()` method. For example, here is the same plot with the `'ticks'` style, formatted using the `'talk'` context. Note how both the font size and thickness of the lines has increased with a single command."
384 | ]
385 | },
386 | {
387 | "cell_type": "code",
388 | "execution_count": null,
389 | "metadata": {},
390 | "outputs": [],
391 | "source": [
392 | "%matplotlib inline\n",
393 | "\n",
394 | "import matplotlib.pyplot as plt\n",
395 | "\n",
396 | "import seaborn as sns\n",
397 | "sns.set_style('ticks')\n",
398 | "sns.set_context('talk')\n",
399 | "\n",
400 | "year = [1960, 1970, 1980, 1990, 2000]\n",
401 | "rainfall_Manchester = [200, 190, 160, 185, 170]\n",
402 | "rainfall_London = [160, 155, 140, 145, 140]\n",
403 | "plt.plot(year, rainfall_Manchester, label = 'Manchester')\n",
404 | "plt.plot(year, rainfall_London, label = 'London')\n",
405 | "plt.legend()\n",
406 | "\n",
407 | "# removes the border on the right and top of the plot\n",
408 | "sns.despine()\n",
409 | "\n",
410 | "plt.show()"
411 | ]
412 | },
413 | {
414 | "cell_type": "markdown",
415 | "metadata": {},
416 | "source": [
417 | "## Exercise 2.2.1\n",
418 | "\n",
419 | "Re-use the GapMinder dataset to plot, in Jupyter using Matplotlib, a scatter plot of world life expectancy against GDP per capita for 1952, 1977 and 2007. Add a title, axis labels and legend to your figure.\n",
420 | "\n",
421 | "Find the country with the highest GDP per capita for 1952, 1977 and 2007.\n",
422 | "\n",
423 | "Re-write the function `gdp_stats_by_continent_and_year()` using Pandas."
424 | ]
425 | },
426 | {
427 | "cell_type": "markdown",
428 | "metadata": {},
429 | "source": [
430 | "## Next session\n",
431 | "\n",
432 | "Go to our next notebook: [Session 2.3: Biological data with BioPython](23_python_data.ipynb)"
433 | ]
434 | }
435 | ],
436 | "metadata": {
437 | "kernelspec": {
438 | "display_name": "Python 3",
439 | "language": "python",
440 | "name": "python3"
441 | },
442 | "language_info": {
443 | "codemirror_mode": {
444 | "name": "ipython",
445 | "version": 3
446 | },
447 | "file_extension": ".py",
448 | "mimetype": "text/x-python",
449 | "name": "python",
450 | "nbconvert_exporter": "python",
451 | "pygments_lexer": "ipython3",
452 | "version": "3.6.4"
453 | }
454 | },
455 | "nbformat": 4,
456 | "nbformat_minor": 1
457 | }
458 |
--------------------------------------------------------------------------------
/24_project.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data science project in Python"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "For the 4 species gff files store in the `data/` folder (mouse: `GRCm38.gff3`, human: `GRCh38.gff3`, zebrafish: `GRCz11.gff3` and panda: `AilMel.gff3`), load these files into a DataFrame, filter the exons, and calculate their length, their GC content as well as their molecular weigth. Store these calculations into new columns in their respective DataFrame. Plot these three values for the four species onto three graphs using a boxplot.\n",
15 | "\n",
16 | "Start by working with the mouse data for exploratory analysis, working through the problem steps by steps. Then, create a re-usable function to apply the analysis onto all datasets. Load all four datasets and apply the newly created function to calculate the new values. After having all DataFrames with the new three columns, visualise the data onto three boxplot graphs.\n",
17 | "\n",
18 | "Present the results into a Jupyter notebook, using Pandas, Matplotlib and Biopython. Write reusable and modular code as much as possible using functions."
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "import pandas\n",
28 | "from Bio import SeqIO\n",
29 | "from Bio import Entrez\n",
30 | "from Bio.SeqUtils import GC, molecular_weight"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": null,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "def get_gc_and_mw_from_gbid(id='NM_177676.6'):\n",
40 | " Entrez.email = 'A.N.Other@example.com' # Always tell NCBI who you are\n",
41 | " handle = Entrez.efetch(db=\"nucleotide\", id=id, rettype=\"gb\")\n",
42 | " seq_record = SeqIO.read(handle, \"gb\")\n",
43 | " handle.close()\n",
44 | " return GC(seq_record.seq), molecular_weight(seq_record.seq)"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "print(get_gc_and_mw_from_gbid('NM_177676.6'))"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": null,
59 | "metadata": {},
60 | "outputs": [],
61 | "source": [
62 | "def get_gc_and_mw_from_gbids(df_ids):\n",
63 | " gcs = []\n",
64 | " mws = []\n",
65 | " for i in df_ids:\n",
66 | " gc, mw = get_gc_and_mw_from_gbid(i)\n",
67 | " gcs.append(gc)\n",
68 | " mws.append(mw)\n",
69 | " return gcs, mws"
70 | ]
71 | },
72 | {
73 | "cell_type": "markdown",
74 | "metadata": {},
75 | "source": [
76 | "## Working with mouse data: exploratory analysis\n",
77 | "\n",
78 | "- remove rows with null values\n",
79 | "- filter all exons\n",
80 | "- calculate GC contents and molecular weights"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": null,
86 | "metadata": {},
87 | "outputs": [],
88 | "source": [
89 | "mouse = pandas.read_csv('data/GRCm38.gff3', sep='\\t')\n",
90 | "mouse.head()"
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": null,
96 | "metadata": {},
97 | "outputs": [],
98 | "source": [
99 | "mouse.dropna(inplace=True)\n",
100 | "print(mouse.type.unique())"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "metadata": {},
107 | "outputs": [],
108 | "source": [
109 | "exon_mouse = mouse[mouse['type']=='exon']"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": null,
115 | "metadata": {},
116 | "outputs": [],
117 | "source": [
118 | "exon_mouse.head()"
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": null,
124 | "metadata": {},
125 | "outputs": [],
126 | "source": [
127 | "small_exon_mouse = exon_mouse.iloc[:9,]"
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": null,
133 | "metadata": {},
134 | "outputs": [],
135 | "source": [
136 | "gcs, mws = get_gc_and_mw_from_gbids(small_exon_mouse['gbid'])"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": null,
142 | "metadata": {},
143 | "outputs": [],
144 | "source": [
145 | "small_exon_mouse.insert(len(small_exon_mouse.columns), 'len', small_exon_mouse['end'] - small_exon_mouse['start'] + 1)"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": null,
151 | "metadata": {},
152 | "outputs": [],
153 | "source": [
154 | "small_exon_mouse.insert(len(small_exon_mouse.columns), 'gc', gcs)"
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": null,
160 | "metadata": {},
161 | "outputs": [],
162 | "source": [
163 | "small_exon_mouse.insert(len(small_exon_mouse.columns), 'mw', mws)"
164 | ]
165 | },
166 | {
167 | "cell_type": "code",
168 | "execution_count": null,
169 | "metadata": {},
170 | "outputs": [],
171 | "source": [
172 | "small_exon_mouse.head()"
173 | ]
174 | },
175 | {
176 | "cell_type": "code",
177 | "execution_count": null,
178 | "metadata": {},
179 | "outputs": [],
180 | "source": [
181 | "%matplotlib inline\n",
182 | "import matplotlib.pyplot as plt\n",
183 | "plt.boxplot([small_exon_mouse['gc']], labels=['mouse'])\n",
184 | "plt.ylabel('GC content (%)')\n",
185 | "plt.show()"
186 | ]
187 | },
188 | {
189 | "cell_type": "markdown",
190 | "metadata": {},
191 | "source": [
192 | "## Creating a re-usable function to apply the analysis onto other datasets"
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": null,
198 | "metadata": {},
199 | "outputs": [],
200 | "source": [
201 | "def get_exons(data):\n",
202 | " data.dropna(inplace=True)\n",
203 | " exons = data[data['type']=='exon']\n",
204 | " exons = exons[:9]\n",
205 | " gcs, mws = get_gc_and_mw_from_gbids(exons['gbid'])\n",
206 | " exons.insert(len(exons.columns), 'len', exons['end'] - exons['start'] + 1)\n",
207 | " exons.insert(len(exons.columns), 'gc', gcs)\n",
208 | " exons.insert(len(exons.columns), 'mw', mws)\n",
209 | " return exons"
210 | ]
211 | },
212 | {
213 | "cell_type": "markdown",
214 | "metadata": {},
215 | "source": [
216 | "## Loading all four datasets and calculating new values"
217 | ]
218 | },
219 | {
220 | "cell_type": "code",
221 | "execution_count": null,
222 | "metadata": {},
223 | "outputs": [],
224 | "source": [
225 | "mouse = pandas.read_csv('data/GRCm38.gff3', sep='\\t')\n",
226 | "mouse_small_exons = get_exons(mouse)\n",
227 | "mouse_small_exons.head()"
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": null,
233 | "metadata": {},
234 | "outputs": [],
235 | "source": [
236 | "human = pandas.read_csv('data/GRCh38.gff3', sep='\\t')\n",
237 | "human_small_exons = get_exons(human)\n",
238 | "human_small_exons.head()"
239 | ]
240 | },
241 | {
242 | "cell_type": "code",
243 | "execution_count": null,
244 | "metadata": {},
245 | "outputs": [],
246 | "source": [
247 | "zebrafish = pandas.read_csv('data/GRCz11.gff3', sep='\\t')\n",
248 | "zebrafish_small_exons = get_exons(zebrafish)\n",
249 | "zebrafish_small_exons.head()"
250 | ]
251 | },
252 | {
253 | "cell_type": "code",
254 | "execution_count": null,
255 | "metadata": {},
256 | "outputs": [],
257 | "source": [
258 | "panda = pandas.read_csv('data/AilMel.gff3', sep='\\t')\n",
259 | "panda_small_exons = get_exons(panda)\n",
260 | "panda_small_exons.head()"
261 | ]
262 | },
263 | {
264 | "cell_type": "markdown",
265 | "metadata": {},
266 | "source": [
267 | "## Visualising data"
268 | ]
269 | },
270 | {
271 | "cell_type": "markdown",
272 | "metadata": {},
273 | "source": [
274 | "### Comparing exon's length, GC content and molecular weight across four species"
275 | ]
276 | },
277 | {
278 | "cell_type": "code",
279 | "execution_count": null,
280 | "metadata": {},
281 | "outputs": [],
282 | "source": [
283 | "%matplotlib inline\n",
284 | "import matplotlib.pyplot as plt\n",
285 | "plt.boxplot([mouse_small_exons['len'], \n",
286 | " human_small_exons['len'], \n",
287 | " zebrafish_small_exons['len'],\n",
288 | " panda_small_exons['len']], \n",
289 | " labels=['mouse', 'human', 'zebrafish', 'panda']\n",
290 | " )\n",
291 | "plt.ylabel('Feature length (bp)')\n",
292 | "plt.show()"
293 | ]
294 | },
295 | {
296 | "cell_type": "code",
297 | "execution_count": null,
298 | "metadata": {},
299 | "outputs": [],
300 | "source": [
301 | "%matplotlib inline\n",
302 | "import matplotlib.pyplot as plt\n",
303 | "plt.boxplot([mouse_small_exons['gc'], \n",
304 | " human_small_exons['gc'], \n",
305 | " zebrafish_small_exons['gc'],\n",
306 | " panda_small_exons['gc']], \n",
307 | " labels=['mouse', 'human', 'zebrafish', 'panda']\n",
308 | " )\n",
309 | "plt.ylabel('GC content (%)')\n",
310 | "plt.show()"
311 | ]
312 | },
313 | {
314 | "cell_type": "code",
315 | "execution_count": null,
316 | "metadata": {},
317 | "outputs": [],
318 | "source": [
319 | "%matplotlib inline\n",
320 | "import matplotlib.pyplot as plt\n",
321 | "plt.boxplot([mouse_small_exons['mw'], \n",
322 | " human_small_exons['mw'], \n",
323 | " zebrafish_small_exons['mw'],\n",
324 | " panda_small_exons['mw']], \n",
325 | " labels=['mouse', 'human', 'zebrafish', 'panda']\n",
326 | " )\n",
327 | "plt.ylabel('Molecular weight')\n",
328 | "plt.show()"
329 | ]
330 | }
331 | ],
332 | "metadata": {
333 | "kernelspec": {
334 | "display_name": "Python 3",
335 | "language": "python",
336 | "name": "python3"
337 | },
338 | "language_info": {
339 | "codemirror_mode": {
340 | "name": "ipython",
341 | "version": 3
342 | },
343 | "file_extension": ".py",
344 | "mimetype": "text/x-python",
345 | "name": "python",
346 | "nbconvert_exporter": "python",
347 | "pygments_lexer": "ipython3",
348 | "version": "3.6.4"
349 | }
350 | },
351 | "nbformat": 4,
352 | "nbformat_minor": 2
353 | }
354 |
--------------------------------------------------------------------------------
/24_python_data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data science in Python\n",
8 | "\n",
9 | "## Session 2.4: Data project report in Jupyter\n",
10 | "\n"
11 | ]
12 | },
13 | {
14 | "cell_type": "markdown",
15 | "metadata": {},
16 | "source": [
17 | "For the 4 species gff files store in the `data/` folder (mouse: `GRCm38.gff3`, human: `GRCh38.gff3`, zebrafish: `GRCz11.gff3` and panda: `AilMel.gff3`), load these files into a DataFrame, filter the exons, and calculate their length, their GC content as well as their molecular weigth. Store these calculations into new columns in their respective DataFrame. Plot these three values for the four species onto three graphs using a boxplot.\n",
18 | "\n",
19 | "Start by working with the mouse data for exploratory analysis, working through the problem steps by steps. Then, create a re-usable function to apply the analysis onto all datasets. Load all four datasets and apply the newly created function to calculate the new values. After having all DataFrames with the new three columns, visualise the data onto three boxplot graphs.\n",
20 | "\n",
21 | "Present the results into a Jupyter notebook, using Pandas, Matplotlib and Biopython. Write reusable and modular code as much as possible using functions.\n",
22 | "\n",
23 | "Have fun!"
24 | ]
25 | }
26 | ],
27 | "metadata": {
28 | "kernelspec": {
29 | "display_name": "Python 3",
30 | "language": "python",
31 | "name": "python3"
32 | },
33 | "language_info": {
34 | "codemirror_mode": {
35 | "name": "ipython",
36 | "version": 3
37 | },
38 | "file_extension": ".py",
39 | "mimetype": "text/x-python",
40 | "name": "python",
41 | "nbconvert_exporter": "python",
42 | "pygments_lexer": "ipython3",
43 | "version": "3.6.4"
44 | }
45 | },
46 | "nbformat": 4,
47 | "nbformat_minor": 2
48 | }
49 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | This is free and unencumbered software released into the public domain.
2 |
3 | Anyone is free to copy, modify, publish, use, compile, sell, or
4 | distribute this software, either in source code form or as a compiled
5 | binary, for any purpose, commercial or non-commercial, and by any
6 | means.
7 |
8 | In jurisdictions that recognize copyright laws, the author or authors
9 | of this software dedicate any and all copyright interest in the
10 | software to the public domain. We make this dedication for the benefit
11 | of the public at large and to the detriment of our heirs and
12 | successors. We intend this dedication to be an overt act of
13 | relinquishment in perpetuity of all present and future rights to this
14 | software under copyright law.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.
23 |
24 | For more information, please refer to
25 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Data science in Python - course materials
2 |
3 | [](https://gitter.im/pycam/Lobby?utm_source=share-link&utm_medium=link&utm_campaign=share-link)
4 |
5 | New course based on 'Working with Python: functions and modules' course
6 |
7 | Materials for the course run by the Graduate School of Life Sciences, University of Cambridge.
8 |
9 | - Course website: http://pycam.github.io/
10 | - Booking website: http://www.training.cam.ac.uk/
11 |
12 | If you wish to run the course on your personal computer, here are the steps to follow to get up and running.
13 |
14 | ## Clone this github project
15 |
16 | For Windows, you have to install GitBash https://gitforwindows.org/ to be able to execute the command.
17 | Note down the folder where you have downloaded the git repository.
18 |
19 | ```bash
20 | git clone https://github.com/pycam/python-data-science.git
21 | cd python-data-science
22 | ```
23 |
24 | ## Dependencies
25 |
26 | ### Mac OSX
27 | Install Python 3 by downloading the latest version from https://www.python.org/. For Mac OSX, just run `brew install python3`.
28 |
29 | Python 2.x is legacy, Python 3.x is the present and future of the language.
30 |
31 | Create first a virtual environment using the [`venv` library](https://docs.python.org/3/library/venv.html). Update pip if needed, install [jupyter](http://jupyter.org/) and [RISE](https://github.com/damianavila/RISE) to get a slideshow extension into jupyter.
32 |
33 | ***Note*** A virtual environment is a Python environment such that the Python interpreter, libraries and scripts installed into it are isolated from those installed in other virtual environments.
34 |
35 | ```bash
36 | python3 -m venv venv
37 | # activate your virtual environment
38 | source venv/bin/activate
39 | # update pip if needed
40 | pip install --upgrade pip
41 | # install package dependencies
42 | pip install jupyter pandas matplotlib seaborn biopython
43 |
44 | # slideshow extension
45 | pip install rise
46 | jupyter-nbextension install rise --py --sys-prefix
47 | jupyter nbextension enable rise --py --sys-prefix
48 |
49 | ```
50 |
51 | On mac OSX you may need to run this command to accept the XCode license, before installing biopython:
52 |
53 | ```bash
54 | sudo xcodebuild -license
55 | ```
56 |
57 | ### Windows
58 | Install Python 3 by downloading the latest version of Anaconda https://www.anaconda.com/distribution/. Take the graphical installer or the command line installer.
59 |
60 | Python 2.x is legacy, Python 3.x is the present and future of the language.
61 |
62 | After installation, open an Anaconda prompt which is Windows command line configured for Anaconda.
63 |
64 | ```bash
65 | (base) C:\Users\your_username>conda create --name python-course
66 | (base) C:\Users\your_username>conda activate python-course
67 | (python-course) C:\Users\your_username>conda install pandas matplotlib seaborn biopython
68 | (python-course) C:\Users\your_username> conda install jupyter rise
69 | (python-course) C:\Users\your_username>jupyter-nbextension install rise --py --sys-prefix
70 | (python-course) C:\Users\your_usernamejupyter nbextension enable rise --py --sys-prefix
71 | ```
72 |
73 | ## Usage
74 |
75 | ### Mac OSX
76 | Go to the directory where you've cloned this repository, activate your virtual environment and run jupyter.
77 |
78 | Your web browser should automatically open with this url http://localhost:8888/tree where you see the directory tree of the course with all the jupyter notebooks.
79 |
80 | ```bash
81 | cd python-data-science
82 | source venv/bin/activate
83 | jupyter notebook
84 | ```
85 |
86 | To shutdown jupyter, type ctrl-C into the terminal you've ran `jupyter notebook`, answer `y` and press `enter`.
87 |
88 | You may wish to deactivate the virtual environment, by entering into the terminal:
89 | ```
90 | deactivate
91 | ```
92 |
93 | ### Windows
94 | Launch the Anaconda prompt.
95 |
96 | Go to the directory where you've cloned this repository (here: `C:\Users\your_username\python-data-science`), activate your Anaconda environment and run jupyter.
97 |
98 | Your web browser should automatically open with this url http://localhost:8888/tree where you see the directory tree of the course with all the jupyter notebooks.
99 |
100 | ```bash
101 | (base) C:\Users\your_username>cd python-data-science
102 | (base) C:\Users\your_username\python-data-science>conda activate python-course
103 | (python-course) C:\Users\your_username\python-data-science>jupyter notebook
104 | ```
105 |
106 | To shutdown jupyter, type ctrl-C into the terminal you've ran `jupyter notebook`, answer `y` and press `enter`.
107 |
108 | You may wish to deactivate the virtual environment, by entering into the terminal:
109 | ```
110 | conda deactivate
111 | ```
112 |
113 | ## Resources used
114 |
115 | - [Data Carpentry lesson](https://datacarpentry.org/python-ecology-lesson/)
116 | - [Software Carpentry lesson](http://swcarpentry.github.io/python-novice-gapminder/)
117 | - [Biopython tutorial on sickle cell](https://krother.gitbooks.io/biopython-tutorial/content/sicklecell.html)
118 | - [Python tutorial](https://docs.python.org/3/tutorial/index.html)
119 | - [Pandas tutorials](http://pandas.pydata.org/pandas-docs/stable/tutorials.html)
120 | - [Matplotlib tutorials](https://matplotlib.org/tutorials/index.html)
121 | - [Biopython tutorial](http://biopython.org/DIST/docs/tutorial/Tutorial.html)
122 |
123 | ## Contributing
124 |
125 | - See [contributing.md](contributing.md)
126 |
--------------------------------------------------------------------------------
/cheat_sheet_basic_python.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "slideshow": {
7 | "slide_type": "-"
8 | }
9 | },
10 | "source": [
11 | "## Cheat Sheet: Basic Python\n",
12 | "- [Printing](#Printing)\n",
13 | "- [Variables](#Variables)\n",
14 | "- [Simple data types](#Simple-data-types)\n",
15 | "- [Arithmetic operations](#Arithmetic-operations)\n",
16 | "- Collections: [Lists](#Lists) | [Dictionnaries](#Dictionnaries) | [Sets](#Sets) | [Tuples](#Tuples) | [Strings](#Strings)\n",
17 | "- [Conditional execution](#Conditional-execution) \n",
18 | "- [Comparison operations](#Comparison-operations)\n",
19 | "- [Loops](#Loops)\n",
20 | "- [Files](#Files)\n",
21 | "- [Getting help](#Getting-help)\n",
22 | "- [Exercise 1.1](#Exercise-1.1)"
23 | ]
24 | },
25 | {
26 | "cell_type": "markdown",
27 | "metadata": {},
28 | "source": [
29 | "## Printing\n",
30 | "\n",
31 | "You can include a ***comment*** in python by prefixing some text with a **`#` character**. All text following the `#` will then be ignored by the interpreter."
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "print('Hello from python!') # to print some text, enclose it between quotation marks - single\n",
41 | "print(\"I'm here today!\") # or double\n",
42 | "print(34) # print an integer\n",
43 | "print(2 + 4) # print the result of an arithmetic operation\n",
44 | "print(\"The answer is\", 42) # print multiple expressions, separated by comma"
45 | ]
46 | },
47 | {
48 | "cell_type": "markdown",
49 | "metadata": {},
50 | "source": [
51 | "## Variables\n",
52 | "\n",
53 | "A variable can be assigned to a simple value or the outcome of a more complex expression.\n",
54 | "The **`=` operator** is used to assign a value to a variable."
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": null,
60 | "metadata": {},
61 | "outputs": [],
62 | "source": [
63 | "x = 3 # assignment of a simple value\n",
64 | "print(x)\n",
65 | "y = x + 5 # assignment of a more complex expression\n",
66 | "print(y)\n",
67 | "i = 12\n",
68 | "print(i)\n",
69 | "i = i + 1 # assigment of the current value of a variable incremented by 1 to itself\n",
70 | "print(i)\n",
71 | "i += 1 # shorter version with the special += operator\n",
72 | "print(i)"
73 | ]
74 | },
75 | {
76 | "cell_type": "markdown",
77 | "metadata": {
78 | "slideshow": {
79 | "slide_type": "-"
80 | }
81 | },
82 | "source": [
83 | "## Simple data types\n",
84 | "\n",
85 | "Python has 4 main basic data types."
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": null,
91 | "metadata": {
92 | "slideshow": {
93 | "slide_type": "-"
94 | }
95 | },
96 | "outputs": [],
97 | "source": [
98 | "a = 2 # integer\n",
99 | "b = 5.0 # float\n",
100 | "c = 'word' # string\n",
101 | "d = 4 > 5 # boolean True or False\n",
102 | "e = None # special built-in value to create a variable that has not been set to anything specific\n",
103 | "print(a, b, c, d, e)\n",
104 | "print(a, 'is of type', type(a)) # to check the type of a variable "
105 | ]
106 | },
107 | {
108 | "cell_type": "markdown",
109 | "metadata": {},
110 | "source": [
111 | "## Arithmetic operations"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "metadata": {},
118 | "outputs": [],
119 | "source": [
120 | "a = 2 # assignment\n",
121 | "a += 1 # change and assign (*=, /=)\n",
122 | "3 + 2 # addition\n",
123 | "3 - 2 # subtraction\n",
124 | "3 * 2 # multiplication\n",
125 | "3 / 2 # integer (python2) or float (python3) division\n",
126 | "\n",
127 | "3 // 2 # integer division\n",
128 | "3 % 2 # remainder\n",
129 | "3 ** 2 # exponent"
130 | ]
131 | },
132 | {
133 | "cell_type": "markdown",
134 | "metadata": {
135 | "slideshow": {
136 | "slide_type": "-"
137 | }
138 | },
139 | "source": [
140 | "## Lists\n",
141 | "\n",
142 | "A list is an ordered collection of mutable elements."
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": null,
148 | "metadata": {
149 | "slideshow": {
150 | "slide_type": "-"
151 | }
152 | },
153 | "outputs": [],
154 | "source": [
155 | "a = ['red', 'blue', 'green'] # manual initialisation\n",
156 | "copy_of_a = a[:] # copy of a \n",
157 | "another_a = a # same as a\n",
158 | "b = list(range(5)) # initialise from iteratable\n",
159 | "c = [1, 2, 3, 4, 5, 6] # manual initialisation\n",
160 | "len(c) # length of the list\n",
161 | "d = c[0] # access first element at index 0\n",
162 | "e = c[1:3] # access a slice of the list, \n",
163 | " # including element at index 1 up to but not including element at index 3\n",
164 | "f = c[-1] # access last element\n",
165 | "c[1] = 8 # assign new value at index position 1\n",
166 | "g = ['re', 'bl'] + ['gr'] # list concatenation\n",
167 | "['re', 'bl'].index('re') # returns index of 're'\n",
168 | "a.append('yellow') # add new element to end of list\n",
169 | "a.extend(b) # add elements from list `b` to end of list `a`\n",
170 | "a.insert(1, 'yellow') # insert element in specified position\n",
171 | "'re' in ['re', 'bl'] # true if 're' in list\n",
172 | "'fi' not in ['re', 'bl'] # true if 'fi' not in list\n",
173 | "c.sort() # sort list in place\n",
174 | "h = sorted([3, 2, 1]) # returns sorted list\n",
175 | "i = a.pop(2) # remove and return item at index (default last)\n",
176 | "print(a, b, c, d, e, f, g, h, i)\n",
177 | "print(a, copy_of_a, another_a)"
178 | ]
179 | },
180 | {
181 | "cell_type": "markdown",
182 | "metadata": {},
183 | "source": [
184 | "## Dictionaries\n",
185 | "\n",
186 | "A dictionary is an unordered collection of key-value pairs where keys must be unique."
187 | ]
188 | },
189 | {
190 | "cell_type": "code",
191 | "execution_count": null,
192 | "metadata": {
193 | "slideshow": {
194 | "slide_type": "-"
195 | }
196 | },
197 | "outputs": [],
198 | "source": [
199 | "a = {'A': 'Adenine', 'C': 'Cytosine'} # dictionary\n",
200 | "b = a['A'] # translate item\n",
201 | "c = a.get('N', 'no value found') # return default value\n",
202 | "'A' in a # true if dictionary a contains key 'A'\n",
203 | "a['G'] = 'Guanine' # assign new key, value pair to dictonary a\n",
204 | "a['T'] = 'Thymine' # assign new key, value pair to dictonary a\n",
205 | "print(a)\n",
206 | "d = a.keys() # get list of keys\n",
207 | "e = a.values() # get list of values\n",
208 | "f = a.items() # get list of key-value pairs\n",
209 | "print(b, c, d, e, f)\n",
210 | "del a['A'] # delete key and associated value\n",
211 | "print(a)"
212 | ]
213 | },
214 | {
215 | "cell_type": "markdown",
216 | "metadata": {},
217 | "source": [
218 | "## Sets\n",
219 | "\n",
220 | "A set is an unordered collection of unique elements. "
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": null,
226 | "metadata": {
227 | "slideshow": {
228 | "slide_type": "-"
229 | }
230 | },
231 | "outputs": [],
232 | "source": [
233 | "a = {1, 2, 3} # initialise manually\n",
234 | "b = set(range(5)) # initialise from iteratable\n",
235 | "c = set([1,2,2,2,2,4,5,6,6,6]) # initialise from list\n",
236 | "a.add(13) # add new element to set\n",
237 | "a.remove(13) # remove element from set\n",
238 | "2 in {1, 2, 3} # true if 2 in set\n",
239 | "5 not in {1, 2, 3} # true if 5 not in set\n",
240 | "d = a.union(b) # return the union of sets as a new set\n",
241 | "e = a.intersection(b) # return the intersection of sets as a new set\n",
242 | "print(a, b, c, d, e)"
243 | ]
244 | },
245 | {
246 | "cell_type": "markdown",
247 | "metadata": {},
248 | "source": [
249 | "## Tuples\n",
250 | "\n",
251 | "Tuple is an ordered collection of immutable elements. Tuples are similar to lists, but the elements un a tuple cannot be modified. Most of list operations seen above can be used on tuples except the assignment of new value at a certain index position."
252 | ]
253 | },
254 | {
255 | "cell_type": "code",
256 | "execution_count": null,
257 | "metadata": {},
258 | "outputs": [],
259 | "source": [
260 | "a = (123, 54, 92) # initialise manually\n",
261 | "b = () # empty tuple\n",
262 | "c = (\"Ala\",) # tuple of a single string (note the trailing \",\")\n",
263 | "d = (2, 3, False, \"Arg\", None) # a tuple of mixed types\n",
264 | "print(a, b, c, d)\n",
265 | "t = a, c, d # tuple packing\n",
266 | "x, y, z = t # tuple unpacking\n",
267 | "print(t, x, y, z)"
268 | ]
269 | },
270 | {
271 | "cell_type": "markdown",
272 | "metadata": {},
273 | "source": [
274 | "## Strings\n",
275 | "\n",
276 | "String is an ordered collection of immutable characters or tuple of characters."
277 | ]
278 | },
279 | {
280 | "cell_type": "code",
281 | "execution_count": null,
282 | "metadata": {
283 | "slideshow": {
284 | "slide_type": "-"
285 | }
286 | },
287 | "outputs": [],
288 | "source": [
289 | "a = 'red' # assignment\n",
290 | "char = a[2] # access individual characters\n",
291 | "b = 'red' + 'blue' # string concatenation\n",
292 | "c = '1, 2, three'.split(',') # split string into list\n",
293 | "d = '.'.join(['1', '2', 'three']) # concatenate list into string\n",
294 | "print(a, char, b, c, d) \n",
295 | "dna = 'ATGTCACCGTTT' # assignment\n",
296 | "seq = list(dna) # convert string into list of character\n",
297 | "e = len(dna) # return string length\n",
298 | "f = dna[2:5] # slice string\n",
299 | "g = dna.find('TGA') # substring location, return -1 when not found\n",
300 | "print(dna, seq, e, f, g)\n",
301 | "text = ' chrom start end ' # assignment\n",
302 | "print('>', text, '<')\n",
303 | "print('>', text.strip(), '<') # remove unwanted whitespace at both end of the string\n",
304 | "print('{:.2f}'.format(0.4567)) # formating string\n",
305 | "print('{gene:s}\\t{exp:+.2f}'.format(gene='Beta-Actin', exp=1.7))"
306 | ]
307 | },
308 | {
309 | "cell_type": "markdown",
310 | "metadata": {},
311 | "source": [
312 | "## Conditional execution\n",
313 | "\n",
314 | "A conditional **`if/elif`** statement is used to specify that some block of code should only be executed if a conditional expression evaluates to `True`, there can be a final **`else`** statement to do something if all of the conditions are `False`.\n",
315 | "Python uses **indentation** to show which statements are in a block of code. "
316 | ]
317 | },
318 | {
319 | "cell_type": "code",
320 | "execution_count": null,
321 | "metadata": {},
322 | "outputs": [],
323 | "source": [
324 | "a, b = 1, 2 # assign different values to a and b\n",
325 | "if a + b == 3:\n",
326 | " print('True')\n",
327 | "elif a + b == 1:\n",
328 | " print('False')\n",
329 | "else:\n",
330 | " print('?')"
331 | ]
332 | },
333 | {
334 | "cell_type": "markdown",
335 | "metadata": {},
336 | "source": [
337 | "## Comparison operations"
338 | ]
339 | },
340 | {
341 | "cell_type": "code",
342 | "execution_count": null,
343 | "metadata": {},
344 | "outputs": [],
345 | "source": [
346 | "1 == 1 # equal value\n",
347 | "1 != 2 # not equal\n",
348 | "2 > 1 # larger\n",
349 | "2 < 1 # smaller\n",
350 | "\n",
351 | "1 != 2 and 2 < 3 # logical AND\n",
352 | "1 != 2 or 2 < 3 # logical OR\n",
353 | "not 1 == 2 # logical NOT\n",
354 | "\n",
355 | "a = list('ATGTCACCGTTT')\n",
356 | "b = a # same as a\n",
357 | "c = a[:] # copy of a\n",
358 | "'N' in a # test if character 'N' is in a\n",
359 | "\n",
360 | "print('a', a) # print a\n",
361 | "print('b', b) # print b\n",
362 | "print('c', c) # print c\n",
363 | "print('Is N in a?', 'N' in a)\n",
364 | "print('Are objects b and a point to the same memory address?', b is a)\n",
365 | "print('Are objects c and a point to the same memory address?', c is a)\n",
366 | "print('Are values of b and a identical?', b == a)\n",
367 | "print('Are values of c and a identical?', c == a)\n",
368 | "a[0] = 'N' # modify a \n",
369 | "print('a', a) # print a\n",
370 | "print('b', b) # print b\n",
371 | "print('c', c) # print c\n",
372 | "print('Is N in a?', 'N' in a)\n",
373 | "print('Are objects b and a point to the same memory address?', b is a)\n",
374 | "print('Are objects c and a point to the same memory address?', c is a)\n",
375 | "print('Are values of b and a identical?', b == a)\n",
376 | "print('Are values of c and a identical?', c == a)"
377 | ]
378 | },
379 | {
380 | "cell_type": "markdown",
381 | "metadata": {},
382 | "source": [
383 | "## Loops\n",
384 | "\n",
385 | "There are two ways of creating loops in Python, the **`for` loop** and the **`while` loop**."
386 | ]
387 | },
388 | {
389 | "cell_type": "code",
390 | "execution_count": null,
391 | "metadata": {},
392 | "outputs": [],
393 | "source": [
394 | "a = ['red', 'blue', 'green']\n",
395 | "for color in a:\n",
396 | " print(color)"
397 | ]
398 | },
399 | {
400 | "cell_type": "code",
401 | "execution_count": null,
402 | "metadata": {},
403 | "outputs": [],
404 | "source": [
405 | "number = 1\n",
406 | "while number < 10:\n",
407 | " print(number)\n",
408 | " number += 1"
409 | ]
410 | },
411 | {
412 | "cell_type": "markdown",
413 | "metadata": {},
414 | "source": [
415 | "Python has two ways of affecting the flow of the `for` or `while` loop inside the block. The **`break`** statement immediately causes all looping to finish, and execution is resumed at the next statement after the loop. The **`continue`** statement means that the rest of the code in the block is skipped for this particular item in the collection."
416 | ]
417 | },
418 | {
419 | "cell_type": "code",
420 | "execution_count": null,
421 | "metadata": {},
422 | "outputs": [],
423 | "source": [
424 | "# break\n",
425 | "sequence = ['CAG','TAC','CAA','TAG','TAC','CAG','CAA']\n",
426 | "for codon in sequence:\n",
427 | " if codon == 'TAG':\n",
428 | " break # Quit looping at this point\n",
429 | " else:\n",
430 | " print(codon)\n",
431 | "\n",
432 | "# continue\n",
433 | "values = [10, -5, 3, -1, 7]\n",
434 | "total = 0\n",
435 | "for v in values:\n",
436 | " if v < 0:\n",
437 | " continue # Skip this iteration \n",
438 | " total += v\n",
439 | "print(values, 'sum:', sum(values), 'total:', total)"
440 | ]
441 | },
442 | {
443 | "cell_type": "markdown",
444 | "metadata": {},
445 | "source": [
446 | "## Files\n",
447 | "\n",
448 | "To read from a file, your program needs to open the file and then read the contents of the file. You can read the entire contents of the file at once, or read the file line by line. The **`with`** statement makes sure the file is closed properly when the program has finished accessing the file.\n",
449 | "\n",
450 | "\n",
451 | "Passing the `'w'` argument to `open()` tells Python you want to write to the file. Be careful; this will erase the contents of the file if it already exists. Passing the `'a'` argument tells Python you want to append to the end of an existing file."
452 | ]
453 | },
454 | {
455 | "cell_type": "code",
456 | "execution_count": null,
457 | "metadata": {},
458 | "outputs": [],
459 | "source": [
460 | "# reading from file\n",
461 | "with open(\"data/genes.txt\") as f:\n",
462 | " for line in f:\n",
463 | " print(line.strip())\n",
464 | "\n",
465 | "# writing to a file\n",
466 | "with open('programming.txt', 'w') as f:\n",
467 | " f.write(\"I love programming in Python!\\n\")\n",
468 | " f.write(\"I love making scripts.\\n\")\n",
469 | " \n",
470 | "# appending to a file \n",
471 | "with open('programming.txt', 'a') as f:\n",
472 | " f.write(\"I love working with data.\\n\")"
473 | ]
474 | },
475 | {
476 | "cell_type": "markdown",
477 | "metadata": {},
478 | "source": [
479 | "## Getting help\n",
480 | "\n",
481 | "[The Python 3 Standard Library](https://docs.python.org/3/library/index.html) is the reference documentation of all libraries included in Python as well as built-in functions and data types."
482 | ]
483 | },
484 | {
485 | "cell_type": "code",
486 | "execution_count": null,
487 | "metadata": {},
488 | "outputs": [],
489 | "source": [
490 | "help(len) # help on built-in function\n",
491 | "help(list.extend) # help on list function"
492 | ]
493 | },
494 | {
495 | "cell_type": "code",
496 | "execution_count": null,
497 | "metadata": {},
498 | "outputs": [],
499 | "source": [
500 | "# help within jupyter\n",
501 | "len?"
502 | ]
503 | }
504 | ],
505 | "metadata": {
506 | "celltoolbar": "Slideshow",
507 | "kernelspec": {
508 | "display_name": "Python 3",
509 | "language": "python",
510 | "name": "python3"
511 | },
512 | "language_info": {
513 | "codemirror_mode": {
514 | "name": "ipython",
515 | "version": 3
516 | },
517 | "file_extension": ".py",
518 | "mimetype": "text/x-python",
519 | "name": "python",
520 | "nbconvert_exporter": "python",
521 | "pygments_lexer": "ipython3",
522 | "version": "3.6.4"
523 | }
524 | },
525 | "nbformat": 4,
526 | "nbformat_minor": 1
527 | }
528 |
--------------------------------------------------------------------------------
/contributing.md:
--------------------------------------------------------------------------------
1 | # Contributing to this project
2 |
3 | ## Steps for updating the course materials.
4 | When updating and changing the content of the course in jupyter notebook, make sure:
5 | - the course is not going to be ran tomorrow by someone else who is not aware of your changes
6 | - your Jupyter notebook kernel is set to **Python3**
7 | - you remove all outputs of cells before committing your code (except in very special circontances where you may wish to display the output)
8 | - Menu: Cell > All Outputs > Clear
9 | - you update the installation instruction if you are using a new library
10 |
11 | ## Steps for creating good issues or pull requests.
12 |
13 | ## Links to external documentation, mailing lists, or a code of conduct.
14 |
15 | ## Community and behavioral expectations.
16 |
--------------------------------------------------------------------------------
/data.txt:
--------------------------------------------------------------------------------
1 | Index Organism Score
2 | 1 Human 1.076
3 | 2 Mouse 1.202
4 | 3 Frog 2.2362
5 | 4 Fly 0.9853
6 |
--------------------------------------------------------------------------------
/data/gapminder_gdp_africa.csv:
--------------------------------------------------------------------------------
1 | country,gdpPercap_1952,gdpPercap_1957,gdpPercap_1962,gdpPercap_1967,gdpPercap_1972,gdpPercap_1977,gdpPercap_1982,gdpPercap_1987,gdpPercap_1992,gdpPercap_1997,gdpPercap_2002,gdpPercap_2007
2 | Algeria,2449.008185,3013.976023,2550.81688,3246.991771,4182.663766,4910.416756,5745.160213,5681.358539,5023.216647,4797.295051,5288.040382,6223.367465
3 | Angola,3520.610273,3827.940465,4269.276742,5522.776375,5473.288005,3008.647355,2756.953672,2430.208311,2627.845685,2277.140884,2773.287312,4797.231267
4 | Benin,1062.7522,959.6010805,949.4990641,1035.831411,1085.796879,1029.161251,1277.897616,1225.85601,1191.207681,1232.975292,1372.877931,1441.284873
5 | Botswana,851.2411407,918.2325349,983.6539764,1214.709294,2263.611114,3214.857818,4551.14215,6205.88385,7954.111645,8647.142313,11003.60508,12569.85177
6 | Burkina Faso,543.2552413,617.1834648,722.5120206,794.8265597,854.7359763,743.3870368,807.1985855,912.0631417,931.7527731,946.2949618,1037.645221,1217.032994
7 | Burundi,339.2964587,379.5646281,355.2032273,412.9775136,464.0995039,556.1032651,559.603231,621.8188189,631.6998778,463.1151478,446.4035126,430.0706916
8 | Cameroon,1172.667655,1313.048099,1399.607441,1508.453148,1684.146528,1783.432873,2367.983282,2602.664206,1793.163278,1694.337469,1934.011449,2042.09524
9 | Central African Republic,1071.310713,1190.844328,1193.068753,1136.056615,1070.013275,1109.374338,956.7529907,844.8763504,747.9055252,740.5063317,738.6906068,706.016537
10 | Chad,1178.665927,1308.495577,1389.817618,1196.810565,1104.103987,1133.98495,797.9081006,952.386129,1058.0643,1004.961353,1156.18186,1704.063724
11 | Comoros,1102.990936,1211.148548,1406.648278,1876.029643,1937.577675,1172.603047,1267.100083,1315.980812,1246.90737,1173.618235,1075.811558,986.1478792
12 | Congo Dem. Rep.,780.5423257,905.8602303,896.3146335,861.5932424,904.8960685,795.757282,673.7478181,672.774812,457.7191807,312.188423,241.1658765,277.5518587
13 | Congo Rep.,2125.621418,2315.056572,2464.783157,2677.939642,3213.152683,3259.178978,4879.507522,4201.194937,4016.239529,3484.164376,3484.06197,3632.557798
14 | Cote d'Ivoire,1388.594732,1500.895925,1728.869428,2052.050473,2378.201111,2517.736547,2602.710169,2156.956069,1648.073791,1786.265407,1648.800823,1544.750112
15 | Djibouti,2669.529475,2864.969076,3020.989263,3020.050513,3694.212352,3081.761022,2879.468067,2880.102568,2377.156192,1895.016984,1908.260867,2082.481567
16 | Egypt,1418.822445,1458.915272,1693.335853,1814.880728,2024.008147,2785.493582,3503.729636,3885.46071,3794.755195,4173.181797,4754.604414,5581.180998
17 | Equatorial Guinea,375.6431231,426.0964081,582.8419714,915.5960025,672.4122571,958.5668124,927.8253427,966.8968149,1132.055034,2814.480755,7703.4959,12154.08975
18 | Eritrea,328.9405571,344.1618859,380.9958433,468.7949699,514.3242082,505.7538077,524.8758493,521.1341333,582.8585102,913.47079,765.3500015,641.3695236
19 | Ethiopia,362.1462796,378.9041632,419.4564161,516.1186438,566.2439442,556.8083834,577.8607471,573.7413142,421.3534653,515.8894013,530.0535319,690.8055759
20 | Gabon,4293.476475,4976.198099,6631.459222,8358.761987,11401.94841,21745.57328,15113.36194,11864.40844,13522.15752,14722.84188,12521.71392,13206.48452
21 | Gambia,485.2306591,520.9267111,599.650276,734.7829124,756.0868363,884.7552507,835.8096108,611.6588611,665.6244126,653.7301704,660.5855997,752.7497265
22 | Ghana,911.2989371,1043.561537,1190.041118,1125.69716,1178.223708,993.2239571,876.032569,847.0061135,925.060154,1005.245812,1111.984578,1327.60891
23 | Guinea,510.1964923,576.2670245,686.3736739,708.7595409,741.6662307,874.6858643,857.2503577,805.5724718,794.3484384,869.4497668,945.5835837,942.6542111
24 | Guinea-Bissau,299.850319,431.7904566,522.0343725,715.5806402,820.2245876,764.7259628,838.1239671,736.4153921,745.5398706,796.6644681,575.7047176,579.231743
25 | Kenya,853.540919,944.4383152,896.9663732,1056.736457,1222.359968,1267.613204,1348.225791,1361.936856,1341.921721,1360.485021,1287.514732,1463.249282
26 | Lesotho,298.8462121,335.9971151,411.8006266,498.6390265,496.5815922,745.3695408,797.2631074,773.9932141,977.4862725,1186.147994,1275.184575,1569.331442
27 | Liberia,575.5729961,620.9699901,634.1951625,713.6036483,803.0054535,640.3224383,572.1995694,506.1138573,636.6229191,609.1739508,531.4823679,414.5073415
28 | Libya,2387.54806,3448.284395,6757.030816,18772.75169,21011.49721,21951.21176,17364.27538,11770.5898,9640.138501,9467.446056,9534.677467,12057.49928
29 | Madagascar,1443.011715,1589.20275,1643.38711,1634.047282,1748.562982,1544.228586,1302.878658,1155.441948,1040.67619,986.2958956,894.6370822,1044.770126
30 | Malawi,369.1650802,416.3698064,427.9010856,495.5147806,584.6219709,663.2236766,632.8039209,635.5173634,563.2000145,692.2758103,665.4231186,759.3499101
31 | Mali,452.3369807,490.3821867,496.1743428,545.0098873,581.3688761,686.3952693,618.0140641,684.1715576,739.014375,790.2579846,951.4097518,1042.581557
32 | Mauritania,743.1159097,846.1202613,1055.896036,1421.145193,1586.851781,1497.492223,1481.150189,1421.603576,1361.369784,1483.136136,1579.019543,1803.151496
33 | Mauritius,1967.955707,2034.037981,2529.067487,2475.387562,2575.484158,3710.982963,3688.037739,4783.586903,6058.253846,7425.705295,9021.815894,10956.99112
34 | Morocco,1688.20357,1642.002314,1566.353493,1711.04477,1930.194975,2370.619976,2702.620356,2755.046991,2948.047252,2982.101858,3258.495584,3820.17523
35 | Mozambique,468.5260381,495.5868333,556.6863539,566.6691539,724.9178037,502.3197334,462.2114149,389.8761846,410.8968239,472.3460771,633.6179466,823.6856205
36 | Namibia,2423.780443,2621.448058,3173.215595,3793.694753,3746.080948,3876.485958,4191.100511,3693.731337,3804.537999,3899.52426,4072.324751,4811.060429
37 | Niger,761.879376,835.5234025,997.7661127,1054.384891,954.2092363,808.8970728,909.7221354,668.3000228,581.182725,580.3052092,601.0745012,619.6768924
38 | Nigeria,1077.281856,1100.592563,1150.927478,1014.514104,1698.388838,1981.951806,1576.97375,1385.029563,1619.848217,1624.941275,1615.286395,2013.977305
39 | Reunion,2718.885295,2769.451844,3173.72334,4021.175739,5047.658563,4319.804067,5267.219353,5303.377488,6101.255823,6071.941411,6316.1652,7670.122558
40 | Rwanda,493.3238752,540.2893983,597.4730727,510.9637142,590.5806638,670.0806011,881.5706467,847.991217,737.0685949,589.9445051,785.6537648,863.0884639
41 | Sao Tome and Principe,879.5835855,860.7369026,1071.551119,1384.840593,1532.985254,1737.561657,1890.218117,1516.525457,1428.777814,1339.076036,1353.09239,1598.435089
42 | Senegal,1450.356983,1567.653006,1654.988723,1612.404632,1597.712056,1561.769116,1518.479984,1441.72072,1367.899369,1392.368347,1519.635262,1712.472136
43 | Sierra Leone,879.7877358,1004.484437,1116.639877,1206.043465,1353.759762,1348.285159,1465.010784,1294.447788,1068.696278,574.6481576,699.489713,862.5407561
44 | Somalia,1135.749842,1258.147413,1369.488336,1284.73318,1254.576127,1450.992513,1176.807031,1093.244963,926.9602964,930.5964284,882.0818218,926.1410683
45 | South Africa,4725.295531,5487.104219,5768.729717,7114.477971,7765.962636,8028.651439,8568.266228,7825.823398,7225.069258,7479.188244,7710.946444,9269.657808
46 | Sudan,1615.991129,1770.337074,1959.593767,1687.997641,1659.652775,2202.988423,1895.544073,1507.819159,1492.197043,1632.210764,1993.398314,2602.394995
47 | Swaziland,1148.376626,1244.708364,1856.182125,2613.101665,3364.836625,3781.410618,3895.384018,3984.839812,3553.0224,3876.76846,4128.116943,4513.480643
48 | Tanzania,716.6500721,698.5356073,722.0038073,848.2186575,915.9850592,962.4922932,874.2426069,831.8220794,825.682454,789.1862231,899.0742111,1107.482182
49 | Togo,859.8086567,925.9083202,1067.53481,1477.59676,1649.660188,1532.776998,1344.577953,1202.201361,1034.298904,982.2869243,886.2205765,882.9699438
50 | Tunisia,1468.475631,1395.232468,1660.30321,1932.360167,2753.285994,3120.876811,3560.233174,3810.419296,4332.720164,4876.798614,5722.895655,7092.923025
51 | Uganda,734.753484,774.3710692,767.2717398,908.9185217,950.735869,843.7331372,682.2662268,617.7244065,644.1707969,816.559081,927.7210018,1056.380121
52 | Zambia,1147.388831,1311.956766,1452.725766,1777.077318,1773.498265,1588.688299,1408.678565,1213.315116,1210.884633,1071.353818,1071.613938,1271.211593
53 | Zimbabwe,406.8841148,518.7642681,527.2721818,569.7950712,799.3621758,685.5876821,788.8550411,706.1573059,693.4207856,792.4499603,672.0386227,469.7092981
54 |
--------------------------------------------------------------------------------
/data/gapminder_gdp_americas.csv:
--------------------------------------------------------------------------------
1 | continent,country,gdpPercap_1952,gdpPercap_1957,gdpPercap_1962,gdpPercap_1967,gdpPercap_1972,gdpPercap_1977,gdpPercap_1982,gdpPercap_1987,gdpPercap_1992,gdpPercap_1997,gdpPercap_2002,gdpPercap_2007
2 | Americas,Argentina,5911.315053,6856.856212,7133.166023,8052.953021,9443.038526,10079.02674,8997.897412,9139.671389,9308.41871,10967.28195,8797.640716,12779.37964
3 | Americas,Bolivia,2677.326347,2127.686326,2180.972546,2586.886053,2980.331339,3548.097832,3156.510452,2753.69149,2961.699694,3326.143191,3413.26269,3822.137084
4 | Americas,Brazil,2108.944355,2487.365989,3336.585802,3429.864357,4985.711467,6660.118654,7030.835878,7807.095818,6950.283021,7957.980824,8131.212843,9065.800825
5 | Americas,Canada,11367.16112,12489.95006,13462.48555,16076.58803,18970.57086,22090.88306,22898.79214,26626.51503,26342.88426,28954.92589,33328.96507,36319.23501
6 | Americas,Chile,3939.978789,4315.622723,4519.094331,5106.654313,5494.024437,4756.763836,5095.665738,5547.063754,7596.125964,10118.05318,10778.78385,13171.63885
7 | Americas,Colombia,2144.115096,2323.805581,2492.351109,2678.729839,3264.660041,3815.80787,4397.575659,4903.2191,5444.648617,6117.361746,5755.259962,7006.580419
8 | Americas,Costa Rica,2627.009471,2990.010802,3460.937025,4161.727834,5118.146939,5926.876967,5262.734751,5629.915318,6160.416317,6677.045314,7723.447195,9645.06142
9 | Americas,Cuba,5586.53878,6092.174359,5180.75591,5690.268015,5305.445256,6380.494966,7316.918107,7532.924763,5592.843963,5431.990415,6340.646683,8948.102923
10 | Americas,Dominican Republic,1397.717137,1544.402995,1662.137359,1653.723003,2189.874499,2681.9889,2861.092386,2899.842175,3044.214214,3614.101285,4563.808154,6025.374752
11 | Americas,Ecuador,3522.110717,3780.546651,4086.114078,4579.074215,5280.99471,6679.62326,7213.791267,6481.776993,7103.702595,7429.455877,5773.044512,6873.262326
12 | Americas,El Salvador,3048.3029,3421.523218,3776.803627,4358.595393,4520.246008,5138.922374,4098.344175,4140.442097,4444.2317,5154.825496,5351.568666,5728.353514
13 | Americas,Guatemala,2428.237769,2617.155967,2750.364446,3242.531147,4031.408271,4879.992748,4820.49479,4246.485974,4439.45084,4684.313807,4858.347495,5186.050003
14 | Americas,Haiti,1840.366939,1726.887882,1796.589032,1452.057666,1654.456946,1874.298931,2011.159549,1823.015995,1456.309517,1341.726931,1270.364932,1201.637154
15 | Americas,Honduras,2194.926204,2220.487682,2291.156835,2538.269358,2529.842345,3203.208066,3121.760794,3023.096699,3081.694603,3160.454906,3099.72866,3548.330846
16 | Americas,Jamaica,2898.530881,4756.525781,5246.107524,6124.703451,7433.889293,6650.195573,6068.05135,6351.237495,7404.923685,7121.924704,6994.774861,7320.880262
17 | Americas,Mexico,3478.125529,4131.546641,4581.609385,5754.733883,6809.40669,7674.929108,9611.147541,8688.156003,9472.384295,9767.29753,10742.44053,11977.57496
18 | Americas,Nicaragua,3112.363948,3457.415947,3634.364406,4643.393534,4688.593267,5486.371089,3470.338156,2955.984375,2170.151724,2253.023004,2474.548819,2749.320965
19 | Americas,Panama,2480.380334,2961.800905,3536.540301,4421.009084,5364.249663,5351.912144,7009.601598,7034.779161,6618.74305,7113.692252,7356.031934,9809.185636
20 | Americas,Paraguay,1952.308701,2046.154706,2148.027146,2299.376311,2523.337977,3248.373311,4258.503604,3998.875695,4196.411078,4247.400261,3783.674243,4172.838464
21 | Americas,Peru,3758.523437,4245.256698,4957.037982,5788.09333,5937.827283,6281.290855,6434.501797,6360.943444,4446.380924,5838.347657,5909.020073,7408.905561
22 | Americas,Puerto Rico,3081.959785,3907.156189,5108.34463,6929.277714,9123.041742,9770.524921,10330.98915,12281.34191,14641.58711,16999.4333,18855.60618,19328.70901
23 | Americas,Trinidad and Tobago,3023.271928,4100.3934,4997.523971,5621.368472,6619.551419,7899.554209,9119.528607,7388.597823,7370.990932,8792.573126,11460.60023,18008.50924
24 | Americas,United States,13990.48208,14847.12712,16173.14586,19530.36557,21806.03594,24072.63213,25009.55914,29884.35041,32003.93224,35767.43303,39097.09955,42951.65309
25 | Americas,Uruguay,5716.766744,6150.772969,5603.357717,5444.61962,5703.408898,6504.339663,6920.223051,7452.398969,8137.004775,9230.240708,7727.002004,10611.46299
26 | Americas,Venezuela,7689.799761,9802.466526,8422.974165,9541.474188,10505.25966,13143.95095,11152.41011,9883.584648,10733.92631,10165.49518,8605.047831,11415.80569
27 |
--------------------------------------------------------------------------------
/data/gapminder_gdp_asia.csv:
--------------------------------------------------------------------------------
1 | country,gdpPercap_1952,gdpPercap_1957,gdpPercap_1962,gdpPercap_1967,gdpPercap_1972,gdpPercap_1977,gdpPercap_1982,gdpPercap_1987,gdpPercap_1992,gdpPercap_1997,gdpPercap_2002,gdpPercap_2007
2 | Afghanistan,779.4453145,820.8530296,853.10071,836.1971382,739.9811058,786.11336,978.0114388,852.3959448,649.3413952,635.341351,726.7340548,974.5803384
3 | Bahrain,9867.084765,11635.79945,12753.27514,14804.6727,18268.65839,19340.10196,19211.14731,18524.02406,19035.57917,20292.01679,23403.55927,29796.04834
4 | Bangladesh,684.2441716,661.6374577,686.3415538,721.1860862,630.2336265,659.8772322,676.9818656,751.9794035,837.8101643,972.7700352,1136.39043,1391.253792
5 | Cambodia,368.4692856,434.0383364,496.9136476,523.4323142,421.6240257,524.9721832,624.4754784,683.8955732,682.3031755,734.28517,896.2260153,1713.778686
6 | China,400.4486107,575.9870009,487.6740183,612.7056934,676.9000921,741.2374699,962.4213805,1378.904018,1655.784158,2289.234136,3119.280896,4959.114854
7 | Hong Kong China,3054.421209,3629.076457,4692.648272,6197.962814,8315.928145,11186.14125,14560.53051,20038.47269,24757.60301,28377.63219,30209.01516,39724.97867
8 | India,546.5657493,590.061996,658.3471509,700.7706107,724.032527,813.337323,855.7235377,976.5126756,1164.406809,1458.817442,1746.769454,2452.210407
9 | Indonesia,749.6816546,858.9002707,849.2897701,762.4317721,1111.107907,1382.702056,1516.872988,1748.356961,2383.140898,3119.335603,2873.91287,3540.651564
10 | Iran,3035.326002,3290.257643,4187.329802,5906.731805,9613.818607,11888.59508,7608.334602,6642.881371,7235.653188,8263.590301,9240.761975,11605.71449
11 | Iraq,4129.766056,6229.333562,8341.737815,8931.459811,9576.037596,14688.23507,14517.90711,11643.57268,3745.640687,3076.239795,4390.717312,4471.061906
12 | Israel,4086.522128,5385.278451,7105.630706,8393.741404,12786.93223,13306.61921,15367.0292,17122.47986,18051.52254,20896.60924,21905.59514,25523.2771
13 | Japan,3216.956347,4317.694365,6576.649461,9847.788607,14778.78636,16610.37701,19384.10571,22375.94189,26824.89511,28816.58499,28604.5919,31656.06806
14 | Jordan,1546.907807,1886.080591,2348.009158,2741.796252,2110.856309,2852.351568,4161.415959,4448.679912,3431.593647,3645.379572,3844.917194,4519.461171
15 | Korea Dem. Rep.,1088.277758,1571.134655,1621.693598,2143.540609,3701.621503,4106.301249,4106.525293,4106.492315,3726.063507,1690.756814,1646.758151,1593.06548
16 | Korea Rep.,1030.592226,1487.593537,1536.344387,2029.228142,3030.87665,4657.22102,5622.942464,8533.088805,12104.27872,15993.52796,19233.98818,23348.13973
17 | Kuwait,108382.3529,113523.1329,95458.11176,80894.88326,109347.867,59265.47714,31354.03573,28118.42998,34932.91959,40300.61996,35110.10566,47306.98978
18 | Lebanon,4834.804067,6089.786934,5714.560611,6006.983042,7486.384341,8659.696836,7640.519521,5377.091329,6890.806854,8754.96385,9313.93883,10461.05868
19 | Malaysia,1831.132894,1810.066992,2036.884944,2277.742396,2849.09478,3827.921571,4920.355951,5249.802653,7277.912802,10132.90964,10206.97794,12451.6558
20 | Mongolia,786.5668575,912.6626085,1056.353958,1226.04113,1421.741975,1647.511665,2000.603139,2338.008304,1785.402016,1902.2521,2140.739323,3095.772271
21 | Myanmar,331,350,388,349,357,371,424,385,347,415,611,944
22 | Nepal,545.8657229,597.9363558,652.3968593,676.4422254,674.7881296,694.1124398,718.3730947,775.6324501,897.7403604,1010.892138,1057.206311,1091.359778
23 | Oman,1828.230307,2242.746551,2924.638113,4720.942687,10618.03855,11848.34392,12954.79101,18115.22313,18616.70691,19702.05581,19774.83687,22316.19287
24 | Pakistan,684.5971438,747.0835292,803.3427418,942.4082588,1049.938981,1175.921193,1443.429832,1704.686583,1971.829464,2049.350521,2092.712441,2605.94758
25 | Philippines,1272.880995,1547.944844,1649.552153,1814.12743,1989.37407,2373.204287,2603.273765,2189.634995,2279.324017,2536.534925,2650.921068,3190.481016
26 | Saudi Arabia,6459.554823,8157.591248,11626.41975,16903.04886,24837.42865,34167.7626,33693.17525,21198.26136,24841.61777,20586.69019,19014.54118,21654.83194
27 | Singapore,2315.138227,2843.104409,3674.735572,4977.41854,8597.756202,11210.08948,15169.16112,18861.53081,24769.8912,33519.4766,36023.1054,47143.17964
28 | Sri Lanka,1083.53203,1072.546602,1074.47196,1135.514326,1213.39553,1348.775651,1648.079789,1876.766827,2153.739222,2664.477257,3015.378833,3970.095407
29 | Syria,1643.485354,2117.234893,2193.037133,1881.923632,2571.423014,3195.484582,3761.837715,3116.774285,3340.542768,4014.238972,4090.925331,4184.548089
30 | Taiwan,1206.947913,1507.86129,1822.879028,2643.858681,4062.523897,5596.519826,7426.354774,11054.56175,15215.6579,20206.82098,23235.42329,28718.27684
31 | Thailand,757.7974177,793.5774148,1002.199172,1295.46066,1524.358936,1961.224635,2393.219781,2982.653773,4616.896545,5852.625497,5913.187529,7458.396327
32 | Vietnam,605.0664917,676.2854478,772.0491602,637.1232887,699.5016441,713.5371196,707.2357863,820.7994449,989.0231487,1385.896769,1764.456677,2441.576404
33 | West Bank and Gaza,1515.592329,1827.067742,2198.956312,2649.715007,3133.409277,3682.831494,4336.032082,5107.197384,6017.654756,7110.667619,4515.487575,3025.349798
34 | Yemen Rep.,781.7175761,804.8304547,825.6232006,862.4421463,1265.047031,1829.765177,1977.55701,1971.741538,1879.496673,2117.484526,2234.820827,2280.769906
35 |
--------------------------------------------------------------------------------
/data/gapminder_gdp_europe.csv:
--------------------------------------------------------------------------------
1 | country,gdpPercap_1952,gdpPercap_1957,gdpPercap_1962,gdpPercap_1967,gdpPercap_1972,gdpPercap_1977,gdpPercap_1982,gdpPercap_1987,gdpPercap_1992,gdpPercap_1997,gdpPercap_2002,gdpPercap_2007
2 | Albania,1601.056136,1942.284244,2312.888958,2760.196931,3313.422188,3533.00391,3630.880722,3738.932735,2497.437901,3193.054604,4604.211737,5937.029526
3 | Austria,6137.076492,8842.59803,10750.72111,12834.6024,16661.6256,19749.4223,21597.08362,23687.82607,27042.01868,29095.92066,32417.60769,36126.4927
4 | Belgium,8343.105127,9714.960623,10991.20676,13149.04119,16672.14356,19117.97448,20979.84589,22525.56308,25575.57069,27561.19663,30485.88375,33692.60508
5 | Bosnia and Herzegovina,973.5331948,1353.989176,1709.683679,2172.352423,2860.16975,3528.481305,4126.613157,4314.114757,2546.781445,4766.355904,6018.975239,7446.298803
6 | Bulgaria,2444.286648,3008.670727,4254.337839,5577.0028,6597.494398,7612.240438,8224.191647,8239.854824,6302.623438,5970.38876,7696.777725,10680.79282
7 | Croatia,3119.23652,4338.231617,5477.890018,6960.297861,9164.090127,11305.38517,13221.82184,13822.58394,8447.794873,9875.604515,11628.38895,14619.22272
8 | Czech Republic,6876.14025,8256.343918,10136.86713,11399.44489,13108.4536,14800.16062,15377.22855,16310.4434,14297.02122,16048.51424,17596.21022,22833.30851
9 | Denmark,9692.385245,11099.65935,13583.31351,15937.21123,18866.20721,20422.9015,21688.04048,25116.17581,26406.73985,29804.34567,32166.50006,35278.41874
10 | Finland,6424.519071,7545.415386,9371.842561,10921.63626,14358.8759,15605.42283,18533.15761,21141.01223,20647.16499,23723.9502,28204.59057,33207.0844
11 | France,7029.809327,8662.834898,10560.48553,12999.91766,16107.19171,18292.63514,20293.89746,22066.44214,24703.79615,25889.78487,28926.03234,30470.0167
12 | Germany,7144.114393,10187.82665,12902.46291,14745.62561,18016.18027,20512.92123,22031.53274,24639.18566,26505.30317,27788.88416,30035.80198,32170.37442
13 | Greece,3530.690067,4916.299889,6017.190733,8513.097016,12724.82957,14195.52428,15268.42089,16120.52839,17541.49634,18747.69814,22514.2548,27538.41188
14 | Hungary,5263.673816,6040.180011,7550.359877,9326.64467,10168.65611,11674.83737,12545.99066,12986.47998,10535.62855,11712.7768,14843.93556,18008.94444
15 | Iceland,7267.688428,9244.001412,10350.15906,13319.89568,15798.06362,19654.96247,23269.6075,26923.20628,25144.39201,28061.09966,31163.20196,36180.78919
16 | Ireland,5210.280328,5599.077872,6631.597314,7655.568963,9530.772896,11150.98113,12618.32141,13872.86652,17558.81555,24521.94713,34077.04939,40675.99635
17 | Italy,4931.404155,6248.656232,8243.58234,10022.40131,12269.27378,14255.98475,16537.4835,19207.23482,22013.64486,24675.02446,27968.09817,28569.7197
18 | Montenegro,2647.585601,3682.259903,4649.593785,5907.850937,7778.414017,9595.929905,11222.58762,11732.51017,7003.339037,6465.613349,6557.194282,9253.896111
19 | Netherlands,8941.571858,11276.19344,12790.84956,15363.25136,18794.74567,21209.0592,21399.46046,23651.32361,26790.94961,30246.13063,33724.75778,36797.93332
20 | Norway,10095.42172,11653.97304,13450.40151,16361.87647,18965.05551,23311.34939,26298.63531,31540.9748,33965.66115,41283.16433,44683.97525,49357.19017
21 | Poland,4029.329699,4734.253019,5338.752143,6557.152776,8006.506993,9508.141454,8451.531004,9082.351172,7738.881247,10159.58368,12002.23908,15389.92468
22 | Portugal,3068.319867,3774.571743,4727.954889,6361.517993,9022.247417,10172.48572,11753.84291,13039.30876,16207.26663,17641.03156,19970.90787,20509.64777
23 | Romania,3144.613186,3943.370225,4734.997586,6470.866545,8011.414402,9356.39724,9605.314053,9696.273295,6598.409903,7346.547557,7885.360081,10808.47561
24 | Serbia,3581.459448,4981.090891,6289.629157,7991.707066,10522.06749,12980.66956,15181.0927,15870.87851,9325.068238,7914.320304,7236.075251,9786.534714
25 | Slovak Republic,5074.659104,6093.26298,7481.107598,8412.902397,9674.167626,10922.66404,11348.54585,12037.26758,9498.467723,12126.23065,13638.77837,18678.31435
26 | Slovenia,4215.041741,5862.276629,7402.303395,9405.489397,12383.4862,15277.03017,17866.72175,18678.53492,14214.71681,17161.10735,20660.01936,25768.25759
27 | Spain,3834.034742,4564.80241,5693.843879,7993.512294,10638.75131,13236.92117,13926.16997,15764.98313,18603.06452,20445.29896,24835.47166,28821.0637
28 | Sweden,8527.844662,9911.878226,12329.44192,15258.29697,17832.02464,18855.72521,20667.38125,23586.92927,23880.01683,25266.59499,29341.63093,33859.74835
29 | Switzerland,14734.23275,17909.48973,20431.0927,22966.14432,27195.11304,26982.29052,28397.71512,30281.70459,31871.5303,32135.32301,34480.95771,37506.41907
30 | Turkey,1969.10098,2218.754257,2322.869908,2826.356387,3450.69638,4269.122326,4241.356344,5089.043686,5678.348271,6601.429915,6508.085718,8458.276384
31 | United Kingdom,9979.508487,11283.17795,12477.17707,14142.85089,15895.11641,17428.74846,18232.42452,21664.78767,22705.09254,26074.53136,29478.99919,33203.26128
32 |
--------------------------------------------------------------------------------
/data/gapminder_gdp_oceania.csv:
--------------------------------------------------------------------------------
1 | country,gdpPercap_1952,gdpPercap_1957,gdpPercap_1962,gdpPercap_1967,gdpPercap_1972,gdpPercap_1977,gdpPercap_1982,gdpPercap_1987,gdpPercap_1992,gdpPercap_1997,gdpPercap_2002,gdpPercap_2007
2 | Australia,10039.59564,10949.64959,12217.22686,14526.12465,16788.62948,18334.19751,19477.00928,21888.88903,23424.76683,26997.93657,30687.75473,34435.36744
3 | New Zealand,10556.57566,12247.39532,13175.678,14463.91893,16046.03728,16233.7177,17632.4104,19007.19129,18363.32494,21050.41377,23189.80135,25185.00911
4 |
--------------------------------------------------------------------------------
/data/genes.txt:
--------------------------------------------------------------------------------
1 | gene chrom start end
2 | BRCA2 13 32889611 32973805
3 | TNFAIP3 6 138188351 138204449
4 | TCF7 5 133450402 133487556
5 |
--------------------------------------------------------------------------------
/data/genes_withstrand.txt:
--------------------------------------------------------------------------------
1 | chrom gene start end strand
2 | 13 BRCA2 32889611 32973805 +
3 | 6 TNFAIP3 138188351 138204449 +
4 | 5 TCF7 133450402 133487556 -
5 |
--------------------------------------------------------------------------------
/data/glpa.fa:
--------------------------------------------------------------------------------
1 | >swissprot|P02724|GLPA_HUMAN Glycophorin-A;
2 | MYGKIIFVLLLSEIVSISASSTTGVAMHTSTSSSVTKSYISSQTNDTHKRDTYAATPRAH
3 | EVSEISVRTVYPPEEETGERVQLAHHFSEPEITLIIFGVMAGVIGTILLISYGIRRLIKK
4 | SPSDVKPLPSPDTDVPLSSVEIENPETSDQ
5 |
6 |
--------------------------------------------------------------------------------
/data/mydata.txt:
--------------------------------------------------------------------------------
1 | Index Organism Score
2 | 1 Human 1.076
3 | 2 Mouse 1.202
4 | 3 Frog 2.2362
5 | 4 Fly 0.9853
6 |
--------------------------------------------------------------------------------
/dict_data.txt:
--------------------------------------------------------------------------------
1 | Index Score Organism
2 | 1 1.076 Human
3 | 2 1.202 Mouse
4 | 3 2.2362 Frog
5 | 4 0.9853 Fly
6 |
--------------------------------------------------------------------------------
/img/mind_maps.key:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pycam/python-data-science/73e50c3fb5e0b942d83487d19c42245a8715c9d3/img/mind_maps.key
--------------------------------------------------------------------------------
/img/mind_maps/mind_maps.001.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pycam/python-data-science/73e50c3fb5e0b942d83487d19c42245a8715c9d3/img/mind_maps/mind_maps.001.jpeg
--------------------------------------------------------------------------------
/img/mind_maps/mind_maps.002.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pycam/python-data-science/73e50c3fb5e0b942d83487d19c42245a8715c9d3/img/mind_maps/mind_maps.002.jpeg
--------------------------------------------------------------------------------
/img/mind_maps/mind_maps.003.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pycam/python-data-science/73e50c3fb5e0b942d83487d19c42245a8715c9d3/img/mind_maps/mind_maps.003.jpeg
--------------------------------------------------------------------------------
/img/mind_maps/mind_maps.004.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pycam/python-data-science/73e50c3fb5e0b942d83487d19c42245a8715c9d3/img/mind_maps/mind_maps.004.jpeg
--------------------------------------------------------------------------------
/img/python_shell.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pycam/python-data-science/73e50c3fb5e0b942d83487d19c42245a8715c9d3/img/python_shell.png
--------------------------------------------------------------------------------
/install/2to3_nb.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """
3 | To run: python3 nb2to3.py notebook-or-directory
4 | """
5 | # Authors: Thomas Kluyver, Fernando Perez
6 | # See: https://gist.github.com/takluyver/c8839593c615bb2f6e80
7 |
8 | import argparse
9 | import pathlib
10 | from nbformat import read, write
11 |
12 | import lib2to3
13 | from lib2to3.refactor import RefactoringTool, get_fixers_from_package
14 |
15 |
16 | def refactor_notebook_inplace(rt, path):
17 |
18 | def refactor_cell(src):
19 | #print('\n***SRC***\n', src)
20 | try:
21 | tree = rt.refactor_string(src+'\n', str(path) + '/cell-%d' % i)
22 | except (lib2to3.pgen2.parse.ParseError,
23 | lib2to3.pgen2.tokenize.TokenError):
24 | return src
25 | else:
26 | return str(tree)[:-1]
27 |
28 |
29 | print("Refactoring:", path)
30 | nb = read(str(path), as_version=4)
31 |
32 | # Run 2to3 on code
33 | for i, cell in enumerate(nb.cells, start=1):
34 | if cell.cell_type == 'code':
35 | if cell.execution_count in (' ', '*'):
36 | cell.execution_count = None
37 |
38 | if cell.source.startswith('%%'):
39 | # For cell magics, try to refactor the body, in case it's
40 | # valid python
41 | head, source = cell.source.split('\n', 1)
42 | cell.source = head + '\n' + refactor_cell(source)
43 | else:
44 | cell.source = refactor_cell(cell.source)
45 |
46 |
47 | # Update notebook metadata
48 | nb.metadata.kernelspec = {
49 | 'display_name': 'Python 3',
50 | 'name': 'python3',
51 | 'language': 'python',
52 | }
53 | if 'language_info' in nb.metadata:
54 | nb.metadata.language_info.codemirror_mode = {
55 | 'name': 'ipython',
56 | 'version': 3,
57 | }
58 | nb.metadata.language_info.pygments_lexer = 'ipython3'
59 | nb.metadata.language_info.pop('version', None)
60 |
61 | write(nb, str(path))
62 |
63 | def main(argv=None):
64 | ap = argparse.ArgumentParser()
65 | ap.add_argument('path', type=pathlib.Path,
66 | help="Notebook or directory containing notebooks")
67 |
68 | options = ap.parse_args(argv)
69 |
70 | avail_fixes = set(get_fixers_from_package('lib2to3.fixes'))
71 | rt = RefactoringTool(avail_fixes)
72 |
73 | if options.path.is_dir():
74 | for nb_path in options.path.rglob('*.ipynb'):
75 | refactor_notebook_inplace(rt, nb_path)
76 | else:
77 | refactor_notebook_inplace(rt, options.path)
78 |
79 | if __name__ == '__main__':
80 | main()
81 |
--------------------------------------------------------------------------------
/install/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu
2 | MAINTAINER Mark Dunning
3 |
4 | RUN sudo apt-get update
5 | RUN apt-get install -y ipython ipython-notebook git
6 | RUN git clone https://github.com/pycam/python-intro.git
7 |
8 | EXPOSE 8888
9 | ENV USE_HTTP 0
10 |
11 | WORKDIR python-intro/
12 | RUN ipython notebook --no-browser --port 8888 --ip=* Introduction_to_python_session_1.ipynb
13 |
--------------------------------------------------------------------------------
/install/vbox_installer.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # lubuntu LTS 14.04 VirtualBox installer based on lubuntu-14.04.2-desktop-i386
3 | # computer name: crukci-training-vm; user: training; password: admin123
4 |
5 | sudo su -
6 | apt-get install gedit
7 | apt-get install vim
8 | apt-get install git
9 | apt-get install python-pip
10 | apt-get install python-zmq
11 | apt-get install python-matplotlib
12 | apt-get install python-biopython
13 | apt-get install ncbi-blast+
14 |
15 | # Install VirtualBox Additions
16 | # From the VirtualBox menu of lubuntu go to Devices > Insert Guest Additions CD image... and do
17 | cd /media/training/VBOXADDITIONS_4.3.26_98988
18 | sudo ./VBoxLinuxAdditions.run
19 |
20 | # To increase screen resolution
21 | # Start > Preferences > Additional Drivers: Using x86 virtualization solution... and click Apply Changes
22 | # Then Start > Preferences > Monitor Settings and select 1440x1050 and click Save and Apply
23 |
24 | pip install ipython[notebook]
25 |
26 | apt-get autoremove
27 | apt-get clean
28 |
29 |
30 | adduser pycam # password: pycam123
31 |
32 | exit
33 |
34 | # login as pycam --------------------------------------------------------------
35 |
36 | git clone https://github.com/pycam/python-intro.git course
37 |
38 | # Add ipython at startup from lubuntu menu do to...
39 | # Preferences > Default applications for LXSession then tab Autostart and add:
40 | # /usr/local/bin/ipython notebook --no-browser --port=8888 --ip=127.0.0.1 /home/pycam/course/
41 |
42 | # Add bookmarks into firefox: (1) pycam.github.io (2) 127.0.0.1:8888
43 |
44 |
45 |
--------------------------------------------------------------------------------
/my_first_module.py:
--------------------------------------------------------------------------------
1 | def say_hello(user):
2 | print('Hello', user, '!')
3 |
--------------------------------------------------------------------------------
/programming.txt:
--------------------------------------------------------------------------------
1 | I love programming in Python!
2 | I love making scripts.
3 | I love working with data.
4 |
--------------------------------------------------------------------------------
/resources.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Online resources"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "- Documentations and tutorials from python and packages' websites\n",
15 | " - Python: [documentation](https://docs.python.org/3/library/index.html) and [tutorial](https://docs.python.org/3/tutorial/index.html)\n",
16 | " - Pandas: [documentation](http://pandas.pydata.org/pandas-docs/stable/) and [tutorials](http://pandas.pydata.org/pandas-docs/stable/tutorials.html)\n",
17 | " - Matplotlib: [documentation](https://matplotlib.org/api/api_overview.html) and [tutorials](https://matplotlib.org/tutorials/index.html)\n",
18 | " - Biopython: [documentation](https://biopython.org/wiki/Documentation), [API documentation](http://biopython.org/DIST/docs/api/) and [tutorial](http://biopython.org/DIST/docs/tutorial/Tutorial.html)\n",
19 | "- Courses: \n",
20 | " - https://www.datacamp.com/\n",
21 | " - https://www.edx.org/\n",
22 | " - https://www.coursera.org/ (from Johns Hopkins University)\n",
23 | " - https://www.codecademy.com/\n",
24 | "- Questions/answers:\n",
25 | " - https://stackoverflow.com/\n",
26 | " - https://www.biostars.org/\n",
27 | " - https://bioinformatics.stackexchange.com/\n",
28 | " - https://twitter.com"
29 | ]
30 | }
31 | ],
32 | "metadata": {
33 | "kernelspec": {
34 | "display_name": "Python 3",
35 | "language": "python",
36 | "name": "python3"
37 | },
38 | "language_info": {
39 | "codemirror_mode": {
40 | "name": "ipython",
41 | "version": 3
42 | },
43 | "file_extension": ".py",
44 | "mimetype": "text/x-python",
45 | "name": "python",
46 | "nbconvert_exporter": "python",
47 | "pygments_lexer": "ipython3",
48 | "version": "3.6.4"
49 | }
50 | },
51 | "nbformat": 4,
52 | "nbformat_minor": 2
53 | }
54 |
--------------------------------------------------------------------------------
/scripts/getIDs.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | GTF.py
4 | Kamil Slowikowski
5 | December 24, 2013
6 | Read GFF/GTF files. Works with gzip compressed files and pandas.
7 | http://useast.ensembl.org/info/website/upload/gff.html
8 | LICENSE
9 | This is free and unencumbered software released into the public domain.
10 | Anyone is free to copy, modify, publish, use, compile, sell, or
11 | distribute this software, either in source code form or as a compiled
12 | binary, for any purpose, commercial or non-commercial, and by any
13 | means.
14 | In jurisdictions that recognize copyright laws, the author or authors
15 | of this software dedicate any and all copyright interest in the
16 | software to the public domain. We make this dedication for the benefit
17 | of the public at large and to the detriment of our heirs and
18 | successors. We intend this dedication to be an overt act of
19 | relinquishment in perpetuity of all present and future rights to this
20 | software under copyright law.
21 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
24 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
25 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
26 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27 | OTHER DEALINGS IN THE SOFTWARE.
28 | For more information, please refer to
29 | """
30 |
31 |
32 | from collections import defaultdict
33 | import gzip
34 | import pandas as pd
35 | import re
36 |
37 | IDlist = []
38 |
39 | p = re.compile('.*Genbank:(.*?),.*')
40 |
41 | gff = pd.read_csv('input/ref_GRCm38.p4_top_level.10000features.random.gff3', header = None, delimiter = '\t', comment = '#')
42 |
43 | gff.columns = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes']
44 |
45 | attributes = gff.attributes
46 |
47 |
48 | for i in range(gff.shape[0]):
49 | attributei = attributes[i]
50 | # attribute2 = attributei.split(';')[2]
51 | m = p.match(attributei)
52 | if m == None:
53 | IDlist.append('')
54 | continue
55 | match = m.group(1)
56 | IDlist.append(match)
57 |
58 | gff['gbID'] = IDlist
59 |
60 | print(gff.head())
61 |
62 | gff.to_csv('input/ref_GRCm38.p4_top_level.10000features.randomWithGBid.gff3', sep = '\t')
63 |
64 | # from Bio import SeqIO
65 | # from Bio import Entrez
66 | #
67 | # Entrez.email = 'A.N.Other@example.com' # Always tell NCBI who you are
68 | # handle = Entrez.efetch(db="nucleotide", id="XP_006541352.1", rettype="gb")
69 | # seq_record = SeqIO.read(handle, "gb")
70 | # handle.close()
71 | #
72 | # print(seq_record.id, 'with', len(seq_record.features), 'features')
73 | # print(seq_record.seq)
74 | # print(seq_record.format("fasta"))
75 |
76 | # ,Genbank:XP_006541352.1,
77 |
78 | # GTF_HEADER = ['seqname', 'source', 'feature', 'start', 'end', 'score',
79 | # 'strand', 'frame']
80 | # R_SEMICOLON = re.compile(r'\s*;\s*')
81 | # R_COMMA = re.compile(r'\s*,\s*')
82 | # R_KEYVALUE = re.compile(r'(\s+|\s*=\s*)')
83 | #
84 | #
85 | # def dataframe(filename):
86 | # """Open an optionally gzipped GTF file and return a pandas.DataFrame.
87 | # """
88 | # # Each column is a list stored as a value in this dict.
89 | # result = defaultdict(list)
90 | #
91 | # for i, line in enumerate(lines(filename)):
92 | # for key in line.keys():
93 | # # This key has not been seen yet, so set it to None for all
94 | # # previous lines.
95 | # if key not in result:
96 | # result[key] = [None] * i
97 | #
98 | # # Ensure this row has some value for each column.
99 | # for key in result.keys():
100 | # result[key].append(line.get(key, None))
101 | #
102 | # return pd.DataFrame(result)
103 | #
104 | #
105 | # def lines(filename):
106 | # """Open an optionally gzipped GTF file and generate a dict for each line.
107 | # """
108 | # fn_open = gzip.open if filename.endswith('.gz') else open
109 | #
110 | # with fn_open(filename) as fh:
111 | # for line in fh:
112 | # if line.startswith('#'):
113 | # continue
114 | # else:
115 | # yield parse(line)
116 | #
117 | #
118 | # def parse(line):
119 | # """Parse a single GTF line and return a dict.
120 | # """
121 | # result = {}
122 | #
123 | # fields = line.rstrip().split('\t')
124 | #
125 | # for i, col in enumerate(GTF_HEADER):
126 | # result[col] = _get_value(fields[i])
127 | #
128 | # # INFO field consists of "key1=value;key2=value;...".
129 | # infos = [x for x in re.split(R_SEMICOLON, fields[8]) if x.strip()]
130 | #
131 | # for i, info in enumerate(infos, 1):
132 | # # It should be key="value".
133 | # try:
134 | # key, _, value = re.split(R_KEYVALUE, info, 1)
135 | # # But sometimes it is just "value".
136 | # except ValueError:
137 | # key = 'INFO{}'.format(i)
138 | # value = info
139 | # # Ignore the field if there is no value.
140 | # if value:
141 | # result[key] = _get_value(value)
142 | #
143 | # return result
144 | #
145 | #
146 | # def _get_value(value):
147 | # if not value:
148 | # return None
149 | #
150 | # # Strip double and single quotes.
151 | # value = value.strip('"\'')
152 | #
153 | # # Return a list if the value has a comma.
154 | # if ',' in value:
155 | # value = re.split(R_COMMA, value)
156 | # # These values are equivalent to None.
157 | # elif value in ['', '.', 'NA']:
158 | # return None
159 | #
160 | # return value
161 | #
162 | # gff = dataframe('input/ref_GRCm38.p4_top_level.10000features.random.gff3')
163 | #
164 | # print(gff.head())
165 |
--------------------------------------------------------------------------------
/scripts/hello.py:
--------------------------------------------------------------------------------
1 | print("Hello world!")
2 |
--------------------------------------------------------------------------------
/solutions/24_solution.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "from Bio import Entrez, SeqIO\n",
11 | "from Bio.SeqUtils import GC, molecular_weight\n",
12 | "import matplotlib.pyplot as plt"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": null,
18 | "metadata": {},
19 | "outputs": [],
20 | "source": [
21 | "def plot_gff_stats(fn, label, your_email, n_records = None):\n",
22 | " \"\"\"Draw box plots of exon lengths, GC% & molecular weight\n",
23 | " for a given .gff file.\n",
24 | " \n",
25 | " Args:\n",
26 | " fn: file path of the gff.\n",
27 | " label: the title to be given to each boxplot.\n",
28 | " your_email: so that NCBI knows who you are.\n",
29 | " n_records: for trial runs, set this to the number of exons\n",
30 | " you wish to plot.\"\"\"\n",
31 | " gff_df = get_exon_data(fn, n_records, your_email)\n",
32 | " plot_stats_from_df(gff_df, label)\n",
33 | "\n",
34 | "def get_exon_data(fn, n_records, your_email):\n",
35 | " \"\"\"From a gff file, obtain a dataframe filtered for exons\n",
36 | " only and excluding rows without GBIDs\n",
37 | " \n",
38 | " Args:\n",
39 | " fn: file path of the gff.\n",
40 | " your_email: so that NCBI knows who you are.\n",
41 | " n_records: for trial runs, set this to the number of exons\n",
42 | " you wish to plot.\"\"\"\n",
43 | " Entrez.email = your_email\n",
44 | " \n",
45 | " # read the table\n",
46 | " gff = pd.read_csv(fn, '\\t')\n",
47 | " # filter the exons\n",
48 | " gff_exons = gff.loc[gff['type'] == 'exon']\n",
49 | " # remove the rows with missing gbid\n",
50 | " gff_exons = gff_exons.dropna()\n",
51 | "\n",
52 | " # shorten the table if desired\n",
53 | " if n_records:\n",
54 | " gff_exons = gff_exons.head(n_records)\n",
55 | "\n",
56 | " # obtain sequence records for each exon with a gbid\n",
57 | " records = []\n",
58 | " for gbid in gff_exons['gbid']:\n",
59 | " handle = Entrez.efetch('nucleotide', id=gbid, rettype='gb')\n",
60 | " record = SeqIO.read(handle, 'gb')\n",
61 | " records.append(record)\n",
62 | "\n",
63 | " # calculate the stats\n",
64 | " # list comprehensions are used here \n",
65 | " # https://realpython.com/list-comprehension-python/\n",
66 | " lengths = [len(r.seq) for r in records]\n",
67 | " gcs = [GC(r.seq) for r in records]\n",
68 | " weights = [molecular_weight(r.seq) for r in records]\n",
69 | " \n",
70 | " # assign the values to the \n",
71 | " gff_exons.loc[:, 'lengths'] = lengths\n",
72 | " gff_exons.loc[:, 'GC'] = gcs\n",
73 | " gff_exons.loc[:, 'molecular_weight'] = weights\n",
74 | " return gff_exons\n",
75 | "\n",
76 | "def plot_stats_from_df(df, label):\n",
77 | " \"\"\"Plot boxplots of values in columns named:\n",
78 | " 'lengths', 'GC' or 'molecular_weight'\"\"\"\n",
79 | " for col in ['lengths', 'GC', 'molecular_weight']:\n",
80 | " gff_exons[col].plot(kind='box')\n",
81 | " plt.title(label)\n",
82 | " plt.show()"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": null,
88 | "metadata": {},
89 | "outputs": [],
90 | "source": [
91 | "animal_files = {'mouse': 'GRCm38.gff3', \n",
92 | " 'human': 'GRCh38.gff3', \n",
93 | " 'zebrafish': 'GRCz11.gff3',\n",
94 | " 'panda': 'AilMel.gff3'}"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": null,
100 | "metadata": {},
101 | "outputs": [],
102 | "source": [
103 | "for animal, file_name in animal_files.items():\n",
104 | " plot_gff_stats('data/'+file_name, animal, 'jct61@cam.ac.uk', 5)"
105 | ]
106 | }
107 | ],
108 | "metadata": {
109 | "kernelspec": {
110 | "display_name": "Python 3",
111 | "language": "python",
112 | "name": "python3"
113 | },
114 | "language_info": {
115 | "codemirror_mode": {
116 | "name": "ipython",
117 | "version": 3
118 | },
119 | "file_extension": ".py",
120 | "mimetype": "text/x-python",
121 | "name": "python",
122 | "nbconvert_exporter": "python",
123 | "pygments_lexer": "ipython3",
124 | "version": "3.6.9"
125 | }
126 | },
127 | "nbformat": 4,
128 | "nbformat_minor": 4
129 | }
130 |
--------------------------------------------------------------------------------
/solutions/ex_11_3.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Solution to exercice 1.1.3\n",
8 | "\n",
9 | "\n",
10 | "- Read data from the file `data/gapminder.csv`.\n",
11 | "- Find which European countries have the largest population in 1957 and 2007."
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": null,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "# Step 1: read data\n",
21 | "with open('../data/gapminder.csv') as f:\n",
22 | " for line in f:\n",
23 | " data = line.strip().split(',')\n",
24 | " print(data)"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": null,
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "# Step 2: find European countries\n",
34 | "with open('../data/gapminder.csv') as f:\n",
35 | " for line in f:\n",
36 | " data = line.strip().split(',')\n",
37 | " if data[1] == \"Europe\":\n",
38 | " print(data)"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": null,
44 | "metadata": {},
45 | "outputs": [],
46 | "source": [
47 | "# Step 3: find population for all European countries for 1957\n",
48 | "with open('../data/gapminder.csv') as f:\n",
49 | " for line in f:\n",
50 | " data = line.strip().split(',')\n",
51 | " if data[1] == \"Europe\" and data[2] == \"1957\":\n",
52 | " print(data)"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "# Step 4: find European country with the largest population in 1957\n",
62 | "pop_1957 = 0\n",
63 | "largest_eu_country_1957 = ''\n",
64 | "with open('../data/gapminder.csv') as f:\n",
65 | " for line in f:\n",
66 | " data = line.strip().split(',')\n",
67 | " if data[1] == \"Europe\" and data[2] == \"1957\":\n",
68 | " if int(data[4]) > pop_1957:\n",
69 | " pop_1957 = int(data[4])\n",
70 | " largest_eu_country_1957 = data[0]\n",
71 | " \n",
72 | "print(largest_eu_country_1957, pop_1957)"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": null,
78 | "metadata": {},
79 | "outputs": [],
80 | "source": [
81 | "# Step 5: find European country with the largest population in 2007\n",
82 | "pop_2007 = 0\n",
83 | "largest_eu_country_2007 = ''\n",
84 | "with open('../data/gapminder.csv') as f:\n",
85 | " for line in f:\n",
86 | " data = line.strip().split(',')\n",
87 | " if data[1] == \"Europe\" and data[2] == \"2007\":\n",
88 | " if int(data[4]) > pop_2007:\n",
89 | " pop_2007 = int(data[4])\n",
90 | " largest_eu_country_2007 = data[0]\n",
91 | " \n",
92 | "print(largest_eu_country_2007, pop_2007) "
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": null,
98 | "metadata": {},
99 | "outputs": [],
100 | "source": [
101 | "# Step 6: combine all steps\n",
102 | "pop_1957 = 0\n",
103 | "largest_eu_country_1957 = ''\n",
104 | "\n",
105 | "pop_2007 = 0\n",
106 | "largest_eu_country_2007 = ''\n",
107 | "\n",
108 | "with open('../data/gapminder.csv') as f:\n",
109 | " for line in f:\n",
110 | " data = line.strip().split(',')\n",
111 | " if data[1] == \"Europe\":\n",
112 | " if data[2] == \"1957\" and int(data[4]) > pop_1957:\n",
113 | " pop_1957 = int(data[4])\n",
114 | " largest_eu_country_1957 = data[0]\n",
115 | " if data[2] == \"2007\" and int(data[4]) > pop_2007:\n",
116 | " pop_2007 = int(data[4])\n",
117 | " largest_eu_country_2007 = data[0]\n",
118 | " \n",
119 | "print(largest_eu_country_1957, pop_1957)\n",
120 | "print(largest_eu_country_2007, pop_2007)"
121 | ]
122 | }
123 | ],
124 | "metadata": {
125 | "kernelspec": {
126 | "display_name": "Python 3",
127 | "language": "python",
128 | "name": "python3"
129 | },
130 | "language_info": {
131 | "codemirror_mode": {
132 | "name": "ipython",
133 | "version": 3
134 | },
135 | "file_extension": ".py",
136 | "mimetype": "text/x-python",
137 | "name": "python",
138 | "nbconvert_exporter": "python",
139 | "pygments_lexer": "ipython3",
140 | "version": "3.6.4"
141 | }
142 | },
143 | "nbformat": 4,
144 | "nbformat_minor": 2
145 | }
146 |
--------------------------------------------------------------------------------
/solutions/ex_12_1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Solution to exercise 1.2.1\n",
8 | "\n",
9 | "- Calculate the average GDP per capita per country in Europe in 1962, its median and standard deviation using `data/gapminder.csv` data; and compare these figures with those from Americas."
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import statistics as stats\n",
19 | "eu_gdppercap_1962 = []\n",
20 | "americas_gdppercap_1962 = []\n",
21 | "with open('../data/gapminder.csv') as f:\n",
22 | " for line in f:\n",
23 | " data = line.strip().split(',')\n",
24 | " if data[2] == \"1962\":\n",
25 | " if data[1] == \"Europe\":\n",
26 | " eu_gdppercap_1962.append(float(data[5]))\n",
27 | " if data[1] == 'Americas':\n",
28 | " americas_gdppercap_1962.append(float(data[5]))\n",
29 | " \n",
30 | "\n",
31 | "print('European GDP per Capita in 1962')\n",
32 | "print(eu_gdppercap_1962)\n",
33 | "print('average:', stats.mean(eu_gdppercap_1962))\n",
34 | "print('median:', stats.median(eu_gdppercap_1962))\n",
35 | "print('standard deviation:', stats.stdev(eu_gdppercap_1962))\n",
36 | "\n",
37 | "print('American GDP per Capita in 1962')\n",
38 | "print(americas_gdppercap_1962)\n",
39 | "print('average:', stats.mean(americas_gdppercap_1962))\n",
40 | "print('median:', stats.median(americas_gdppercap_1962))\n",
41 | "print('standard deviation:', stats.stdev(americas_gdppercap_1962))"
42 | ]
43 | }
44 | ],
45 | "metadata": {
46 | "kernelspec": {
47 | "display_name": "Python 3",
48 | "language": "python",
49 | "name": "python3"
50 | },
51 | "language_info": {
52 | "codemirror_mode": {
53 | "name": "ipython",
54 | "version": 3
55 | },
56 | "file_extension": ".py",
57 | "mimetype": "text/x-python",
58 | "name": "python",
59 | "nbconvert_exporter": "python",
60 | "pygments_lexer": "ipython3",
61 | "version": "3.6.4"
62 | }
63 | },
64 | "nbformat": 4,
65 | "nbformat_minor": 2
66 | }
67 |
--------------------------------------------------------------------------------
/solutions/ex_12_2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Exercise 1.2.2\n",
8 | "\n",
9 | "- Print the first 3 lines from each `.gff3` file in the `data` folder.\n",
10 | "\n",
11 | "You may wish to use the `enumerate` function, along with the `break` statement to avoid printing every line in the file."
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 11,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "import os\n",
21 | "# os.chdir changes the working directory. Here we use ../ to go up one directory, out of the solutions directory\n",
22 | "if os.getcwd().split('/')[-1] == 'solutions':\n",
23 | " os.chdir('../')"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 15,
29 | "metadata": {},
30 | "outputs": [
31 | {
32 | "name": "stdout",
33 | "output_type": "stream",
34 | "text": [
35 | "gapminder_gdp_americas.csv\n",
36 | "continent,country,gdpPercap_1952,gdpPercap_1957,gdpPercap_1962,gdpPercap_1967,gdpPercap_1972,gdpPercap_1977,gdpPercap_1982,gdpPercap_1987,gdpPercap_1992,gdpPercap_1997,gdpPercap_2002,gdpPercap_2007\n",
37 | "Americas,Argentina,5911.315053,6856.856212,7133.166023,8052.953021,9443.038526,10079.02674,8997.897412,9139.671389,9308.41871,10967.28195,8797.640716,12779.37964\n",
38 | "Americas,Bolivia,2677.326347,2127.686326,2180.972546,2586.886053,2980.331339,3548.097832,3156.510452,2753.69149,2961.699694,3326.143191,3413.26269,3822.137084\n",
39 | "Americas,Brazil,2108.944355,2487.365989,3336.585802,3429.864357,4985.711467,6660.118654,7030.835878,7807.095818,6950.283021,7957.980824,8131.212843,9065.800825\n",
40 | "\n",
41 | "\n",
42 | "\n",
43 | "gapminder_gdp_europe.csv\n",
44 | "country,gdpPercap_1952,gdpPercap_1957,gdpPercap_1962,gdpPercap_1967,gdpPercap_1972,gdpPercap_1977,gdpPercap_1982,gdpPercap_1987,gdpPercap_1992,gdpPercap_1997,gdpPercap_2002,gdpPercap_2007\n",
45 | "Albania,1601.056136,1942.284244,2312.888958,2760.196931,3313.422188,3533.00391,3630.880722,3738.932735,2497.437901,3193.054604,4604.211737,5937.029526\n",
46 | "Austria,6137.076492,8842.59803,10750.72111,12834.6024,16661.6256,19749.4223,21597.08362,23687.82607,27042.01868,29095.92066,32417.60769,36126.4927\n",
47 | "Belgium,8343.105127,9714.960623,10991.20676,13149.04119,16672.14356,19117.97448,20979.84589,22525.56308,25575.57069,27561.19663,30485.88375,33692.60508\n",
48 | "\n",
49 | "\n",
50 | "\n",
51 | "gapminder_gdp_oceania.csv\n",
52 | "country,gdpPercap_1952,gdpPercap_1957,gdpPercap_1962,gdpPercap_1967,gdpPercap_1972,gdpPercap_1977,gdpPercap_1982,gdpPercap_1987,gdpPercap_1992,gdpPercap_1997,gdpPercap_2002,gdpPercap_2007\n",
53 | "Australia,10039.59564,10949.64959,12217.22686,14526.12465,16788.62948,18334.19751,19477.00928,21888.88903,23424.76683,26997.93657,30687.75473,34435.36744\n",
54 | "New Zealand,10556.57566,12247.39532,13175.678,14463.91893,16046.03728,16233.7177,17632.4104,19007.19129,18363.32494,21050.41377,23189.80135,25185.00911\n",
55 | "\n",
56 | "\n",
57 | "\n",
58 | "gapminder_gdp_africa.csv\n",
59 | "country,gdpPercap_1952,gdpPercap_1957,gdpPercap_1962,gdpPercap_1967,gdpPercap_1972,gdpPercap_1977,gdpPercap_1982,gdpPercap_1987,gdpPercap_1992,gdpPercap_1997,gdpPercap_2002,gdpPercap_2007\n",
60 | "Algeria,2449.008185,3013.976023,2550.81688,3246.991771,4182.663766,4910.416756,5745.160213,5681.358539,5023.216647,4797.295051,5288.040382,6223.367465\n",
61 | "Angola,3520.610273,3827.940465,4269.276742,5522.776375,5473.288005,3008.647355,2756.953672,2430.208311,2627.845685,2277.140884,2773.287312,4797.231267\n",
62 | "Benin,1062.7522,959.6010805,949.4990641,1035.831411,1085.796879,1029.161251,1277.897616,1225.85601,1191.207681,1232.975292,1372.877931,1441.284873\n",
63 | "\n",
64 | "\n",
65 | "\n",
66 | "gapminder.csv\n",
67 | "country,continent,year,lifeExp,pop,gdpPercap\n",
68 | "Afghanistan,Asia,1952,28.801,8425333,779.4453145\n",
69 | "Afghanistan,Asia,1957,30.332,9240934,820.8530296\n",
70 | "Afghanistan,Asia,1962,31.997,10267083,853.10071\n",
71 | "\n",
72 | "\n",
73 | "\n",
74 | "gapminder_gdp_asia.csv\n",
75 | "country,gdpPercap_1952,gdpPercap_1957,gdpPercap_1962,gdpPercap_1967,gdpPercap_1972,gdpPercap_1977,gdpPercap_1982,gdpPercap_1987,gdpPercap_1992,gdpPercap_1997,gdpPercap_2002,gdpPercap_2007\n",
76 | "Afghanistan,779.4453145,820.8530296,853.10071,836.1971382,739.9811058,786.11336,978.0114388,852.3959448,649.3413952,635.341351,726.7340548,974.5803384\n",
77 | "Bahrain,9867.084765,11635.79945,12753.27514,14804.6727,18268.65839,19340.10196,19211.14731,18524.02406,19035.57917,20292.01679,23403.55927,29796.04834\n",
78 | "Bangladesh,684.2441716,661.6374577,686.3415538,721.1860862,630.2336265,659.8772322,676.9818656,751.9794035,837.8101643,972.7700352,1136.39043,1391.253792\n",
79 | "\n",
80 | "\n",
81 | "\n"
82 | ]
83 | }
84 | ],
85 | "source": [
86 | "# go through all the files in the directory\n",
87 | "for file_name in os.listdir('data'):\n",
88 | " \n",
89 | " # only do anything with files of the right type\n",
90 | " if file_name.endswith('.csv'):\n",
91 | " if not os.path.isdir('data/'+file_name):\n",
92 | " print(file_na)\n",
93 | " \n",
94 | " # print the first 3 lines\n",
95 | " with open('data/'+file_name) as f:\n",
96 | " print(file_name)\n",
97 | " # line_idx is the number of the line we are on\n",
98 | " for line_idx, line in enumerate(f):\n",
99 | " print(line.strip())\n",
100 | " if line_idx == 3:\n",
101 | " break\n",
102 | " \n",
103 | " # print some blank lines so we can see what is happening\n",
104 | " print('\\n\\n')\n",
105 | " "
106 | ]
107 | },
108 | {
109 | "cell_type": "markdown",
110 | "metadata": {},
111 | "source": [
112 | "Jupyter has the ability to execute Bash commands using `!`. These can be mixed with python code, so the above task can be completed with the following code:"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": 14,
118 | "metadata": {},
119 | "outputs": [
120 | {
121 | "name": "stdout",
122 | "output_type": "stream",
123 | "text": [
124 | "data/gapminder.csv\n",
125 | "country,continent,year,lifeExp,pop,gdpPercap\n",
126 | "Afghanistan,Asia,1952,28.801,8425333,779.4453145\n",
127 | "Afghanistan,Asia,1957,30.332,9240934,820.8530296\n",
128 | "\n",
129 | "data/gapminder_gdp_africa.csv\n",
130 | "country,gdpPercap_1952,gdpPercap_1957,gdpPercap_1962,gdpPercap_1967,gdpPercap_1972,gdpPercap_1977,gdpPercap_1982,gdpPercap_1987,gdpPercap_1992,gdpPercap_1997,gdpPercap_2002,gdpPercap_2007\n",
131 | "Algeria,2449.008185,3013.976023,2550.81688,3246.991771,4182.663766,4910.416756,5745.160213,5681.358539,5023.216647,4797.295051,5288.040382,6223.367465\n",
132 | "Angola,3520.610273,3827.940465,4269.276742,5522.776375,5473.288005,3008.647355,2756.953672,2430.208311,2627.845685,2277.140884,2773.287312,4797.231267\n",
133 | "\n",
134 | "data/gapminder_gdp_americas.csv\n",
135 | "continent,country,gdpPercap_1952,gdpPercap_1957,gdpPercap_1962,gdpPercap_1967,gdpPercap_1972,gdpPercap_1977,gdpPercap_1982,gdpPercap_1987,gdpPercap_1992,gdpPercap_1997,gdpPercap_2002,gdpPercap_2007\n",
136 | "Americas,Argentina,5911.315053,6856.856212,7133.166023,8052.953021,9443.038526,10079.02674,8997.897412,9139.671389,9308.41871,10967.28195,8797.640716,12779.37964\n",
137 | "Americas,Bolivia,2677.326347,2127.686326,2180.972546,2586.886053,2980.331339,3548.097832,3156.510452,2753.69149,2961.699694,3326.143191,3413.26269,3822.137084\n",
138 | "\n",
139 | "data/gapminder_gdp_asia.csv\n",
140 | "country,gdpPercap_1952,gdpPercap_1957,gdpPercap_1962,gdpPercap_1967,gdpPercap_1972,gdpPercap_1977,gdpPercap_1982,gdpPercap_1987,gdpPercap_1992,gdpPercap_1997,gdpPercap_2002,gdpPercap_2007\n",
141 | "Afghanistan,779.4453145,820.8530296,853.10071,836.1971382,739.9811058,786.11336,978.0114388,852.3959448,649.3413952,635.341351,726.7340548,974.5803384\n",
142 | "Bahrain,9867.084765,11635.79945,12753.27514,14804.6727,18268.65839,19340.10196,19211.14731,18524.02406,19035.57917,20292.01679,23403.55927,29796.04834\n",
143 | "\n",
144 | "data/gapminder_gdp_europe.csv\n",
145 | "country,gdpPercap_1952,gdpPercap_1957,gdpPercap_1962,gdpPercap_1967,gdpPercap_1972,gdpPercap_1977,gdpPercap_1982,gdpPercap_1987,gdpPercap_1992,gdpPercap_1997,gdpPercap_2002,gdpPercap_2007\n",
146 | "Albania,1601.056136,1942.284244,2312.888958,2760.196931,3313.422188,3533.00391,3630.880722,3738.932735,2497.437901,3193.054604,4604.211737,5937.029526\n",
147 | "Austria,6137.076492,8842.59803,10750.72111,12834.6024,16661.6256,19749.4223,21597.08362,23687.82607,27042.01868,29095.92066,32417.60769,36126.4927\n",
148 | "\n",
149 | "data/gapminder_gdp_oceania.csv\n",
150 | "country,gdpPercap_1952,gdpPercap_1957,gdpPercap_1962,gdpPercap_1967,gdpPercap_1972,gdpPercap_1977,gdpPercap_1982,gdpPercap_1987,gdpPercap_1992,gdpPercap_1997,gdpPercap_2002,gdpPercap_2007\n",
151 | "Australia,10039.59564,10949.64959,12217.22686,14526.12465,16788.62948,18334.19751,19477.00928,21888.88903,23424.76683,26997.93657,30687.75473,34435.36744\n",
152 | "New Zealand,10556.57566,12247.39532,13175.678,14463.91893,16046.03728,16233.7177,17632.4104,19007.19129,18363.32494,21050.41377,23189.80135,25185.00911\n",
153 | "\n",
154 | "\n",
155 | "^C\n",
156 | "\n",
157 | "data/decoy.csv:\n",
158 | "head: data/decoy.csv:: No such file or directory\n",
159 | "\n"
160 | ]
161 | }
162 | ],
163 | "source": [
164 | "# bash commands can create python objects, here `ls` creates a list of strings\n",
165 | "csv_files = !ls data/*.csv\n",
166 | "\n",
167 | "for file_name in csv_files:\n",
168 | " print(file_name)\n",
169 | " if not os.path.isdir(file_name):\n",
170 | " # python variables can be passed to bash using the $var syntax\n",
171 | " !head -3 $file_name\n",
172 | " !echo \"\""
173 | ]
174 | },
175 | {
176 | "cell_type": "code",
177 | "execution_count": 17,
178 | "metadata": {},
179 | "outputs": [
180 | {
181 | "name": "stdout",
182 | "output_type": "stream",
183 | "text": [
184 | "country,continent,year,lifeExp,pop,gdpPercap\n",
185 | "Afghanistan,Asia,1952,28.801,8425333,779.4453145\n",
186 | "Afghanistan,Asia,1957,30.332,9240934,820.8530296\n",
187 | "Afghanistan,Asia,1962,31.997,10267083,853.10071\n",
188 | "Afghanistan,Asia,1967,34.02,11537966,836.1971382\n",
189 | "Afghanistan,Asia,1972,36.088,13079460,739.9811058\n",
190 | "Afghanistan,Asia,1977,38.438,14880372,786.11336\n",
191 | "Afghanistan,Asia,1982,39.854,12881816,978.0114388\n",
192 | "Afghanistan,Asia,1987,40.822,13867957,852.3959448\n",
193 | "Afghanistan,Asia,1992,41.674,16317921,649.3413952\n"
194 | ]
195 | }
196 | ],
197 | "source": [
198 | "!head -3 data/gapminder.csv"
199 | ]
200 | }
201 | ],
202 | "metadata": {
203 | "kernelspec": {
204 | "display_name": "Python 3",
205 | "language": "python",
206 | "name": "python3"
207 | },
208 | "language_info": {
209 | "codemirror_mode": {
210 | "name": "ipython",
211 | "version": 3
212 | },
213 | "file_extension": ".py",
214 | "mimetype": "text/x-python",
215 | "name": "python",
216 | "nbconvert_exporter": "python",
217 | "pygments_lexer": "ipython3",
218 | "version": "3.7.5"
219 | }
220 | },
221 | "nbformat": 4,
222 | "nbformat_minor": 4
223 | }
224 |
--------------------------------------------------------------------------------
/solutions/ex_12_3.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Solution to exercise 1.2.3\n",
8 | "\n",
9 | "- Change the script you wrote for [Exercise 1.2.1](#Exercise-1.2.1) to make use of the `csv` module to calculate the average GDP per capita per country in Europe in 1962, its median and standard deviation using `data/gapminder.csv` data; and compare these figures with those from Americas."
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import os\n",
19 | "import statistics as stats\n",
20 | "import csv\n",
21 | "\n",
22 | "# data will be stored here\n",
23 | "eu_gdppercap_1962 = []\n",
24 | "americas_gdppercap_1962 = []\n",
25 | "\n",
26 | "# open a file\n",
27 | "with open(os.path.join('..', 'data', 'gapminder.csv')) as f:\n",
28 | " \n",
29 | " # create a DictReader that will produce \n",
30 | " reader = csv.DictReader(f, delimiter = \",\")\n",
31 | " \n",
32 | " # iterate through the rows of the DictReader\n",
33 | " for data in reader:\n",
34 | " # first check the year\n",
35 | " if data['year'] == \"1962\":\n",
36 | " # If the year matches, check if the row matches one of the continents \n",
37 | " # and put the GDP value at the end of the appropriate list\n",
38 | " if data['continent'] == \"Europe\":\n",
39 | " eu_gdppercap_1962.append(float(data['gdpPercap']))\n",
40 | " if data['continent'] == 'Americas':\n",
41 | " americas_gdppercap_1962.append(float(data['gdpPercap']))\n",
42 | " \n",
43 | "# print the data\n",
44 | "print('European GDP per Capita in 1962')\n",
45 | "print(eu_gdppercap_1962)\n",
46 | "print('average:', stats.mean(eu_gdppercap_1962))\n",
47 | "print('median:', stats.median(eu_gdppercap_1962))\n",
48 | "print('standard deviation:', stats.stdev(eu_gdppercap_1962))\n",
49 | "\n",
50 | "print('American GDP per Capita in 1962')\n",
51 | "print(americas_gdppercap_1962)\n",
52 | "print('average:', stats.mean(americas_gdppercap_1962))\n",
53 | "print('median:', stats.median(americas_gdppercap_1962))\n",
54 | "print('standard deviation:', stats.stdev(americas_gdppercap_1962))"
55 | ]
56 | },
57 | {
58 | "cell_type": "markdown",
59 | "metadata": {},
60 | "source": [
61 | "Alternative solution:"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "metadata": {},
68 | "outputs": [],
69 | "source": [
70 | "import os\n",
71 | "import statistics as stats\n",
72 | "import csv\n",
73 | "\n",
74 | "gdps = []\n",
75 | "\n",
76 | "# we define the continent and year we are interested in\n",
77 | "\n",
78 | "year = '1962'\n",
79 | "\n",
80 | "for continent in ['Europe', 'Americas']:\n",
81 | " with open('../data/gapminder.csv') as f:\n",
82 | " reader = csv.DictReader(f, delimiter = \",\")\n",
83 | " for data in reader:\n",
84 | " # we check the year and continent at the same time\n",
85 | " if data['continent'] == continent and data['year'] == year:\n",
86 | " gdps.append(float(data['gdpPercap']))\n",
87 | " \n",
88 | " # print results for each continent\n",
89 | " print('The mean and std GDP for', continent, 'in', year, 'median =', stats.median(gdps), 'std =', stats.stdev(gdps))"
90 | ]
91 | }
92 | ],
93 | "metadata": {
94 | "kernelspec": {
95 | "display_name": "Python 3",
96 | "language": "python",
97 | "name": "python3"
98 | },
99 | "language_info": {
100 | "codemirror_mode": {
101 | "name": "ipython",
102 | "version": 3
103 | },
104 | "file_extension": ".py",
105 | "mimetype": "text/x-python",
106 | "name": "python",
107 | "nbconvert_exporter": "python",
108 | "pygments_lexer": "ipython3",
109 | "version": "3.7.4"
110 | }
111 | },
112 | "nbformat": 4,
113 | "nbformat_minor": 4
114 | }
115 |
--------------------------------------------------------------------------------
/solutions/ex_13_1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Solution to exercise 1.3.1\n",
8 | "\n",
9 | "- Write a function that takes two arguments and returns their mean. \n",
10 | " - Give your function a meaningful name, and a good documentation. \n",
11 | " - Call your function multiple times with different values, and once using the keyword arguments with their associated values.\n",
12 | " - Print the result of these different function calls.\n",
13 | "- Write another function that takes a list as argument and returns the mean and the median of all the numbers in the list."
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": null,
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "def mean_of_two_values(value1, value2):\n",
23 | " \"\"\"\n",
24 | " Returns the mean of the two arguments.\n",
25 | " \n",
26 | " value1 --- first value \n",
27 | " value2 --- second value\n",
28 | " \"\"\"\n",
29 | " print('calculating the mean of', value1, 'and', value2)\n",
30 | " return (value1 + value2)/2"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": null,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "mean_of_variables = mean_of_two_values(2, 5)\n",
40 | "print(mean_of_variables)"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": null,
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "mean_of_variables = mean_of_two_values(-6, 5)\n",
50 | "print(mean_of_variables)"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": null,
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "mean_of_variables = mean_of_two_values('4', 5)\n",
60 | "print(mean_of_variables)"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": null,
66 | "metadata": {},
67 | "outputs": [],
68 | "source": [
69 | "mean_of_variables = mean_of_two_values(value2=2, value1=5)\n",
70 | "print(mean_of_variables)"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": null,
76 | "metadata": {},
77 | "outputs": [],
78 | "source": [
79 | "import statistics\n",
80 | "def mean_and_median(values):\n",
81 | " \"\"\"\n",
82 | " Returns the mean and median of a list of values\n",
83 | " \n",
84 | " values --- list of values\n",
85 | " \"\"\"\n",
86 | " return statistics.mean(values), statistics.median(values)"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": null,
92 | "metadata": {},
93 | "outputs": [],
94 | "source": [
95 | "results = mean_and_median([2, 5])\n",
96 | "print(results)"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": null,
102 | "metadata": {},
103 | "outputs": [],
104 | "source": [
105 | "results = mean_and_median([6, 9, 14])\n",
106 | "print(results)"
107 | ]
108 | },
109 | {
110 | "cell_type": "markdown",
111 | "metadata": {},
112 | "source": [
113 | "**BEWARE** Do not use [Python built-in names](https://docs.python.org/3/library/functions.html#built-in-funcs) for your variables and functions otherwise you will change their behaviour."
114 | ]
115 | }
116 | ],
117 | "metadata": {
118 | "kernelspec": {
119 | "display_name": "Python 3",
120 | "language": "python",
121 | "name": "python3"
122 | },
123 | "language_info": {
124 | "codemirror_mode": {
125 | "name": "ipython",
126 | "version": 3
127 | },
128 | "file_extension": ".py",
129 | "mimetype": "text/x-python",
130 | "name": "python",
131 | "nbconvert_exporter": "python",
132 | "pygments_lexer": "ipython3",
133 | "version": "3.6.4"
134 | }
135 | },
136 | "nbformat": 4,
137 | "nbformat_minor": 2
138 | }
139 |
--------------------------------------------------------------------------------
/solutions/ex_13_2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Solution to exercise 1.3.2\n",
8 | "\n",
9 | "- Generalise the code written for exercise 1.1.3 for finding which European countries have the largest population in 1952 and 2007 by creating a function that finds which country on a defined continent has the largest population for a given year. Provide default values for certain arguments."
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "pop_1957 = 0\n",
19 | "largest_eu_country_1957 = ''\n",
20 | "\n",
21 | "pop_2007 = 0\n",
22 | "largest_eu_country_2007 = ''\n",
23 | "\n",
24 | "with open('../data/gapminder.csv') as f:\n",
25 | " for line in f:\n",
26 | " data = line.strip().split(',')\n",
27 | " if data[1] == \"Europe\":\n",
28 | " if data[2] == \"1957\" and int(data[4]) > pop_1957:\n",
29 | " pop_1957 = int(data[4])\n",
30 | " largest_eu_country_1957 = data[0]\n",
31 | " if data[2] == \"2007\" and int(data[4]) > pop_2007:\n",
32 | " pop_2007 = int(data[4])\n",
33 | " largest_eu_country_2007 = data[0]\n",
34 | " \n",
35 | "print(largest_eu_country_1957, pop_1957)\n",
36 | "print(largest_eu_country_2007, pop_2007)"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": null,
42 | "metadata": {},
43 | "outputs": [],
44 | "source": [
45 | "import statistics as stats\n",
46 | "import csv\n",
47 | "\n",
48 | "def largest_country_by_continent_and_year(gapminder_filepath, continent='Europe', year='1952'):\n",
49 | " \"\"\"\n",
50 | " Returns the largest country of the selected continent for a given year.\n",
51 | "\n",
52 | " gapminder_filepath --- gapminder file path with multi-continent and multi-year data\n",
53 | " continent --- continent for which data is extracted\n",
54 | " year --- year for which data is extracted\n",
55 | " \"\"\"\n",
56 | " pop = 0\n",
57 | " largest_country = ''\n",
58 | " with open(gapminder_filepath) as f:\n",
59 | " reader = csv.DictReader(f, delimiter = \",\")\n",
60 | " for data in reader: \n",
61 | " if data['continent'] == continent and data['year'] == year:\n",
62 | " if int(data['pop']) > pop:\n",
63 | " pop = int(data['pop'])\n",
64 | " largest_country = data['country']\n",
65 | " print(continent, 'largest country in', year)\n",
66 | " return largest_country, pop"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": null,
72 | "metadata": {},
73 | "outputs": [],
74 | "source": [
75 | "import os \n",
76 | "largest_country = largest_country_by_continent_and_year(os.path.join('..', 'data', 'gapminder.csv'))\n",
77 | "print(largest_country)"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": null,
83 | "metadata": {},
84 | "outputs": [],
85 | "source": [
86 | "import os \n",
87 | "largest_country = largest_country_by_continent_and_year(os.path.join('..', 'data', 'gapminder.csv'), 'Africa')\n",
88 | "print(largest_country)"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": null,
94 | "metadata": {},
95 | "outputs": [],
96 | "source": [
97 | "import os \n",
98 | "largest_country = largest_country_by_continent_and_year(os.path.join('..', 'data', 'gapminder.csv'), 'Africa', '2007')\n",
99 | "print(largest_country)"
100 | ]
101 | },
102 | {
103 | "cell_type": "code",
104 | "execution_count": null,
105 | "metadata": {},
106 | "outputs": [],
107 | "source": [
108 | "import os \n",
109 | "largest_country = largest_country_by_continent_and_year(os.path.join('..', 'data', 'gapminder.csv'), 'Asia')\n",
110 | "print(largest_country)"
111 | ]
112 | }
113 | ],
114 | "metadata": {
115 | "kernelspec": {
116 | "display_name": "Python 3",
117 | "language": "python",
118 | "name": "python3"
119 | },
120 | "language_info": {
121 | "codemirror_mode": {
122 | "name": "ipython",
123 | "version": 3
124 | },
125 | "file_extension": ".py",
126 | "mimetype": "text/x-python",
127 | "name": "python",
128 | "nbconvert_exporter": "python",
129 | "pygments_lexer": "ipython3",
130 | "version": "3.6.4"
131 | }
132 | },
133 | "nbformat": 4,
134 | "nbformat_minor": 2
135 | }
136 |
--------------------------------------------------------------------------------
/solutions/ex_13_3.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Solution to exercise 1.3.3\n",
8 | "\n",
9 | "- Create a module with the two functions written so far to analyse the Gapminder dataset. Import the module, and call these functions multiple times with different arguments.\n",
10 | "- Create a new function in this module that returns the average life expectancy on a given continent for a given year. Call this function with different arguments and compare the results."
11 | ]
12 | },
13 | {
14 | "cell_type": "markdown",
15 | "metadata": {},
16 | "source": [
17 | "Code to paste into the `gapminder.py` module:\n",
18 | "\n",
19 | "```\n",
20 | "import statistics as stats\n",
21 | "import csv\n",
22 | "\n",
23 | "def gdp_stats_by_continent_and_year(gapminder_filepath, continent='Europe', year='1952'):\n",
24 | " \"\"\"\n",
25 | " Returns a dictionary of the average, median and standard deviation of GDP per capita \n",
26 | " for all countries of the selected continent for a given year.\n",
27 | "\n",
28 | " gapminder_filepath --- gapminder file path with multi-continent and multi-year data\n",
29 | " continent --- continent for which data is extracted\n",
30 | " year --- year for which data is extracted\n",
31 | " \"\"\"\n",
32 | " gdppercap = []\n",
33 | " with open(gapminder_filepath) as f:\n",
34 | " reader = csv.DictReader(f, delimiter = \",\")\n",
35 | " for data in reader: \n",
36 | " if data['continent'] == continent and data['year'] == year:\n",
37 | " gdppercap.append(float(data['gdpPercap']))\n",
38 | " print(continent, 'GDP per Capita in', year)\n",
39 | " return {'mean': stats.mean(gdppercap), 'median': stats.median(gdppercap), 'stdev': stats.stdev(gdppercap)}\n",
40 | "\n",
41 | "\n",
42 | "def largest_country_by_continent_and_year(gapminder_filepath, continent='Europe', year='1952'):\n",
43 | " \"\"\"\n",
44 | " Returns the largest country of the selected continent for a given year.\n",
45 | "\n",
46 | " gapminder_filepath --- gapminder file path with multi-continent and multi-year data\n",
47 | " continent --- continent for which data is extracted\n",
48 | " year --- year for which data is extracted\n",
49 | " \"\"\"\n",
50 | " pop = 0\n",
51 | " largest_country = ''\n",
52 | " with open(gapminder_filepath) as f:\n",
53 | " reader = csv.DictReader(f, delimiter = \",\")\n",
54 | " for data in reader: \n",
55 | " if data['continent'] == continent and data['year'] == year:\n",
56 | " if int(data['pop']) > pop:\n",
57 | " pop = int(data['pop'])\n",
58 | " largest_country = data['country']\n",
59 | " print(continent, 'largest country in', year)\n",
60 | " return largest_country, pop\n",
61 | "```"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "metadata": {},
68 | "outputs": [],
69 | "source": [
70 | "import gapminder"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": null,
76 | "metadata": {},
77 | "outputs": [],
78 | "source": [
79 | "help(gapminder)"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": null,
85 | "metadata": {},
86 | "outputs": [],
87 | "source": [
88 | "help(gapminder.gdp_stats_by_continent_and_year)"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": null,
94 | "metadata": {},
95 | "outputs": [],
96 | "source": [
97 | "import os\n",
98 | "gapminder_filepath = os.path.join('..', 'data', 'gapminder.csv')"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": null,
104 | "metadata": {},
105 | "outputs": [],
106 | "source": [
107 | "largest_country = gapminder.largest_country_by_continent_and_year(gapminder_filepath, 'Asia')\n",
108 | "print(largest_country)"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": null,
114 | "metadata": {},
115 | "outputs": [],
116 | "source": [
117 | "gdp_stats = gapminder.gdp_stats_by_continent_and_year(gapminder_filepath, 'Asia')\n",
118 | "print(gdp_stats)"
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": null,
124 | "metadata": {},
125 | "outputs": [],
126 | "source": [
127 | "largest_country = gapminder.largest_country_by_continent_and_year(gapminder_filepath, 'Africa')\n",
128 | "print(largest_country)"
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": null,
134 | "metadata": {},
135 | "outputs": [],
136 | "source": [
137 | "gdp_stats = gapminder.gdp_stats_by_continent_and_year(gapminder_filepath, 'Africa')\n",
138 | "print(gdp_stats)"
139 | ]
140 | },
141 | {
142 | "cell_type": "markdown",
143 | "metadata": {},
144 | "source": [
145 | "New function to add into the module `gapminder.py`:\n",
146 | "\n",
147 | "```\n",
148 | "import csv\n",
149 | "import statistics as stats\n",
150 | "def avg_life_exp_by_continent_and_year(gapminder_filepath, continent='Europe', year='1952'):\n",
151 | " \"\"\"\n",
152 | " Returns the average life expectancy \n",
153 | " for all countries of the selected continent for a given year.\n",
154 | "\n",
155 | " gapminder_filepath --- gapminder file path with multi-continent and multi-year data\n",
156 | " continent --- continent for which data is extracted\n",
157 | " year --- year for which data is extracted\n",
158 | " \"\"\"\n",
159 | " life_exp = []\n",
160 | " with open(gapminder_filepath) as f:\n",
161 | " reader = csv.DictReader(f, delimiter = \",\")\n",
162 | " for data in reader: \n",
163 | " if data['continent'] == continent and data['year'] == year:\n",
164 | " life_exp.append(float(data['lifeExp']))\n",
165 | " print(continent, 'life expectancy', year)\n",
166 | " return stats.mean(life_exp)\n",
167 | "```"
168 | ]
169 | },
170 | {
171 | "cell_type": "markdown",
172 | "metadata": {},
173 | "source": [
174 | "**BEWARE!** When using Jupyter Notebooks and modifying a module, you MUST restart the kernel of the notebook to have these changes taken into account."
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": null,
180 | "metadata": {},
181 | "outputs": [],
182 | "source": [
183 | "import gapminder"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": null,
189 | "metadata": {},
190 | "outputs": [],
191 | "source": [
192 | "import os\n",
193 | "gapminder_filepath = os.path.join('..', 'data', 'gapminder.csv')"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": null,
199 | "metadata": {},
200 | "outputs": [],
201 | "source": [
202 | "avg_life_exp = gapminder.avg_life_exp_by_continent_and_year(gapminder_filepath, 'Africa')\n",
203 | "print(avg_life_exp)"
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": null,
209 | "metadata": {},
210 | "outputs": [],
211 | "source": [
212 | "avg_life_exp = gapminder.avg_life_exp_by_continent_and_year(gapminder_filepath, 'Africa', '2007')\n",
213 | "print(avg_life_exp)"
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": null,
219 | "metadata": {},
220 | "outputs": [],
221 | "source": [
222 | "avg_life_exp = gapminder.avg_life_exp_by_continent_and_year(gapminder_filepath, 'Europe')\n",
223 | "print(avg_life_exp)"
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": null,
229 | "metadata": {},
230 | "outputs": [],
231 | "source": [
232 | "avg_life_exp = gapminder.avg_life_exp_by_continent_and_year(gapminder_filepath, 'Europe', '2007')\n",
233 | "print(avg_life_exp)"
234 | ]
235 | }
236 | ],
237 | "metadata": {
238 | "kernelspec": {
239 | "display_name": "Python 3",
240 | "language": "python",
241 | "name": "python3"
242 | },
243 | "language_info": {
244 | "codemirror_mode": {
245 | "name": "ipython",
246 | "version": 3
247 | },
248 | "file_extension": ".py",
249 | "mimetype": "text/x-python",
250 | "name": "python",
251 | "nbconvert_exporter": "python",
252 | "pygments_lexer": "ipython3",
253 | "version": "3.6.4"
254 | }
255 | },
256 | "nbformat": 4,
257 | "nbformat_minor": 2
258 | }
259 |
--------------------------------------------------------------------------------
/solutions/ex_21_1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Solution to exercise 2.1.1\n",
8 | "\n",
9 | "- Read the data in `gapminder_gdp_americas.csv` (which should be in the same directory as `gapminder_gdp_oceania.csv`) into a variable called `americas_data` and display its summary statistics.\n",
10 | "- As well as the `read_csv()` function for reading data from a file, Pandas provides a `to_csv()` function to write dataframes to files. Applying what you’ve learned about reading from files, write one of your dataframes to a file called `processed.csv`. You can use help to get information on how to use `to_csv`."
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": null,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "import pandas"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": null,
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "americas_data = pandas.read_csv('../data/gapminder_gdp_americas.csv', index_col='country')"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": null,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "americas_data.head()"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": null,
43 | "metadata": {},
44 | "outputs": [],
45 | "source": [
46 | "americas_data.describe()"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": null,
52 | "metadata": {},
53 | "outputs": [],
54 | "source": [
55 | "help(americas_data.to_csv)"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": null,
61 | "metadata": {},
62 | "outputs": [],
63 | "source": [
64 | "americas_data.to_csv('processed.csv')"
65 | ]
66 | }
67 | ],
68 | "metadata": {
69 | "kernelspec": {
70 | "display_name": "Python 3",
71 | "language": "python",
72 | "name": "python3"
73 | },
74 | "language_info": {
75 | "codemirror_mode": {
76 | "name": "ipython",
77 | "version": 3
78 | },
79 | "file_extension": ".py",
80 | "mimetype": "text/x-python",
81 | "name": "python",
82 | "nbconvert_exporter": "python",
83 | "pygments_lexer": "ipython3",
84 | "version": "3.6.4"
85 | }
86 | },
87 | "nbformat": 4,
88 | "nbformat_minor": 2
89 | }
90 |
--------------------------------------------------------------------------------
/solutions/ex_22_1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Solution to exercise 2.2.1\n",
8 | "\n",
9 | "Re-use the GapMinder dataset to plot, in Jupyter using Matplotlib, from the world data the life expectancy against GDP per capita for 1952, 1977 and 2007 using a scatter plot, add title to your graph as well as a legend.\n",
10 | "\n",
11 | "Find the country with the highest GDP per Capita for 1952, 1977 and 2007.\n",
12 | "\n",
13 | "Re-write the function `gdp_stats_by_continent_and_year()` written yesterday using Pandas."
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": null,
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "%matplotlib inline\n",
23 | "\n",
24 | "import os\n",
25 | "import pandas\n",
26 | "import matplotlib.pyplot as plt\n",
27 | "import seaborn as sns\n",
28 | "\n",
29 | "sns.set_style('darkgrid')\n",
30 | "sns.set_context('talk')\n",
31 | "\n",
32 | "gapminder_data = pandas.read_csv(os.path.join('..', 'data', 'gapminder.csv'))\n",
33 | "#print(gapminder_data.year.unique())\n",
34 | "gapminder_1952 = gapminder_data[gapminder_data.year == 1952]\n",
35 | "gapminder_1977 = gapminder_data[gapminder_data.year == 1977]\n",
36 | "gapminder_2007 = gapminder_data[gapminder_data.year == 2007]\n",
37 | "#print(gapminder_1957.lifeExp)\n",
38 | "\n",
39 | "# scatter plots\n",
40 | "plt.xlabel('Life expectancy')\n",
41 | "plt.ylabel('GDP per capita')\n",
42 | "plt.title('GapMinder world data over 50 years')\n",
43 | "plt.scatter(gapminder_1952.lifeExp, gapminder_1952.gdpPercap, alpha=0.25, label='1952')\n",
44 | "plt.scatter(gapminder_1977.lifeExp, gapminder_1977.gdpPercap, alpha=0.25, label='1977')\n",
45 | "plt.scatter(gapminder_2007.lifeExp, gapminder_2007.gdpPercap, alpha=0.25, label='2007')\n",
46 | "plt.legend()\n",
47 | "plt.show()\n",
48 | "\n",
49 | "# find the country with the highest GDP per Capita\n",
50 | "print(gapminder_1952[gapminder_1952.gdpPercap==gapminder_1952.gdpPercap.max()])\n",
51 | "print(gapminder_1977[gapminder_1977.gdpPercap==gapminder_1977.gdpPercap.max()])\n",
52 | "print(gapminder_2007[gapminder_2007.gdpPercap==gapminder_2007.gdpPercap.max()])"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "metadata": {
59 | "collapsed": true
60 | },
61 | "outputs": [],
62 | "source": [
63 | "import os\n",
64 | "import pandas\n",
65 | "\n",
66 | "def gdp_stats_by_continent_and_year(continent='Europe', year=1952, gapminder_filepath=os.path.join('..', 'data', 'gapminder.csv')):\n",
67 | " \"\"\"\n",
68 | " Returns a dictionary of the average, median and standard deviation of GDP per capita \n",
69 | " for all countries of the selected continent for a given year.\n",
70 | "\n",
71 | " gapminder_filepath --- gapminder file path with multi-continent and multi-year data\n",
72 | " continent --- continent for which data is extracted\n",
73 | " year --- year for which data is extracted\n",
74 | " \"\"\"\n",
75 | " gapminder_data = pandas.read_csv(gapminder_filepath)\n",
76 | " filtered_data = gapminder_data[(gapminder_data['year'] == year) & (gapminder_data['continent'] == continent)]\n",
77 | " print(continent, 'GDP per Capita in', year)\n",
78 | " return {'mean': filtered_data['gdpPercap'].mean(), 'median': filtered_data['gdpPercap'].median(), 'stdev': filtered_data['gdpPercap'].std()}"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": null,
84 | "metadata": {
85 | "collapsed": true
86 | },
87 | "outputs": [],
88 | "source": [
89 | "print(gdp_stats_by_continent_and_year())"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": null,
95 | "metadata": {
96 | "collapsed": true
97 | },
98 | "outputs": [],
99 | "source": [
100 | "print(gdp_stats_by_continent_and_year('Americas'))"
101 | ]
102 | }
103 | ],
104 | "metadata": {
105 | "kernelspec": {
106 | "display_name": "Python 3",
107 | "language": "python",
108 | "name": "python3"
109 | },
110 | "language_info": {
111 | "codemirror_mode": {
112 | "name": "ipython",
113 | "version": 3
114 | },
115 | "file_extension": ".py",
116 | "mimetype": "text/x-python",
117 | "name": "python",
118 | "nbconvert_exporter": "python",
119 | "pygments_lexer": "ipython3",
120 | "version": "3.6.4"
121 | }
122 | },
123 | "nbformat": 4,
124 | "nbformat_minor": 2
125 | }
126 |
--------------------------------------------------------------------------------
/solutions/ex_23_1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Solution to exercise 2.3.1\n",
8 | "\n",
9 | "- Retrieve a FASTA file named `data/sample.fa` using BioPython and answer the following questions:\n",
10 | " - How many sequences are in the file?\n",
11 | " - What are the IDs and the lengths of the longest and the shortest sequences?\n",
12 | " - Select sequences longer than 500bp. What is the average length of these sequences?\n",
13 | " - Calculate and print the percentage of GC in each of the sequences.\n",
14 | " - Write the newly created sequences into a FASTA file named `long_sequences.fa` "
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "from Bio import SeqIO\n",
24 | "\n",
25 | "# read the FASTA file named data/sample.fa\n",
26 | "seq_records = list(SeqIO.parse('../data/sample.fa', 'fasta'))\n",
27 | "\n",
28 | "# find the number of sequences present in the file\n",
29 | "num_seq = len(seq_records)\n",
30 | "print('Total number of sequences:', num_seq)"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": null,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "# find IDs and lengths of the longest and the shortest sequences\n",
40 | "\n",
41 | "# Create a Pandas dataframe for storing the Seq objects, their IDs and their sequences\n",
42 | "import pandas\n",
43 | "\n",
44 | "seq_ids = []\n",
45 | "seq_seqs = []\n",
46 | "seq_objs = []\n",
47 | "\n",
48 | "for seq in seq_records:\n",
49 | " seq_ids.append(seq.id)\n",
50 | " seq_seqs.append(str(seq.seq))\n",
51 | " seq_objs.append(seq)\n",
52 | "\n",
53 | "seq_df = pandas.DataFrame({\"id\": seq_ids, \"seq\": seq_seqs, 'seqobj': seq_objs})\n",
54 | "\n",
55 | "# Calculate the length of each sequence\n",
56 | "seq_df['len'] = seq_df['seq'].apply(len)\n",
57 | "\n",
58 | "# Find shortest and longest sequence ids\n",
59 | "shortest = seq_df.sort_values(\"len\", ascending=True).iloc[0]\n",
60 | "longest = seq_df.sort_values(\"len\", ascending=False).iloc[0]\n",
61 | "print('Longest sequence is', longest['id'], 'with length', longest['len'], 'bp')\n",
62 | "print('Shortest sequence is', shortest['id'], 'with length', shortest['len'], 'bp')"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": null,
68 | "metadata": {},
69 | "outputs": [],
70 | "source": [
71 | "print(seq_df.head())"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": null,
77 | "metadata": {},
78 | "outputs": [],
79 | "source": [
80 | "# Calculate the average length of sequences longer than 500bp\n",
81 | "# Calculate and print the percentage of GC contents\n",
82 | "\n",
83 | "from Bio.SeqUtils import GC\n",
84 | "\n",
85 | "# Calculate GC content \n",
86 | "seq_df['gc'] = seq_df['seq'].apply(GC)\n",
87 | "\n",
88 | "# Filter sequences longer the 500bp\n",
89 | "long_seq_df = seq_df[seq_df['len'] > 500]\n",
90 | "\n",
91 | "print('Average length for sequences longer than 500bp is {}'.format(long_seq_df['len'].mean()))\n",
92 | "print(long_seq_df[['id', 'gc']])"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": null,
98 | "metadata": {},
99 | "outputs": [],
100 | "source": [
101 | "# Write sequences stored in dataframe as Seq objects in the long_seq_df in a file with 'GenBank' format\n",
102 | "SeqIO.write(long_seq_df['seqobj'], 'long_sequences.fa', 'fasta')"
103 | ]
104 | }
105 | ],
106 | "metadata": {
107 | "kernelspec": {
108 | "display_name": "Python 3",
109 | "language": "python",
110 | "name": "python3"
111 | },
112 | "language_info": {
113 | "codemirror_mode": {
114 | "name": "ipython",
115 | "version": 3
116 | },
117 | "file_extension": ".py",
118 | "mimetype": "text/x-python",
119 | "name": "python",
120 | "nbconvert_exporter": "python",
121 | "pygments_lexer": "ipython3",
122 | "version": "3.6.4"
123 | }
124 | },
125 | "nbformat": 4,
126 | "nbformat_minor": 2
127 | }
128 |
--------------------------------------------------------------------------------
/solutions/gapminder.py:
--------------------------------------------------------------------------------
1 | import statistics as stats
2 | import csv
3 |
4 | def gdp_stats_by_continent_and_year(gapminder_filepath, continent='Europe', year='1952'):
5 | """
6 | Returns a dictionary of the average, median and standard deviation of GDP per capita
7 | for all countries of the selected continent for a given year.
8 |
9 | gapminder_filepath --- gapminder file path with multi-continent and multi-year data
10 | continent --- continent for which data is extracted
11 | year --- year for which data is extracted
12 | """
13 | gdppercap = []
14 | with open(gapminder_filepath) as f:
15 | reader = csv.DictReader(f, delimiter = ",")
16 | for data in reader:
17 | if data['continent'] == continent and data['year'] == year:
18 | gdppercap.append(float(data['gdpPercap']))
19 | print(continent, 'GDP per Capita in', year)
20 | return {'mean': stats.mean(gdppercap), 'median': stats.median(gdppercap), 'stdev': stats.stdev(gdppercap)}
21 |
22 |
23 | def largest_country_by_continent_and_year(gapminder_filepath, continent='Europe', year='1952'):
24 | """
25 | Returns the largest country of the selected continent for a given year.
26 |
27 | gapminder_filepath --- gapminder file path with multi-continent and multi-year data
28 | continent --- continent for which data is extracted
29 | year --- year for which data is extracted
30 | """
31 | pop = 0
32 | largest_country = ''
33 | with open(gapminder_filepath) as f:
34 | reader = csv.DictReader(f, delimiter = ",")
35 | for data in reader:
36 | if data['continent'] == continent and data['year'] == year:
37 | if int(data['pop']) > pop:
38 | pop = int(data['pop'])
39 | largest_country = data['country']
40 | print(continent, 'largest country in', year)
41 | return largest_country, pop
42 |
43 |
44 | def avg_life_exp_by_continent_and_year(gapminder_filepath, continent='Europe', year='1952'):
45 | """
46 | Returns the average life expectancy
47 | for all countries of the selected continent for a given year.
48 |
49 | gapminder_filepath --- gapminder file path with multi-continent and multi-year data
50 | continent --- continent for which data is extracted
51 | year --- year for which data is extracted
52 | """
53 | life_exp = []
54 | with open(gapminder_filepath) as f:
55 | reader = csv.DictReader(f, delimiter = ",")
56 | for data in reader:
57 | if data['continent'] == continent and data['year'] == year:
58 | life_exp.append(float(data['lifeExp']))
59 | print(continent, 'life expectancy', year)
60 | return stats.mean(life_exp)
61 |
--------------------------------------------------------------------------------