├── .gitignore
├── LICENSE
├── README.md
├── _solved
├── 00-jupyter_introduction.ipynb
├── case1_bike_count.ipynb
├── case2_observations.ipynb
├── case2_observations_analysis.ipynb
├── case2_observations_processing.ipynb
├── case3_bacterial_resistance_lab_experiment.ipynb
├── case4_air_quality_analysis.ipynb
├── case4_air_quality_processing.ipynb
├── data
├── pandas_01_data_structures.ipynb
├── pandas_02_basic_operations.ipynb
├── pandas_03a_selecting_data.ipynb
├── pandas_03b_indexing.ipynb
├── pandas_04_time_series_data.ipynb
├── pandas_05_groupby_operations.ipynb
├── pandas_06_data_cleaning.ipynb
├── pandas_07_missing_values.ipynb
├── pandas_08_reshaping_data.ipynb
├── pandas_09_combining_datasets.ipynb
├── python_intro
│ ├── 00-jupyterlab.ipynb
│ ├── 01-variables.ipynb
│ ├── 02-functions-use.ipynb
│ ├── 03-containers.ipynb
│ ├── 04-control-flow.ipynb
│ └── 05-functions-write.ipynb
├── python_recap
│ ├── 01-basic.ipynb
│ ├── 02-control_flow.ipynb
│ ├── 03-functions.ipynb
│ ├── 04-reusing_code.ipynb
│ ├── 05-numpy.ipynb
│ ├── data
│ │ ├── bogota_part_dataset.csv
│ │ ├── out1.txt
│ │ ├── out2.txt
│ │ ├── out3.txt
│ │ ├── out4.txt
│ │ └── values.txt
│ └── python_rehearsal.ipynb
├── spreaddiagram.py
├── visualization_01_matplotlib.ipynb
├── visualization_02_plotnine.ipynb
├── visualization_02_seaborn.ipynb
├── visualization_03_landscape.ipynb
└── workflow_example_evaluation.ipynb
├── check_environment.py
├── convert_notebooks.sh
├── docs
├── _config.yml
├── contributing.md
├── index.md
├── setup.md
├── slides.html
└── static
│ ├── img
│ ├── JakeVdP-ecosystem1.svg
│ ├── JakeVdP-ecosystem2.svg
│ ├── JakeVdP-ecosystem3.svg
│ ├── JakeVdP-ecosystem4.svg
│ ├── JakeVdP-ecosystem5.svg
│ ├── datacleaning1.jpg
│ ├── datacleaning2.jpg
│ ├── dataframe.png
│ ├── doctoralschoolsprofiel_hq_rgb_web.png
│ ├── download-button.png
│ ├── environment_save.png
│ ├── environment_save.svg
│ ├── icon_github.svg
│ ├── icon_twitter.svg
│ ├── ipython.png
│ ├── issuetracker.png
│ ├── logo_flanders+richtingmorgen.png
│ ├── navigator_notebook.png
│ ├── navigator_notebook.svg
│ ├── navigator_terminal.png
│ ├── notebook.png
│ ├── startup.png
│ ├── tidy_data_paper.png
│ ├── tidy_data_scheme.png
│ └── work_stijn_1.png
│ ├── remark-latest.min.js
│ └── slides.css
├── environment.yml
├── img
├── bacteriophage.jpeg
├── bike_count_illustration.png
├── change_kernel.png
├── doctoralschoolsprofiel_hq_rgb_web.png
├── enterbutton.png
├── heatmap.png
├── keya.png
├── keyb.png
├── keyescape.png
├── logo_flanders+richtingmorgen.png
├── matplotlib_fundamentals.png
├── matplotlib_fundamentals.svg
├── matplotlib_oo.png
├── pandas.svg
├── pandas
│ ├── 01_table_dataframe1.svg
│ ├── pivot_excel.png
│ ├── schema-concat0.svg
│ ├── schema-concat1.svg
│ ├── schema-dataframe.svg
│ ├── schema-stack.svg
│ └── splitApplyCombine.png
├── plot_overview.png
├── python-function.svg
├── python-sticky-note-variables-01.svg
├── python-sticky-note-variables-02.svg
├── python-sticky-note-variables-03.svg
├── seaborn_overview_modules.png
├── shift-tab.png
├── shift_button.png
├── shiftenter.jpg
├── stack.png
├── tabbutton.jpg
├── tidy_data_scheme.png
└── toomuch.jpg
├── nbconvert_config.py
└── notebooks
├── 00-jupyter_introduction.ipynb
├── _solutions
├── case1_bike_count1.py
├── case1_bike_count10.py
├── case1_bike_count11.py
├── case1_bike_count12.py
├── case1_bike_count13.py
├── case1_bike_count14.py
├── case1_bike_count15.py
├── case1_bike_count16.py
├── case1_bike_count17.py
├── case1_bike_count18.py
├── case1_bike_count19.py
├── case1_bike_count2.py
├── case1_bike_count20.py
├── case1_bike_count21.py
├── case1_bike_count22.py
├── case1_bike_count23.py
├── case1_bike_count24.py
├── case1_bike_count25.py
├── case1_bike_count26.py
├── case1_bike_count27.py
├── case1_bike_count28.py
├── case1_bike_count3.py
├── case1_bike_count4.py
├── case1_bike_count5.py
├── case1_bike_count6.py
├── case1_bike_count7.py
├── case1_bike_count8.py
├── case1_bike_count9.py
├── case2_observations1.py
├── case2_observations10.py
├── case2_observations11.py
├── case2_observations12.py
├── case2_observations13.py
├── case2_observations14.py
├── case2_observations15.py
├── case2_observations16.py
├── case2_observations17.py
├── case2_observations18.py
├── case2_observations19.py
├── case2_observations2.py
├── case2_observations20.py
├── case2_observations21.py
├── case2_observations22.py
├── case2_observations23.py
├── case2_observations24.py
├── case2_observations25.py
├── case2_observations26.py
├── case2_observations27.py
├── case2_observations28.py
├── case2_observations29.py
├── case2_observations3.py
├── case2_observations30.py
├── case2_observations31.py
├── case2_observations32.py
├── case2_observations33.py
├── case2_observations34.py
├── case2_observations35.py
├── case2_observations36.py
├── case2_observations37.py
├── case2_observations38.py
├── case2_observations39.py
├── case2_observations4.py
├── case2_observations40.py
├── case2_observations41.py
├── case2_observations42.py
├── case2_observations43.py
├── case2_observations44.py
├── case2_observations45.py
├── case2_observations46.py
├── case2_observations47.py
├── case2_observations48.py
├── case2_observations49.py
├── case2_observations5.py
├── case2_observations50.py
├── case2_observations51.py
├── case2_observations6.py
├── case2_observations7.py
├── case2_observations8.py
├── case2_observations9.py
├── case3_bacterial_resistance_lab_experiment1.py
├── case3_bacterial_resistance_lab_experiment10.py
├── case3_bacterial_resistance_lab_experiment11.py
├── case3_bacterial_resistance_lab_experiment12.py
├── case3_bacterial_resistance_lab_experiment13.py
├── case3_bacterial_resistance_lab_experiment2.py
├── case3_bacterial_resistance_lab_experiment3.py
├── case3_bacterial_resistance_lab_experiment4.py
├── case3_bacterial_resistance_lab_experiment5.py
├── case3_bacterial_resistance_lab_experiment6.py
├── case3_bacterial_resistance_lab_experiment7.py
├── case3_bacterial_resistance_lab_experiment8.py
├── case3_bacterial_resistance_lab_experiment9.py
├── case4_air_quality_analysis1.py
├── case4_air_quality_analysis10.py
├── case4_air_quality_analysis11.py
├── case4_air_quality_analysis12.py
├── case4_air_quality_analysis13.py
├── case4_air_quality_analysis14.py
├── case4_air_quality_analysis15.py
├── case4_air_quality_analysis16.py
├── case4_air_quality_analysis17.py
├── case4_air_quality_analysis18.py
├── case4_air_quality_analysis19.py
├── case4_air_quality_analysis2.py
├── case4_air_quality_analysis20.py
├── case4_air_quality_analysis21.py
├── case4_air_quality_analysis22.py
├── case4_air_quality_analysis23.py
├── case4_air_quality_analysis24.py
├── case4_air_quality_analysis25.py
├── case4_air_quality_analysis26.py
├── case4_air_quality_analysis27.py
├── case4_air_quality_analysis28.py
├── case4_air_quality_analysis29.py
├── case4_air_quality_analysis3.py
├── case4_air_quality_analysis30.py
├── case4_air_quality_analysis31.py
├── case4_air_quality_analysis32.py
├── case4_air_quality_analysis33.py
├── case4_air_quality_analysis34.py
├── case4_air_quality_analysis35.py
├── case4_air_quality_analysis36.py
├── case4_air_quality_analysis37.py
├── case4_air_quality_analysis38.py
├── case4_air_quality_analysis39.py
├── case4_air_quality_analysis4.py
├── case4_air_quality_analysis40.py
├── case4_air_quality_analysis5.py
├── case4_air_quality_analysis6.py
├── case4_air_quality_analysis7.py
├── case4_air_quality_analysis8.py
├── case4_air_quality_analysis9.py
├── case4_air_quality_processing1.py
├── case4_air_quality_processing10.py
├── case4_air_quality_processing11.py
├── case4_air_quality_processing12.py
├── case4_air_quality_processing13.py
├── case4_air_quality_processing2.py
├── case4_air_quality_processing3.py
├── case4_air_quality_processing4.py
├── case4_air_quality_processing5.py
├── case4_air_quality_processing6.py
├── case4_air_quality_processing7.py
├── case4_air_quality_processing8.py
├── case4_air_quality_processing9.py
├── pandas_01_data_structures1.py
├── pandas_01_data_structures2.py
├── pandas_01_data_structures3.py
├── pandas_01_data_structures4.py
├── pandas_01_data_structures5.py
├── pandas_01_data_structures6.py
├── pandas_02_basic_operations1.py
├── pandas_02_basic_operations10.py
├── pandas_02_basic_operations2.py
├── pandas_02_basic_operations3.py
├── pandas_02_basic_operations4.py
├── pandas_02_basic_operations5.py
├── pandas_02_basic_operations6.py
├── pandas_02_basic_operations7.py
├── pandas_02_basic_operations8.py
├── pandas_02_basic_operations9.py
├── pandas_03a_selecting_data1.py
├── pandas_03a_selecting_data10.py
├── pandas_03a_selecting_data11.py
├── pandas_03a_selecting_data12.py
├── pandas_03a_selecting_data13.py
├── pandas_03a_selecting_data14.py
├── pandas_03a_selecting_data15.py
├── pandas_03a_selecting_data16.py
├── pandas_03a_selecting_data17.py
├── pandas_03a_selecting_data18.py
├── pandas_03a_selecting_data19.py
├── pandas_03a_selecting_data2.py
├── pandas_03a_selecting_data20.py
├── pandas_03a_selecting_data21.py
├── pandas_03a_selecting_data22.py
├── pandas_03a_selecting_data23.py
├── pandas_03a_selecting_data3.py
├── pandas_03a_selecting_data4.py
├── pandas_03a_selecting_data5.py
├── pandas_03a_selecting_data6.py
├── pandas_03a_selecting_data7.py
├── pandas_03a_selecting_data8.py
├── pandas_03a_selecting_data9.py
├── pandas_03b_indexing1.py
├── pandas_03b_indexing2.py
├── pandas_03b_indexing3.py
├── pandas_03b_indexing4.py
├── pandas_03b_indexing5.py
├── pandas_03b_indexing6.py
├── pandas_03b_indexing7.py
├── pandas_04_time_series_data1.py
├── pandas_04_time_series_data2.py
├── pandas_04_time_series_data3.py
├── pandas_04_time_series_data4.py
├── pandas_04_time_series_data5.py
├── pandas_04_time_series_data6.py
├── pandas_04_time_series_data7.py
├── pandas_04_time_series_data8.py
├── pandas_04_time_series_data9.py
├── pandas_05_groupby_operations1.py
├── pandas_05_groupby_operations10.py
├── pandas_05_groupby_operations11.py
├── pandas_05_groupby_operations12.py
├── pandas_05_groupby_operations13.py
├── pandas_05_groupby_operations14.py
├── pandas_05_groupby_operations15.py
├── pandas_05_groupby_operations16.py
├── pandas_05_groupby_operations17.py
├── pandas_05_groupby_operations18.py
├── pandas_05_groupby_operations19.py
├── pandas_05_groupby_operations2.py
├── pandas_05_groupby_operations20.py
├── pandas_05_groupby_operations21.py
├── pandas_05_groupby_operations22.py
├── pandas_05_groupby_operations23.py
├── pandas_05_groupby_operations24.py
├── pandas_05_groupby_operations25.py
├── pandas_05_groupby_operations26.py
├── pandas_05_groupby_operations27.py
├── pandas_05_groupby_operations28.py
├── pandas_05_groupby_operations29.py
├── pandas_05_groupby_operations3.py
├── pandas_05_groupby_operations30.py
├── pandas_05_groupby_operations31.py
├── pandas_05_groupby_operations4.py
├── pandas_05_groupby_operations5.py
├── pandas_05_groupby_operations6.py
├── pandas_05_groupby_operations7.py
├── pandas_05_groupby_operations8.py
├── pandas_05_groupby_operations9.py
├── pandas_06_data_cleaning1.py
├── pandas_06_data_cleaning10.py
├── pandas_06_data_cleaning11.py
├── pandas_06_data_cleaning12.py
├── pandas_06_data_cleaning13.py
├── pandas_06_data_cleaning14.py
├── pandas_06_data_cleaning15.py
├── pandas_06_data_cleaning2.py
├── pandas_06_data_cleaning3.py
├── pandas_06_data_cleaning4.py
├── pandas_06_data_cleaning5.py
├── pandas_06_data_cleaning6.py
├── pandas_06_data_cleaning7.py
├── pandas_06_data_cleaning8.py
├── pandas_06_data_cleaning9.py
├── pandas_08_reshaping_data1.py
├── pandas_08_reshaping_data10.py
├── pandas_08_reshaping_data11.py
├── pandas_08_reshaping_data12.py
├── pandas_08_reshaping_data13.py
├── pandas_08_reshaping_data14.py
├── pandas_08_reshaping_data15.py
├── pandas_08_reshaping_data16.py
├── pandas_08_reshaping_data17.py
├── pandas_08_reshaping_data18.py
├── pandas_08_reshaping_data19.py
├── pandas_08_reshaping_data2.py
├── pandas_08_reshaping_data20.py
├── pandas_08_reshaping_data3.py
├── pandas_08_reshaping_data4.py
├── pandas_08_reshaping_data5.py
├── pandas_08_reshaping_data6.py
├── pandas_08_reshaping_data7.py
├── pandas_08_reshaping_data8.py
├── pandas_08_reshaping_data9.py
├── pandas_09_combining_datasets1.py
├── pandas_09_combining_datasets2.py
├── pandas_09_combining_datasets3.py
├── pandas_09_combining_datasets4.py
├── pandas_09_combining_datasets5.py
├── visualization_01_matplotlib1.py
├── visualization_01_matplotlib2.py
├── visualization_01_matplotlib3.py
├── visualization_01_matplotlib4.py
├── visualization_01_matplotlib5.py
├── visualization_01_matplotlib6.py
├── visualization_02_seaborn1.py
├── visualization_02_seaborn10.py
├── visualization_02_seaborn11.py
├── visualization_02_seaborn12.py
├── visualization_02_seaborn13.py
├── visualization_02_seaborn14.py
├── visualization_02_seaborn15.py
├── visualization_02_seaborn16.py
├── visualization_02_seaborn17.py
├── visualization_02_seaborn18.py
├── visualization_02_seaborn19.py
├── visualization_02_seaborn2.py
├── visualization_02_seaborn20.py
├── visualization_02_seaborn21.py
├── visualization_02_seaborn22.py
├── visualization_02_seaborn3.py
├── visualization_02_seaborn4.py
├── visualization_02_seaborn5.py
├── visualization_02_seaborn6.py
├── visualization_02_seaborn7.py
├── visualization_02_seaborn8.py
└── visualization_02_seaborn9.py
├── case1_bike_count.ipynb
├── case2_observations.ipynb
├── case3_bacterial_resistance_lab_experiment.ipynb
├── case4_air_quality_analysis.ipynb
├── case4_air_quality_processing.ipynb
├── data
├── BETN0290000800100hour.1-1-1990.31-12-2012
├── BETR8010000800100hour.1-1-1990.31-12-2012
├── Dryad_Arias_Hall_v3.xlsx
├── FR040120000800100hour.1-1-1999.31-12-2012
├── FR040370000800100hour.1-1-1999.31-12-2012
├── TF_ACCIDENTS_VICTIMS_2020.zip
├── TF_VAT_NACE_SQ_2019.zip
├── airbase_data.csv
├── daily_min_temperature_2020.csv
├── data-preprocessing.ipynb
├── data-preprocessing.md
├── fietstellingencoupure.csv
├── fietstelpaal-coupure-links-2022-gent.zip
├── fietstelpaal-coupure-links-2023-gent.zip
├── fietstelpaal-coupure-links-gent.zip
├── load_casualties.py
├── observations.csv
├── plot_location.xlsx
├── species.csv
├── species_names.csv
├── statbel_statistical_sectors_2019.shp.zip
├── survey_data_completed.csv
├── surveys.csv
├── titanic.csv
├── verbruiksgegevens-per-maand.xlsx
└── vmm_flowdata.csv
├── pandas_01_data_structures.ipynb
├── pandas_02_basic_operations.ipynb
├── pandas_03a_selecting_data.ipynb
├── pandas_03b_indexing.ipynb
├── pandas_04_time_series_data.ipynb
├── pandas_05_groupby_operations.ipynb
├── pandas_06_data_cleaning.ipynb
├── pandas_07_missing_values.ipynb
├── pandas_08_reshaping_data.ipynb
├── pandas_09_combining_datasets.ipynb
├── python_intro
├── 00-jupyterlab.ipynb
├── 01-variables.ipynb
├── 02-functions-use.ipynb
├── 03-containers.ipynb
├── 04-control-flow.ipynb
├── 05-functions-write.ipynb
└── _solutions
│ ├── 00-jupyterlab1.py
│ ├── 00-jupyterlab2.py
│ ├── 01-variables1.py
│ ├── 01-variables2.py
│ ├── 01-variables3.py
│ ├── 01-variables4.py
│ ├── 01-variables5.py
│ ├── 01-variables6.py
│ ├── 01-variables7.py
│ ├── 01-variables8.py
│ ├── 02-functions-use1.py
│ ├── 02-functions-use2.py
│ ├── 02-functions-use3.py
│ ├── 02-functions-use4.py
│ ├── 02-functions-use5.py
│ ├── 02-functions-use6.py
│ ├── 02-functions-use7.py
│ ├── 03-containers1.py
│ ├── 03-containers2.py
│ ├── 03-containers3.py
│ ├── 03-containers4.py
│ ├── 03-containers5.py
│ ├── 03-containers6.py
│ ├── 03-containers7.py
│ ├── 03-containers8.py
│ ├── 03-containers9.py
│ ├── 04-control-flow1.py
│ ├── 04-control-flow2.py
│ ├── 04-control-flow3.py
│ ├── 04-control-flow4.py
│ ├── 04-control-flow5.py
│ ├── 05-functions-write1.py
│ ├── 05-functions-write2.py
│ └── 05-functions-write3.py
├── python_recap
├── 00-jupyter_introduction.ipynb
├── 01-basic.ipynb
├── 02-control_flow.ipynb
├── 03-functions.ipynb
├── 04-reusing_code.ipynb
├── 05-numpy.ipynb
├── _solutions
│ ├── 01-basic24.py
│ ├── 01-basic25.py
│ ├── 01-basic28.py
│ ├── 01-basic47.py
│ ├── 01-basic49.py
│ ├── 01-basic58.py
│ ├── 02-control_flow15.py
│ ├── 02-control_flow16.py
│ ├── 02-control_flow24.py
│ ├── 03-functions19.py
│ ├── 03-functions27.py
│ ├── 05-numpy109.py
│ ├── 05-numpy137.py
│ ├── 05-numpy34.py
│ ├── 05-numpy35.py
│ ├── 05-numpy36.py
│ ├── 05-numpy37.py
│ ├── 05-numpy58.py
│ ├── 05-numpy73.py
│ ├── 05-numpy75.py
│ ├── 05-numpy77.py
│ ├── python_rehearsal1.py
│ ├── python_rehearsal10.py
│ ├── python_rehearsal11.py
│ ├── python_rehearsal12.py
│ ├── python_rehearsal13.py
│ ├── python_rehearsal2.py
│ ├── python_rehearsal3.py
│ ├── python_rehearsal4.py
│ ├── python_rehearsal5.py
│ ├── python_rehearsal6.py
│ ├── python_rehearsal7.py
│ ├── python_rehearsal8.py
│ └── python_rehearsal9.py
├── data
│ ├── bogota_part_dataset.csv
│ ├── out1.txt
│ ├── out2.txt
│ ├── out3.txt
│ ├── out4.txt
│ └── values.txt
└── python_rehearsal.ipynb
├── visualization_01_matplotlib.ipynb
├── visualization_02_seaborn.ipynb
└── visualization_03_landscape.ipynb
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 |
27 | # PyInstaller
28 | # Usually these files are written by a python script from a template
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 |
48 | # Translations
49 | *.mo
50 | *.pot
51 |
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 |
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 |
60 | # Scrapy stuff:
61 | .scrapy
62 |
63 | # Sphinx documentation
64 | docs/_build/
65 |
66 | # PyBuilder
67 | target/
68 |
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 |
72 | # pyenv
73 | .python-version
74 |
75 | # celery beat schedule file
76 | celerybeat-schedule
77 |
78 | # dotenv
79 | .env
80 |
81 | # virtualenv
82 | venv/
83 | ENV/
84 |
85 | # Spyder project settings
86 | .spyderproject
87 |
88 | # Rope project settings
89 | .ropeproject
90 |
91 | notebooks/data/0284676-200613084148143.zip
92 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2016, Joris Van den Bossche
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | * Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | * Neither the name of the copyright holder nor the names of its
17 | contributors may be used to endorse or promote products derived from
18 | this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Data manipulation, analysis and visualisation in Python
2 |
3 | ## Introduction
4 |
5 | This course is intended for researchers that have at least basic programming skills in Python. It targets researchers that want to enhance their general data manipulation and analysis skills in Python.
6 |
7 | The course does not aim to provide a course in statistics or machine learning. It aims to provide researchers the means to effectively tackle commonly encountered data handling tasks in order to increase the overall efficiency of the research.
8 |
9 | The course has been developed as a specialist course for the Doctoral schools of Ghent University, but can be taught to others upon request (and the material is freely available to re-use).
10 |
11 |
12 | ## Getting started
13 |
14 | The course uses Python 3 and some data analysis packages such as Pandas, Numpy and Matplotlib. To install the required libraries, we highly recommend Anaconda or miniconda ( ) or another Python distribution that includes the scientific libraries (this recommendation applies to all platforms, so for both Window, Linux and Mac).
15 |
16 | For detailed instructions to get started on your local machine , see the [setup instructions](./docs/setup.md).
17 |
18 | In case you do not want to install everything and just want to try out the course material, use the environment setup by Binder [](https://mybinder.org/v2/gh/jorisvandenbossche/DS-python-data-analysis/main?urlpath=lab/) and open de notebooks rightaway.
19 |
20 |
21 | ## Contributing
22 |
23 | Found any typo or have a suggestion, see [how to contribute](./docs/contributing.md).
24 |
25 |
26 | ## Meta
27 | Authors: Joris Van den Bossche, Stijn Van Hoey
28 |
29 |
30 |
31 |
--------------------------------------------------------------------------------
/_solved/data:
--------------------------------------------------------------------------------
1 | ../notebooks/data/
--------------------------------------------------------------------------------
/_solved/pandas_07_missing_values.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "8bd0774d",
6 | "metadata": {},
7 | "source": [
8 | "
07 - Pandas: Working with missing data
\n",
9 | "\n",
10 | "\n",
11 | "> *© 2025, Joris Van den Bossche and Stijn Van Hoey (, ). Licensed under [CC BY 4.0 Creative Commons](http://creativecommons.org/licenses/by/4.0/)*\n",
12 | "\n",
13 | "---"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": null,
19 | "id": "fad2705f",
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "import numpy as np\n",
24 | "import pandas as pd"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": null,
30 | "id": "6cf9e666",
31 | "metadata": {},
32 | "outputs": [],
33 | "source": [
34 | "df = pd.DataFrame({'A': [1, 2, np.nan],\n",
35 | " 'B': [4, np.nan, np.nan],\n",
36 | " 'C': [7, 8, 9]})\n",
37 | "df"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "id": "9204ffad",
43 | "metadata": {},
44 | "source": [
45 | "## Missing values in Pandas"
46 | ]
47 | },
48 | {
49 | "cell_type": "markdown",
50 | "id": "20ebca57",
51 | "metadata": {},
52 | "source": [
53 | "For numerical data, the \"NaN\" (Not-A-Number) floating point value is used as missing value indicator:"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": null,
59 | "id": "17a6454f",
60 | "metadata": {},
61 | "outputs": [],
62 | "source": [
63 | "df.loc[2, 'A']"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": null,
69 | "id": "35dc8450",
70 | "metadata": {},
71 | "outputs": [],
72 | "source": [
73 | "np.nan"
74 | ]
75 | },
76 | {
77 | "cell_type": "markdown",
78 | "id": "b116e307",
79 | "metadata": {},
80 | "source": [
81 | "\n",
82 | "\n",
83 | "**NOTE**: because NaN is a float value, it is currently not possible to have integer columns with missing values. Notice how the columns in the example above were casted to float dtype.\n",
84 | "\n",
85 | "
"
86 | ]
87 | },
88 | {
89 | "cell_type": "markdown",
90 | "id": "89150b7e",
91 | "metadata": {},
92 | "source": [
93 | "### Missing values are skipped by default in *reductions*"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": null,
99 | "id": "1e2b48d5",
100 | "metadata": {},
101 | "outputs": [],
102 | "source": [
103 | "df['A'].mean()"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": null,
109 | "id": "96daf776",
110 | "metadata": {},
111 | "outputs": [],
112 | "source": [
113 | "df['A'].mean(skipna=False)"
114 | ]
115 | },
116 | {
117 | "cell_type": "markdown",
118 | "id": "604e4841",
119 | "metadata": {},
120 | "source": [
121 | "### ... but propagated in *element-wise arithmetic*"
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": null,
127 | "id": "92901db0",
128 | "metadata": {},
129 | "outputs": [],
130 | "source": [
131 | "df['A'] + 3"
132 | ]
133 | },
134 | {
135 | "cell_type": "markdown",
136 | "id": "cf8a72a6",
137 | "metadata": {},
138 | "source": [
139 | "## Checking missing values"
140 | ]
141 | },
142 | {
143 | "cell_type": "markdown",
144 | "id": "5b50553a",
145 | "metadata": {},
146 | "source": [
147 | "Checking for a missing value cannot be done with an equality operation (`==`) because NaN is not equal to iself:"
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": null,
153 | "id": "61a4ebe9",
154 | "metadata": {},
155 | "outputs": [],
156 | "source": [
157 | "df['A'] == np.nan"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": null,
163 | "id": "1acc9e71",
164 | "metadata": {},
165 | "outputs": [],
166 | "source": [
167 | "np.nan == np.nan"
168 | ]
169 | },
170 | {
171 | "cell_type": "markdown",
172 | "id": "b4439546",
173 | "metadata": {},
174 | "source": [
175 | "Therefore, dedicated methods are available: `isna()` and `notna()`"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": null,
181 | "id": "3c7d6670",
182 | "metadata": {},
183 | "outputs": [],
184 | "source": [
185 | "df['A'].isna()"
186 | ]
187 | },
188 | {
189 | "cell_type": "code",
190 | "execution_count": null,
191 | "id": "4b95b7c2",
192 | "metadata": {},
193 | "outputs": [],
194 | "source": [
195 | "df['A'].notna()"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": null,
201 | "id": "683cccc8",
202 | "metadata": {},
203 | "outputs": [],
204 | "source": [
205 | "df['A'].isna().sum()"
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": null,
211 | "id": "c023dd7d",
212 | "metadata": {},
213 | "outputs": [],
214 | "source": [
215 | "df.isna().sum()"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": null,
221 | "id": "82b582da",
222 | "metadata": {},
223 | "outputs": [],
224 | "source": [
225 | "df"
226 | ]
227 | },
228 | {
229 | "cell_type": "markdown",
230 | "id": "a8488b86",
231 | "metadata": {},
232 | "source": [
233 | "## Dropping missing values"
234 | ]
235 | },
236 | {
237 | "cell_type": "markdown",
238 | "id": "e1440709",
239 | "metadata": {},
240 | "source": [
241 | "Dropping missing values can be done with `isna()`/`notna()` and boolean indexing (eg `df[df['A'].notna()]`), but pandas also provides some convenient helper functions for this:"
242 | ]
243 | },
244 | {
245 | "cell_type": "code",
246 | "execution_count": null,
247 | "id": "788d650e",
248 | "metadata": {},
249 | "outputs": [],
250 | "source": [
251 | "df.dropna()"
252 | ]
253 | },
254 | {
255 | "cell_type": "markdown",
256 | "id": "c694bb08",
257 | "metadata": {},
258 | "source": [
259 | "By default it drop rows if there is a NaN in any of the columns. To limit this to we subset of the columns, use the `subset` keyword:"
260 | ]
261 | },
262 | {
263 | "cell_type": "code",
264 | "execution_count": null,
265 | "id": "5bb3578c",
266 | "metadata": {},
267 | "outputs": [],
268 | "source": [
269 | "df.dropna(subset=['A', 'C'])"
270 | ]
271 | },
272 | {
273 | "cell_type": "markdown",
274 | "id": "00036b6f",
275 | "metadata": {},
276 | "source": [
277 | "## Filling missing values"
278 | ]
279 | },
280 | {
281 | "cell_type": "markdown",
282 | "id": "0e64082f",
283 | "metadata": {},
284 | "source": [
285 | "Filling missing values with a scalar:"
286 | ]
287 | },
288 | {
289 | "cell_type": "code",
290 | "execution_count": null,
291 | "id": "94f40e9a",
292 | "metadata": {},
293 | "outputs": [],
294 | "source": [
295 | "df.fillna(0)"
296 | ]
297 | },
298 | {
299 | "cell_type": "markdown",
300 | "id": "0a73ff4c",
301 | "metadata": {},
302 | "source": [
303 | "Further, more advanced filling techniques are available in the ``interpolate()`` method."
304 | ]
305 | },
306 | {
307 | "cell_type": "markdown",
308 | "id": "7b57edf1",
309 | "metadata": {},
310 | "source": [
311 | "\n",
312 | "\n",
313 | "**REMEMBER**: \n",
314 | "\n",
315 | "* Missing value indicator: `np.nan` (`NaN`)\n",
316 | "* Reductions: skipped by default\n",
317 | "* Mathematical operations (eg `+`): propagate by default\n",
318 | "* Specific functions:\n",
319 | " * `isna()`, `notna()`\n",
320 | " * `dropna()`\n",
321 | " * `fillna()`, `interpolate()`\n",
322 | "\n",
323 | "
"
324 | ]
325 | },
326 | {
327 | "cell_type": "code",
328 | "execution_count": null,
329 | "id": "e1f5bf9a",
330 | "metadata": {},
331 | "outputs": [],
332 | "source": []
333 | }
334 | ],
335 | "metadata": {
336 | "jupytext": {
337 | "formats": "ipynb,md:myst"
338 | },
339 | "kernelspec": {
340 | "display_name": "Python 3 (ipykernel)",
341 | "language": "python",
342 | "name": "python3"
343 | },
344 | "language_info": {
345 | "codemirror_mode": {
346 | "name": "ipython",
347 | "version": 3
348 | },
349 | "file_extension": ".py",
350 | "mimetype": "text/x-python",
351 | "name": "python",
352 | "nbconvert_exporter": "python",
353 | "pygments_lexer": "ipython3",
354 | "version": "3.12.8"
355 | },
356 | "widgets": {
357 | "application/vnd.jupyter.widget-state+json": {
358 | "state": {},
359 | "version_major": 2,
360 | "version_minor": 0
361 | }
362 | }
363 | },
364 | "nbformat": 4,
365 | "nbformat_minor": 5
366 | }
367 |
--------------------------------------------------------------------------------
/_solved/python_recap/data/bogota_part_dataset.csv:
--------------------------------------------------------------------------------
1 | DIA,SST AM,SSV AM,SSV PM,SSF PM
2 | Unidad,mg/l,mg/l,mg/l,mg/l
3 | ,,,,
4 | 1,198,141,131,38
5 | 2,274,200,125,35
6 | 3,156,119,274,120
7 | 4,382,266,272,105
8 | 5,494,342,202,76
9 | 6,259,182,205,67
10 | 7,247,185,232,77
11 | 8,164,125,112,33
12 | 9,367,265,82,30
13 | 10,123,90,91,26
14 | 11,132,96,130,46
15 | 12,97,66,110,33
16 | 13,160,104,181,83
17 | 14,137,100,122,41
18 | 15,172,123,151,56
19 | 16,192,138,168,78
20 | 17,176,106,94,36
21 | 18,192,132,111,43
22 | 19,152,99,112,37
23 | 20,255,179,181,67
24 | 21,188,134,220,94
25 | 22,215,153,149,58
26 | 23,221,157,147,60
27 | 24,284,199,201,93
28 | 25,134,84,133,65
29 | 26,196,120,132,47
30 | 27,144,88,114,41
31 | 28,193,143,128,45
32 |
--------------------------------------------------------------------------------
/_solved/python_recap/data/out1.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/_solved/python_recap/data/out1.txt
--------------------------------------------------------------------------------
/_solved/python_recap/data/out2.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/_solved/python_recap/data/out2.txt
--------------------------------------------------------------------------------
/_solved/python_recap/data/out3.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/_solved/python_recap/data/out3.txt
--------------------------------------------------------------------------------
/_solved/python_recap/data/out4.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/_solved/python_recap/data/out4.txt
--------------------------------------------------------------------------------
/_solved/python_recap/data/values.txt:
--------------------------------------------------------------------------------
1 | 0,09400 3,37968
2 | 0,28820 0,83214
3 | 0,06823 0,57102
4 | 0,65576 0,59619
5 | -1,23714 0,03561
--------------------------------------------------------------------------------
/_solved/spreaddiagram.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 |
4 | @author: Stijnvh
5 | """
6 |
7 | import sys
8 | import datetime
9 |
10 | import numpy as np
11 | from scipy import stats
12 | from scipy.stats import linregress
13 |
14 | import pandas as pd
15 | from pandas.tseries.offsets import DateOffset
16 |
17 | import pylab as p
18 | import matplotlib as mpl
19 | mpl.rcParams['mathtext.default'] = 'regular'
20 | import matplotlib.pyplot as plt
21 | import matplotlib.gridspec as gridspec
22 | from matplotlib.patches import Rectangle
23 | from matplotlib.ticker import MaxNLocator
24 |
25 | ##-----------------------------------------------------------------------------
26 | ## Calculating objective functions
27 | ##-----------------------------------------------------------------------------
28 |
29 | def root_mean_square_error(observed, modelled):
30 | '''
31 | Root Mean Square Error (RMSE)
32 |
33 | Parameters
34 | -----------
35 | observed : np.ndarray or pd.DataFrame
36 | observed/measured values of the variable
37 | observed : np.ndarray or pd.DataFrame
38 | simulated values of the variable
39 |
40 | Notes
41 | -------
42 | The root mean square error is an absolute criterion that is often.
43 | It indicates the overall agreement between predicted and observed data.
44 | The square allows avoiding
45 | error compensation and emphasises larger errors. The root provides
46 | a criterion in actual units. Consequently, this quality criterion
47 | can be compared to the MAE to provide information on the prominence
48 | of outliers in the dataset.
49 |
50 | Notes
51 | -------
52 | * range: [0, inf]
53 | * optimum: 0
54 | '''
55 | residuals = observed - modelled
56 | return np.sqrt((residuals**2).mean())
57 |
58 |
59 | def bias(observed, modelled):
60 | """
61 | Bias E[obs-mod]
62 |
63 | Parameters
64 | -----------
65 | observed : np.ndarray or pd.DataFrame
66 | observed/measured values of the variable
67 | observed : np.ndarray or pd.DataFrame
68 | simulated values of the variable
69 |
70 | Notes
71 | -------
72 | * range: [-inf, inf]
73 | * optimum: 0
74 | """
75 | residuals = observed - modelled
76 | return np.mean(residuals)
77 |
78 | ##-----------------------------------------------------------------------------
79 | ## MODEL CALIBRATION EVALUATION PLOTS - SPREAD DIAGRAMS
80 | ##-----------------------------------------------------------------------------
81 |
82 | def spread_diagram(axs, obs, mod, infobox = True, *args, **kwargs):
83 | '''
84 | plot a scatter plot comparing the simulated and observed datasets in a
85 | scatter plot with some extra information about the fit included.
86 |
87 | Parameters
88 | -----------
89 | axs : axes.AxesSubplot object
90 | an subplot instance where the graph will be located,
91 | this supports the use of different subplots
92 | obs : ndarray
93 | 1D array of the observed data
94 | mod : ndarray
95 | 1D array of the modelled output
96 | infobox : bool True|False
97 | defines if a infobox with the regression info is added or not
98 | *args, **kwargs : args
99 | argument passed to the matplotlib scatter command
100 |
101 | Returns
102 | --------
103 | axs
104 | '''
105 | p.rc('mathtext', default = 'regular')
106 |
107 | axs.scatter(obs,mod, *args, **kwargs)
108 | axs.set_aspect('equal')
109 |
110 | if isinstance(obs, np.ndarray):
111 | getmax = min(obs.max(), mod.max())*0.9
112 | getmin = max(obs.min(), mod.min())*1.1
113 | else:
114 | getmax = min(obs.max().values, mod.max().values)*0.9
115 | getmin = max(obs.min().values, mod.min().values)*1.1
116 | obs = obs.values
117 | mod = mod.values
118 |
119 | axs.plot([getmin, getmax], [getmin, getmax],'k--', linewidth = 0.5)
120 |
121 | slope, intercept, r_value, p_value, std_err = stats.linregress(obs, mod)
122 |
123 | forplot = np.arange(getmin, getmax, 0.01)
124 | axs.plot(forplot, slope*forplot + intercept, '-', color = 'grey',
125 | linewidth = 0.5)
126 | axs.set_xlim(left = getmin, right = getmax)
127 | axs.set_ylim(bottom = getmin, top = getmax)
128 |
129 | rmse = root_mean_square_error(obs, mod)
130 |
131 | #for infobox
132 | if infobox == True:
133 | patch = Rectangle((0., 0.65), 0.35, 0.35, facecolor = 'white',
134 | edgecolor = 'k', transform = axs.transAxes)
135 | axs.add_patch(patch)
136 | axs.set_axisbelow(True)
137 |
138 | textinfo = ({'transform' : axs.transAxes,
139 | 'verticalalignment' : 'center',
140 | 'horizontalalignment' : 'left',
141 | 'fontsize' : 12})
142 |
143 | axs.text(0.05, 0.95, r'$\bar{x}\ $', textinfo)
144 | axs.text(0.05, 0.90, r'$\bar{y}\ $', textinfo)
145 | axs.text(0.05, 0.85, r'$rico\ $', textinfo)
146 | axs.text(0.05, 0.8, r'$intc.\ $', textinfo)
147 | axs.text(0.05, 0.75, r'$R^2\ $', textinfo)
148 | axs.text(0.05, 0.70, r'$RMSE\ $', textinfo)
149 |
150 | axs.text(0.2, 0.95, r': %.2f'%obs.mean(), textinfo)
151 | axs.text(0.2, 0.90, r': %.2f'%mod.mean(), textinfo)
152 | axs.text(0.2, 0.85, r': %.2f'%slope, textinfo)
153 | axs.text(0.2, 0.8, r': %.2f'%intercept, textinfo)
154 | axs.text(0.2, 0.75, r': %.2f'%r_value, textinfo)
155 | axs.text(0.2, 0.70, r': %.2f'%rmse, textinfo)
156 |
157 | return axs
158 |
159 |
160 | def main(argv=None):
161 | print(argv[0])
162 |
163 | # loading data from a file
164 | data = pd.read_csv(argv[1], parse_dates=True, index_col=0).dropna()
165 |
166 | # using custom plot function
167 |
168 | formatfig = argv[2]
169 | fig, ax = plt.subplots()
170 | spread_diagram(ax, data.iloc[:,0].values,
171 | data.iloc[:,1].values, infobox = True)
172 | fig.savefig("{}_evaluation.{}".format(datetime.date.today().strftime("%Y%m%d"), formatfig))
173 |
174 |
175 | if __name__ == "__main__":
176 | sys.exit(main(sys.argv))
177 |
178 |
--------------------------------------------------------------------------------
/check_environment.py:
--------------------------------------------------------------------------------
1 | # This script is adapted from Andreas Mueller:
2 | # https://github.com/amueller/scipy-2018-sklearn/blob/master/check_env.ipynb
3 | # and glemaitre: https://github.com/glemaitre/pyparis-2018-sklearn/blob/master/check_environment.py
4 |
5 | from __future__ import print_function
6 | import sys
7 |
8 | # packaging is not in the stdlib, but should be available as dependency of
9 | # some other package (eg jupyterlab, matplotlib, ..)
10 | from packaging import version
11 |
12 | try:
13 | import curses
14 | curses.setupterm()
15 | assert curses.tigetnum("colors") > 2
16 | OK = "\x1b[1;%dm[ OK ]\x1b[0m" % (30 + curses.COLOR_GREEN)
17 | FAIL = "\x1b[1;%dm[FAIL]\x1b[0m" % (30 + curses.COLOR_RED)
18 | except:
19 | OK = '[ OK ]'
20 | FAIL = '[FAIL]'
21 |
22 | try:
23 | import importlib
24 | except ImportError:
25 | print(FAIL, "Python version 3.4 is required,"
26 | " but %s is installed." % sys.version)
27 |
28 |
29 | def import_version(pkg, min_ver, fail_msg=""):
30 | mod = None
31 | try:
32 | mod = importlib.import_module(pkg)
33 |
34 | if pkg in {'PIL'}:
35 | ver = mod.VERSION
36 | elif pkg in {'xlrd'}:
37 | ver = mod.__VERSION__
38 | else:
39 | ver = mod.__version__
40 | if version.parse(ver) < version.parse(min_ver):
41 | print(FAIL, "%s version %s or higher required, but %s installed."
42 | % (lib, min_ver, ver))
43 | else:
44 | print(OK, '%s version %s' % (pkg, ver))
45 | except ImportError:
46 | print(FAIL, '%s not installed. %s' % (pkg, fail_msg))
47 | return mod
48 |
49 |
50 | # first check the python version
51 | print('Using python in', sys.prefix)
52 | print(sys.version)
53 | pyversion = version.parse(sys.version.split(" ")[0])
54 | if pyversion >= version.parse("3"):
55 | if pyversion < version.parse("3.8"):
56 | print(FAIL, "Python version 3.8 is required,"
57 | " but %s is installed." % sys.version)
58 | else:
59 | print(FAIL, "Python 3 is required, but %s is installed." % sys.version)
60 |
61 | print()
62 | requirements = {'numpy': "2", 'matplotlib': "3",
63 | 'pandas': "2", 'jupyterlab': "3",
64 | 'pyproj': "2", 'requests': "2.32",
65 | 'seaborn': "0.13"}
66 |
67 | # now the dependencies
68 | for lib, required_version in list(requirements.items()):
69 | import_version(lib, required_version)
70 |
71 | # mplleaflet has no option to derive __version__
72 | try:
73 | import mplleaflet
74 | print(OK, '%s can be loaded' % ('mplleaflet'))
75 | except:
76 | print(FAIL, '%s can not be loaded.' % ('mplleaflet'))
77 |
--------------------------------------------------------------------------------
/convert_notebooks.sh:
--------------------------------------------------------------------------------
1 | # run this from the the top-level directory
2 | # it creates there a notebooks/ and _solved/solutions/ dir
3 | # that get automatically copied to the correct places
4 |
5 |
6 | declare -a arr=(
7 | #"00-jupyter_introduction.ipynb"
8 | #"01-basic.ipynb"
9 | #"02-control_flow.ipynb"
10 | #"03-functions.ipynb"
11 | #"04-reusing_code.ipynb"
12 | #"05-numpy.ipynb"
13 | #"python_rehearsal"
14 | "00-jupyter_introduction.ipynb"
15 | "pandas_01_data_structures.ipynb"
16 | "pandas_02_basic_operations.ipynb"
17 | "pandas_03a_selecting_data.ipynb"
18 | "pandas_03b_indexing.ipynb"
19 | "pandas_04_time_series_data.ipynb"
20 | "pandas_05_groupby_operations.ipynb"
21 | "pandas_06_data_cleaning.ipynb"
22 | "pandas_07_missing_values.ipynb"
23 | "pandas_08_reshaping_data.ipynb"
24 | "pandas_09_combining_datasets.ipynb"
25 | "visualization_01_matplotlib.ipynb"
26 | "visualization_02_seaborn.ipynb"
27 | "visualization_03_landscape.ipynb"
28 | "case1_bike_count.ipynb"
29 | "case2_observations.ipynb"
30 | "case3_bacterial_resistance_lab_experiment"
31 | "case4_air_quality_processing.ipynb"
32 | "case4_air_quality_analysis.ipynb"
33 | )
34 |
35 | cd _solved
36 |
37 | mkdir ./notebooks
38 |
39 | echo "- Converting notebooks"
40 |
41 | for i in "${arr[@]}"
42 | do
43 | echo "--" "$i"
44 | jupyter nbconvert --to=notebook --config ../nbconvert_config.py --output "notebooks/$i" "$i"
45 | done
46 |
47 | echo "- Copying converted notebooks and solutions"
48 | cp -r notebooks/. ../notebooks
49 | cp -r _solutions/. ../notebooks/_solutions
50 |
51 | rm -r notebooks/
52 | rm -r _solutions/
53 |
54 | cd ..
55 |
56 |
57 | declare -a arr=(
58 | "00-jupyterlab.ipynb"
59 | "01-variables.ipynb"
60 | "02-functions-use.ipynb"
61 | "03-containers.ipynb"
62 | "04-control-flow.ipynb"
63 | "05-functions-write.ipynb"
64 | )
65 |
66 | cd _solved/python_intro
67 |
68 | mkdir ./notebooks
69 |
70 | echo "- Converting notebooks"
71 |
72 | for i in "${arr[@]}"
73 | do
74 | echo "--" "$i"
75 | jupyter nbconvert --to=notebook --config ../../nbconvert_config.py --output "notebooks/$i" "$i"
76 | done
77 |
78 |
79 |
80 | echo "- Copying converted notebooks and solutions"
81 | cp -r notebooks/. ../../notebooks/python_intro
82 | cp -r _solutions/. ../../notebooks/python_intro/_solutions
83 |
84 | rm -r notebooks/
85 | rm -r _solutions/
86 |
87 | cd ../..
88 |
89 |
90 |
--------------------------------------------------------------------------------
/docs/_config.yml:
--------------------------------------------------------------------------------
1 | title: Data manipulation, analysis and visualisation in Python
2 | logo:
3 | description: Specialist course Doctoral schools of Ghent University
4 | show_downloads: true
5 | theme: jekyll-theme-minimal
--------------------------------------------------------------------------------
/docs/contributing.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | ---
4 |
5 | # Contributing guide
6 |
7 | First of all, thanks for considering contributing to the course! 👍
8 |
9 | ## How you can contribute
10 |
11 | There are several ways you can contribute to this course.
12 |
13 | ### Share the love ❤️
14 |
15 | Think this course is useful? Let others discover it, by telling them in person, via Twitter or a blog post.
16 |
17 | ### Ask a question ⁉️
18 |
19 | Trying out the material and got stuck? Post your question as an [issue on GitHub](https://github.com/jorisvandenbossche/course-python-data/issues). While we cannot offer user support, we'll try to do our best to address it, as questions often lead to the discovery of bugs.
20 |
21 | Want to ask a question in private? Contact the course maintainer by [email](jorisvandenbossche@gmail.com).
22 |
23 | ### Propose an idea 💡
24 |
25 | Have an idea for to improve the course? Take a look at the [issue list](https://github.com/jorisvandenbossche/course-python-data/issues) to see if it isn't included or suggested yet. If not, suggest your idea as an [issue on GitHub](https://github.com/jorisvandenbossche/course-python-data/issues/new).
26 |
27 | ### Report a bug 🐛
28 |
29 | Using the course and discovered a bug or a typo? That's annoying! Don't let others have the same experience and report it as an [issue on GitHub](https://github.com/jorisvandenbossche/Have an idea for to improve the course? Take a look at the [issue list](https://github.com/jorisvandenbossche/course-python-data/issues) to see if it isn't included or suggested yet. If not, suggest your idea as an [issue on GitHub](https://github.com/jorisvandenbossche/course-python-data/issues/new).
30 | /issues/new) so we can fix it. A good bug report makes it easier for us to do so, so please include:
31 |
32 | * Your operating system name and version (e.g. Mac OS 10.13.6).
33 | * Any details about your local setup that might be helpful in troubleshooting.
34 | * Detailed steps to reproduce the bug.
35 |
36 | ### Contribute code 📝
37 |
38 | Care to fix issues or typo's? Awesome! 👏
39 |
40 | Some notes to take into account:
41 |
42 | - The course material is developed in the [course-python-data](https://github.com/jorisvandenbossche/course-python-data) repository. When updating course material, edit the notebooks in the [course-python-data](https://github.com/jorisvandenbossche/course-python-data) repository, the other ones (the ones used in the tutorial) are generated automatically.
43 | - the exercises are cleared using the `nbtutor` notebook extension:
44 |
45 |
46 |
47 |
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | # Data manipulation, analysis and visualisation in Python
2 |
3 | ## Introduction
4 |
5 | The handling of data is a recurring task for data analysts. Reading in experimental data, checking its properties,
6 | and creating visualisations are crucial steps in the research process. Hence, increasing the efficiency in this process is beneficial for professionals
7 | handling data. Spreadsheet-based software lacks the ability to properly support this process, due to the lack of automation and repeatability.
8 | The usage of a high-level scripting language such as Python is ideal for these tasks.
9 |
10 | This course trains participants to use Python effectively to do these tasks. The course focuses on data manipulation and cleaning of tabular data,
11 | explorative analysis and visualisation using important packages such as Pandas, Matplotlib and Seaborn.
12 |
13 | The course does not cover statistics, data mining, machine learning, or predictive modelling. It aims to provide participants the means to effectively
14 | tackle commonly encountered data handling tasks in order to increase the overall efficiency. These skills are both useful for data cleaning as well as
15 | feature engineering.
16 |
17 | The course has been developed as a course for the Specialist course Doctoral schools of Ghent University, but can be taught to others upon request.
18 |
19 | ## Course info
20 |
21 | ### Aim & scope
22 |
23 | This course is intended for researchers that have at least basic programming skills. A basic (scientific) programming course that is part of
24 | the regular curriculum should suffice. For those who have experience in another programming language (e.g. Matlab, R, ...), following a Python
25 | tutorial prior to the course is advised.
26 |
27 | The course is intended for professionals who wish to enhance their general data manipulation and visualization skills in Python, with a specific
28 | focus on tabular data. The course is NOT intended to be a course on statistics or machine learning.
29 |
30 | ### Program
31 |
32 | After setting up the programming environment with the required packages using the conda package manager and an introduction of the Jupyter
33 | notebook environment, the data analysis package Pandas and the plotting packages Matplotlib and Seaborn are introduced. Advanced usage of Pandas
34 | for different data cleaning and manipulation tasks is taught and the acquired skills will immediately be brought into practice to handle real-world
35 | data sets. Applications include time series handling, categorical data, merging data, tidy data,...
36 |
37 | The course closes with a discussion on the scientific Python ecosystem and the visualisation landscape learning
38 | participants to create interactive charts.
39 |
40 | ## Getting started
41 |
42 | The course uses Python 3 and some data analysis packages such as Pandas, Seaborn, Numpy and Matplotlib. To install the required libraries,
43 | we recommend Anaconda or miniconda ([https://www.anaconda.com/download/](https://www.anaconda.com/download/)) or another Python distribution that
44 | includes the scientific libraries (this recommendation applies to all platforms, so for both Window, Linux and Mac).
45 |
46 | For detailed instructions to get started on your local machine, see the [setup instructions](./setup.html).
47 |
48 | In case you do not want to install everything and just want to try out the course material, use the environment setup by
49 | Binder [](https://mybinder.org/v2/gh/jorisvandenbossche/DS-python-data-analysis/HEAD) and open de notebooks
50 | rightaway (inside the `notebooks` directory).
51 |
52 | ## Slides
53 |
54 | For the course slides, click [here](https://jorisvandenbossche.github.io/DS-python-data-analysis/slides.html).
55 |
56 | ## Contributing
57 |
58 | Found any typo or have a suggestion, see [how to contribute](./contributing.html).
59 |
60 | ## Meta
61 |
62 | Authors: Joris Van den Bossche, Stijn Van Hoey
63 |
64 | With the support of the Flemish Government.
65 |
66 |
67 |
68 |
69 |
--------------------------------------------------------------------------------
/docs/setup.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | ---
4 |
5 | # Course setup
6 |
7 | To get started, you should have the following elements setup:
8 |
9 | 1. Download the course material to your computer
10 | 2. Install Python and the required Python packages using `conda`
11 | 3. Test your configuration and installation
12 | 4. Start Jupyter lab
13 |
14 | In the following sections, more details are provided for each of these steps. When all three are done, you are ready to start coding!
15 |
16 | ## 1. Getting the course materials
17 |
18 | ### Option 1: You are already a git user
19 |
20 | As the course has been set up as a [git](https://git-scm.com/) repository managed on [Github](https://github.com/jorisvandenbossche/DS-python-data-analysis),
21 | you can clone the entire course to your local machine. Use the command line to clone the repository and go into the course folder:
22 |
23 | ```
24 | git clone https://github.com/jorisvandenbossche/DS-python-data-analysis.git
25 | cd DS-python-data-analysis
26 | ```
27 |
28 | In case you would prefer using Github Desktop,
29 | see [this tutorial](https://help.github.com/desktop/guides/contributing-to-projects/cloning-a-repository-from-github-to-github-desktop/).
30 |
31 | ### Option 2: You are not a git user
32 |
33 | To download the repository to your local machine as a zip-file, click the `download ZIP` on the
34 | repository page (green button "Code"):
35 |
36 | 
37 |
38 | After the download, unzip on the location you prefer within your user account (e.g. `My Documents`, not `C:\`). Watch out for a nested 'DS-python-data-analysis/DS-python-data-analysis' folder structure after unzipping and move the inner DS-python-data-analysis folder to your preferred location.
39 |
40 | __Note:__ Make sure you know where you stored the course material, e.g. `C:/Users/yourusername/Documents/DS-python-data-analysis`.
41 |
42 | ## 2. Install Python and the required Python packages using `conda`
43 |
44 | For scientific and data analysis, we recommend to use `conda`, a command line tool for package and environment management ( ).
45 | `conda` allows us to install a Python distribution with the the scientific libraries we will use in this course (this recommendation applies to all platforms, so for both Windows, Linux and Mac).
46 |
47 | ### 2.1 Install `conda`
48 |
49 | #### Option 1: I do not have `conda` installed
50 |
51 | We recommend to use the installer provided by the conda-forge community: .
52 |
53 | Follow the instructions on that page, i.e. first download the appropriate installed (depending on your operating system), and then run that installer.
54 |
55 | On Windows, this will mean double-clicking the downloaded `.exe` file, and following the instructions. During installation, choose the options (click checkbox):
56 |
57 | - '_Register Miniforge3 as my default Python 3.12_' (in case this returns an error about an existing Python 3.12 installation, remove the existing Python installation using [windows Control Panel](https://support.microsoft.com/en-us/windows/uninstall-or-remove-apps-and-programs-in-windows-4b55f974-2cc6-2d2b-d092-5905080eaf98)).
58 | - '_Clear the package cache upon completion_'.
59 |
60 | On MacOS or Linux, you have to open a terminal, and run `bash Miniforge3-$(uname)-$(uname -m).sh`
61 |
62 | #### Option 2: I already have `conda`, Anaconda or Miniconda installed
63 |
64 | When you already have an installation of `conda` or Anaconda, you have to make sure you are working with a recent version. If you installed it only a
65 | few months ago, this step is probably not needed, otherwise follow the next steps:
66 |
67 | 1. Open a terminal window (on Windows, use the dedicated "Anaconda Prompt" or "Miniforge Prompt", via Start Menu)
68 | 2. Run `conda update conda`, by typing that command, hit the ENTER-button
69 | (make sure you have an internet connection), and respond with *Yes* by typing `y`.
70 | 3. Run `conda config --add channels conda-forge`, by typing that command, hit the ENTER-button
71 | 4. Run `conda config --set channel_priority strict`, by typing that command, hit the ENTER-button
72 |
73 | If you are using Anaconda on Windows, replace each time "Miniforge Prompt" by "Anaconda Prompt" in the following sections.
74 |
75 | ### 2.2 Setup after `conda` installation
76 |
77 | Now we will use `conda` to install the Python packages we are going to use
78 | throughout this course.
79 | As a good practice, we will create a new _conda environment_ to work with.
80 |
81 | The packages used in the course are enlisted in
82 | an [`environment.yml` file](https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/main/environment.yml). The file looks as follows:
83 |
84 | ```
85 | name: DS-python
86 | channels:
87 | - conda-forge
88 | dependencies:
89 | - python=3.12
90 | - geopandas
91 | - ...
92 | ```
93 |
94 | The file contains information on:
95 | - `name` is the name used for the environment
96 | - `channels` to define where to download the packages from
97 | - `dependencies` contains each of the packages
98 |
99 | The environment.yml file for this course is included in the course material you
100 | downloaded.
101 |
102 | Now we can create the environment:
103 |
104 | 1. Open the terminal window (on Windows use "Miniforge Prompt", open it via Start Menu > 'Miniforge Prompt')
105 | 2. Navigate to the directory where you downloaded the course materials (that directory should contain a `environment.yml` file, double check in your file explorer).:
106 |
107 | ```
108 | cd FOLDER_PATH_TO_COURSE_MATERIAL
109 | ```
110 | (Make sure to hit the ENTER-button to run the command)
111 |
112 | 3. Create the environment by typing the following commands line by line + hitting the ENTER-button (make sure you have an internet connection):
113 |
114 | ```
115 | conda env create -f environment.yml
116 | ```
117 |
118 | __!__ `FOLDER_PATH_TO_COURSE_MATERIAL` should be replaced by the path to the folder containing the downloaded course materials (e.g. in the example it is `C:/Users/yourusername/Documents/DS-python-data-analysis`)
119 |
120 | __!__ You can safely ignore the warning `FutureWarning: 'remote_definition'...`.
121 |
122 | Respond with *Yes* by typing `y` when asked. Output will be printed and if no error occurs, you should have the environment configured with all packages installed.
123 |
124 | When finished, keep the terminal window (or "Miniforge Prompt") open (or reopen it). Execute the following commands to check your installation:
125 |
126 | ```
127 | conda activate DS-python
128 | ipython
129 | ```
130 |
131 | Within the terminal, a Python session will be started in which you can start writing Python! Type the following command:
132 |
133 | ```
134 | import pandas
135 | import matplotlib
136 | ```
137 |
138 | If no message is returned, you're all set! If a message (probably an error) returned, contact the instructors. Copy paste the message returned.
139 |
140 | To get out of the Python session, type:
141 |
142 | ```
143 | quit
144 | ```
145 |
146 | ## 3. Test your configuration
147 |
148 | To check if your packages are properly installed, open the Conda Terminal again (see above) and navigate to the course directory:
149 |
150 | ```
151 | cd FOLDER_PATH_TO_COURSE_MATERIAL
152 | ```
153 |
154 | With `FOLDER_PATH_TO_COURSE_MATERIAL` replaced by the path to the folder with the downloaded
155 | course material (e.g. in the example it is `C:/Users/yourusername/Documents/DS-python-data-analysis`).
156 |
157 | Activate the newly created conda environment:
158 |
159 | ```
160 | conda activate DS-python
161 | ```
162 |
163 | Then, run the `check_environment.py` script:
164 |
165 | ```
166 | python check_environment.py
167 | ```
168 |
169 | When all checkmarks are ok, you're ready to go!
170 |
171 |
172 | ## 4.(_start of day during course_) Starting Jupyter Notebook with Jupyter Lab
173 |
174 | Each of the course modules is set up as a [Jupyter notebook](http://jupyter.org/), an interactive environment to write and run code. It is no problem if you never used jupyter notebooks before as an introduction to notebooks is part of the course.
175 |
176 |
177 | * In the terminal (or "Miniforge Prompt"), navigate to the `DS-python-data-analysis` directory (downloaded or cloned in the previous section)
178 |
179 | ```
180 | cd FOLDER_PATH_TO_COURSE_MATERIAL
181 | ```
182 |
183 | * Ensure that the correct environment is activated.
184 |
185 | ```
186 | conda activate DS-python
187 | ```
188 |
189 | * Start a jupyter notebook server by typing
190 |
191 | ```
192 | jupyter lab
193 | ```
194 |
195 | ## Next?
196 |
197 | This will open a browser window automatically. Navigate to the course directory (if not already there) and choose the `notebooks` folder to access the individual notebooks containing the course material.
--------------------------------------------------------------------------------
/docs/static/img/datacleaning1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/docs/static/img/datacleaning1.jpg
--------------------------------------------------------------------------------
/docs/static/img/datacleaning2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/docs/static/img/datacleaning2.jpg
--------------------------------------------------------------------------------
/docs/static/img/dataframe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/docs/static/img/dataframe.png
--------------------------------------------------------------------------------
/docs/static/img/doctoralschoolsprofiel_hq_rgb_web.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/docs/static/img/doctoralschoolsprofiel_hq_rgb_web.png
--------------------------------------------------------------------------------
/docs/static/img/download-button.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/docs/static/img/download-button.png
--------------------------------------------------------------------------------
/docs/static/img/environment_save.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/docs/static/img/environment_save.png
--------------------------------------------------------------------------------
/docs/static/img/icon_github.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/docs/static/img/icon_twitter.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/docs/static/img/ipython.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/docs/static/img/ipython.png
--------------------------------------------------------------------------------
/docs/static/img/issuetracker.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/docs/static/img/issuetracker.png
--------------------------------------------------------------------------------
/docs/static/img/logo_flanders+richtingmorgen.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/docs/static/img/logo_flanders+richtingmorgen.png
--------------------------------------------------------------------------------
/docs/static/img/navigator_notebook.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/docs/static/img/navigator_notebook.png
--------------------------------------------------------------------------------
/docs/static/img/navigator_terminal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/docs/static/img/navigator_terminal.png
--------------------------------------------------------------------------------
/docs/static/img/notebook.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/docs/static/img/notebook.png
--------------------------------------------------------------------------------
/docs/static/img/startup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/docs/static/img/startup.png
--------------------------------------------------------------------------------
/docs/static/img/tidy_data_paper.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/docs/static/img/tidy_data_paper.png
--------------------------------------------------------------------------------
/docs/static/img/tidy_data_scheme.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/docs/static/img/tidy_data_scheme.png
--------------------------------------------------------------------------------
/docs/static/img/work_stijn_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/docs/static/img/work_stijn_1.png
--------------------------------------------------------------------------------
/docs/static/slides.css:
--------------------------------------------------------------------------------
1 | body {
2 | font-family: -apple-system, BlinkMacSystemFont, avenir next, avenir, segoe ui, helvetica neue, helvetica, Cantarell, Ubuntu, roboto, noto, arial, sans-serif;
3 | font-weight: normal;
4 | }
5 |
6 | h1, h2, h3, h4, h5, h6 {
7 | font-weight: 300;
8 | margin-top: 5px;
9 | margin-bottom: 10px;
10 | }
11 | h1 {
12 | margin-top: 0.5em;
13 | }
14 | h2 {
15 | font-size: 140%;
16 | line-height: 150%;
17 | }
18 | h3 {
19 | font-size: 120%;
20 | line-height: 140%;
21 | }
22 | h2, h3, h4, h5, h6 {
23 | font-weight: normal;
24 | }
25 |
26 | li {
27 | font-size: 120%;
28 | line-height: 130%;
29 | }
30 |
31 | p {
32 | font-size: 100%;
33 | line-height: 120%;
34 | }
35 |
36 | table {
37 | margin: 0 auto 0.8em;
38 | border-collapse: collapse;
39 | }
40 | td, th {
41 | border: 1px solid #ddd;
42 | padding: 0.3em 0.5em;
43 | }
44 |
45 | .bgheader h1 {
46 | background-color: rgba(0, 0, 0, 0.9);
47 | opacity: 50%;
48 | padding: 0.5em;
49 | color: white;
50 | border-radius: .5em;
51 | }
52 |
53 | .section_background {
54 | background-color:#c2c444;
55 | color: #fff;
56 | font-weight: normal;
57 | }
58 |
59 | .middlebelowheader {
60 | /* This fixed size height was found to work well with the slide
61 | scaling mechanism of remark.js:
62 | */
63 | height: 500px;
64 | display: table-cell;
65 | vertical-align: middle;
66 | }
67 |
68 | .hidden {
69 | visibility: hidden;
70 | }
71 |
72 | .small {
73 | font-size: 90%;
74 | }
75 |
76 | a:visited {
77 | color: #356196;
78 | }
79 |
80 | a:link {
81 | color: #356196;
82 | }
83 |
84 | .footnote {
85 | color: #808080;
86 | background-color: rgba(256, 256, 256, 0.9);
87 | font-size: 60%;
88 | position: absolute;
89 | bottom: 30px;
90 | left: 20px;
91 | text-align: left;
92 | line-height: 100%;
93 | padding: 5px;
94 | }
95 |
96 | .remark-slide-content {
97 | background-size: contain;
98 | }
99 |
100 | .emphasize {
101 | color: rgba(100, 100, 100, 0.95);
102 | font-size: 150%;
103 | line-height: 120%;
104 | }
105 |
106 | .widthlimit {
107 | width: 600px;
108 | font-size: 200%;
109 | }
110 |
111 | .fadetext {
112 | opacity: 0.3;
113 | }
114 |
115 | blockquote {
116 | padding: 0px 20px;
117 | font-weight: lighter;
118 | border-left: 5px solid #eee;
119 | }
120 |
121 | code {
122 | font-family: Menlo, Consolas, Monaco, Liberation Mono, Lucida Console, monospace;
123 | background-color: #f5f5f5;
124 | border: 1px solid #ccc;
125 | border-radius: 4px;
126 | font-size: 0.95;
127 | }
128 |
129 | .remark-code {
130 | font-family: Menlo, Consolas, Monaco, Liberation Mono, Lucida Console, monospace;
131 | display: block;
132 | padding: 9.5px;
133 | margin: 0 0 10px;
134 | font-size: 13px;
135 | line-height: 1.42857143;
136 | color: #333;
137 | word-break: break-all;
138 | word-wrap: break-word;
139 | background-color: #f5f5f5;
140 | border: 1px solid #ccc;
141 | border-radius: 4px;
142 | }
143 |
144 |
145 |
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: DS-python
2 | channels:
3 | - conda-forge
4 | dependencies:
5 | - python=3.12
6 | - ipython
7 | - jupyter
8 | - jupyterlab>=3
9 | - numpy
10 | - pandas=2.2
11 | - matplotlib>3
12 | - mplleaflet
13 | - ipympl
14 | - seaborn
15 | - plotnine
16 | - pyproj
17 | - requests
18 | - openpyxl
19 | - geopandas
20 | - pyarrow
21 |
--------------------------------------------------------------------------------
/img/bacteriophage.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/bacteriophage.jpeg
--------------------------------------------------------------------------------
/img/bike_count_illustration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/bike_count_illustration.png
--------------------------------------------------------------------------------
/img/change_kernel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/change_kernel.png
--------------------------------------------------------------------------------
/img/doctoralschoolsprofiel_hq_rgb_web.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/doctoralschoolsprofiel_hq_rgb_web.png
--------------------------------------------------------------------------------
/img/enterbutton.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/enterbutton.png
--------------------------------------------------------------------------------
/img/heatmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/heatmap.png
--------------------------------------------------------------------------------
/img/keya.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/keya.png
--------------------------------------------------------------------------------
/img/keyb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/keyb.png
--------------------------------------------------------------------------------
/img/keyescape.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/keyescape.png
--------------------------------------------------------------------------------
/img/logo_flanders+richtingmorgen.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/logo_flanders+richtingmorgen.png
--------------------------------------------------------------------------------
/img/matplotlib_fundamentals.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/matplotlib_fundamentals.png
--------------------------------------------------------------------------------
/img/matplotlib_oo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/matplotlib_oo.png
--------------------------------------------------------------------------------
/img/pandas.svg:
--------------------------------------------------------------------------------
1 | Artboard 63
--------------------------------------------------------------------------------
/img/pandas/pivot_excel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/pandas/pivot_excel.png
--------------------------------------------------------------------------------
/img/pandas/splitApplyCombine.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/pandas/splitApplyCombine.png
--------------------------------------------------------------------------------
/img/plot_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/plot_overview.png
--------------------------------------------------------------------------------
/img/python-function.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | def fahr_to_celsius (temp): return ((temp - 32) * (5/9))
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 | def statement
17 | name
18 | parameter names
19 | body
20 | return statement
21 | return value
22 |
23 |
24 |
25 |
--------------------------------------------------------------------------------
/img/python-sticky-note-variables-01.svg:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 |
7 |
8 |
10 |
11 |
12 |
15 |
16 |
17 | 65.0
22 |
23 |
24 | weight_kg
29 |
30 |
--------------------------------------------------------------------------------
/img/python-sticky-note-variables-02.svg:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 |
7 |
8 |
10 |
11 |
12 |
15 |
16 |
17 | 65.0
22 |
23 |
24 | weight_kg
29 |
30 |
31 |
33 |
34 |
35 |
38 |
39 |
40 | 143.0
45 |
46 |
47 | weight_lb
52 |
53 |
--------------------------------------------------------------------------------
/img/python-sticky-note-variables-03.svg:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 |
7 |
8 |
10 |
11 |
12 |
15 |
16 |
17 | 100.0
22 |
23 |
24 | weight_kg
29 |
30 |
31 |
33 |
34 |
35 |
38 |
39 |
40 | 143.0
45 |
46 |
47 | weight_lb
52 |
53 |
--------------------------------------------------------------------------------
/img/seaborn_overview_modules.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/seaborn_overview_modules.png
--------------------------------------------------------------------------------
/img/shift-tab.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/shift-tab.png
--------------------------------------------------------------------------------
/img/shift_button.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/shift_button.png
--------------------------------------------------------------------------------
/img/shiftenter.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/shiftenter.jpg
--------------------------------------------------------------------------------
/img/stack.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/stack.png
--------------------------------------------------------------------------------
/img/tabbutton.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/tabbutton.jpg
--------------------------------------------------------------------------------
/img/tidy_data_scheme.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/tidy_data_scheme.png
--------------------------------------------------------------------------------
/img/toomuch.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/toomuch.jpg
--------------------------------------------------------------------------------
/nbconvert_config.py:
--------------------------------------------------------------------------------
1 | c.Exporter.preprocessors = ['nbtutor.ClearExercisePreprocessor', 'nbconvert.preprocessors.ClearOutputPreprocessor']
2 |
--------------------------------------------------------------------------------
/notebooks/_solutions/case1_bike_count1.py:
--------------------------------------------------------------------------------
1 | df = pd.read_csv("data/fietstelpaal-coupure-links-2022-gent.zip", sep=';')
--------------------------------------------------------------------------------
/notebooks/_solutions/case1_bike_count10.py:
--------------------------------------------------------------------------------
1 | def process_bike_count_data(df):
2 | """Process the provided dataframe: parse datetimes and rename columns.
3 |
4 | Parameters
5 | ----------
6 | df : pandas.DataFrame
7 | DataFrame as read from the raw `fietstellingen`,
8 | containing the 'Datum', 'Uur5Minuten',
9 | 'Ordening', 'Totaal', 'Tegenrichting', 'Hoofdrichting' columns.
10 |
11 | Returns
12 | -------
13 | df2 : pandas.DataFrame
14 | DataFrame with the datetime info as index and the
15 | `direction_centre` and `direction_mariakerke` columns
16 | with the counts.
17 | """
18 | timestamps = pd.to_datetime(df["Ordening"], format="%Y-%m-%dT%H:%M:%S%z", utc=True)
19 | df2 = df.drop(columns=['Datum', 'Uur5Minuten', 'Ordening', 'Code'])
20 | df2["timestamp"] = timestamps
21 | df2 = df2.set_index("timestamp")
22 | df2 = df2.rename(columns={'Tegenrichting': 'direction_centre',
23 | 'Hoofdrichting': 'direction_mariakerke',
24 | 'Totaal': 'total',
25 | 'Locatie': 'location'
26 | })
27 | return df2
--------------------------------------------------------------------------------
/notebooks/_solutions/case1_bike_count11.py:
--------------------------------------------------------------------------------
1 | df_both = df.sum(axis=1)
2 | df_both
--------------------------------------------------------------------------------
/notebooks/_solutions/case1_bike_count12.py:
--------------------------------------------------------------------------------
1 | df_quiet = df_both[df_both < 5]
2 | df_quiet
--------------------------------------------------------------------------------
/notebooks/_solutions/case1_bike_count13.py:
--------------------------------------------------------------------------------
1 | df[(df['direction_centre'] < 3) | (df['direction_mariakerke'] < 3)]
--------------------------------------------------------------------------------
/notebooks/_solutions/case1_bike_count14.py:
--------------------------------------------------------------------------------
1 | df.mean()
--------------------------------------------------------------------------------
/notebooks/_solutions/case1_bike_count15.py:
--------------------------------------------------------------------------------
1 | df.resample('h').sum().mean()
--------------------------------------------------------------------------------
/notebooks/_solutions/case1_bike_count16.py:
--------------------------------------------------------------------------------
1 | df['direction_centre'].nlargest(10)
2 | # alternative:
3 | # df['direction_centre'].sort_values(ascending=False).head(10)
--------------------------------------------------------------------------------
/notebooks/_solutions/case1_bike_count17.py:
--------------------------------------------------------------------------------
1 | df_both = df.sum(axis=1)
--------------------------------------------------------------------------------
/notebooks/_solutions/case1_bike_count18.py:
--------------------------------------------------------------------------------
1 | df_daily = df_both.resample('D').sum()
--------------------------------------------------------------------------------
/notebooks/_solutions/case1_bike_count19.py:
--------------------------------------------------------------------------------
1 | df_daily.max()
--------------------------------------------------------------------------------
/notebooks/_solutions/case1_bike_count2.py:
--------------------------------------------------------------------------------
1 | df.head()
--------------------------------------------------------------------------------
/notebooks/_solutions/case1_bike_count20.py:
--------------------------------------------------------------------------------
1 | df_daily.nlargest(10)
--------------------------------------------------------------------------------
/notebooks/_solutions/case1_bike_count21.py:
--------------------------------------------------------------------------------
1 | df_monthly = df.resample('ME').sum()
2 | df_monthly.plot()
--------------------------------------------------------------------------------
/notebooks/_solutions/case1_bike_count22.py:
--------------------------------------------------------------------------------
1 | df_hourly = df.resample('h').sum()
--------------------------------------------------------------------------------
/notebooks/_solutions/case1_bike_count23.py:
--------------------------------------------------------------------------------
1 | df_hourly.head()
--------------------------------------------------------------------------------
/notebooks/_solutions/case1_bike_count24.py:
--------------------------------------------------------------------------------
1 | df_hourly['2023-01-01':'2023-01-21'].plot()
--------------------------------------------------------------------------------
/notebooks/_solutions/case1_bike_count25.py:
--------------------------------------------------------------------------------
1 | newyear = df["2022-12-31 12:00:00": "2023-01-01 12:00:00"]
--------------------------------------------------------------------------------
/notebooks/_solutions/case1_bike_count26.py:
--------------------------------------------------------------------------------
1 | newyear.plot()
--------------------------------------------------------------------------------
/notebooks/_solutions/case1_bike_count27.py:
--------------------------------------------------------------------------------
1 | newyear.rolling(10, center=True).mean().plot(linewidth=2)
--------------------------------------------------------------------------------
/notebooks/_solutions/case1_bike_count28.py:
--------------------------------------------------------------------------------
1 | # A more in-detail plotting version of the graph.
2 | fig, ax = plt.subplots()
3 | newyear.plot(ax=ax, color=['LightGreen', 'LightBlue'], legend=False, rot=0)
4 | newyear.rolling(10, center=True).mean().plot(linewidth=2, ax=ax, color=['DarkGreen', 'DarkBlue'], rot=0)
5 |
6 | ax.set_xlabel('')
7 | ax.set_ylabel('Cyclists count')
--------------------------------------------------------------------------------
/notebooks/_solutions/case1_bike_count3.py:
--------------------------------------------------------------------------------
1 | df.tail()
--------------------------------------------------------------------------------
/notebooks/_solutions/case1_bike_count4.py:
--------------------------------------------------------------------------------
1 | len(df)
--------------------------------------------------------------------------------
/notebooks/_solutions/case1_bike_count5.py:
--------------------------------------------------------------------------------
1 | df.dtypes
--------------------------------------------------------------------------------
/notebooks/_solutions/case1_bike_count6.py:
--------------------------------------------------------------------------------
1 | df["timestamp"] = pd.to_datetime(df["Ordening"], format="%Y-%m-%dT%H:%M:%S%z", utc=True)
--------------------------------------------------------------------------------
/notebooks/_solutions/case1_bike_count7.py:
--------------------------------------------------------------------------------
1 | df = df.set_index("timestamp")
--------------------------------------------------------------------------------
/notebooks/_solutions/case1_bike_count8.py:
--------------------------------------------------------------------------------
1 | df2022 = df.drop(columns=['Datum', 'Uur5Minuten', 'Ordening', 'Code'])
--------------------------------------------------------------------------------
/notebooks/_solutions/case1_bike_count9.py:
--------------------------------------------------------------------------------
1 | df2022 = df2022.rename(columns={'Tegenrichting': 'direction_centre',
2 | 'Hoofdrichting': 'direction_mariakerke',
3 | 'Totaal': 'total',
4 | 'Locatie': 'location'})
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations1.py:
--------------------------------------------------------------------------------
1 | observations = pd.read_csv("data/observations.csv", index_col="occurrenceID")
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations10.py:
--------------------------------------------------------------------------------
1 | observations.duplicated().sum()
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations11.py:
--------------------------------------------------------------------------------
1 | duplicate_observations = observations[observations.duplicated(keep=False)]
2 | duplicate_observations.sort_values(["eventDate", "verbatimLocality"]).head(9)
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations12.py:
--------------------------------------------------------------------------------
1 | observations_unique = observations.drop_duplicates()
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations13.py:
--------------------------------------------------------------------------------
1 | len(observations_unique)
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations14.py:
--------------------------------------------------------------------------------
1 | len(observations_unique.dropna())
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations15.py:
--------------------------------------------------------------------------------
1 | len(observations_unique.dropna(subset=['species_ID']))
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations16.py:
--------------------------------------------------------------------------------
1 | observations_with_ID = observations_unique.dropna(subset=['species_ID'])
2 | observations_with_ID.head()
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations17.py:
--------------------------------------------------------------------------------
1 | mask = observations['species_ID'].isna() & observations['sex'].notna()
2 | not_identified = observations[mask]
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations18.py:
--------------------------------------------------------------------------------
1 | not_identified.head()
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations19.py:
--------------------------------------------------------------------------------
1 | observations.groupby("name").size().nlargest(8)
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations2.py:
--------------------------------------------------------------------------------
1 | observations.head()
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations20.py:
--------------------------------------------------------------------------------
1 | observations['name'].value_counts().iloc[:8]
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations21.py:
--------------------------------------------------------------------------------
1 | n_species_per_plot = observations.groupby(["verbatimLocality"])["name"].nunique()
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations22.py:
--------------------------------------------------------------------------------
1 | fig, ax = plt.subplots(figsize=(6, 6))
2 | n_species_per_plot.plot(kind="barh", ax=ax)
3 | ax.set_ylabel("Plot number");
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations23.py:
--------------------------------------------------------------------------------
1 | ## Alternative option to calculate the species per plot:
2 | ## inspired on the pivot table we already had:
3 | #species_per_plot = observations.reset_index().pivot_table(
4 | # index="species_ID", columns="verbatimLocality", values="occurrenceID", aggfunc='count')
5 | #n_species_per_plot = species_per_plot.count()
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations24.py:
--------------------------------------------------------------------------------
1 | n_plots_per_species = observations.groupby(["name"])["verbatimLocality"].nunique().sort_values()
2 |
3 | fig, ax = plt.subplots(figsize=(10, 8))
4 | n_plots_per_species.plot(kind="barh", ax=ax)
5 | ax.set_xlabel("Number of plots");
6 | ax.set_ylabel("");
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations25.py:
--------------------------------------------------------------------------------
1 | n_plot_sex = observations.groupby(["sex", "verbatimLocality"]).size().rename("count").reset_index()
2 | n_plot_sex.head()
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations26.py:
--------------------------------------------------------------------------------
1 | pivoted = n_plot_sex.pivot(columns="sex", index="verbatimLocality", values="count")
2 | pivoted.head()
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations27.py:
--------------------------------------------------------------------------------
1 | sns.catplot(data=observations, x="verbatimLocality",
2 | hue="sex", kind="count", height=3, aspect=3)
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations28.py:
--------------------------------------------------------------------------------
1 | heatmap_prep = observations.pivot_table(index='year', columns='month',
2 | values="species_ID", aggfunc='count')
3 | fig, ax = plt.subplots(figsize=(10, 8))
4 | ax = sns.heatmap(heatmap_prep, cmap='Reds')
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations29.py:
--------------------------------------------------------------------------------
1 | survey_data = pd.merge(observations_data, species_names, how="left",
2 | left_on="species_ID", right_on="ID")
3 | survey_data
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations3.py:
--------------------------------------------------------------------------------
1 | observations.info()
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations30.py:
--------------------------------------------------------------------------------
1 | non_rodent_species = survey_data[survey_data['taxa'].isin(['Rabbit', 'Bird', 'Reptile'])]
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations31.py:
--------------------------------------------------------------------------------
1 | len(non_rodent_species)
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations32.py:
--------------------------------------------------------------------------------
1 | r_species = survey_data[survey_data['name'].str.lower().str.startswith('r')]
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations33.py:
--------------------------------------------------------------------------------
1 | len(r_species)
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations34.py:
--------------------------------------------------------------------------------
1 | non_bird_species = survey_data[survey_data['taxa'] != 'Bird']
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations35.py:
--------------------------------------------------------------------------------
1 | birds_85_89 = survey_data[(survey_data["eventDate"] >= "1985-01-01")
2 | & (survey_data["eventDate"] <= "1989-12-31 23:59")
3 | & (survey_data['taxa'] == 'Bird')]
4 | birds_85_89.head()
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations36.py:
--------------------------------------------------------------------------------
1 | # alternative solution
2 | birds_85_89 = survey_data[(survey_data["eventDate"].dt.year >= 1985)
3 | & (survey_data["eventDate"].dt.year <= 1989)
4 | & (survey_data['taxa'] == 'Bird')]
5 | birds_85_89.head()
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations37.py:
--------------------------------------------------------------------------------
1 | # Multiple lines
2 | obs_with_weight = survey_data.dropna(subset=["weight"])
3 | median_weight = obs_with_weight.groupby(['name'])["weight"].median()
4 | median_weight.sort_values(ascending=False)
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations38.py:
--------------------------------------------------------------------------------
1 | # Single line statement
2 | survey_data.dropna(subset=["weight"]).groupby(['name'])["weight"].median().sort_values(ascending=False)
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations39.py:
--------------------------------------------------------------------------------
1 | species_per_plot = survey_data.reset_index().pivot_table(index="name",
2 | columns="verbatimLocality",
3 | values="ID",
4 | aggfunc='count')
5 | species_per_plot.head()
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations4.py:
--------------------------------------------------------------------------------
1 | observations["eventDate"] = pd.to_datetime(observations[["year", "month", "day"]])
2 | observations
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations40.py:
--------------------------------------------------------------------------------
1 | fig, ax = plt.subplots(figsize=(8,8))
2 | sns.heatmap(species_per_plot, ax=ax, cmap='Greens')
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations41.py:
--------------------------------------------------------------------------------
1 | survey_data.resample('YE', on='eventDate').size().plot()
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations42.py:
--------------------------------------------------------------------------------
1 | merriami = survey_data[survey_data["name"] == "Dipodomys merriami"]
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations43.py:
--------------------------------------------------------------------------------
1 | fig, ax = plt.subplots()
2 | merriami.groupby(merriami['eventDate'].dt.month).size().plot(kind="barh", ax=ax)
3 | ax.set_xlabel("number of occurrences")
4 | ax.set_ylabel("Month of the year")
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations44.py:
--------------------------------------------------------------------------------
1 | subsetspecies = survey_data[survey_data["name"].isin(['Dipodomys merriami', 'Dipodomys ordii',
2 | 'Reithrodontomys megalotis', 'Chaetodipus baileyi'])]
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations45.py:
--------------------------------------------------------------------------------
1 | month_evolution = subsetspecies.groupby("name").resample('ME', on='eventDate').size()
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations46.py:
--------------------------------------------------------------------------------
1 | species_evolution = month_evolution.unstack(level=0)
2 | axs = species_evolution.plot(subplots=True, figsize=(14, 8), sharey=True)
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations47.py:
--------------------------------------------------------------------------------
1 | sns.relplot(data=month_evolution, x='eventDate', y="counts",
2 | row="name", kind="line", hue="name", height=2, aspect=5)
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations48.py:
--------------------------------------------------------------------------------
1 | year_evolution = survey_data.groupby("taxa").resample('YE', on='eventDate').size()
2 | year_evolution.name = "counts"
3 | year_evolution = year_evolution.reset_index()
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations49.py:
--------------------------------------------------------------------------------
1 | year_evolution.head()
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations5.py:
--------------------------------------------------------------------------------
1 | observations["datasetName"] = "Ecological Archives E090-118-D1."
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations50.py:
--------------------------------------------------------------------------------
1 | sns.relplot(data=year_evolution, x='eventDate', y="counts",
2 | col="taxa", col_wrap=2, kind="line", height=2, aspect=5,
3 | facet_kws={"sharey": False})
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations51.py:
--------------------------------------------------------------------------------
1 | fig, ax = plt.subplots()
2 | survey_data.groupby(survey_data["eventDate"].dt.weekday).size().plot(kind='barh', color='#66b266', ax=ax)
3 |
4 | import calendar
5 | xticks = ax.set_yticklabels(calendar.day_name)
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations6.py:
--------------------------------------------------------------------------------
1 | sex_dict = {"M": "male",
2 | "F": "female",
3 | "R": "male",
4 | "P": "female",
5 | "Z": np.nan}
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations7.py:
--------------------------------------------------------------------------------
1 | observations['sex'] = observations['verbatimSex'].replace(sex_dict)
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations8.py:
--------------------------------------------------------------------------------
1 | observations["sex"].unique()
--------------------------------------------------------------------------------
/notebooks/_solutions/case2_observations9.py:
--------------------------------------------------------------------------------
1 | observations['species_ID'].isna().sum()
--------------------------------------------------------------------------------
/notebooks/_solutions/case3_bacterial_resistance_lab_experiment1.py:
--------------------------------------------------------------------------------
1 | tidy_experiment = main_experiment.melt(id_vars=['Bacterial_genotype', 'Phage_t', 'experiment_ID'],
2 | value_vars=['OD_0h', 'OD_20h', 'OD_72h'],
3 | var_name='experiment_time_h',
4 | value_name='optical_density', )
5 | tidy_experiment
--------------------------------------------------------------------------------
/notebooks/_solutions/case3_bacterial_resistance_lab_experiment10.py:
--------------------------------------------------------------------------------
1 | sns.catplot(data=falcor, kind="point",
2 | x='Bacterial_genotype',
3 | y='log10 Mc',
4 | row="Phage",
5 | linestyle="none",
6 | errorbar=None,
7 | row_order=["Lambda", "T4", "T7"],
8 | order=['WT', 'MUT', 'D87G', 'S83L', 'D516G', 'S512F', 'K43N', 'K88R', 'RSF1010', 'RP4'],
9 | aspect=3, height=3,
10 | color="black")
--------------------------------------------------------------------------------
/notebooks/_solutions/case3_bacterial_resistance_lab_experiment11.py:
--------------------------------------------------------------------------------
1 | falcor["Bacterial_genotype"] = falcor["Bacterial_genotype"].replace({'WT(2)': 'WT',
2 | 'MUT(2)': 'MUT'})
--------------------------------------------------------------------------------
/notebooks/_solutions/case3_bacterial_resistance_lab_experiment12.py:
--------------------------------------------------------------------------------
1 | def errorbar(x, y, low, high, **kws):
2 | """Utility function to link falcor data representation with the errorbar representation"""
3 | plt.errorbar(x, y, (y - low, high - y), capsize=3, fmt="o", color="black", ms=4)
--------------------------------------------------------------------------------
/notebooks/_solutions/case3_bacterial_resistance_lab_experiment13.py:
--------------------------------------------------------------------------------
1 | sns.set_style("ticks")
2 | g = sns.FacetGrid(data=falcor, row="Phage", aspect=3, height=3)
3 | g.map(errorbar,
4 | "Bacterial_genotype", "log10 Mc",
5 | "log10 LBc", "log10 UBc")
--------------------------------------------------------------------------------
/notebooks/_solutions/case3_bacterial_resistance_lab_experiment2.py:
--------------------------------------------------------------------------------
1 | sns.set_style("white")
2 | histplot = sns.displot(data=tidy_experiment, x="optical_density",
3 | color='grey', edgecolor='white')
4 |
5 | histplot.fig.suptitle("Optical density distribution")
6 | histplot.axes[0][0].set_ylabel("Frequency");
--------------------------------------------------------------------------------
/notebooks/_solutions/case3_bacterial_resistance_lab_experiment3.py:
--------------------------------------------------------------------------------
1 | sns.catplot(data=tidy_experiment, x="experiment_time_h",
2 | y="optical_density", kind="violin")
--------------------------------------------------------------------------------
/notebooks/_solutions/case3_bacterial_resistance_lab_experiment4.py:
--------------------------------------------------------------------------------
1 | sns.catplot(data=tidy_experiment, x="experiment_time_h", y="optical_density",
2 | col="Phage_t", col_wrap=2, kind="violin")
--------------------------------------------------------------------------------
/notebooks/_solutions/case3_bacterial_resistance_lab_experiment5.py:
--------------------------------------------------------------------------------
1 | pd.pivot_table(tidy_experiment, values='optical_density',
2 | index='Bacterial_genotype',
3 | columns='experiment_time_h',
4 | aggfunc='mean')
--------------------------------------------------------------------------------
/notebooks/_solutions/case3_bacterial_resistance_lab_experiment6.py:
--------------------------------------------------------------------------------
1 | # advanced/optional solution
2 | tidy_experiment.groupby(['Bacterial_genotype', 'experiment_time_h'])['optical_density'].mean().unstack()
--------------------------------------------------------------------------------
/notebooks/_solutions/case3_bacterial_resistance_lab_experiment7.py:
--------------------------------------------------------------------------------
1 | density_mean = (tidy_experiment
2 | .groupby(['Bacterial_genotype','Phage_t', 'experiment_time_h'])['optical_density']
3 | .mean().reset_index())
--------------------------------------------------------------------------------
/notebooks/_solutions/case3_bacterial_resistance_lab_experiment8.py:
--------------------------------------------------------------------------------
1 | sns.catplot(data=density_mean, kind="bar",
2 | x='Bacterial_genotype',
3 | y='optical_density',
4 | hue='Phage_t',
5 | row="experiment_time_h",
6 | sharey=False,
7 | aspect=3, height=3,
8 | palette="colorblind")
--------------------------------------------------------------------------------
/notebooks/_solutions/case3_bacterial_resistance_lab_experiment9.py:
--------------------------------------------------------------------------------
1 | falcor["Bacterial_genotype"] = falcor["Bacterial_genotype"].replace({'WT(2)': 'WT',
2 | 'MUT(2)': 'MUT'})
3 | falcor.head()
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_analysis1.py:
--------------------------------------------------------------------------------
1 | data_tidy = data.reset_index().melt(id_vars=["datetime"], var_name='station', value_name='no2')
2 | data_tidy.head()
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_analysis10.py:
--------------------------------------------------------------------------------
1 | fig, ax = plt.subplots()
2 |
3 | data['1999':].resample('YE').mean().plot(ax=ax)
4 | data['1999':].mean(axis=1).resample('YE').mean().plot(color='k',
5 | linestyle='--',
6 | linewidth=4,
7 | ax=ax,
8 | label='Overall mean')
9 | ax.legend(loc='center', ncol=3,
10 | bbox_to_anchor=(0.5, 1.06))
11 | ax.set_ylabel("NO$_2$ concentration (µg/m³)");
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_analysis11.py:
--------------------------------------------------------------------------------
1 | # add a column to the dataframe that indicates the month (integer value of 1 to 12):
2 | data['month'] = data.index.month
3 |
4 | # now, we can calculate the mean of each month over the different years:
5 | data.groupby('month').mean()
6 |
7 | # plot the typical monthly profile of the different stations:
8 | data.groupby('month').mean().plot()
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_analysis12.py:
--------------------------------------------------------------------------------
1 | # Resample wise
2 | df2011 = data.loc['2011']
3 | df2011[['BETN029', 'BETR801']].resample('W').quantile(0.95).plot()
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_analysis13.py:
--------------------------------------------------------------------------------
1 | # Groupby wise
2 | # Note the different x-axis labels
3 | df2011.groupby(df2011.index.isocalendar().week)[['BETN029', 'BETR801']].quantile(0.95).plot()
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_analysis14.py:
--------------------------------------------------------------------------------
1 | data.groupby(data.index.hour).mean().plot()
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_analysis15.py:
--------------------------------------------------------------------------------
1 | data['weekend'] = data.index.dayofweek.isin([5, 6])
2 | data['weekend'] = data['weekend'].replace({True: 'weekend', False: 'weekday'})
3 | data['hour'] = data.index.hour
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_analysis16.py:
--------------------------------------------------------------------------------
1 | data_weekend = data.groupby(['weekend', 'hour']).mean()
2 | data_weekend.head()
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_analysis17.py:
--------------------------------------------------------------------------------
1 | # using unstack and pandas plotting
2 | data_weekend_BETR801 = data_weekend['BETR801'].unstack(level=0)
3 | data_weekend_BETR801.plot()
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_analysis18.py:
--------------------------------------------------------------------------------
1 | # using a tidy dataset and seaborn
2 | data_weekend_BETR801_tidy = data_weekend['BETR801'].reset_index()
3 |
4 | sns.lineplot(data=data_weekend_BETR801_tidy, x="hour", y="BETR801", hue="weekend")
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_analysis19.py:
--------------------------------------------------------------------------------
1 | # tidy dataset that still includes all stations
2 |
3 | data_weekend_tidy = pd.melt(data_weekend.reset_index(), id_vars=['weekend', 'hour'],
4 | var_name='station', value_name='no2')
5 | data_weekend_tidy.head()
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_analysis2.py:
--------------------------------------------------------------------------------
1 | data_tidy['no2'].isna().sum()
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_analysis20.py:
--------------------------------------------------------------------------------
1 | # when still having multiple factors, it becomes useful to convert to tidy dataset and use seaborn
2 | sns.relplot(data=data_weekend_tidy, x="hour", y="no2", kind="line",
3 | hue="weekend", col="station", col_wrap=2)
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_analysis21.py:
--------------------------------------------------------------------------------
1 | data[['BETR801', 'BETN029', 'FR04037', 'FR04012']].corr()
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_analysis22.py:
--------------------------------------------------------------------------------
1 | exceedances = data > 200
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_analysis23.py:
--------------------------------------------------------------------------------
1 | # group by year and count exceedances (sum of boolean)
2 | exceedances = exceedances.groupby(exceedances.index.year).sum()
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_analysis24.py:
--------------------------------------------------------------------------------
1 | # Make a barplot of the yearly number of exceedances
2 | ax = exceedances.loc[2005:].plot(kind='bar')
3 | ax.axhline(18, color='k', linestyle='--')
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_analysis25.py:
--------------------------------------------------------------------------------
1 | FR_station = data['FR04012'] # select the specific data series
2 | FR_station = FR_station[(FR_station.notnull()) & (FR_station != 0.0)] # exclude the Nan and zero values
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_analysis26.py:
--------------------------------------------------------------------------------
1 | FR_sorted = FR_station.sort_values(ascending=True)
2 | FR_scaled = (FR_sorted - FR_sorted.min())/(FR_sorted.max() - FR_sorted.min())
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_analysis27.py:
--------------------------------------------------------------------------------
1 | fig, axfr = plt.subplots()
2 | FR_scaled.plot(use_index=False, ax = axfr) #alternative version: FR_scaled.reset_index(drop=True).plot(use_index=False)
3 | axfr.set_ylabel('FR04012')
4 | # optional addition, just in case you need this
5 | axfr.axvline(x=FR_scaled.searchsorted(0.3), color='0.6', linestyle='--', linewidth=3)
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_analysis28.py:
--------------------------------------------------------------------------------
1 | # Mixing an matching matplotlib and Pandas
2 | fig, (ax1, ax2) = plt.subplots(1, 2,
3 | sharex=True,
4 | sharey=True)
5 |
6 | data.loc['2009', ['BETN029', 'BETR801']].plot(kind='hist', subplots=True,
7 | bins=30, legend=False,
8 | ax=(ax1, ax2))
9 | ax1.set_title('BETN029')
10 | ax2.set_title('BETR801')
11 | # Remark: the width of the bins is calculated over the x data range for both plots together
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_analysis29.py:
--------------------------------------------------------------------------------
1 | # A more step by step approach (equally valid)
2 | fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True, sharex=True)
3 | data.loc['2009', 'BETN029'].plot(kind='hist', bins=30, ax=ax1)
4 | ax1.set_title('BETN029')
5 | data.loc['2009', 'BETR801'].plot(kind='hist', bins=30, ax=ax2)
6 | ax2.set_title('BETR801')
7 | # Remark: the width of the bins is calculated over the x data range for each plot individually
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_analysis3.py:
--------------------------------------------------------------------------------
1 | data_tidy = data_tidy.dropna()
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_analysis30.py:
--------------------------------------------------------------------------------
1 | subset = data.loc['2009-01'].copy()
2 | subset["dayofweek"] = subset.index.dayofweek
3 | subset = subset[subset['dayofweek'].isin([0, 6])]
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_analysis31.py:
--------------------------------------------------------------------------------
1 | subset["dayofweek"] = subset["dayofweek"].replace(to_replace={0:"Monday", 6:"Sunday"})
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_analysis32.py:
--------------------------------------------------------------------------------
1 | sns.set_style("whitegrid")
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_analysis33.py:
--------------------------------------------------------------------------------
1 | sns.lmplot(
2 | data=subset, x="BETN029", y="FR04037", hue="dayofweek"
3 | )
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_analysis34.py:
--------------------------------------------------------------------------------
1 | exceedances = data.rolling(8).mean().resample('D').max() > 100
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_analysis35.py:
--------------------------------------------------------------------------------
1 | exceedances = exceedances.groupby(exceedances.index.year).sum()
2 | ax = exceedances.plot(kind='bar')
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_analysis36.py:
--------------------------------------------------------------------------------
1 | data_daily = data.resample('D').mean()
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_analysis37.py:
--------------------------------------------------------------------------------
1 | # add a dayofweek column
2 | data_daily['dayofweek'] = data_daily.index.dayofweek
3 | data_daily.head()
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_analysis38.py:
--------------------------------------------------------------------------------
1 | # seaborn
2 | sns.boxplot(data=data_daily, x='dayofweek', y='BETR801', color="grey")
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_analysis39.py:
--------------------------------------------------------------------------------
1 | # when using pandas to plot, the different boxplots should be different columns
2 | # therefore, pivot table so that the weekdays are the different columns
3 | data_daily['week'] = data_daily.index.isocalendar().week
4 | data_pivoted = data_daily.pivot_table(columns='dayofweek', index='week',
5 | values='BETR801')
6 | data_pivoted.head()
7 | data_pivoted.boxplot();
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_analysis4.py:
--------------------------------------------------------------------------------
1 | fig, ax = plt.subplots()
2 | data.loc['2009':, 'FR04037'].resample('ME').mean().plot(ax=ax, label='mean')
3 | data.loc['2009':, 'FR04037'].resample('ME').median().plot(ax=ax, label='median')
4 | ax.legend(ncol=2)
5 | ax.set_title("FR04037");
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_analysis40.py:
--------------------------------------------------------------------------------
1 | # An alternative method using `groupby` and `unstack`
2 | data_daily.groupby(['dayofweek', 'week'])['BETR801'].mean().unstack(level=0).boxplot();
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_analysis5.py:
--------------------------------------------------------------------------------
1 | data.loc['2009':, 'FR04037'].resample('ME').agg(['mean', 'median']).plot()
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_analysis6.py:
--------------------------------------------------------------------------------
1 | # with wide dataframe
2 | fig, ax = plt.subplots()
3 | sns.violinplot(data=data['2011-01': '2011-08'], color="C0", ax=ax)
4 | ax.set_ylabel("NO$_2$ concentration (µg/m³)")
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_analysis7.py:
--------------------------------------------------------------------------------
1 | # with tidy dataframe
2 | data_tidy_subset = data_tidy[(data_tidy['datetime'] >= "2011-01") & (data_tidy['datetime'] < "2011-09")]
3 |
4 | fig, ax = plt.subplots()
5 | sns.violinplot(data=data_tidy_subset, x="station", y="no2", color="C0", ax=ax)
6 | ax.set_ylabel("NO$_2$ concentration (µg/m³)")
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_analysis8.py:
--------------------------------------------------------------------------------
1 | # with figure-level function
2 | sns.catplot(data=data_tidy_subset, x="station", y="no2", kind="violin")
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_analysis9.py:
--------------------------------------------------------------------------------
1 | fig, ax = plt.subplots()
2 | data['2012':].mean().plot(kind='bar', ax=ax, rot=0, color='C0')
3 | ax.set_ylabel("NO$_2$ concentration (µg/m³)")
4 | ax.axhline(y=40., color='darkorange')
5 | ax.text(0.3, 0.48, 'Yearly limit is 40 µg/m³',
6 | horizontalalignment='left', fontsize=13,
7 | transform=ax.transAxes, color='darkorange');
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_processing1.py:
--------------------------------------------------------------------------------
1 | data = pd.read_csv("data/BETR8010000800100hour.1-1-1990.31-12-2012",
2 | sep='\t', header=None, names=column_names, na_values=[-999, -9999])
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_processing10.py:
--------------------------------------------------------------------------------
1 | def read_airbase_file(filename, station):
2 | """
3 | Read hourly AirBase data files.
4 |
5 | Parameters
6 | ----------
7 | filename : string
8 | Path to the data file.
9 | station : string
10 | Name of the station.
11 |
12 | Returns
13 | -------
14 | DataFrame
15 | Processed dataframe.
16 | """
17 |
18 | # construct the column names
19 | hours = ["{:02d}".format(i) for i in range(24)]
20 | flags = ['flag' + str(i) for i in range(24)]
21 | colnames = ['date'] + [item for pair in zip(hours, flags) for item in pair]
22 |
23 | # read the actual data
24 | data = pd.read_csv(filename, sep='\t', header=None, na_values=[-999, -9999], names=colnames)
25 |
26 | # drop the 'flag' columns
27 | data = data.drop([col for col in data.columns if 'flag' in col], axis=1)
28 |
29 | # reshape
30 | data_stacked = pd.melt(data, id_vars=['date'], var_name='hour')
31 |
32 | # parse to datetime and remove redundant columns
33 | data_stacked.index = pd.to_datetime(data_stacked['date'] + data_stacked['hour'], format="%Y-%m-%d%H")
34 | data_stacked = data_stacked.drop(['date', 'hour'], axis=1)
35 | data_stacked = data_stacked.rename(columns={'value': station})
36 |
37 | return data_stacked
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_processing11.py:
--------------------------------------------------------------------------------
1 | data_folder = Path("./data")
2 | data_files = list(data_folder.glob("*0008001*"))
3 | data_files
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_processing12.py:
--------------------------------------------------------------------------------
1 | dfs = []
2 |
3 | for filename in data_files:
4 | station = filename.name[:7]
5 | df = read_airbase_file(filename, station)
6 | dfs.append(df)
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_processing13.py:
--------------------------------------------------------------------------------
1 | combined_data = pd.concat(dfs, axis=1)
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_processing2.py:
--------------------------------------------------------------------------------
1 | data.head()
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_processing3.py:
--------------------------------------------------------------------------------
1 | data = data.drop(flag_columns, axis=1)
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_processing4.py:
--------------------------------------------------------------------------------
1 | data_stacked = pd.melt(data, id_vars=['date'], var_name='hour')
2 | data_stacked.head()
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_processing5.py:
--------------------------------------------------------------------------------
1 | # we use stack to reshape the data to move the hours (the column labels) into a column.
2 | # But we don't want to move the 'date' column label, therefore we first set this as the index.
3 | # You can check the difference with "data.stack()"
4 | data_stacked = data.set_index('date').stack()
5 | data_stacked.head()
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_processing6.py:
--------------------------------------------------------------------------------
1 | # We reset the index to have the date and hours available as columns
2 | data_stacked = data_stacked.reset_index()
3 | data_stacked = data_stacked.rename(columns={'level_1': 'hour'})
4 | data_stacked.head()
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_processing7.py:
--------------------------------------------------------------------------------
1 | # Now we combine the dates and the hours into a datetime, and set this as the index
2 | data_stacked.index = pd.to_datetime(data_stacked['date'] + data_stacked['hour'], format="%Y-%m-%d%H")
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_processing8.py:
--------------------------------------------------------------------------------
1 | # Drop the origal date and hour columns
2 | data_stacked = data_stacked.drop(['date', 'hour'], axis=1)
3 | data_stacked.head()
--------------------------------------------------------------------------------
/notebooks/_solutions/case4_air_quality_processing9.py:
--------------------------------------------------------------------------------
1 | # rename the remaining column to the name of the measurement station
2 | # (this is 0 or 'value' depending on which method was used)
3 | data_stacked = data_stacked.rename(columns={0: 'BETR801'})
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_01_data_structures1.py:
--------------------------------------------------------------------------------
1 | df = pd.read_csv("data/titanic.csv")
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_01_data_structures2.py:
--------------------------------------------------------------------------------
1 | df.head()
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_01_data_structures3.py:
--------------------------------------------------------------------------------
1 | len(df)
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_01_data_structures4.py:
--------------------------------------------------------------------------------
1 | df['Age']
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_01_data_structures5.py:
--------------------------------------------------------------------------------
1 | df['Fare'].plot.box() # or .plot(kind='box')
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_01_data_structures6.py:
--------------------------------------------------------------------------------
1 | df.sort_values(by='Age', ascending=False)
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_02_basic_operations1.py:
--------------------------------------------------------------------------------
1 | df['Age'].mean()
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_02_basic_operations10.py:
--------------------------------------------------------------------------------
1 | np.log(df['Fare'])
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_02_basic_operations2.py:
--------------------------------------------------------------------------------
1 | df['Age'].plot.hist() # bins=30, log=True)
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_02_basic_operations3.py:
--------------------------------------------------------------------------------
1 | df['Survived'].sum() / len(df['Survived'])
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_02_basic_operations4.py:
--------------------------------------------------------------------------------
1 | df['Survived'].mean()
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_02_basic_operations5.py:
--------------------------------------------------------------------------------
1 | df['Fare'].max()
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_02_basic_operations6.py:
--------------------------------------------------------------------------------
1 | df['Fare'].median()
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_02_basic_operations7.py:
--------------------------------------------------------------------------------
1 | df['Fare'].quantile(0.75)
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_02_basic_operations8.py:
--------------------------------------------------------------------------------
1 | df['Fare'] / df['Fare'].mean()
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_02_basic_operations9.py:
--------------------------------------------------------------------------------
1 | df['Fare_scaled'] = df['Fare'] / df['Fare'].mean()
2 | df.head()
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_03a_selecting_data1.py:
--------------------------------------------------------------------------------
1 | males = df[df['Sex'] == 'male']
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_03a_selecting_data10.py:
--------------------------------------------------------------------------------
1 | df[df['Surname'].str.len() > 15]
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_03a_selecting_data11.py:
--------------------------------------------------------------------------------
1 | len(titles)
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_03a_selecting_data12.py:
--------------------------------------------------------------------------------
1 | titles.sort_values('year').head(2)
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_03a_selecting_data13.py:
--------------------------------------------------------------------------------
1 | titles.nsmallest(2, columns="year")
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_03a_selecting_data14.py:
--------------------------------------------------------------------------------
1 | len(titles[titles['title'] == 'Hamlet'])
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_03a_selecting_data15.py:
--------------------------------------------------------------------------------
1 | titles[titles['title'] == 'Treasure Island'].sort_values('year')
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_03a_selecting_data16.py:
--------------------------------------------------------------------------------
1 | len(titles[(titles['year'] >= 1950) & (titles['year'] <= 1959)])
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_03a_selecting_data17.py:
--------------------------------------------------------------------------------
1 | len(titles[titles['year'] // 10 == 195])
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_03a_selecting_data18.py:
--------------------------------------------------------------------------------
1 | inception = cast[cast['title'] == 'Inception']
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_03a_selecting_data19.py:
--------------------------------------------------------------------------------
1 | len(inception[inception['n'].isna()])
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_03a_selecting_data2.py:
--------------------------------------------------------------------------------
1 | males['Age'].mean()
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_03a_selecting_data20.py:
--------------------------------------------------------------------------------
1 | inception['n'].isna().sum()
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_03a_selecting_data21.py:
--------------------------------------------------------------------------------
1 | len(inception[inception['n'].notna()])
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_03a_selecting_data22.py:
--------------------------------------------------------------------------------
1 | titanic = cast[(cast['title'] == 'Titanic') & (cast['year'] == 1997)]
2 | titanic = titanic[titanic['n'].notna()]
3 | titanic.sort_values('n')
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_03a_selecting_data23.py:
--------------------------------------------------------------------------------
1 | brad = cast[cast['name'] == 'Brad Pitt']
2 | brad = brad[brad['year'] // 10 == 199]
3 | brad = brad[brad['n'] == 2]
4 | brad.sort_values('year')
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_03a_selecting_data3.py:
--------------------------------------------------------------------------------
1 | df[df['Sex'] == 'female']['Age'].mean()
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_03a_selecting_data4.py:
--------------------------------------------------------------------------------
1 | len(df[df['Age'] > 70])
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_03a_selecting_data5.py:
--------------------------------------------------------------------------------
1 | (df['Age'] > 70).sum()
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_03a_selecting_data6.py:
--------------------------------------------------------------------------------
1 | df[(df['Age'] > 30) & (df['Age'] <= 40)]
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_03a_selecting_data7.py:
--------------------------------------------------------------------------------
1 | name.split(",")[0]
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_03a_selecting_data8.py:
--------------------------------------------------------------------------------
1 | df['Surname'] = df['Name'].str.split(",").str.get(0)
2 | df['Surname']
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_03a_selecting_data9.py:
--------------------------------------------------------------------------------
1 | df[df['Surname'].str.startswith('Williams')]
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_03b_indexing1.py:
--------------------------------------------------------------------------------
1 | countries['density'] = countries['population']*1000000 / countries['area']
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_03b_indexing2.py:
--------------------------------------------------------------------------------
1 | countries.loc[countries['density'] > 300, ['capital', 'population']]
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_03b_indexing3.py:
--------------------------------------------------------------------------------
1 | countries['density_ratio'] = countries['density'] / countries['density'].mean()
2 | countries
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_03b_indexing4.py:
--------------------------------------------------------------------------------
1 | countries.loc['United Kingdom', 'capital'] = 'Cambridge'
2 | countries
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_03b_indexing5.py:
--------------------------------------------------------------------------------
1 | countries[(countries['density'] > 100) & (countries['density'] < 300)]
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_03b_indexing6.py:
--------------------------------------------------------------------------------
1 | df.loc[df['Sex'] == 'male', 'Age'].mean()
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_03b_indexing7.py:
--------------------------------------------------------------------------------
1 | df.loc[df['Sex'] == 'female', 'Age'].mean()
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_04_time_series_data1.py:
--------------------------------------------------------------------------------
1 | data['2012':]
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_04_time_series_data2.py:
--------------------------------------------------------------------------------
1 | data[data.index.month == 1]
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_04_time_series_data3.py:
--------------------------------------------------------------------------------
1 | data[data.index.month.isin([4, 5, 6])]
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_04_time_series_data4.py:
--------------------------------------------------------------------------------
1 | data[(data.index.hour > 8) & (data.index.hour < 20)]
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_04_time_series_data5.py:
--------------------------------------------------------------------------------
1 | data.resample('ME').std().plot() # 'A'
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_04_time_series_data6.py:
--------------------------------------------------------------------------------
1 | subset = data['2011':'2012']['L06_347']
2 | subset.resample('ME').agg(['mean', 'median']).plot()
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_04_time_series_data7.py:
--------------------------------------------------------------------------------
1 | daily = data['LS06_348'].resample('D').mean() # daily averages calculated
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_04_time_series_data8.py:
--------------------------------------------------------------------------------
1 | daily.resample('MS').agg(['min', 'max']).plot() # monthly minimum and maximum values of these daily averages
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_04_time_series_data9.py:
--------------------------------------------------------------------------------
1 | data['2013':'2013'].mean().plot(kind='barh')
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_05_groupby_operations1.py:
--------------------------------------------------------------------------------
1 | df.groupby('Sex')['Age'].mean()
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_05_groupby_operations10.py:
--------------------------------------------------------------------------------
1 | titles['decade'] = titles['year'] // 10 * 10
2 | hamlet = titles[titles['title'].str.contains('Hamlet')]
3 | hamlet.groupby('decade').size().plot.bar(color="lightblue")
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_05_groupby_operations11.py:
--------------------------------------------------------------------------------
1 | cast1990 = cast[cast['year'] >= 1990]
2 | cast1990 = cast1990[cast1990['n'] == 1]
3 | cast1990.groupby('name').size().nlargest(10)
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_05_groupby_operations12.py:
--------------------------------------------------------------------------------
1 | cast1990['name'].value_counts().head(10)
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_05_groupby_operations13.py:
--------------------------------------------------------------------------------
1 | hamlets = titles[titles['title'].str.contains('Hamlet')]
2 | hamlets['title'].value_counts()
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_05_groupby_operations14.py:
--------------------------------------------------------------------------------
1 | hamlets = titles[titles['title'].str.startswith('Hamlet')]
2 | hamlets['title'].value_counts()
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_05_groupby_operations15.py:
--------------------------------------------------------------------------------
1 | title_longest = titles['title'].str.len().nlargest(10)
2 | title_longest
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_05_groupby_operations16.py:
--------------------------------------------------------------------------------
1 | pd.options.display.max_colwidth = 210
2 | titles.loc[title_longest.index]
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_05_groupby_operations17.py:
--------------------------------------------------------------------------------
1 | cast1950 = cast[cast['year'] // 10 == 195]
2 | cast1950 = cast1950[cast1950['n'] == 1]
3 | cast1950.groupby(['year', 'type']).size()
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_05_groupby_operations18.py:
--------------------------------------------------------------------------------
1 | cast.character.value_counts().head(11)
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_05_groupby_operations19.py:
--------------------------------------------------------------------------------
1 | cast[cast.name == 'Brad Pitt'].year.value_counts().sort_index().plot()
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_05_groupby_operations2.py:
--------------------------------------------------------------------------------
1 | # df['Survived'].sum() / len(df['Survived'])
2 | df['Survived'].mean()
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_05_groupby_operations20.py:
--------------------------------------------------------------------------------
1 | titles[titles['title'].str.startswith('The Life')]['title'].value_counts().head(10)
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_05_groupby_operations21.py:
--------------------------------------------------------------------------------
1 | cast[cast.year == 2010].name.value_counts().head(10)
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_05_groupby_operations22.py:
--------------------------------------------------------------------------------
1 | pink = cast[cast['title'] == 'The Pink Panther']
2 | pink.groupby(['year'])[['n']].max()
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_05_groupby_operations23.py:
--------------------------------------------------------------------------------
1 | oz = cast[cast['name'] == 'Frank Oz']
2 | oz_roles = oz.groupby(['year', 'title']).size()
3 | oz_roles[oz_roles > 1]
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_05_groupby_operations24.py:
--------------------------------------------------------------------------------
1 | oz = cast[cast['name'] == 'Frank Oz']
2 | oz_roles = oz.groupby(['character']).size()
3 | oz_roles[oz_roles > 1].sort_values()
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_05_groupby_operations25.py:
--------------------------------------------------------------------------------
1 | cast['n_total'] = cast.groupby(['title', 'year'])['n'].transform('size') # transform will return an element for each row, so the size value is given to the whole group
2 | cast.head()
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_05_groupby_operations26.py:
--------------------------------------------------------------------------------
1 | leading = cast[cast['n'] == 1]
2 | sums_decade = leading.groupby([cast['year'] // 10 * 10, 'type']).size()
3 | sums_decade
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_05_groupby_operations27.py:
--------------------------------------------------------------------------------
1 | #sums_decade.groupby(level='year').transform(lambda x: x / x.sum())
2 | ratios_decade = sums_decade / sums_decade.groupby(level='year').transform('sum')
3 | ratios_decade
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_05_groupby_operations28.py:
--------------------------------------------------------------------------------
1 | ratios_decade[:, 'actor'].plot()
2 | ratios_decade[:, 'actress'].plot()
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_05_groupby_operations29.py:
--------------------------------------------------------------------------------
1 | t = titles
2 | t.year.value_counts().head(3)
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_05_groupby_operations3.py:
--------------------------------------------------------------------------------
1 | df25 = df[df['Age'] < 25]
2 | df25['Survived'].mean()
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_05_groupby_operations30.py:
--------------------------------------------------------------------------------
1 | cast1950 = cast[cast['year'] // 10 == 195]
2 | cast1950 = cast1950[cast1950['n'] == 1]
3 | cast1950['type'].value_counts()
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_05_groupby_operations31.py:
--------------------------------------------------------------------------------
1 | cast2000 = cast[cast['year'] // 10 == 200]
2 | cast2000 = cast2000[cast2000['n'] == 1]
3 | cast2000['type'].value_counts()
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_05_groupby_operations4.py:
--------------------------------------------------------------------------------
1 | df.groupby('Sex')['Survived'].mean()
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_05_groupby_operations5.py:
--------------------------------------------------------------------------------
1 | df.groupby('Pclass')['Survived'].mean().plot.bar() #and what if you would compare the total number of survivors?
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_05_groupby_operations6.py:
--------------------------------------------------------------------------------
1 | df.groupby('AgeClass', observed=False)['Fare'].mean().plot.bar(rot=0)
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_05_groupby_operations7.py:
--------------------------------------------------------------------------------
1 | titles['decade'] = titles['year'] // 10 * 10
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_05_groupby_operations8.py:
--------------------------------------------------------------------------------
1 | titles.groupby('decade').size().plot.bar(color='green')
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_05_groupby_operations9.py:
--------------------------------------------------------------------------------
1 | titles['decade'] = titles['year'] // 10 * 10
2 | hamlet = titles[titles['title'] == 'Hamlet']
3 | hamlet.groupby('decade').size().plot.bar(color="orange")
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_06_data_cleaning1.py:
--------------------------------------------------------------------------------
1 | casualties_raw["TX_SEX_DESCR_NL"].unique()
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_06_data_cleaning10.py:
--------------------------------------------------------------------------------
1 | casualties["datetime"] = pd.to_datetime(casualties["datetime"])
2 | casualties["datetime"]
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_06_data_cleaning11.py:
--------------------------------------------------------------------------------
1 | casualties["week_day"] = pd.Categorical(casualties["DAY_OF_WEEK"],
2 | categories=["Monday", "Tuesday", "Wednesday", "Thursday",
3 | "Friday", "Saturday", "Sunday"],
4 | ordered=True)
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_06_data_cleaning12.py:
--------------------------------------------------------------------------------
1 | casualties["week_day"].dtype
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_06_data_cleaning13.py:
--------------------------------------------------------------------------------
1 | casualties["AGE_CLS"] = casualties["AGE_CLS"].str.replace(" tot ", " - ").str.removesuffix(" jaar").str.strip()
2 | casualties["AGE_CLS"] = casualties["AGE_CLS"].replace({"Onbekend": None, "75 jaar en meer": ">75", "": None})
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_06_data_cleaning14.py:
--------------------------------------------------------------------------------
1 | unique_combinations = ["DT_DAY", "DT_HOUR", "CD_MUNTY_REFNIS", "BUILD_UP_AREA","LIGHT_COND", "ROAD_TYPE"]
2 | casualties.drop_duplicates(subset=unique_combinations).shape
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_06_data_cleaning15.py:
--------------------------------------------------------------------------------
1 | # alternative using `duplicated`
2 | (~casualties.duplicated(subset=unique_combinations)).sum()
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_06_data_cleaning2.py:
--------------------------------------------------------------------------------
1 | gender_mapping = {"Vrouwelijk": "female", "Mannelijk": "male", "Onbekend": None}
2 | casualties_raw["TX_SEX_DESCR_NL"] = casualties_raw["TX_SEX_DESCR_NL"].replace(gender_mapping)
3 | casualties_raw["TX_SEX_DESCR_NL"].unique()
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_06_data_cleaning3.py:
--------------------------------------------------------------------------------
1 | casualties_raw["DT_HOUR"].unique()
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_06_data_cleaning4.py:
--------------------------------------------------------------------------------
1 | (casualties_raw["DT_HOUR"] == 99).sum()
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_06_data_cleaning5.py:
--------------------------------------------------------------------------------
1 | casualties_raw["DT_HOUR"] = casualties_raw["DT_HOUR"].replace(99, 9)
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_06_data_cleaning6.py:
--------------------------------------------------------------------------------
1 | casualties_nl = casualties_raw.drop(columns=column_names_with_fr)
2 | casualties_nl
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_06_data_cleaning7.py:
--------------------------------------------------------------------------------
1 | casualties = casualties_nl.rename(columns=clean_column_name)
2 | casualties.head()
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_06_data_cleaning8.py:
--------------------------------------------------------------------------------
1 | casualties[["DT_DAY", "DT_HOUR"]].dtypes
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_06_data_cleaning9.py:
--------------------------------------------------------------------------------
1 | casualties["datetime"] = casualties["DT_DAY"] + " " + casualties["DT_HOUR"].astype(str)
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_08_reshaping_data1.py:
--------------------------------------------------------------------------------
1 | df = pd.read_excel("data/verbruiksgegevens-per-maand.xlsx")
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_08_reshaping_data10.py:
--------------------------------------------------------------------------------
1 | df.pivot_table(index='Underaged', columns='Sex',
2 | values='Fare', aggfunc='median')
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_08_reshaping_data11.py:
--------------------------------------------------------------------------------
1 | df_survival = df.groupby(["Pclass", "Sex"])["Survived"].mean().reset_index()
2 | df_survival
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_08_reshaping_data12.py:
--------------------------------------------------------------------------------
1 | df_survival.pivot(index="Pclass", columns="Sex", values="Survived")
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_08_reshaping_data13.py:
--------------------------------------------------------------------------------
1 | df.groupby(['Pclass', 'Sex'])['Survived'].mean().unstack()
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_08_reshaping_data14.py:
--------------------------------------------------------------------------------
1 | grouped = cast.groupby(['year', 'type']).size()
2 | table = grouped.unstack('type')
3 | table.plot()
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_08_reshaping_data15.py:
--------------------------------------------------------------------------------
1 | cast.pivot_table(index='year', columns='type', values="character", aggfunc='count').plot()
2 | # for the values column to use in the aggfunc, take a column with no NaN values in order to count effectively all values
3 | # -> at this stage: aha-erlebnis about crosstab function(!)
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_08_reshaping_data16.py:
--------------------------------------------------------------------------------
1 | pd.crosstab(index=cast['year'], columns=cast['type']).plot()
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_08_reshaping_data17.py:
--------------------------------------------------------------------------------
1 | pd.crosstab(index=cast['year'], columns=cast['type']).plot.area()
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_08_reshaping_data18.py:
--------------------------------------------------------------------------------
1 | grouped = cast.groupby(['year', 'type']).size()
2 | table = grouped.unstack('type').fillna(0)
3 | (table['actor'] / (table['actor'] + table['actress'])).plot(ylim=[0, 1])
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_08_reshaping_data19.py:
--------------------------------------------------------------------------------
1 | c = cast
2 | c = c[(c.character == 'Superman') | (c.character == 'Batman')]
3 | c = c.groupby(['year', 'character']).size()
4 | c = c.unstack()
5 | c = c.fillna(0)
6 | c.head()
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_08_reshaping_data2.py:
--------------------------------------------------------------------------------
1 | df = df.drop(columns=["Regio"])
2 | df
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_08_reshaping_data20.py:
--------------------------------------------------------------------------------
1 | d = c.Superman - c.Batman
2 | print('Superman years:')
3 | print(len(d[d > 0.0]))
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_08_reshaping_data3.py:
--------------------------------------------------------------------------------
1 | df_tidy = pd.melt(df, id_vars=["Hoofdgemeente", "Energie", "SLP"], var_name="time", value_name="consumption")
2 | df_tidy
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_08_reshaping_data4.py:
--------------------------------------------------------------------------------
1 | df_tidy["time"] = pd.to_datetime(df_tidy["time"], format="%Y%m")
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_08_reshaping_data5.py:
--------------------------------------------------------------------------------
1 | df_overall = df_tidy.groupby(["time", "Energie"])[["consumption"]].sum() # or with .reset_index()
2 | df_overall.head()
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_08_reshaping_data6.py:
--------------------------------------------------------------------------------
1 | facet = sns.relplot(x="time", y="consumption", col="Energie",
2 | data=df_overall, kind="line")
3 | facet.set(ylim=(0, None))
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_08_reshaping_data7.py:
--------------------------------------------------------------------------------
1 | df.pivot_table(index='Pclass', columns='Sex',
2 | values='Survived', aggfunc='mean')
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_08_reshaping_data8.py:
--------------------------------------------------------------------------------
1 | fig, ax1 = plt.subplots()
2 | (df.pivot_table(index='Pclass', columns='Sex',
3 | values='Survived', aggfunc='mean')
4 | .plot.bar(rot=0, ax=ax1)
5 | )
6 | ax1.set_ylabel('Survival ratio')
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_08_reshaping_data9.py:
--------------------------------------------------------------------------------
1 | df['Underaged'] = df['Age'] <= 18
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_09_combining_datasets1.py:
--------------------------------------------------------------------------------
1 | joined = pd.merge(df, df_legal_forms, on="CD_LGL_PSN_VAT", how="left")
2 | joined
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_09_combining_datasets2.py:
--------------------------------------------------------------------------------
1 | joined.groupby("TX_LGL_PSN_VAT_EN_LVL1")["MS_NUM_VAT"].sum().sort_values(ascending=False)
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_09_combining_datasets3.py:
--------------------------------------------------------------------------------
1 | df_muni = pd.read_sql("SELECT * FROM TD_MUNTY_REFNIS", con)
2 | df_muni
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_09_combining_datasets4.py:
--------------------------------------------------------------------------------
1 | joined = pd.merge(df, df_muni[["CD_REFNIS", "TX_PROV_DESCR_EN"]], on="CD_REFNIS", how="left")
2 | joined
--------------------------------------------------------------------------------
/notebooks/_solutions/pandas_09_combining_datasets5.py:
--------------------------------------------------------------------------------
1 | joined.groupby("TX_PROV_DESCR_EN")["MS_NUM_VAT"].sum()
--------------------------------------------------------------------------------
/notebooks/_solutions/visualization_01_matplotlib1.py:
--------------------------------------------------------------------------------
1 | fig, ax = plt.subplots(figsize=(12, 4))
2 |
3 | ax.plot(data, color='darkgrey')
4 | ax.set_xlabel('days since start');
5 | ax.set_ylabel('measured value');
--------------------------------------------------------------------------------
/notebooks/_solutions/visualization_01_matplotlib2.py:
--------------------------------------------------------------------------------
1 | dates = pd.date_range("2021-01-01", periods=100, freq="D")
2 |
3 | fig, ax = plt.subplots(figsize=(12, 4))
4 |
5 | ax.plot(dates, data, color='darkgrey')
6 | ax.axhspan(ymin=-5, ymax=5, color='green', alpha=0.2)
7 |
8 | ax.set_xlabel('days since start');
9 | ax.set_ylabel('measured value');
--------------------------------------------------------------------------------
/notebooks/_solutions/visualization_01_matplotlib3.py:
--------------------------------------------------------------------------------
1 | fig, ax = plt.subplots(figsize=(12, 4))
2 |
3 | ax.bar(dates[-10:], data[-10:], color='darkgrey')
4 | ax.bar(dates[-6], data[-6], color='orange')
--------------------------------------------------------------------------------
/notebooks/_solutions/visualization_01_matplotlib4.py:
--------------------------------------------------------------------------------
1 | fig, ax = plt.subplots()
2 | flowdata.mean().plot.bar(ylabel="mean discharge", ax=ax)
--------------------------------------------------------------------------------
/notebooks/_solutions/visualization_01_matplotlib5.py:
--------------------------------------------------------------------------------
1 | fig, (ax0, ax1) = plt.subplots(1, 2, constrained_layout=True)
2 |
3 | flowdata.min().plot.bar(ylabel="min discharge", ax=ax0)
4 | flowdata.max().plot.bar(ylabel="max discharge", ax=ax1)
5 |
6 | fig.suptitle(f"Minimal and maximal discharge from {flowdata.index[0]:%Y-%m-%d} till {flowdata.index[-1]:%Y-%m-%d}");
--------------------------------------------------------------------------------
/notebooks/_solutions/visualization_01_matplotlib6.py:
--------------------------------------------------------------------------------
1 | alarm_level = 20
2 | max_datetime, max_value = flowdata["LS06_347"].idxmax(), flowdata["LS06_347"].max()
3 |
4 | fig, ax = plt.subplots(figsize=(18, 4))
5 | flowdata["LS06_347"].plot(ax=ax)
6 |
7 | ax.axhline(y=alarm_level, color='red', linestyle='-', alpha=0.8)
8 | ax.annotate('Alarm level', xy=(flowdata.index[0], alarm_level),
9 | xycoords="data", xytext=(10, 10), textcoords="offset points",
10 | color="red", fontsize=12)
11 | ax.annotate(f"Flood event on {max_datetime:%Y-%m-%d}",
12 | xy=(max_datetime, max_value), xycoords='data',
13 | xytext=(-30, -30), textcoords='offset points',
14 | arrowprops=dict(facecolor='black', shrink=0.05),
15 | horizontalalignment='right', verticalalignment='bottom',
16 | fontsize=12)
--------------------------------------------------------------------------------
/notebooks/_solutions/visualization_02_seaborn1.py:
--------------------------------------------------------------------------------
1 | sns.displot(data=titanic, x="Age", row="Sex", aspect=3, height=2)
--------------------------------------------------------------------------------
/notebooks/_solutions/visualization_02_seaborn10.py:
--------------------------------------------------------------------------------
1 | # filter the data
2 | compare_dead_30 = casualties.set_index("datetime")["2019":"2021"]
3 | compare_dead_30 = compare_dead_30[compare_dead_30["road_user_type"].isin(
4 | ["Bicycle", "Passenger car", "Pedestrian", "Motorbike"])]
5 |
6 | # Sum the victims and dead within 30 days victims for each year/road-user type combination
7 | compare_dead_30 = compare_dead_30.groupby(
8 | ["road_user_type", compare_dead_30.index.year])[["n_dead_30days", "n_victims"]].sum().reset_index()
9 |
10 | # create a new colum with the percentage deads
11 | compare_dead_30["dead_prop"] = compare_dead_30["n_dead_30days"] / compare_dead_30["n_victims"] * 100
--------------------------------------------------------------------------------
/notebooks/_solutions/visualization_02_seaborn11.py:
--------------------------------------------------------------------------------
1 | sns.catplot(data=compare_dead_30,
2 | x="dead_prop",
3 | y="road_user_type",
4 | kind="bar",
5 | hue="datetime"
6 | )
--------------------------------------------------------------------------------
/notebooks/_solutions/visualization_02_seaborn12.py:
--------------------------------------------------------------------------------
1 | monthly_victim_counts = casualties.resample("ME", on="datetime")[
2 | ["n_victims_ok", "n_slightly_injured", "n_seriously_injured", "n_dead_30days"]
3 | ].sum()
--------------------------------------------------------------------------------
/notebooks/_solutions/visualization_02_seaborn13.py:
--------------------------------------------------------------------------------
1 | sns.relplot(
2 | data=monthly_victim_counts,
3 | kind="line",
4 | palette="colorblind",
5 | height=3, aspect=4,
6 | )
--------------------------------------------------------------------------------
/notebooks/_solutions/visualization_02_seaborn14.py:
--------------------------------------------------------------------------------
1 | # Optional solution with tidy data representation (providing x and y)
2 | monthly_victim_counts_melt = monthly_victim_counts.reset_index().melt(
3 | id_vars="datetime", var_name="victim_type", value_name="count"
4 | )
5 |
6 | sns.relplot(
7 | data=monthly_victim_counts_melt,
8 | x="datetime",
9 | y="count",
10 | hue="victim_type",
11 | kind="line",
12 | palette="colorblind",
13 | height=3, aspect=4,
14 | )
--------------------------------------------------------------------------------
/notebooks/_solutions/visualization_02_seaborn15.py:
--------------------------------------------------------------------------------
1 | # Pandas area plot
2 | monthly_victim_counts.plot.area(colormap='Reds', figsize=(15, 5))
--------------------------------------------------------------------------------
/notebooks/_solutions/visualization_02_seaborn16.py:
--------------------------------------------------------------------------------
1 | # Using Pandas
2 | daily_total_counts_2020 = casualties.set_index("datetime")["2020":"2021"].resample("D")["n_victims"].sum()
3 | daily_total_counts_2020.plot.line(figsize=(12, 3))
--------------------------------------------------------------------------------
/notebooks/_solutions/visualization_02_seaborn17.py:
--------------------------------------------------------------------------------
1 | # Using Seaborn
2 | sns.relplot(data=daily_total_counts_2020,
3 | kind="line",
4 | aspect=4, height=3)
--------------------------------------------------------------------------------
/notebooks/_solutions/visualization_02_seaborn18.py:
--------------------------------------------------------------------------------
1 | # weekly proportion of deadly victims for each light condition
2 | weekly_victim_dead_lc = (
3 | casualties
4 | .groupby("light_conditions")
5 | .resample("W", on="datetime")[["datetime", "n_victims", "n_dead_30days"]]
6 | .sum()
7 | .reset_index()
8 | )
9 | weekly_victim_dead_lc["dead_prop"] = weekly_victim_dead_lc["n_dead_30days"] / weekly_victim_dead_lc["n_victims"] * 100
10 |
11 | # .. and the same for each road type
12 | weekly_victim_dead_rt = (
13 | casualties
14 | .groupby("road_type")
15 | .resample("W", on="datetime")[["datetime", "n_victims", "n_dead_30days"]]
16 | .sum()
17 | .reset_index()
18 | )
19 | weekly_victim_dead_rt["dead_prop"] = weekly_victim_dead_rt["n_dead_30days"] / weekly_victim_dead_rt["n_victims"] * 100
--------------------------------------------------------------------------------
/notebooks/_solutions/visualization_02_seaborn19.py:
--------------------------------------------------------------------------------
1 | fig, (ax0, ax1) = plt.subplots(1, 2, figsize=(15, 5))
2 |
3 | sns.ecdfplot(data=weekly_victim_dead_lc, x="dead_prop", hue="light_conditions", ax=ax0)
4 | sns.ecdfplot(data=weekly_victim_dead_rt, x="dead_prop", hue="road_type", ax=ax1)
--------------------------------------------------------------------------------
/notebooks/_solutions/visualization_02_seaborn2.py:
--------------------------------------------------------------------------------
1 | # Figure based
2 | sns.catplot(data=titanic, x="Pclass", y="Age",
3 | hue="Sex", split=True,
4 | palette="Set2", kind="violin")
5 | sns.despine(left=True)
--------------------------------------------------------------------------------
/notebooks/_solutions/visualization_02_seaborn20.py:
--------------------------------------------------------------------------------
1 | daily_min_temp_2020 = pd.read_csv("./data/daily_min_temperature_2020.csv",
2 | parse_dates=["datetime"])
--------------------------------------------------------------------------------
/notebooks/_solutions/visualization_02_seaborn21.py:
--------------------------------------------------------------------------------
1 | daily_with_temp = daily_total_counts_2020.reset_index().merge(daily_min_temp_2020, on="datetime")
--------------------------------------------------------------------------------
/notebooks/_solutions/visualization_02_seaborn22.py:
--------------------------------------------------------------------------------
1 | g = sns.jointplot(
2 | data=daily_with_temp, x="air_temperature", y="n_victims", kind="reg"
3 | )
--------------------------------------------------------------------------------
/notebooks/_solutions/visualization_02_seaborn3.py:
--------------------------------------------------------------------------------
1 | # Axes based
2 | sns.violinplot(data=titanic, x="Pclass", y="Age",
3 | hue="Sex", split=True,
4 | palette="Set2")
5 | sns.despine(left=True)
--------------------------------------------------------------------------------
/notebooks/_solutions/visualization_02_seaborn4.py:
--------------------------------------------------------------------------------
1 | victims_hour_of_day = casualties.groupby(casualties["datetime"].dt.hour)["n_victims"].sum().reset_index()
2 | victims_hour_of_day = victims_hour_of_day.rename(
3 | columns={"datetime": "Hour of the day", "n_victims": "Number of victims"}
4 | )
--------------------------------------------------------------------------------
/notebooks/_solutions/visualization_02_seaborn5.py:
--------------------------------------------------------------------------------
1 | sns.catplot(data=victims_hour_of_day,
2 | x="Hour of the day",
3 | y="Number of victims",
4 | kind="bar",
5 | aspect=4,
6 | height=3,
7 | )
--------------------------------------------------------------------------------
/notebooks/_solutions/visualization_02_seaborn6.py:
--------------------------------------------------------------------------------
1 | victims_gender_hour_of_day = casualties.groupby([casualties["datetime"].dt.hour, "gender"],
2 | dropna=False)["n_victims"].sum().reset_index()
3 | victims_gender_hour_of_day.head()
--------------------------------------------------------------------------------
/notebooks/_solutions/visualization_02_seaborn7.py:
--------------------------------------------------------------------------------
1 | sns.catplot(data=victims_gender_hour_of_day.fillna("unknown"),
2 | x="datetime",
3 | y="n_victims",
4 | row="gender",
5 | kind="bar",
6 | aspect=4,
7 | height=3)
--------------------------------------------------------------------------------
/notebooks/_solutions/visualization_02_seaborn8.py:
--------------------------------------------------------------------------------
1 | casualties_motorway_trucks = casualties[
2 | (casualties["road_type"] == "Motorway")
3 | & casualties["road_user_type"].isin(["Light truck", "Truck"])
4 | ]
--------------------------------------------------------------------------------
/notebooks/_solutions/visualization_02_seaborn9.py:
--------------------------------------------------------------------------------
1 | sns.catplot(data=casualties_motorway_trucks,
2 | x="week_day",
3 | y="n_victims",
4 | estimator=np.sum,
5 | errorbar=None,
6 | kind="bar",
7 | color="#900C3F",
8 | height=3,
9 | aspect=4)
--------------------------------------------------------------------------------
/notebooks/data/Dryad_Arias_Hall_v3.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/notebooks/data/Dryad_Arias_Hall_v3.xlsx
--------------------------------------------------------------------------------
/notebooks/data/TF_ACCIDENTS_VICTIMS_2020.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/notebooks/data/TF_ACCIDENTS_VICTIMS_2020.zip
--------------------------------------------------------------------------------
/notebooks/data/TF_VAT_NACE_SQ_2019.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/notebooks/data/TF_VAT_NACE_SQ_2019.zip
--------------------------------------------------------------------------------
/notebooks/data/daily_min_temperature_2020.csv:
--------------------------------------------------------------------------------
1 | datetime,air_temperature
2 | 2020-01-01,0.43
3 | 2020-01-02,2.44
4 | 2020-01-03,4.46
5 | 2020-01-04,1.56
6 | 2020-01-05,5.99
7 | 2020-01-06,2.2
8 | 2020-01-07,2.54
9 | 2020-01-08,8.54
10 | 2020-01-09,11.02
11 | 2020-01-10,5.74
12 | 2020-01-11,3.39
13 | 2020-01-12,5.49
14 | 2020-01-13,5.67
15 | 2020-01-14,9.77
16 | 2020-01-15,2.97
17 | 2020-01-16,0.48
18 | 2020-01-17,4.51
19 | 2020-01-18,-0.65
20 | 2020-01-19,-0.98
21 | 2020-01-20,-1.61
22 | 2020-01-21,-1.49
23 | 2020-01-22,-0.7
24 | 2020-01-23,2.49
25 | 2020-01-24,0.74
26 | 2020-01-25,0.48
27 | 2020-01-26,1.75
28 | 2020-01-27,6.18
29 | 2020-01-28,3.62
30 | 2020-01-29,1.96
31 | 2020-01-30,3.85
32 | 2020-01-31,10.38
33 | 2020-02-01,7.87
34 | 2020-02-02,7.12
35 | 2020-02-03,6.36
36 | 2020-02-04,2.12
37 | 2020-02-05,-0.12
38 | 2020-02-06,-1.02
39 | 2020-02-07,-2.5
40 | 2020-02-08,6.44
41 | 2020-02-09,6.86
42 | 2020-02-10,4.01
43 | 2020-02-11,4.76
44 | 2020-02-12,3.05
45 | 2020-02-13,2.46
46 | 2020-02-14,6.68
47 | 2020-02-15,7.93
48 | 2020-02-16,8.12
49 | 2020-02-17,6.78
50 | 2020-02-18,5.1
51 | 2020-02-19,4.32
52 | 2020-02-20,4.44
53 | 2020-02-21,2.33
54 | 2020-02-22,6.0
55 | 2020-02-23,4.43
56 | 2020-02-24,4.08
57 | 2020-02-25,3.34
58 | 2020-02-26,1.42
59 | 2020-02-27,1.78
60 | 2020-02-28,-0.51
61 | 2020-02-29,4.71
62 | 2020-03-01,4.94
63 | 2020-03-02,2.84
64 | 2020-03-03,1.44
65 | 2020-03-04,2.14
66 | 2020-03-05,5.9
67 | 2020-03-06,5.0
68 | 2020-03-07,0.05
69 | 2020-03-08,6.96
70 | 2020-03-09,5.58
71 | 2020-03-10,7.0
72 | 2020-03-11,11.95
73 | 2020-03-12,6.18
74 | 2020-03-13,4.11
75 | 2020-03-14,3.5
76 | 2020-03-15,7.3
77 | 2020-03-16,3.64
78 | 2020-03-17,0.79
79 | 2020-03-18,2.43
80 | 2020-03-19,2.6
81 | 2020-03-20,5.07
82 | 2020-03-21,4.04
83 | 2020-03-22,1.4
84 | 2020-03-23,-0.63
85 | 2020-03-24,-3.49
86 | 2020-03-25,-3.38
87 | 2020-03-26,0.77
88 | 2020-03-27,0.95
89 | 2020-03-28,3.03
90 | 2020-03-29,1.34
91 | 2020-03-30,-2.33
92 | 2020-03-31,0.91
93 | 2020-04-01,-2.17
94 | 2020-04-02,-1.53
95 | 2020-04-03,2.86
96 | 2020-04-04,1.38
97 | 2020-04-05,1.49
98 | 2020-04-06,5.41
99 | 2020-04-07,4.11
100 | 2020-04-08,6.19
101 | 2020-04-09,6.48
102 | 2020-04-10,7.9
103 | 2020-04-11,4.95
104 | 2020-04-12,4.38
105 | 2020-04-13,5.02
106 | 2020-04-14,1.26
107 | 2020-04-15,0.05
108 | 2020-04-16,1.73
109 | 2020-04-17,8.49
110 | 2020-04-18,8.02
111 | 2020-04-19,6.89
112 | 2020-04-20,7.53
113 | 2020-04-21,8.36
114 | 2020-04-22,8.81
115 | 2020-04-23,4.71
116 | 2020-04-24,5.13
117 | 2020-04-25,4.59
118 | 2020-04-26,2.59
119 | 2020-04-27,4.48
120 | 2020-04-28,9.3
121 | 2020-04-29,9.8
122 | 2020-04-30,7.45
123 | 2020-05-01,7.71
124 | 2020-05-02,4.59
125 | 2020-05-03,2.95
126 | 2020-05-04,8.91
127 | 2020-05-05,6.41
128 | 2020-05-06,3.76
129 | 2020-05-07,3.86
130 | 2020-05-08,6.58
131 | 2020-05-09,8.23
132 | 2020-05-10,8.15
133 | 2020-05-11,3.64
134 | 2020-05-12,1.0
135 | 2020-05-13,2.34
136 | 2020-05-14,2.67
137 | 2020-05-15,3.77
138 | 2020-05-16,2.35
139 | 2020-05-17,2.68
140 | 2020-05-18,6.7
141 | 2020-05-19,6.27
142 | 2020-05-20,9.09
143 | 2020-05-21,10.09
144 | 2020-05-22,13.38
145 | 2020-05-23,10.85
146 | 2020-05-24,8.93
147 | 2020-05-25,9.62
148 | 2020-05-26,7.48
149 | 2020-05-27,9.04
150 | 2020-05-28,9.74
151 | 2020-05-29,8.09
152 | 2020-05-30,7.58
153 | 2020-05-31,8.77
154 | 2020-06-01,8.55
155 | 2020-06-02,9.55
156 | 2020-06-03,9.74
157 | 2020-06-04,10.83
158 | 2020-06-05,7.53
159 | 2020-06-06,6.64
160 | 2020-06-07,9.69
161 | 2020-06-08,11.91
162 | 2020-06-09,8.93
163 | 2020-06-10,8.89
164 | 2020-06-11,12.26
165 | 2020-06-12,10.62
166 | 2020-06-13,12.58
167 | 2020-06-14,12.6
168 | 2020-06-15,12.54
169 | 2020-06-16,12.24
170 | 2020-06-17,12.74
171 | 2020-06-18,12.68
172 | 2020-06-19,10.87
173 | 2020-06-20,9.03
174 | 2020-06-21,10.86
175 | 2020-06-22,10.47
176 | 2020-06-23,10.46
177 | 2020-06-24,13.11
178 | 2020-06-25,12.98
179 | 2020-06-26,15.35
180 | 2020-06-27,16.32
181 | 2020-06-28,10.37
182 | 2020-06-29,10.3
183 | 2020-06-30,11.56
184 | 2020-07-01,12.53
185 | 2020-07-02,12.19
186 | 2020-07-03,12.1
187 | 2020-07-04,14.35
188 | 2020-07-05,13.28
189 | 2020-07-06,12.81
190 | 2020-07-07,12.45
191 | 2020-07-08,13.38
192 | 2020-07-09,16.62
193 | 2020-07-10,10.35
194 | 2020-07-11,7.88
195 | 2020-07-12,8.1
196 | 2020-07-13,9.01
197 | 2020-07-14,12.68
198 | 2020-07-15,13.44
199 | 2020-07-16,14.88
200 | 2020-07-17,14.68
201 | 2020-07-18,12.76
202 | 2020-07-19,10.75
203 | 2020-07-20,10.45
204 | 2020-07-21,7.13
205 | 2020-07-22,7.48
206 | 2020-07-23,8.1
207 | 2020-07-24,13.23
208 | 2020-07-25,13.17
209 | 2020-07-26,12.95
210 | 2020-07-27,12.72
211 | 2020-07-28,12.23
212 | 2020-07-29,9.63
213 | 2020-07-30,10.01
214 | 2020-07-31,12.45
215 | 2020-08-01,14.11
216 | 2020-08-02,11.21
217 | 2020-08-03,10.46
218 | 2020-08-04,9.32
219 | 2020-08-05,10.22
220 | 2020-08-06,11.78
221 | 2020-08-07,14.0
222 | 2020-08-08,15.37
223 | 2020-08-09,18.68
224 | 2020-08-10,17.67
225 | 2020-08-11,18.89
226 | 2020-08-12,18.24
227 | 2020-08-13,19.24
228 | 2020-08-14,17.63
229 | 2020-08-15,17.13
230 | 2020-08-16,16.72
231 | 2020-08-17,14.84
232 | 2020-08-18,13.13
233 | 2020-08-19,13.8
234 | 2020-08-20,18.6
235 | 2020-08-21,15.48
236 | 2020-08-22,14.58
237 | 2020-08-23,13.97
238 | 2020-08-24,13.59
239 | 2020-08-25,12.52
240 | 2020-08-26,12.75
241 | 2020-08-27,10.48
242 | 2020-08-28,11.24
243 | 2020-08-29,10.88
244 | 2020-08-30,11.9
245 | 2020-08-31,10.64
246 | 2020-09-01,9.97
247 | 2020-09-02,8.09
248 | 2020-09-03,11.29
249 | 2020-09-04,15.12
250 | 2020-09-05,9.97
251 | 2020-09-06,7.23
252 | 2020-09-07,6.5
253 | 2020-09-08,12.13
254 | 2020-09-09,16.28
255 | 2020-09-10,11.05
256 | 2020-09-11,8.01
257 | 2020-09-12,7.76
258 | 2020-09-13,8.81
259 | 2020-09-14,9.65
260 | 2020-09-15,13.1
261 | 2020-09-16,13.19
262 | 2020-09-17,9.85
263 | 2020-09-18,7.81
264 | 2020-09-19,10.44
265 | 2020-09-20,8.93
266 | 2020-09-21,5.91
267 | 2020-09-22,8.78
268 | 2020-09-23,9.33
269 | 2020-09-24,10.49
270 | 2020-09-25,7.6
271 | 2020-09-26,9.18
272 | 2020-09-27,12.46
273 | 2020-09-28,12.34
274 | 2020-09-29,9.44
275 | 2020-09-30,12.81
276 | 2020-10-01,7.77
277 | 2020-10-02,7.82
278 | 2020-10-03,8.4
279 | 2020-10-04,9.69
280 | 2020-10-05,10.25
281 | 2020-10-06,11.48
282 | 2020-10-07,8.65
283 | 2020-10-08,11.45
284 | 2020-10-09,8.85
285 | 2020-10-10,4.89
286 | 2020-10-11,5.34
287 | 2020-10-12,6.87
288 | 2020-10-13,7.39
289 | 2020-10-14,5.02
290 | 2020-10-15,6.51
291 | 2020-10-16,4.69
292 | 2020-10-17,3.22
293 | 2020-10-18,7.22
294 | 2020-10-19,6.04
295 | 2020-10-20,6.35
296 | 2020-10-21,12.66
297 | 2020-10-22,9.57
298 | 2020-10-23,8.81
299 | 2020-10-24,12.22
300 | 2020-10-25,8.5
301 | 2020-10-26,8.47
302 | 2020-10-27,7.15
303 | 2020-10-28,9.25
304 | 2020-10-29,7.81
305 | 2020-10-30,13.26
306 | 2020-10-31,9.79
307 | 2020-11-01,10.63
308 | 2020-11-02,9.54
309 | 2020-11-03,4.74
310 | 2020-11-04,0.88
311 | 2020-11-05,-0.47
312 | 2020-11-06,-0.63
313 | 2020-11-07,1.27
314 | 2020-11-08,4.14
315 | 2020-11-09,5.77
316 | 2020-11-10,6.68
317 | 2020-11-11,7.35
318 | 2020-11-12,4.86
319 | 2020-11-13,4.14
320 | 2020-11-14,10.41
321 | 2020-11-15,9.3
322 | 2020-11-16,8.01
323 | 2020-11-17,10.97
324 | 2020-11-18,8.56
325 | 2020-11-19,4.32
326 | 2020-11-20,0.63
327 | 2020-11-21,4.6
328 | 2020-11-22,8.24
329 | 2020-11-23,4.49
330 | 2020-11-24,4.25
331 | 2020-11-25,3.99
332 | 2020-11-26,5.37
333 | 2020-11-27,6.85
334 | 2020-11-28,1.85
335 | 2020-11-29,1.39
336 | 2020-11-30,1.22
337 | 2020-12-01,7.01
338 | 2020-12-02,6.25
339 | 2020-12-03,5.9
340 | 2020-12-04,-0.15
341 | 2020-12-05,-0.11
342 | 2020-12-06,-1.7
343 | 2020-12-07,0.78
344 | 2020-12-08,-0.68
345 | 2020-12-09,0.45
346 | 2020-12-10,0.71
347 | 2020-12-11,1.66
348 | 2020-12-12,7.8
349 | 2020-12-13,4.4
350 | 2020-12-14,8.25
351 | 2020-12-15,4.28
352 | 2020-12-16,2.56
353 | 2020-12-17,5.21
354 | 2020-12-18,5.79
355 | 2020-12-19,6.59
356 | 2020-12-20,4.84
357 | 2020-12-21,3.56
358 | 2020-12-22,10.91
359 | 2020-12-23,9.62
360 | 2020-12-24,3.46
361 | 2020-12-25,-0.13
362 | 2020-12-26,0.99
363 | 2020-12-27,3.94
364 | 2020-12-28,-0.49
365 | 2020-12-29,-0.39
366 | 2020-12-30,-0.94
367 | 2020-12-31,3.42
368 |
--------------------------------------------------------------------------------
/notebooks/data/data-preprocessing.md:
--------------------------------------------------------------------------------
1 | ---
2 | jupytext:
3 | text_representation:
4 | extension: .md
5 | format_name: myst
6 | format_version: 0.13
7 | jupytext_version: 1.13.6
8 | kernelspec:
9 | display_name: Python 3 (ipykernel)
10 | language: python
11 | name: python3
12 | ---
13 |
14 | ## Simplified Statistical Sectors
15 |
16 | https://statbel.fgov.be/nl/open-data/statistische-sectoren-2019
17 |
18 | ```{code-cell} ipython3
19 | import geopandas
20 | ```
21 |
22 | ```{code-cell} ipython3
23 | df = geopandas.read_file("/home/joris/Downloads/sh_statbel_statistical_sectors_20190101.shp.zip")
24 | ```
25 |
26 | ```{code-cell} ipython3
27 | df = df.dissolve("CNIS5_2019").reset_index()
28 | ```
29 |
30 | ```{code-cell} ipython3
31 | import topojson as tp
32 | topo = tp.Topology(df, prequantize=True)
33 | res = topo.toposimplify(1000).to_gdf()
34 | ```
35 |
36 | ```{code-cell} ipython3
37 | res.plot()
38 | ```
39 |
40 | ```{code-cell} ipython3
41 | res.crs = df.crs
42 | ```
43 |
44 | ```{code-cell} ipython3
45 | res[["CNIS5_2019", "T_MUN_NL", "geometry"]].to_file("statbel_statistical_sectors_2019.shp")
46 | ```
47 |
48 | ```{code-cell} ipython3
49 |
50 | ```
51 |
--------------------------------------------------------------------------------
/notebooks/data/fietstelpaal-coupure-links-2022-gent.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/notebooks/data/fietstelpaal-coupure-links-2022-gent.zip
--------------------------------------------------------------------------------
/notebooks/data/fietstelpaal-coupure-links-2023-gent.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/notebooks/data/fietstelpaal-coupure-links-2023-gent.zip
--------------------------------------------------------------------------------
/notebooks/data/fietstelpaal-coupure-links-gent.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/notebooks/data/fietstelpaal-coupure-links-gent.zip
--------------------------------------------------------------------------------
/notebooks/data/load_casualties.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import urllib
3 | import logging
4 | from tempfile import tempdir
5 | from pathlib import Path
6 |
7 | import pandas as pd
8 | import numpy as np
9 |
10 |
11 | def clean_casualties_data(casualties_raw):
12 | """Convert raw casualties data to english and restructured format"""
13 | casualties = (
14 | casualties_raw
15 | .drop(columns=[col for col in casualties_raw.columns
16 | if col.endswith("_FR")])
17 | .drop(columns=[col for col in casualties_raw.columns
18 | if col.startswith("CD_") and not col.endswith("_REFNIS")])
19 | .rename(columns={name: name.removeprefix("TX_").removesuffix("_DESCR_NL")
20 | for name in casualties_raw.columns})
21 | .replace("Onbekend", None)
22 | )
23 | casualties["gender"] = casualties["SEX"].replace(
24 | {"Vrouwelijk": "female", "Mannelijk": "male"}
25 | )
26 |
27 | casualties["DT_HOUR"] = casualties["DT_HOUR"].replace(99, 0)
28 | casualties["datetime"] = pd.to_datetime(
29 | casualties["DT_DAY"] + " " + casualties["DT_HOUR"].astype(str) + ":00"
30 | )
31 |
32 | casualties["age"] = casualties["AGE_CLS"].str.replace(
33 | " tot ", " - ").str.removesuffix("jaar").str.strip()
34 | casualties["age"] = casualties["age"].replace(
35 | {"": None, "75 jaar en meer": ">75", ' ': None})
36 |
37 | casualties["DAY_OF_WEEK"] = casualties["DAY_OF_WEEK"].replace({
38 | "maandag": "Monday", "dinsdag": "Tuesday", "woensdag": "Wednesday",
39 | "donderdag": "Thursday", "vrijdag": "Friday", "zaterdag": "Saturday",
40 | "zondag": "Sunday"})
41 | casualties["week_day"] = pd.Categorical(
42 | casualties["DAY_OF_WEEK"],
43 | categories=["Monday", "Tuesday", "Wednesday",
44 | "Thursday", "Friday", "Saturday", "Sunday"],
45 | ordered=True
46 | )
47 |
48 | casualties["victim_type"] = casualties["VICT_TYPE"].replace({
49 | "Bestuurder": "Driver", "Bromfietser": "Moped driver",
50 | "Passagier": "Passenger", "Motorfietser": 'Motorcyclist',
51 | "Fietser": "Cyclist", "Voetganger": "Pedestrian",
52 | "Autres victimes": None})
53 |
54 | casualties["build_up_area"] = casualties["BUILD_UP_AREA"].replace({
55 | "Binnen bebouwde kom": "Inside built-up area",
56 | "Buiten bebouwde kom": "Outside built-up area",
57 | " ": None})
58 |
59 | casualties["ROAD_USR_TYPE"] = casualties["ROAD_USR_TYPE"].replace({
60 | 'Personenauto': 'Passenger car',
61 | 'Auto voor dubbel gebruik': 'Dual-purpose vehicle',
62 | 'Lichte vrachtauto': 'Light truck',
63 | 'Bromfiets': 'Moped',
64 | 'Bromfiets A (tweewielige)': 'Moped',
65 | 'Bromfiets B (tweewielige)': 'Moped',
66 | 'Bromfiets met 3 of 4 wielen': 'Moped',
67 | 'Motorfiets': 'Motorbike',
68 | 'Motorfiets meer dan 400 cc': 'Motorbike',
69 | 'Motorfiets niet meer dan 400 cc': 'Motorbike',
70 | 'Fiets': 'Bicycle',
71 | 'Elektrische fiets': 'Electric bicycle',
72 | 'Fiets met elektrische hulpmotor (<=250W en <=25km/u)': 'Electric bicycle',
73 | 'Gemotoriseerde fiets (<=1000W en <=25km/u)': 'Electric bicycle',
74 | 'Speed pedelec (<= 4000W en <=45km/u)': 'Speed pedelec',
75 | 'Gemotoriseerd voortbewegingstoestel (<=18km/u)': 'Electric bicycle',
76 | 'Trekker + aanhangwagen': 'Trailer',
77 | 'Trekker alleen': 'Trailer',
78 | 'Vrachtwagen': 'Truck',
79 | 'Ruiter': 'Horse rider',
80 | 'Bespannen voertuig': 'Horse rider',
81 | 'Andere voetganger': 'Pedestrian',
82 | 'Gehandicapte in rolstoel': 'Disabled person in a wheelchair',
83 | 'Voetganger die zijn (brom)fiets duwt': 'Pedestrian',
84 | 'Trolleybus, Tram': 'Tram',
85 | 'Minibus': 'Van',
86 | 'Autobus': 'Bus',
87 | 'Autocar': 'Bus',
88 | 'Autobus/Autocar': 'Bus',
89 | 'Kampeerwagen': 'Campervan',
90 | 'Landbouwtractor': 'Tractor',
91 | 'Andere weggebruiker': None,
92 | 'Niet ingevuld': None,
93 | np.nan: None
94 | })
95 |
96 | casualties["LIGHT_COND"] = casualties["LIGHT_COND"].replace(
97 | {'Bij klaarlichte dag': 'In broad daylight',
98 | 'Nacht, ontstoken openbare verlichting': 'Night, public lighting lit',
99 | 'Dageraad - schemering': 'Dawn',
100 | 'Nacht, openbare verlichting aanwezig, maar niet ontstoken': 'Night, no public lighting',
101 | 'Nacht, geen openbare verlichting': 'Night, no public lighting',
102 | ' ': None
103 | })
104 |
105 | casualties["ROAD_TYPE"] = casualties["ROAD_TYPE"].replace({
106 | 'Gemeenteweg': 'Municipal road',
107 | 'Gewestweg': 'Regional road',
108 | 'Autosnelweg': 'Motorway'
109 | })
110 |
111 | casualties["RGN"] = casualties["RGN"].replace({
112 | 'Vlaams Gewest': 'Flemish Region',
113 | 'Brussels Hoofdstedelijk Gewest': 'Brussels-Capital Region',
114 | 'Waals Gewest': 'Walloon Region'
115 | })
116 | casualties["CD_RGN_REFNIS"] = casualties["CD_RGN_REFNIS"].replace(
117 | {'02000': 2000, '03000': 3000, '04000': 4000, ' ': None}
118 | )
119 |
120 | casualties = casualties.replace(" ", None)
121 | casualties = casualties.rename(columns={
122 | "MS_VICT": "n_victims",
123 | "MS_VIC_OK": "n_victims_ok",
124 | "MS_SLY_INJ": "n_slightly_injured",
125 | "MS_SERLY_INJ": "n_seriously_injured",
126 | "MS_DEAD_30_DAYS": "n_dead_30days",
127 | "ROAD_USR_TYPE": "road_user_type",
128 | "LIGHT_COND": "light_conditions",
129 | "ROAD_TYPE": "road_type",
130 | "RGN": "region",
131 | "CD_RGN_REFNIS": "refnis_region",
132 | "CD_MUNTY_REFNIS": "refnis_municipality",
133 | "MUNTY": "municipality"
134 | })
135 | casualties_clean = casualties.drop(
136 | columns=[
137 | "DT_DAY", "DT_HOUR", "DAY_OF_WEEK", "SEX", "VICT_TYPE",
138 | "BUILD_UP_AREA", "AGE_CLS", "CD_PROV_REFNIS", "PROV",
139 | "CD_DSTR_REFNIS", "ADM_DSTR"]
140 | )
141 |
142 | return casualties_clean
143 |
144 |
145 | def main(start_year=2005, end_year=2020,
146 | processed_file_name="casualties.csv"):
147 | """Download casualties data, run cleaning function, concat and save as CSV
148 |
149 | Parameters
150 | ----------
151 | start_year : int, default 2005
152 | Start year to download data from.
153 | end_year : int, default 2021
154 | End year to download data from.
155 | processed_file_name : str
156 | File name of the concatenated clean data set.
157 | """
158 | download_folder = Path(tempdir) / "casualties"
159 | download_folder.mkdir(exist_ok=True)
160 |
161 | logger.info("Start processing causalties Belgium open data from {start_year} till {end_year}.")
162 | casualties_all = []
163 | for year in range(start_year, end_year+1):
164 | logger.info(f"Handling year {year}")
165 | file_name = download_folder / f"TF_ACCIDENTS_VICTIMS_{year}_.zip"
166 | if not file_name.exists():
167 | logger.info(f"Download year {year}.")
168 | urllib.request.urlretrieve(
169 | f"https://statbel.fgov.be/sites/default/files/files/opendata/Verkeersslachtoffers/TF_ACCIDENTS_VICTIMS_{year}.zip",
170 | file_name)
171 | casualties = pd.read_csv(file_name, compression='zip',
172 | sep="|", low_memory=False)
173 | try:
174 | casualties_clean = clean_casualties_data(casualties)
175 | casualties_all.append(casualties_clean)
176 | except:
177 | logger.error(f"Data processing of year {year} failed")
178 | logger.info("All casualties raw data set donwloads ready.")
179 |
180 | logger.info("Combining individual years to single DataFrame.")
181 | casualties_all = pd.concat(casualties_all).sort_values("datetime")
182 |
183 | if 'n_victims_ok' in casualties_all.columns:
184 | casualties = casualties_all[["datetime", "week_day",
185 | "n_victims", "n_victims_ok", "n_slightly_injured",
186 | "n_seriously_injured", "n_dead_30days",
187 | "road_user_type", "victim_type", "gender", "age",
188 | "road_type", "build_up_area", "light_conditions",
189 | "refnis_municipality", "municipality",
190 | "refnis_region", "region"
191 | ]]
192 | else:
193 | casualties = casualties_all[["datetime", "week_day",
194 | "n_victims", "n_slightly_injured",
195 | "n_seriously_injured", "n_dead_30days",
196 | "road_user_type", "victim_type", "gender", "age",
197 | "road_type", "build_up_area", "light_conditions",
198 | "refnis_municipality", "municipality",
199 | "refnis_region", "region"
200 | ]]
201 |
202 | logger.info("Writing combined casualties data file to disk.")
203 | casualties.to_csv(Path("./data") / processed_file_name, index=False)
204 |
205 | logger.info("Combined casualties data file ready.")
206 |
207 |
208 | if __name__ == "__main__":
209 |
210 | logger = logging.getLogger(__name__)
211 |
212 | parser = argparse.ArgumentParser(
213 | description='Collect and prepare casualties open data Belgium.'
214 | )
215 | parser.add_argument('start_year', metavar='start-year', type=int, default=2015,
216 | help='First year to download casualties data.')
217 | parser.add_argument('end_year', metavar='end-year', type=int, default=20210,
218 | help='Last year to download casualties data.')
219 |
220 | args = parser.parse_args()
221 |
222 | print("Start casualties data preparation...")
223 | main(args.start_year, args.end_year)
224 | print("...done!")
--------------------------------------------------------------------------------
/notebooks/data/plot_location.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/notebooks/data/plot_location.xlsx
--------------------------------------------------------------------------------
/notebooks/data/species.csv:
--------------------------------------------------------------------------------
1 | species_id;genus;species;taxa
2 | AB;Amphispiza;bilineata;Bird
3 | AH;Ammospermophilus;harrisi;Rodent-not censused
4 | AS;Ammodramus;savannarum;Bird
5 | BA;Baiomys;taylori;Rodent
6 | CB;Campylorhynchus;brunneicapillus;Bird
7 | CM;Calamospiza;melanocorys;Bird
8 | CQ;Callipepla;squamata;Bird
9 | CS;Crotalus;scutalatus;Reptile
10 | CT;Cnemidophorus;tigris;Reptile
11 | CU;Cnemidophorus;uniparens;Reptile
12 | CV;Crotalus;viridis;Reptile
13 | DM;Dipodomys;merriami;Rodent
14 | DO;Dipodomys;ordii;Rodent
15 | DS;Dipodomys;spectabilis;Rodent
16 | DX;Dipodomys;sp.;Rodent
17 | EO;Eumeces;obsoletus;Reptile
18 | GS;Gambelia;silus;Reptile
19 | NE;Neotoma;albigula;Rodent
20 | NX;Neotoma;sp.;Rodent
21 | OL;Onychomys;leucogaster;Rodent
22 | OT;Onychomys;torridus;Rodent
23 | OX;Onychomys;sp.;Rodent
24 | PB;Chaetodipus;baileyi;Rodent
25 | PC;Pipilo;chlorurus;Bird
26 | PE;Peromyscus;eremicus;Rodent
27 | PF;Perognathus;flavus;Rodent
28 | PG;Pooecetes;gramineus;Bird
29 | PH;Perognathus;hispidus;Rodent
30 | PI;Chaetodipus;intermedius;Rodent
31 | PL;Peromyscus;leucopus;Rodent
32 | PM;Peromyscus;maniculatus;Rodent
33 | PP;Chaetodipus;penicillatus;Rodent
34 | PU;Pipilo;fuscus;Bird
35 | PX;Chaetodipus;sp.;Rodent
36 | RF;Reithrodontomys;fulvescens;Rodent
37 | RM;Reithrodontomys;megalotis;Rodent
38 | RO;Reithrodontomys;montanus;Rodent
39 | RX;Reithrodontomys;sp.;Rodent
40 | SA;Sylvilagus;audubonii;Rabbit
41 | SB;Spizella;breweri;Bird
42 | SC;Sceloporus;clarki;Reptile
43 | SF;Sigmodon;fulviventer;Rodent
44 | SH;Sigmodon;hispidus;Rodent
45 | SO;Sigmodon;ochrognathus;Rodent
46 | SS;Spermophilus;spilosoma;Rodent-not censused
47 | ST;Spermophilus;tereticaudus;Rodent-not censused
48 | SU;Sceloporus;undulatus;Reptile
49 | SX;Sigmodon;sp.;Rodent
50 | UL;Lizard;sp.;Reptile
51 | UP;Pipilo;sp.;Bird
52 | UR;Rodent;sp.;Rodent
53 | US;Sparrow;sp.;Bird
54 | XX;;;Zero Trapping Success
55 | ZL;Zonotrichia;leucophrys;Bird
56 | ZM;Zenaida;macroura;Bird
57 |
--------------------------------------------------------------------------------
/notebooks/data/species_names.csv:
--------------------------------------------------------------------------------
1 | class,kingdom,order,phylum,scientificName,ID,taxa
2 | Mammalia,Animalia,Rodentia,Chordata,"Dipodomys merriami Mearns, 1890",2439521,Rodent
3 | Mammalia,Animalia,Rodentia,Chordata,"Perognathus flavus Baird, 1855",2439566,Rodent
4 | Mammalia,Animalia,Rodentia,Chordata,"Peromyscus eremicus (Baird, 1857)",2437981,Rodent
5 | Mammalia,Animalia,Rodentia,Chordata,"Sigmodon hispidus Say & Ord, 1825",2438147,Rodent
6 | Mammalia,Animalia,Rodentia,Chordata,"Dipodomys spectabilis Merriam, 1890",2439531,Rodent
7 | Mammalia,Animalia,Rodentia,Chordata,"Chaetodipus penicillatus (Woodhouse, 1852)",2439591,Rodent
8 | Mammalia,Animalia,Rodentia,Chordata,"Onychomys torridus (Coues, 1874)",2438516,Rodent
9 | Mammalia,Animalia,Rodentia,Chordata,"Dipodomys ordii Woodhouse, 1853",2439541,Rodent
10 | Mammalia,Animalia,Rodentia,Chordata,"Spermophilus spilosoma Bennett, 1833",2437300,Rodent-not censused
11 | Mammalia,Animalia,Rodentia,Chordata,"Onychomys leucogaster (Wied-Neuwied, 1841)",2438517,Rodent
12 | Mammalia,Animalia,Rodentia,Chordata,"Reithrodontomys megalotis (Baird, 1857)",2437874,Rodent
13 | Mammalia,Animalia,Lagomorpha,Chordata,"Sylvilagus audubonii (Baird, 1858)",2436910,Rabbit
14 | Mammalia,Animalia,Rodentia,Chordata,"Peromyscus maniculatus (Wagner, 1845)",2437967,Rodent
15 | Mammalia,Animalia,Rodentia,Chordata,"Ammospermophilus harrisii (Audubon & Bachman, 1854)",2437568,Rodent-not censused
16 | Aves,Animalia,Passeriformes,Chordata,"Amphispiza bilineata (Cassin, 1850)",2491757,Bird
17 | Aves,Animalia,Passeriformes,Chordata,"Campylorhynchus brunneicapillus (Lafresnaye, 1835)",5231474,Bird
18 | Aves,Animalia,Passeriformes,Chordata,"Calamospiza melanocorys Stejneger, 1885",2491893,Bird
19 | Aves,Animalia,Galliformes,Chordata,"Callipepla squamata (Vigors, 1830)",5228075,Bird
20 | Mammalia,Animalia,Rodentia,Chordata,"Reithrodontomys fulvescens J.A.Allen, 1894",2437864,Rodent
21 | Aves,Animalia,Passeriformes,Chordata,"Pipilo chlorurus (Audubon, 1839)",2491276,Bird
22 | Aves,Animalia,Passeriformes,Chordata,"Pooecetes gramineus (J.F.Gmelin, 1789)",2491728,Bird
23 | Mammalia,Animalia,Rodentia,Chordata,"Perognathus hispidus Baird, 1858",2439584,Rodent
24 | Aves,Animalia,Passeriformes,Chordata,"Pipilo fuscus Swainson, 1827",2491244,Bird
25 | Reptilia,Animalia,Squamata,Chordata,"Crotalus viridis Rafinesque, 1818",8945077,Reptile
26 | Aves,Animalia,Passeriformes,Chordata,"Zonotrichia leucophrys (J.R.Forster, 1772)",5231132,Bird
27 | Reptilia,Animalia,Squamata,Chordata,"Sceloporus clarkii Baird & Girard, 1852",2451192,Reptile
28 | Mammalia,Animalia,Rodentia,Chordata,"Baiomys taylori (Thomas, 1887)",2438866,Rodent
29 | Mammalia,Animalia,Rodentia,Chordata,"Sigmodon fulviventer J.A.Allen, 1889",2438153,Rodent
30 | Mammalia,Animalia,Rodentia,Chordata,"Reithrodontomys montanus (Baird, 1855)",2437866,Rodent
31 | Aves,Animalia,Passeriformes,Chordata,"Ammodramus savannarum (J.F.Gmelin, 1789)",2491123,Bird
32 | Mammalia,Animalia,Rodentia,Chordata,"Sigmodon ochrognathus Bailey, 1902",2438156,Rodent
33 | Mammalia,Animalia,Rodentia,Chordata,"Chaetodipus intermedius (Merriam, 1889)",2439589,Rodent
34 | Mammalia,Animalia,Rodentia,Chordata,"Spermophilus tereticaudus Baird, 1858",2437325,Rodent-not censused
35 | Reptilia,Animalia,Squamata,Chordata,"Cnemidophorus uniparens Wright & Lowe, 1965",5227544,Reptile
36 | Reptilia,Animalia,Squamata,Chordata,"Sceloporus undulatus (Bosc & Daudin, 1801)",2451347,Reptile
37 | Mammalia,Animalia,Rodentia,Chordata,"Chaetodipus baileyi (Merriam, 1894)",2439581,Rodent
38 | Mammalia,Animalia,Rodentia,Chordata,"Peromyscus leucopus (Rafinesque, 1818)",2438019,Rodent
39 | Reptilia,Animalia,Squamata,Chordata,"Cnemidophorus tigris Grismer, 1999",8071886,Reptile
40 |
--------------------------------------------------------------------------------
/notebooks/data/statbel_statistical_sectors_2019.shp.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/notebooks/data/statbel_statistical_sectors_2019.shp.zip
--------------------------------------------------------------------------------
/notebooks/data/verbruiksgegevens-per-maand.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/notebooks/data/verbruiksgegevens-per-maand.xlsx
--------------------------------------------------------------------------------
/notebooks/pandas_07_missing_values.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "8bd0774d",
6 | "metadata": {},
7 | "source": [
8 | "07 - Pandas: Working with missing data
\n",
9 | "\n",
10 | "\n",
11 | "> *© 2025, Joris Van den Bossche and Stijn Van Hoey (, ). Licensed under [CC BY 4.0 Creative Commons](http://creativecommons.org/licenses/by/4.0/)*\n",
12 | "\n",
13 | "---"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": null,
19 | "id": "fad2705f",
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "import numpy as np\n",
24 | "import pandas as pd"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": null,
30 | "id": "6cf9e666",
31 | "metadata": {},
32 | "outputs": [],
33 | "source": [
34 | "df = pd.DataFrame({'A': [1, 2, np.nan],\n",
35 | " 'B': [4, np.nan, np.nan],\n",
36 | " 'C': [7, 8, 9]})\n",
37 | "df"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "id": "9204ffad",
43 | "metadata": {},
44 | "source": [
45 | "## Missing values in Pandas"
46 | ]
47 | },
48 | {
49 | "cell_type": "markdown",
50 | "id": "20ebca57",
51 | "metadata": {},
52 | "source": [
53 | "For numerical data, the \"NaN\" (Not-A-Number) floating point value is used as missing value indicator:"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": null,
59 | "id": "17a6454f",
60 | "metadata": {},
61 | "outputs": [],
62 | "source": [
63 | "df.loc[2, 'A']"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": null,
69 | "id": "35dc8450",
70 | "metadata": {},
71 | "outputs": [],
72 | "source": [
73 | "np.nan"
74 | ]
75 | },
76 | {
77 | "cell_type": "markdown",
78 | "id": "b116e307",
79 | "metadata": {},
80 | "source": [
81 | "\n",
82 | "\n",
83 | "**NOTE**: because NaN is a float value, it is currently not possible to have integer columns with missing values. Notice how the columns in the example above were casted to float dtype.\n",
84 | "\n",
85 | "
"
86 | ]
87 | },
88 | {
89 | "cell_type": "markdown",
90 | "id": "89150b7e",
91 | "metadata": {},
92 | "source": [
93 | "### Missing values are skipped by default in *reductions*"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": null,
99 | "id": "1e2b48d5",
100 | "metadata": {},
101 | "outputs": [],
102 | "source": [
103 | "df['A'].mean()"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": null,
109 | "id": "96daf776",
110 | "metadata": {},
111 | "outputs": [],
112 | "source": [
113 | "df['A'].mean(skipna=False)"
114 | ]
115 | },
116 | {
117 | "cell_type": "markdown",
118 | "id": "604e4841",
119 | "metadata": {},
120 | "source": [
121 | "### ... but propagated in *element-wise arithmetic*"
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": null,
127 | "id": "92901db0",
128 | "metadata": {},
129 | "outputs": [],
130 | "source": [
131 | "df['A'] + 3"
132 | ]
133 | },
134 | {
135 | "cell_type": "markdown",
136 | "id": "cf8a72a6",
137 | "metadata": {},
138 | "source": [
139 | "## Checking missing values"
140 | ]
141 | },
142 | {
143 | "cell_type": "markdown",
144 | "id": "5b50553a",
145 | "metadata": {},
146 | "source": [
147 | "Checking for a missing value cannot be done with an equality operation (`==`) because NaN is not equal to iself:"
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": null,
153 | "id": "61a4ebe9",
154 | "metadata": {},
155 | "outputs": [],
156 | "source": [
157 | "df['A'] == np.nan"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": null,
163 | "id": "1acc9e71",
164 | "metadata": {},
165 | "outputs": [],
166 | "source": [
167 | "np.nan == np.nan"
168 | ]
169 | },
170 | {
171 | "cell_type": "markdown",
172 | "id": "b4439546",
173 | "metadata": {},
174 | "source": [
175 | "Therefore, dedicated methods are available: `isna()` and `notna()`"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": null,
181 | "id": "3c7d6670",
182 | "metadata": {},
183 | "outputs": [],
184 | "source": [
185 | "df['A'].isna()"
186 | ]
187 | },
188 | {
189 | "cell_type": "code",
190 | "execution_count": null,
191 | "id": "4b95b7c2",
192 | "metadata": {},
193 | "outputs": [],
194 | "source": [
195 | "df['A'].notna()"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": null,
201 | "id": "683cccc8",
202 | "metadata": {},
203 | "outputs": [],
204 | "source": [
205 | "df['A'].isna().sum()"
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": null,
211 | "id": "c023dd7d",
212 | "metadata": {},
213 | "outputs": [],
214 | "source": [
215 | "df.isna().sum()"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": null,
221 | "id": "82b582da",
222 | "metadata": {},
223 | "outputs": [],
224 | "source": [
225 | "df"
226 | ]
227 | },
228 | {
229 | "cell_type": "markdown",
230 | "id": "a8488b86",
231 | "metadata": {},
232 | "source": [
233 | "## Dropping missing values"
234 | ]
235 | },
236 | {
237 | "cell_type": "markdown",
238 | "id": "e1440709",
239 | "metadata": {},
240 | "source": [
241 | "Dropping missing values can be done with `isna()`/`notna()` and boolean indexing (eg `df[df['A'].notna()]`), but pandas also provides some convenient helper functions for this:"
242 | ]
243 | },
244 | {
245 | "cell_type": "code",
246 | "execution_count": null,
247 | "id": "788d650e",
248 | "metadata": {},
249 | "outputs": [],
250 | "source": [
251 | "df.dropna()"
252 | ]
253 | },
254 | {
255 | "cell_type": "markdown",
256 | "id": "c694bb08",
257 | "metadata": {},
258 | "source": [
259 | "By default it drop rows if there is a NaN in any of the columns. To limit this to we subset of the columns, use the `subset` keyword:"
260 | ]
261 | },
262 | {
263 | "cell_type": "code",
264 | "execution_count": null,
265 | "id": "5bb3578c",
266 | "metadata": {},
267 | "outputs": [],
268 | "source": [
269 | "df.dropna(subset=['A', 'C'])"
270 | ]
271 | },
272 | {
273 | "cell_type": "markdown",
274 | "id": "00036b6f",
275 | "metadata": {},
276 | "source": [
277 | "## Filling missing values"
278 | ]
279 | },
280 | {
281 | "cell_type": "markdown",
282 | "id": "0e64082f",
283 | "metadata": {},
284 | "source": [
285 | "Filling missing values with a scalar:"
286 | ]
287 | },
288 | {
289 | "cell_type": "code",
290 | "execution_count": null,
291 | "id": "94f40e9a",
292 | "metadata": {},
293 | "outputs": [],
294 | "source": [
295 | "df.fillna(0)"
296 | ]
297 | },
298 | {
299 | "cell_type": "markdown",
300 | "id": "0a73ff4c",
301 | "metadata": {},
302 | "source": [
303 | "Further, more advanced filling techniques are available in the ``interpolate()`` method."
304 | ]
305 | },
306 | {
307 | "cell_type": "markdown",
308 | "id": "7b57edf1",
309 | "metadata": {},
310 | "source": [
311 | "\n",
312 | "\n",
313 | "**REMEMBER**: \n",
314 | "\n",
315 | "* Missing value indicator: `np.nan` (`NaN`)\n",
316 | "* Reductions: skipped by default\n",
317 | "* Mathematical operations (eg `+`): propagate by default\n",
318 | "* Specific functions:\n",
319 | " * `isna()`, `notna()`\n",
320 | " * `dropna()`\n",
321 | " * `fillna()`, `interpolate()`\n",
322 | "\n",
323 | "
"
324 | ]
325 | },
326 | {
327 | "cell_type": "code",
328 | "execution_count": null,
329 | "id": "e1f5bf9a",
330 | "metadata": {},
331 | "outputs": [],
332 | "source": []
333 | }
334 | ],
335 | "metadata": {
336 | "jupytext": {
337 | "formats": "ipynb,md:myst"
338 | },
339 | "kernelspec": {
340 | "display_name": "Python 3 (ipykernel)",
341 | "language": "python",
342 | "name": "python3"
343 | },
344 | "language_info": {
345 | "codemirror_mode": {
346 | "name": "ipython",
347 | "version": 3
348 | },
349 | "file_extension": ".py",
350 | "mimetype": "text/x-python",
351 | "name": "python",
352 | "nbconvert_exporter": "python",
353 | "pygments_lexer": "ipython3",
354 | "version": "3.12.8"
355 | },
356 | "widgets": {
357 | "application/vnd.jupyter.widget-state+json": {
358 | "state": {},
359 | "version_major": 2,
360 | "version_minor": 0
361 | }
362 | }
363 | },
364 | "nbformat": 4,
365 | "nbformat_minor": 5
366 | }
367 |
--------------------------------------------------------------------------------
/notebooks/python_intro/_solutions/00-jupyterlab1.py:
--------------------------------------------------------------------------------
1 | # Jupyter returns the output of the last calculation.
2 | 7 * 3
3 | 2 + 1
--------------------------------------------------------------------------------
/notebooks/python_intro/_solutions/00-jupyterlab2.py:
--------------------------------------------------------------------------------
1 | x = 6 * 7 + 12
2 | print(x)
--------------------------------------------------------------------------------
/notebooks/python_intro/_solutions/01-variables1.py:
--------------------------------------------------------------------------------
1 | weight_kg = 65
2 | weight_g = weight_kg * 1000
--------------------------------------------------------------------------------
/notebooks/python_intro/_solutions/01-variables2.py:
--------------------------------------------------------------------------------
1 | initial = "left"
2 | position = initial
3 | initial = "right"
4 | position
--------------------------------------------------------------------------------
/notebooks/python_intro/_solutions/01-variables3.py:
--------------------------------------------------------------------------------
1 | pressure, weight = 1010, 60.5
2 | print(weight) # prints 60.5
--------------------------------------------------------------------------------
/notebooks/python_intro/_solutions/01-variables4.py:
--------------------------------------------------------------------------------
1 | # Variables must be created before they are used.
2 | # print(pressure_p)
--------------------------------------------------------------------------------
/notebooks/python_intro/_solutions/01-variables5.py:
--------------------------------------------------------------------------------
1 | type(21.55)
--------------------------------------------------------------------------------
/notebooks/python_intro/_solutions/01-variables6.py:
--------------------------------------------------------------------------------
1 | type(3.25 + 4)
--------------------------------------------------------------------------------
/notebooks/python_intro/_solutions/01-variables7.py:
--------------------------------------------------------------------------------
1 | first = 1.0
2 | second = "1"
3 | third = "1.1"
4 | print(first + float(second)) # 1
5 | print(first + int(float(third))) # 4
--------------------------------------------------------------------------------
/notebooks/python_intro/_solutions/01-variables8.py:
--------------------------------------------------------------------------------
1 | int(float("3.0"))
--------------------------------------------------------------------------------
/notebooks/python_intro/_solutions/02-functions-use1.py:
--------------------------------------------------------------------------------
1 | math.floor(1.7)
--------------------------------------------------------------------------------
/notebooks/python_intro/_solutions/02-functions-use2.py:
--------------------------------------------------------------------------------
1 | experiment_label = "Lab1_C_2"
2 | experiment_label.endswith("2")
--------------------------------------------------------------------------------
/notebooks/python_intro/_solutions/02-functions-use3.py:
--------------------------------------------------------------------------------
1 | import random
2 | #help(random)
--------------------------------------------------------------------------------
/notebooks/python_intro/_solutions/02-functions-use4.py:
--------------------------------------------------------------------------------
1 | import random
2 |
3 | random.randint(1, 6)
--------------------------------------------------------------------------------
/notebooks/python_intro/_solutions/02-functions-use5.py:
--------------------------------------------------------------------------------
1 | # alternative using randrange
2 | random.randrange(1, 7)
--------------------------------------------------------------------------------
/notebooks/python_intro/_solutions/02-functions-use6.py:
--------------------------------------------------------------------------------
1 | print("""
2 | Order of operations:
3 | - 1.1 * radiance = 1.1
4 | - 1.1 - 0.5 = 0.6
5 | - min(radiance, 0.6) = 0.6
6 | - 2.0 + 0.6 = 2.6
7 | - max(2.1, 2.6) = 2.6
8 |
9 | At the end, result = 2.6
10 | """)
--------------------------------------------------------------------------------
/notebooks/python_intro/_solutions/02-functions-use7.py:
--------------------------------------------------------------------------------
1 | pressure_hPa = 1010
2 | height = 2500
3 |
4 | pressure_hPa * math.exp(-gravit_acc * molar_mass_earth * height/(gas_constant * standard_temperature))
--------------------------------------------------------------------------------
/notebooks/python_intro/_solutions/03-containers1.py:
--------------------------------------------------------------------------------
1 | pressures_hPa[2] = 1111
2 | pressures_hPa
--------------------------------------------------------------------------------
/notebooks/python_intro/_solutions/03-containers2.py:
--------------------------------------------------------------------------------
1 | pressures_hPa.insert(4, 1212)
2 | pressures_hPa
--------------------------------------------------------------------------------
/notebooks/python_intro/_solutions/03-containers3.py:
--------------------------------------------------------------------------------
1 | pressures_hPa[-3:]
--------------------------------------------------------------------------------
/notebooks/python_intro/_solutions/03-containers4.py:
--------------------------------------------------------------------------------
1 | pressures_hPa[1::2]
--------------------------------------------------------------------------------
/notebooks/python_intro/_solutions/03-containers5.py:
--------------------------------------------------------------------------------
1 | # Returns a sorted copy of the list (the original list `pressures_hPa` remains unchanged)
2 | print(sorted(pressures_hPa))
3 | # The list methos `sort` sorts the list in-place and does not return anything on itself
4 | print(pressures_hPa.sort())
5 | print(pressures_hPa)
--------------------------------------------------------------------------------
/notebooks/python_intro/_solutions/03-containers6.py:
--------------------------------------------------------------------------------
1 | a_third_list = ['red', 'blue', 'green', 'black', 'white']
2 | a_third_list_reversed = a_third_list.copy()
3 | a_third_list_reversed.reverse()
4 | a_concatenated_list = a_third_list + a_third_list_reversed
5 | a_concatenated_list
--------------------------------------------------------------------------------
/notebooks/python_intro/_solutions/03-containers7.py:
--------------------------------------------------------------------------------
1 | my_spell = "abracadabra"
2 | my_spell.upper()
--------------------------------------------------------------------------------
/notebooks/python_intro/_solutions/03-containers8.py:
--------------------------------------------------------------------------------
1 | my_spell = "abracadabra"
2 | my_spell[1::2]
--------------------------------------------------------------------------------
/notebooks/python_intro/_solutions/03-containers9.py:
--------------------------------------------------------------------------------
1 | report_location = "Nete"
2 | f"The measured dissolved oxygen in {report_location} on March 18th 2024 was {water_quality[report_location]} mg/l."
--------------------------------------------------------------------------------
/notebooks/python_intro/_solutions/04-control-flow1.py:
--------------------------------------------------------------------------------
1 | # using the 'accumulator pattern' to check the number of counts
2 | acc = 0
3 | for letter in 'oxygen':
4 | acc += 1 # the in-place operator
5 | print(acc)
--------------------------------------------------------------------------------
/notebooks/python_intro/_solutions/04-control-flow2.py:
--------------------------------------------------------------------------------
1 | a = 0.43
2 | r = 1.35
3 | for conductivity in conductivities:
4 | print(a + conductivity*r )
--------------------------------------------------------------------------------
/notebooks/python_intro/_solutions/04-control-flow3.py:
--------------------------------------------------------------------------------
1 | indices = []
2 | for j, pressure in enumerate(pressures_hPa):
3 | if pressure < 1000:
4 | indices.append(j)
5 | indices
--------------------------------------------------------------------------------
/notebooks/python_intro/_solutions/04-control-flow4.py:
--------------------------------------------------------------------------------
1 | for location, do in water_quality.items():
2 | if (do > 20) or (do < 5):
3 | print(f"Alert: Poor conditions measured at {location} with DO concentration of {do} mg/l.")
--------------------------------------------------------------------------------
/notebooks/python_intro/_solutions/04-control-flow5.py:
--------------------------------------------------------------------------------
1 | for file_name in file_names:
2 | if file_name.startswith("sigma"):
3 | print(f"Processing file {file_name} with sigma pipeline.")
4 | elif file_name.startswith("ava"):
5 | print(f"Processing file {file_name} with avalanche pipeline.")
6 | else:
7 | print(f"Unrecognized file {file_name}")
--------------------------------------------------------------------------------
/notebooks/python_intro/_solutions/05-functions-write1.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | def barometric_formula(pressure_sea_level, height=2500):
4 | """Apply barometric formula
5 |
6 | Apply the barometric formula to calculate the air pressure on a given height
7 |
8 | Parameters
9 | ----------
10 | pressure_sea_level : float
11 | pressure, measured as sea level (hPa)
12 | height : float
13 | height above sea level (m)
14 |
15 | Notes
16 | ------
17 | see https://www.math24.net/barometric-formula/ or
18 | https://en.wikipedia.org/wiki/Atmospheric_pressure
19 | """
20 | standard_temperature = 288.15
21 | gas_constant = 8.3144598
22 | gravit_acc = 9.81
23 | molar_mass_earth = 0.02896
24 |
25 | pressure_altitude = pressure_sea_level * math.exp(-gravit_acc * molar_mass_earth* height/(gas_constant*standard_temperature))
26 | return pressure_altitude
27 |
28 | barometric_formula(1010), barometric_formula(1010, 2750)
--------------------------------------------------------------------------------
/notebooks/python_intro/_solutions/05-functions-write2.py:
--------------------------------------------------------------------------------
1 | pressures_hPa_1200 = [barometric_formula(pressure, 1200) for pressure in pressures_hPa]
2 | pressures_hPa_1200
--------------------------------------------------------------------------------
/notebooks/python_intro/_solutions/05-functions-write3.py:
--------------------------------------------------------------------------------
1 | pressures_hPa_1200 = []
2 | for pressure in pressures_hPa:
3 | pressures_hPa_1200.append(barometric_formula(pressure, 1200))
4 | pressures_hPa_1200
--------------------------------------------------------------------------------
/notebooks/python_recap/_solutions/01-basic24.py:
--------------------------------------------------------------------------------
1 | a_third_list.count?
--------------------------------------------------------------------------------
/notebooks/python_recap/_solutions/01-basic25.py:
--------------------------------------------------------------------------------
1 | a_third_list.index?
--------------------------------------------------------------------------------
/notebooks/python_recap/_solutions/01-basic28.py:
--------------------------------------------------------------------------------
1 | a_third_list[::-1]
--------------------------------------------------------------------------------
/notebooks/python_recap/_solutions/01-basic47.py:
--------------------------------------------------------------------------------
1 | [el for el in dir(list) if not el[0]=='_']
--------------------------------------------------------------------------------
/notebooks/python_recap/_solutions/01-basic49.py:
--------------------------------------------------------------------------------
1 | #split in words and get word lengths
2 | [len(word) for word in sentence.split()]
--------------------------------------------------------------------------------
/notebooks/python_recap/_solutions/01-basic58.py:
--------------------------------------------------------------------------------
1 | str_key = []
2 | for key in hourly_wage.keys():
3 | str_key.append(str(key))
4 | str_key
--------------------------------------------------------------------------------
/notebooks/python_recap/_solutions/02-control_flow15.py:
--------------------------------------------------------------------------------
1 | # return the name of the company given a certain value between 1 and 5:
2 | for k in dd:
3 | if dd[k] == value:
4 | print(k.upper())
--------------------------------------------------------------------------------
/notebooks/python_recap/_solutions/02-control_flow16.py:
--------------------------------------------------------------------------------
1 | if 'antea' in dd.keys():
2 | print('already in dictionary')
--------------------------------------------------------------------------------
/notebooks/python_recap/_solutions/02-control_flow24.py:
--------------------------------------------------------------------------------
1 | sentence = "hello world! 123"
2 | d = {"DIGITS": 0, "LETTERS": 0}
3 | for char in sentence:
4 | if char.isdigit():
5 | d["DIGITS"] += 1
6 | elif char.isalpha():
7 | d["LETTERS"] += 1
8 | else:
9 | pass
10 | print("LETTERS", d["LETTERS"])
11 | print("DIGITS", d["DIGITS"])
--------------------------------------------------------------------------------
/notebooks/python_recap/_solutions/03-functions19.py:
--------------------------------------------------------------------------------
1 | def check_for_key(checkdict, key):
2 | """
3 | Function checks the presence of key in dictionary checkdict and returns an
4 | exception if the key is already used in the dictionary
5 |
6 | """
7 | if key in checkdict.keys():
8 | raise Exception('Key already used in this dictionary')
--------------------------------------------------------------------------------
/notebooks/python_recap/_solutions/03-functions27.py:
--------------------------------------------------------------------------------
1 | class Employee(): #object
2 |
3 | def __init__(self, name, wage=60.):
4 | """
5 | Employee class to save the amount of hours worked and related earnings
6 | """
7 | self.name = name
8 | self.wage = wage
9 | self.projects = {}
10 |
11 | def new_project(self, projectname):
12 | """
13 | """
14 | if projectname in self.projects:
15 | raise Exception("project already exist for", self.name)
16 | else:
17 | self.projects[projectname] = 0.
18 |
19 |
20 | def worked(self, hours, projectname):
21 | """add worked hours on a project
22 | """
23 | try:
24 | hours = float(hours)
25 | except:
26 | raise Exception("Hours not convertable to float!")
27 |
28 | if not projectname in self.projects:
29 | raise Exception("project non-existing for", self.name)
30 |
31 | self.projects[projectname] += hours
32 |
33 | def calc_earnings(self):
34 | """
35 | Calculate earnings
36 | """
37 | total_hours = 0
38 | for val in self.projects.values():
39 | total_hours += val
40 |
41 | return total_hours *self.wage
42 |
43 | def info(self):
44 | """
45 | get info
46 | """
47 | for proj, hour in self.projects.items():
48 | print(hour, 'worked on project', proj)
--------------------------------------------------------------------------------
/notebooks/python_recap/_solutions/05-numpy109.py:
--------------------------------------------------------------------------------
1 | # RESCALE:
2 | (Z - Z.min())/(Z.max() - Z.min())
--------------------------------------------------------------------------------
/notebooks/python_recap/_solutions/05-numpy137.py:
--------------------------------------------------------------------------------
1 | x, y = b_data[:,3], b_data[:,4]
2 | t = np.polyfit(x, y, 4) # fit a 2nd degree polynomial to the data, result is x**2 + 2x + 3
3 | t
4 | x.sort()
5 | plt.plot(x, y, 'o')
6 | plt.plot(x, t[0]*x**4 + t[1]*x**3 + t[2]*x**2 + t[3]*x +t[4], '-')
--------------------------------------------------------------------------------
/notebooks/python_recap/_solutions/05-numpy34.py:
--------------------------------------------------------------------------------
1 | np.arange(10, 50, 1)
--------------------------------------------------------------------------------
/notebooks/python_recap/_solutions/05-numpy35.py:
--------------------------------------------------------------------------------
1 | np.identity(3)
--------------------------------------------------------------------------------
/notebooks/python_recap/_solutions/05-numpy36.py:
--------------------------------------------------------------------------------
1 | np.eye(3)
--------------------------------------------------------------------------------
/notebooks/python_recap/_solutions/05-numpy37.py:
--------------------------------------------------------------------------------
1 | np.random.random((3, 3, 3))
--------------------------------------------------------------------------------
/notebooks/python_recap/_solutions/05-numpy58.py:
--------------------------------------------------------------------------------
1 | vec = np.zeros(10)
2 | vec[4] = 1.
3 | vec
--------------------------------------------------------------------------------
/notebooks/python_recap/_solutions/05-numpy73.py:
--------------------------------------------------------------------------------
1 | #SWAP
2 | A[[0, 1]] = A[[1, 0]]
3 | A
--------------------------------------------------------------------------------
/notebooks/python_recap/_solutions/05-numpy75.py:
--------------------------------------------------------------------------------
1 | AR[AR%2==0] = 0.
2 | AR
--------------------------------------------------------------------------------
/notebooks/python_recap/_solutions/05-numpy77.py:
--------------------------------------------------------------------------------
1 | AR[1::2] = 0
2 | AR
--------------------------------------------------------------------------------
/notebooks/python_recap/_solutions/python_rehearsal1.py:
--------------------------------------------------------------------------------
1 | height = 2500
2 | pressure_hPa * math.exp(-gravit_acc * molar_mass_earth* height/(gas_constant*standard_temperature))
--------------------------------------------------------------------------------
/notebooks/python_recap/_solutions/python_rehearsal10.py:
--------------------------------------------------------------------------------
1 | np.sqrt(AR2[AR2 > np.percentile(AR2, 75)])
--------------------------------------------------------------------------------
/notebooks/python_recap/_solutions/python_rehearsal11.py:
--------------------------------------------------------------------------------
1 | AR3[np.isclose(AR3, -99)] = np.nan
2 | AR3
--------------------------------------------------------------------------------
/notebooks/python_recap/_solutions/python_rehearsal12.py:
--------------------------------------------------------------------------------
1 | [location.lower() for location in locations]
--------------------------------------------------------------------------------
/notebooks/python_recap/_solutions/python_rehearsal13.py:
--------------------------------------------------------------------------------
1 | [location.lower() for location in locations]
--------------------------------------------------------------------------------
/notebooks/python_recap/_solutions/python_rehearsal2.py:
--------------------------------------------------------------------------------
1 | def barometric_formula(pressure_sea_level, height=2500):
2 | """Apply barometric formula
3 |
4 | Apply the barometric formula to calculate the air pressure on a given height
5 |
6 | Parameters
7 | ----------
8 | pressure_sea_level : float
9 | pressure, measured as sea level
10 | height : float
11 | height above sea level (m)
12 |
13 | Notes
14 | ------
15 | see https://www.math24.net/barometric-formula/ or
16 | https://en.wikipedia.org/wiki/Atmospheric_pressure
17 | """
18 | standard_temperature = 288.15
19 | gas_constant = 8.3144598
20 | gravit_acc = 9.81
21 | molar_mass_earth = 0.02896
22 |
23 | pressure_altitude = pressure_sea_level * math.exp(-gravit_acc * molar_mass_earth* height/(gas_constant*standard_temperature))
24 | return pressure_altitude
--------------------------------------------------------------------------------
/notebooks/python_recap/_solutions/python_rehearsal3.py:
--------------------------------------------------------------------------------
1 | def barometric_formula(pressure_sea_level, height=2500):
2 | """Apply barometric formula
3 |
4 | Apply the barometric formula to calculate the air pressure on a given height
5 |
6 | Parameters
7 | ----------
8 | pressure_sea_level : float
9 | pressure, measured as sea level
10 | height : float
11 | height above sea level (m)
12 |
13 | Notes
14 | ------
15 | see https://www.math24.net/barometric-formula/ or
16 | https://en.wikipedia.org/wiki/Atmospheric_pressure
17 | """
18 | if height > 11000:
19 | raise Exception("Barometric formula only valid for heights lower than 11000m above sea level")
20 |
21 | standard_temperature = 288.15
22 | gas_constant = 8.3144598
23 | gravit_acc = 9.81
24 | molar_mass_earth = 0.02896
25 |
26 | pressure_altitude = pressure_sea_level * math.exp(-gravit_acc * molar_mass_earth* height/(gas_constant*standard_temperature))
27 | return pressure_altitude
--------------------------------------------------------------------------------
/notebooks/python_recap/_solutions/python_rehearsal4.py:
--------------------------------------------------------------------------------
1 | for pressure in pressures_hPa:
2 | print(barometric_formula(pressure, 3000))
--------------------------------------------------------------------------------
/notebooks/python_recap/_solutions/python_rehearsal5.py:
--------------------------------------------------------------------------------
1 | pressures_hPa_adjusted = [barometric_formula(pressure, 3000) for pressure in pressures_hPa]
2 | pressures_hPa_adjusted
--------------------------------------------------------------------------------
/notebooks/python_recap/_solutions/python_rehearsal6.py:
--------------------------------------------------------------------------------
1 | np_pressures_hPa * math.exp(-gravit_acc * molar_mass_earth* height/(gas_constant*standard_temperature))
--------------------------------------------------------------------------------
/notebooks/python_recap/_solutions/python_rehearsal7.py:
--------------------------------------------------------------------------------
1 | sum(AR > 10)
--------------------------------------------------------------------------------
/notebooks/python_recap/_solutions/python_rehearsal8.py:
--------------------------------------------------------------------------------
1 | AR[AR%2 == 0] = 0
2 | AR
--------------------------------------------------------------------------------
/notebooks/python_recap/_solutions/python_rehearsal9.py:
--------------------------------------------------------------------------------
1 | AR[1::2] = 30
2 | AR
--------------------------------------------------------------------------------
/notebooks/python_recap/data/bogota_part_dataset.csv:
--------------------------------------------------------------------------------
1 | DIA,SST AM,SSV AM,SSV PM,SSF PM
2 | Unidad,mg/l,mg/l,mg/l,mg/l
3 | ,,,,
4 | 1,198,141,131,38
5 | 2,274,200,125,35
6 | 3,156,119,274,120
7 | 4,382,266,272,105
8 | 5,494,342,202,76
9 | 6,259,182,205,67
10 | 7,247,185,232,77
11 | 8,164,125,112,33
12 | 9,367,265,82,30
13 | 10,123,90,91,26
14 | 11,132,96,130,46
15 | 12,97,66,110,33
16 | 13,160,104,181,83
17 | 14,137,100,122,41
18 | 15,172,123,151,56
19 | 16,192,138,168,78
20 | 17,176,106,94,36
21 | 18,192,132,111,43
22 | 19,152,99,112,37
23 | 20,255,179,181,67
24 | 21,188,134,220,94
25 | 22,215,153,149,58
26 | 23,221,157,147,60
27 | 24,284,199,201,93
28 | 25,134,84,133,65
29 | 26,196,120,132,47
30 | 27,144,88,114,41
31 | 28,193,143,128,45
32 |
--------------------------------------------------------------------------------
/notebooks/python_recap/data/out1.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/notebooks/python_recap/data/out1.txt
--------------------------------------------------------------------------------
/notebooks/python_recap/data/out2.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/notebooks/python_recap/data/out2.txt
--------------------------------------------------------------------------------
/notebooks/python_recap/data/out3.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/notebooks/python_recap/data/out3.txt
--------------------------------------------------------------------------------
/notebooks/python_recap/data/out4.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/notebooks/python_recap/data/out4.txt
--------------------------------------------------------------------------------
/notebooks/python_recap/data/values.txt:
--------------------------------------------------------------------------------
1 | 0,09400 3,37968
2 | 0,28820 0,83214
3 | 0,06823 0,57102
4 | 0,65576 0,59619
5 | -1,23714 0,03561
--------------------------------------------------------------------------------