├── .gitignore ├── LICENSE ├── README.md ├── _solved ├── 00-jupyter_introduction.ipynb ├── case1_bike_count.ipynb ├── case2_observations.ipynb ├── case2_observations_analysis.ipynb ├── case2_observations_processing.ipynb ├── case3_bacterial_resistance_lab_experiment.ipynb ├── case4_air_quality_analysis.ipynb ├── case4_air_quality_processing.ipynb ├── data ├── pandas_01_data_structures.ipynb ├── pandas_02_basic_operations.ipynb ├── pandas_03a_selecting_data.ipynb ├── pandas_03b_indexing.ipynb ├── pandas_04_time_series_data.ipynb ├── pandas_05_groupby_operations.ipynb ├── pandas_06_data_cleaning.ipynb ├── pandas_07_missing_values.ipynb ├── pandas_08_reshaping_data.ipynb ├── pandas_09_combining_datasets.ipynb ├── python_intro │ ├── 00-jupyterlab.ipynb │ ├── 01-variables.ipynb │ ├── 02-functions-use.ipynb │ ├── 03-containers.ipynb │ ├── 04-control-flow.ipynb │ └── 05-functions-write.ipynb ├── python_recap │ ├── 01-basic.ipynb │ ├── 02-control_flow.ipynb │ ├── 03-functions.ipynb │ ├── 04-reusing_code.ipynb │ ├── 05-numpy.ipynb │ ├── data │ │ ├── bogota_part_dataset.csv │ │ ├── out1.txt │ │ ├── out2.txt │ │ ├── out3.txt │ │ ├── out4.txt │ │ └── values.txt │ └── python_rehearsal.ipynb ├── spreaddiagram.py ├── visualization_01_matplotlib.ipynb ├── visualization_02_plotnine.ipynb ├── visualization_02_seaborn.ipynb ├── visualization_03_landscape.ipynb └── workflow_example_evaluation.ipynb ├── check_environment.py ├── convert_notebooks.sh ├── docs ├── _config.yml ├── contributing.md ├── index.md ├── setup.md ├── slides.html └── static │ ├── img │ ├── JakeVdP-ecosystem1.svg │ ├── JakeVdP-ecosystem2.svg │ ├── JakeVdP-ecosystem3.svg │ ├── JakeVdP-ecosystem4.svg │ ├── JakeVdP-ecosystem5.svg │ ├── datacleaning1.jpg │ ├── datacleaning2.jpg │ ├── dataframe.png │ ├── doctoralschoolsprofiel_hq_rgb_web.png │ ├── download-button.png │ ├── environment_save.png │ ├── environment_save.svg │ ├── icon_github.svg │ ├── icon_twitter.svg │ ├── ipython.png │ ├── issuetracker.png │ ├── logo_flanders+richtingmorgen.png │ ├── navigator_notebook.png │ ├── navigator_notebook.svg │ ├── navigator_terminal.png │ ├── notebook.png │ ├── startup.png │ ├── tidy_data_paper.png │ ├── tidy_data_scheme.png │ └── work_stijn_1.png │ ├── remark-latest.min.js │ └── slides.css ├── environment.yml ├── img ├── bacteriophage.jpeg ├── bike_count_illustration.png ├── change_kernel.png ├── doctoralschoolsprofiel_hq_rgb_web.png ├── enterbutton.png ├── heatmap.png ├── keya.png ├── keyb.png ├── keyescape.png ├── logo_flanders+richtingmorgen.png ├── matplotlib_fundamentals.png ├── matplotlib_fundamentals.svg ├── matplotlib_oo.png ├── pandas.svg ├── pandas │ ├── 01_table_dataframe1.svg │ ├── pivot_excel.png │ ├── schema-concat0.svg │ ├── schema-concat1.svg │ ├── schema-dataframe.svg │ ├── schema-stack.svg │ └── splitApplyCombine.png ├── plot_overview.png ├── python-function.svg ├── python-sticky-note-variables-01.svg ├── python-sticky-note-variables-02.svg ├── python-sticky-note-variables-03.svg ├── seaborn_overview_modules.png ├── shift-tab.png ├── shift_button.png ├── shiftenter.jpg ├── stack.png ├── tabbutton.jpg ├── tidy_data_scheme.png └── toomuch.jpg ├── nbconvert_config.py └── notebooks ├── 00-jupyter_introduction.ipynb ├── _solutions ├── case1_bike_count1.py ├── case1_bike_count10.py ├── case1_bike_count11.py ├── case1_bike_count12.py ├── case1_bike_count13.py ├── case1_bike_count14.py ├── case1_bike_count15.py ├── case1_bike_count16.py ├── case1_bike_count17.py ├── case1_bike_count18.py ├── case1_bike_count19.py ├── case1_bike_count2.py ├── case1_bike_count20.py ├── case1_bike_count21.py ├── case1_bike_count22.py ├── case1_bike_count23.py ├── case1_bike_count24.py ├── case1_bike_count25.py ├── case1_bike_count26.py ├── case1_bike_count27.py ├── case1_bike_count28.py ├── case1_bike_count3.py ├── case1_bike_count4.py ├── case1_bike_count5.py ├── case1_bike_count6.py ├── case1_bike_count7.py ├── case1_bike_count8.py ├── case1_bike_count9.py ├── case2_observations1.py ├── case2_observations10.py ├── case2_observations11.py ├── case2_observations12.py ├── case2_observations13.py ├── case2_observations14.py ├── case2_observations15.py ├── case2_observations16.py ├── case2_observations17.py ├── case2_observations18.py ├── case2_observations19.py ├── case2_observations2.py ├── case2_observations20.py ├── case2_observations21.py ├── case2_observations22.py ├── case2_observations23.py ├── case2_observations24.py ├── case2_observations25.py ├── case2_observations26.py ├── case2_observations27.py ├── case2_observations28.py ├── case2_observations29.py ├── case2_observations3.py ├── case2_observations30.py ├── case2_observations31.py ├── case2_observations32.py ├── case2_observations33.py ├── case2_observations34.py ├── case2_observations35.py ├── case2_observations36.py ├── case2_observations37.py ├── case2_observations38.py ├── case2_observations39.py ├── case2_observations4.py ├── case2_observations40.py ├── case2_observations41.py ├── case2_observations42.py ├── case2_observations43.py ├── case2_observations44.py ├── case2_observations45.py ├── case2_observations46.py ├── case2_observations47.py ├── case2_observations48.py ├── case2_observations49.py ├── case2_observations5.py ├── case2_observations50.py ├── case2_observations51.py ├── case2_observations6.py ├── case2_observations7.py ├── case2_observations8.py ├── case2_observations9.py ├── case3_bacterial_resistance_lab_experiment1.py ├── case3_bacterial_resistance_lab_experiment10.py ├── case3_bacterial_resistance_lab_experiment11.py ├── case3_bacterial_resistance_lab_experiment12.py ├── case3_bacterial_resistance_lab_experiment13.py ├── case3_bacterial_resistance_lab_experiment2.py ├── case3_bacterial_resistance_lab_experiment3.py ├── case3_bacterial_resistance_lab_experiment4.py ├── case3_bacterial_resistance_lab_experiment5.py ├── case3_bacterial_resistance_lab_experiment6.py ├── case3_bacterial_resistance_lab_experiment7.py ├── case3_bacterial_resistance_lab_experiment8.py ├── case3_bacterial_resistance_lab_experiment9.py ├── case4_air_quality_analysis1.py ├── case4_air_quality_analysis10.py ├── case4_air_quality_analysis11.py ├── case4_air_quality_analysis12.py ├── case4_air_quality_analysis13.py ├── case4_air_quality_analysis14.py ├── case4_air_quality_analysis15.py ├── case4_air_quality_analysis16.py ├── case4_air_quality_analysis17.py ├── case4_air_quality_analysis18.py ├── case4_air_quality_analysis19.py ├── case4_air_quality_analysis2.py ├── case4_air_quality_analysis20.py ├── case4_air_quality_analysis21.py ├── case4_air_quality_analysis22.py ├── case4_air_quality_analysis23.py ├── case4_air_quality_analysis24.py ├── case4_air_quality_analysis25.py ├── case4_air_quality_analysis26.py ├── case4_air_quality_analysis27.py ├── case4_air_quality_analysis28.py ├── case4_air_quality_analysis29.py ├── case4_air_quality_analysis3.py ├── case4_air_quality_analysis30.py ├── case4_air_quality_analysis31.py ├── case4_air_quality_analysis32.py ├── case4_air_quality_analysis33.py ├── case4_air_quality_analysis34.py ├── case4_air_quality_analysis35.py ├── case4_air_quality_analysis36.py ├── case4_air_quality_analysis37.py ├── case4_air_quality_analysis38.py ├── case4_air_quality_analysis39.py ├── case4_air_quality_analysis4.py ├── case4_air_quality_analysis40.py ├── case4_air_quality_analysis5.py ├── case4_air_quality_analysis6.py ├── case4_air_quality_analysis7.py ├── case4_air_quality_analysis8.py ├── case4_air_quality_analysis9.py ├── case4_air_quality_processing1.py ├── case4_air_quality_processing10.py ├── case4_air_quality_processing11.py ├── case4_air_quality_processing12.py ├── case4_air_quality_processing13.py ├── case4_air_quality_processing2.py ├── case4_air_quality_processing3.py ├── case4_air_quality_processing4.py ├── case4_air_quality_processing5.py ├── case4_air_quality_processing6.py ├── case4_air_quality_processing7.py ├── case4_air_quality_processing8.py ├── case4_air_quality_processing9.py ├── pandas_01_data_structures1.py ├── pandas_01_data_structures2.py ├── pandas_01_data_structures3.py ├── pandas_01_data_structures4.py ├── pandas_01_data_structures5.py ├── pandas_01_data_structures6.py ├── pandas_02_basic_operations1.py ├── pandas_02_basic_operations10.py ├── pandas_02_basic_operations2.py ├── pandas_02_basic_operations3.py ├── pandas_02_basic_operations4.py ├── pandas_02_basic_operations5.py ├── pandas_02_basic_operations6.py ├── pandas_02_basic_operations7.py ├── pandas_02_basic_operations8.py ├── pandas_02_basic_operations9.py ├── pandas_03a_selecting_data1.py ├── pandas_03a_selecting_data10.py ├── pandas_03a_selecting_data11.py ├── pandas_03a_selecting_data12.py ├── pandas_03a_selecting_data13.py ├── pandas_03a_selecting_data14.py ├── pandas_03a_selecting_data15.py ├── pandas_03a_selecting_data16.py ├── pandas_03a_selecting_data17.py ├── pandas_03a_selecting_data18.py ├── pandas_03a_selecting_data19.py ├── pandas_03a_selecting_data2.py ├── pandas_03a_selecting_data20.py ├── pandas_03a_selecting_data21.py ├── pandas_03a_selecting_data22.py ├── pandas_03a_selecting_data23.py ├── pandas_03a_selecting_data3.py ├── pandas_03a_selecting_data4.py ├── pandas_03a_selecting_data5.py ├── pandas_03a_selecting_data6.py ├── pandas_03a_selecting_data7.py ├── pandas_03a_selecting_data8.py ├── pandas_03a_selecting_data9.py ├── pandas_03b_indexing1.py ├── pandas_03b_indexing2.py ├── pandas_03b_indexing3.py ├── pandas_03b_indexing4.py ├── pandas_03b_indexing5.py ├── pandas_03b_indexing6.py ├── pandas_03b_indexing7.py ├── pandas_04_time_series_data1.py ├── pandas_04_time_series_data2.py ├── pandas_04_time_series_data3.py ├── pandas_04_time_series_data4.py ├── pandas_04_time_series_data5.py ├── pandas_04_time_series_data6.py ├── pandas_04_time_series_data7.py ├── pandas_04_time_series_data8.py ├── pandas_04_time_series_data9.py ├── pandas_05_groupby_operations1.py ├── pandas_05_groupby_operations10.py ├── pandas_05_groupby_operations11.py ├── pandas_05_groupby_operations12.py ├── pandas_05_groupby_operations13.py ├── pandas_05_groupby_operations14.py ├── pandas_05_groupby_operations15.py ├── pandas_05_groupby_operations16.py ├── pandas_05_groupby_operations17.py ├── pandas_05_groupby_operations18.py ├── pandas_05_groupby_operations19.py ├── pandas_05_groupby_operations2.py ├── pandas_05_groupby_operations20.py ├── pandas_05_groupby_operations21.py ├── pandas_05_groupby_operations22.py ├── pandas_05_groupby_operations23.py ├── pandas_05_groupby_operations24.py ├── pandas_05_groupby_operations25.py ├── pandas_05_groupby_operations26.py ├── pandas_05_groupby_operations27.py ├── pandas_05_groupby_operations28.py ├── pandas_05_groupby_operations29.py ├── pandas_05_groupby_operations3.py ├── pandas_05_groupby_operations30.py ├── pandas_05_groupby_operations31.py ├── pandas_05_groupby_operations4.py ├── pandas_05_groupby_operations5.py ├── pandas_05_groupby_operations6.py ├── pandas_05_groupby_operations7.py ├── pandas_05_groupby_operations8.py ├── pandas_05_groupby_operations9.py ├── pandas_06_data_cleaning1.py ├── pandas_06_data_cleaning10.py ├── pandas_06_data_cleaning11.py ├── pandas_06_data_cleaning12.py ├── pandas_06_data_cleaning13.py ├── pandas_06_data_cleaning14.py ├── pandas_06_data_cleaning15.py ├── pandas_06_data_cleaning2.py ├── pandas_06_data_cleaning3.py ├── pandas_06_data_cleaning4.py ├── pandas_06_data_cleaning5.py ├── pandas_06_data_cleaning6.py ├── pandas_06_data_cleaning7.py ├── pandas_06_data_cleaning8.py ├── pandas_06_data_cleaning9.py ├── pandas_08_reshaping_data1.py ├── pandas_08_reshaping_data10.py ├── pandas_08_reshaping_data11.py ├── pandas_08_reshaping_data12.py ├── pandas_08_reshaping_data13.py ├── pandas_08_reshaping_data14.py ├── pandas_08_reshaping_data15.py ├── pandas_08_reshaping_data16.py ├── pandas_08_reshaping_data17.py ├── pandas_08_reshaping_data18.py ├── pandas_08_reshaping_data19.py ├── pandas_08_reshaping_data2.py ├── pandas_08_reshaping_data20.py ├── pandas_08_reshaping_data3.py ├── pandas_08_reshaping_data4.py ├── pandas_08_reshaping_data5.py ├── pandas_08_reshaping_data6.py ├── pandas_08_reshaping_data7.py ├── pandas_08_reshaping_data8.py ├── pandas_08_reshaping_data9.py ├── pandas_09_combining_datasets1.py ├── pandas_09_combining_datasets2.py ├── pandas_09_combining_datasets3.py ├── pandas_09_combining_datasets4.py ├── pandas_09_combining_datasets5.py ├── visualization_01_matplotlib1.py ├── visualization_01_matplotlib2.py ├── visualization_01_matplotlib3.py ├── visualization_01_matplotlib4.py ├── visualization_01_matplotlib5.py ├── visualization_01_matplotlib6.py ├── visualization_02_seaborn1.py ├── visualization_02_seaborn10.py ├── visualization_02_seaborn11.py ├── visualization_02_seaborn12.py ├── visualization_02_seaborn13.py ├── visualization_02_seaborn14.py ├── visualization_02_seaborn15.py ├── visualization_02_seaborn16.py ├── visualization_02_seaborn17.py ├── visualization_02_seaborn18.py ├── visualization_02_seaborn19.py ├── visualization_02_seaborn2.py ├── visualization_02_seaborn20.py ├── visualization_02_seaborn21.py ├── visualization_02_seaborn22.py ├── visualization_02_seaborn3.py ├── visualization_02_seaborn4.py ├── visualization_02_seaborn5.py ├── visualization_02_seaborn6.py ├── visualization_02_seaborn7.py ├── visualization_02_seaborn8.py └── visualization_02_seaborn9.py ├── case1_bike_count.ipynb ├── case2_observations.ipynb ├── case3_bacterial_resistance_lab_experiment.ipynb ├── case4_air_quality_analysis.ipynb ├── case4_air_quality_processing.ipynb ├── data ├── BETN0290000800100hour.1-1-1990.31-12-2012 ├── BETR8010000800100hour.1-1-1990.31-12-2012 ├── Dryad_Arias_Hall_v3.xlsx ├── FR040120000800100hour.1-1-1999.31-12-2012 ├── FR040370000800100hour.1-1-1999.31-12-2012 ├── TF_ACCIDENTS_VICTIMS_2020.zip ├── TF_VAT_NACE_SQ_2019.zip ├── airbase_data.csv ├── daily_min_temperature_2020.csv ├── data-preprocessing.ipynb ├── data-preprocessing.md ├── fietstellingencoupure.csv ├── fietstelpaal-coupure-links-2022-gent.zip ├── fietstelpaal-coupure-links-2023-gent.zip ├── fietstelpaal-coupure-links-gent.zip ├── load_casualties.py ├── observations.csv ├── plot_location.xlsx ├── species.csv ├── species_names.csv ├── statbel_statistical_sectors_2019.shp.zip ├── survey_data_completed.csv ├── surveys.csv ├── titanic.csv ├── verbruiksgegevens-per-maand.xlsx └── vmm_flowdata.csv ├── pandas_01_data_structures.ipynb ├── pandas_02_basic_operations.ipynb ├── pandas_03a_selecting_data.ipynb ├── pandas_03b_indexing.ipynb ├── pandas_04_time_series_data.ipynb ├── pandas_05_groupby_operations.ipynb ├── pandas_06_data_cleaning.ipynb ├── pandas_07_missing_values.ipynb ├── pandas_08_reshaping_data.ipynb ├── pandas_09_combining_datasets.ipynb ├── python_intro ├── 00-jupyterlab.ipynb ├── 01-variables.ipynb ├── 02-functions-use.ipynb ├── 03-containers.ipynb ├── 04-control-flow.ipynb ├── 05-functions-write.ipynb └── _solutions │ ├── 00-jupyterlab1.py │ ├── 00-jupyterlab2.py │ ├── 01-variables1.py │ ├── 01-variables2.py │ ├── 01-variables3.py │ ├── 01-variables4.py │ ├── 01-variables5.py │ ├── 01-variables6.py │ ├── 01-variables7.py │ ├── 01-variables8.py │ ├── 02-functions-use1.py │ ├── 02-functions-use2.py │ ├── 02-functions-use3.py │ ├── 02-functions-use4.py │ ├── 02-functions-use5.py │ ├── 02-functions-use6.py │ ├── 02-functions-use7.py │ ├── 03-containers1.py │ ├── 03-containers2.py │ ├── 03-containers3.py │ ├── 03-containers4.py │ ├── 03-containers5.py │ ├── 03-containers6.py │ ├── 03-containers7.py │ ├── 03-containers8.py │ ├── 03-containers9.py │ ├── 04-control-flow1.py │ ├── 04-control-flow2.py │ ├── 04-control-flow3.py │ ├── 04-control-flow4.py │ ├── 04-control-flow5.py │ ├── 05-functions-write1.py │ ├── 05-functions-write2.py │ └── 05-functions-write3.py ├── python_recap ├── 00-jupyter_introduction.ipynb ├── 01-basic.ipynb ├── 02-control_flow.ipynb ├── 03-functions.ipynb ├── 04-reusing_code.ipynb ├── 05-numpy.ipynb ├── _solutions │ ├── 01-basic24.py │ ├── 01-basic25.py │ ├── 01-basic28.py │ ├── 01-basic47.py │ ├── 01-basic49.py │ ├── 01-basic58.py │ ├── 02-control_flow15.py │ ├── 02-control_flow16.py │ ├── 02-control_flow24.py │ ├── 03-functions19.py │ ├── 03-functions27.py │ ├── 05-numpy109.py │ ├── 05-numpy137.py │ ├── 05-numpy34.py │ ├── 05-numpy35.py │ ├── 05-numpy36.py │ ├── 05-numpy37.py │ ├── 05-numpy58.py │ ├── 05-numpy73.py │ ├── 05-numpy75.py │ ├── 05-numpy77.py │ ├── python_rehearsal1.py │ ├── python_rehearsal10.py │ ├── python_rehearsal11.py │ ├── python_rehearsal12.py │ ├── python_rehearsal13.py │ ├── python_rehearsal2.py │ ├── python_rehearsal3.py │ ├── python_rehearsal4.py │ ├── python_rehearsal5.py │ ├── python_rehearsal6.py │ ├── python_rehearsal7.py │ ├── python_rehearsal8.py │ └── python_rehearsal9.py ├── data │ ├── bogota_part_dataset.csv │ ├── out1.txt │ ├── out2.txt │ ├── out3.txt │ ├── out4.txt │ └── values.txt └── python_rehearsal.ipynb ├── visualization_01_matplotlib.ipynb ├── visualization_02_seaborn.ipynb └── visualization_03_landscape.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | 91 | notebooks/data/0284676-200613084148143.zip 92 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2016, Joris Van den Bossche 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data manipulation, analysis and visualisation in Python 2 | 3 | ## Introduction 4 | 5 | This course is intended for researchers that have at least basic programming skills in Python. It targets researchers that want to enhance their general data manipulation and analysis skills in Python. 6 | 7 | The course does not aim to provide a course in statistics or machine learning. It aims to provide researchers the means to effectively tackle commonly encountered data handling tasks in order to increase the overall efficiency of the research. 8 | 9 | The course has been developed as a specialist course for the Doctoral schools of Ghent University, but can be taught to others upon request (and the material is freely available to re-use). 10 | 11 | 12 | ## Getting started 13 | 14 | The course uses Python 3 and some data analysis packages such as Pandas, Numpy and Matplotlib. To install the required libraries, we highly recommend Anaconda or miniconda () or another Python distribution that includes the scientific libraries (this recommendation applies to all platforms, so for both Window, Linux and Mac). 15 | 16 | For detailed instructions to get started on your local machine , see the [setup instructions](./docs/setup.md). 17 | 18 | In case you do not want to install everything and just want to try out the course material, use the environment setup by Binder [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/jorisvandenbossche/DS-python-data-analysis/main?urlpath=lab/) and open de notebooks rightaway. 19 | 20 | 21 | ## Contributing 22 | 23 | Found any typo or have a suggestion, see [how to contribute](./docs/contributing.md). 24 | 25 | 26 | ## Meta 27 | Authors: Joris Van den Bossche, Stijn Van Hoey 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /_solved/data: -------------------------------------------------------------------------------- 1 | ../notebooks/data/ -------------------------------------------------------------------------------- /_solved/pandas_07_missing_values.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "8bd0774d", 6 | "metadata": {}, 7 | "source": [ 8 | "

07 - Pandas: Working with missing data

\n", 9 | "\n", 10 | "\n", 11 | "> *© 2025, Joris Van den Bossche and Stijn Van Hoey (, ). Licensed under [CC BY 4.0 Creative Commons](http://creativecommons.org/licenses/by/4.0/)*\n", 12 | "\n", 13 | "---" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "id": "fad2705f", 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import numpy as np\n", 24 | "import pandas as pd" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "id": "6cf9e666", 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "df = pd.DataFrame({'A': [1, 2, np.nan],\n", 35 | " 'B': [4, np.nan, np.nan],\n", 36 | " 'C': [7, 8, 9]})\n", 37 | "df" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "id": "9204ffad", 43 | "metadata": {}, 44 | "source": [ 45 | "## Missing values in Pandas" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "id": "20ebca57", 51 | "metadata": {}, 52 | "source": [ 53 | "For numerical data, the \"NaN\" (Not-A-Number) floating point value is used as missing value indicator:" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "id": "17a6454f", 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "df.loc[2, 'A']" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "id": "35dc8450", 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "np.nan" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "id": "b116e307", 79 | "metadata": {}, 80 | "source": [ 81 | "
\n", 82 | "\n", 83 | "**NOTE**: because NaN is a float value, it is currently not possible to have integer columns with missing values. Notice how the columns in the example above were casted to float dtype.\n", 84 | "\n", 85 | "
" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "id": "89150b7e", 91 | "metadata": {}, 92 | "source": [ 93 | "### Missing values are skipped by default in *reductions*" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "id": "1e2b48d5", 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "df['A'].mean()" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "id": "96daf776", 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "df['A'].mean(skipna=False)" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "id": "604e4841", 119 | "metadata": {}, 120 | "source": [ 121 | "### ... but propagated in *element-wise arithmetic*" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "id": "92901db0", 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "df['A'] + 3" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "id": "cf8a72a6", 137 | "metadata": {}, 138 | "source": [ 139 | "## Checking missing values" 140 | ] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "id": "5b50553a", 145 | "metadata": {}, 146 | "source": [ 147 | "Checking for a missing value cannot be done with an equality operation (`==`) because NaN is not equal to iself:" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "id": "61a4ebe9", 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "df['A'] == np.nan" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "id": "1acc9e71", 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "np.nan == np.nan" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "id": "b4439546", 173 | "metadata": {}, 174 | "source": [ 175 | "Therefore, dedicated methods are available: `isna()` and `notna()`" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "id": "3c7d6670", 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "df['A'].isna()" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "id": "4b95b7c2", 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "df['A'].notna()" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "id": "683cccc8", 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "df['A'].isna().sum()" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "id": "c023dd7d", 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "df.isna().sum()" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "id": "82b582da", 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "df" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "id": "a8488b86", 231 | "metadata": {}, 232 | "source": [ 233 | "## Dropping missing values" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "id": "e1440709", 239 | "metadata": {}, 240 | "source": [ 241 | "Dropping missing values can be done with `isna()`/`notna()` and boolean indexing (eg `df[df['A'].notna()]`), but pandas also provides some convenient helper functions for this:" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "id": "788d650e", 248 | "metadata": {}, 249 | "outputs": [], 250 | "source": [ 251 | "df.dropna()" 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "id": "c694bb08", 257 | "metadata": {}, 258 | "source": [ 259 | "By default it drop rows if there is a NaN in any of the columns. To limit this to we subset of the columns, use the `subset` keyword:" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "id": "5bb3578c", 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [ 269 | "df.dropna(subset=['A', 'C'])" 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "id": "00036b6f", 275 | "metadata": {}, 276 | "source": [ 277 | "## Filling missing values" 278 | ] 279 | }, 280 | { 281 | "cell_type": "markdown", 282 | "id": "0e64082f", 283 | "metadata": {}, 284 | "source": [ 285 | "Filling missing values with a scalar:" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "id": "94f40e9a", 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [ 295 | "df.fillna(0)" 296 | ] 297 | }, 298 | { 299 | "cell_type": "markdown", 300 | "id": "0a73ff4c", 301 | "metadata": {}, 302 | "source": [ 303 | "Further, more advanced filling techniques are available in the ``interpolate()`` method." 304 | ] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "id": "7b57edf1", 309 | "metadata": {}, 310 | "source": [ 311 | "
\n", 312 | "\n", 313 | "**REMEMBER**:
\n", 314 | "\n", 315 | "* Missing value indicator: `np.nan` (`NaN`)\n", 316 | "* Reductions: skipped by default\n", 317 | "* Mathematical operations (eg `+`): propagate by default\n", 318 | "* Specific functions:\n", 319 | " * `isna()`, `notna()`\n", 320 | " * `dropna()`\n", 321 | " * `fillna()`, `interpolate()`\n", 322 | "\n", 323 | "
" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": null, 329 | "id": "e1f5bf9a", 330 | "metadata": {}, 331 | "outputs": [], 332 | "source": [] 333 | } 334 | ], 335 | "metadata": { 336 | "jupytext": { 337 | "formats": "ipynb,md:myst" 338 | }, 339 | "kernelspec": { 340 | "display_name": "Python 3 (ipykernel)", 341 | "language": "python", 342 | "name": "python3" 343 | }, 344 | "language_info": { 345 | "codemirror_mode": { 346 | "name": "ipython", 347 | "version": 3 348 | }, 349 | "file_extension": ".py", 350 | "mimetype": "text/x-python", 351 | "name": "python", 352 | "nbconvert_exporter": "python", 353 | "pygments_lexer": "ipython3", 354 | "version": "3.12.8" 355 | }, 356 | "widgets": { 357 | "application/vnd.jupyter.widget-state+json": { 358 | "state": {}, 359 | "version_major": 2, 360 | "version_minor": 0 361 | } 362 | } 363 | }, 364 | "nbformat": 4, 365 | "nbformat_minor": 5 366 | } 367 | -------------------------------------------------------------------------------- /_solved/python_recap/data/bogota_part_dataset.csv: -------------------------------------------------------------------------------- 1 | DIA,SST AM,SSV AM,SSV PM,SSF PM 2 | Unidad,mg/l,mg/l,mg/l,mg/l 3 | ,,,, 4 | 1,198,141,131,38 5 | 2,274,200,125,35 6 | 3,156,119,274,120 7 | 4,382,266,272,105 8 | 5,494,342,202,76 9 | 6,259,182,205,67 10 | 7,247,185,232,77 11 | 8,164,125,112,33 12 | 9,367,265,82,30 13 | 10,123,90,91,26 14 | 11,132,96,130,46 15 | 12,97,66,110,33 16 | 13,160,104,181,83 17 | 14,137,100,122,41 18 | 15,172,123,151,56 19 | 16,192,138,168,78 20 | 17,176,106,94,36 21 | 18,192,132,111,43 22 | 19,152,99,112,37 23 | 20,255,179,181,67 24 | 21,188,134,220,94 25 | 22,215,153,149,58 26 | 23,221,157,147,60 27 | 24,284,199,201,93 28 | 25,134,84,133,65 29 | 26,196,120,132,47 30 | 27,144,88,114,41 31 | 28,193,143,128,45 32 | -------------------------------------------------------------------------------- /_solved/python_recap/data/out1.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/_solved/python_recap/data/out1.txt -------------------------------------------------------------------------------- /_solved/python_recap/data/out2.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/_solved/python_recap/data/out2.txt -------------------------------------------------------------------------------- /_solved/python_recap/data/out3.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/_solved/python_recap/data/out3.txt -------------------------------------------------------------------------------- /_solved/python_recap/data/out4.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/_solved/python_recap/data/out4.txt -------------------------------------------------------------------------------- /_solved/python_recap/data/values.txt: -------------------------------------------------------------------------------- 1 | 0,09400 3,37968 2 | 0,28820 0,83214 3 | 0,06823 0,57102 4 | 0,65576 0,59619 5 | -1,23714 0,03561 -------------------------------------------------------------------------------- /_solved/spreaddiagram.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | 4 | @author: Stijnvh 5 | """ 6 | 7 | import sys 8 | import datetime 9 | 10 | import numpy as np 11 | from scipy import stats 12 | from scipy.stats import linregress 13 | 14 | import pandas as pd 15 | from pandas.tseries.offsets import DateOffset 16 | 17 | import pylab as p 18 | import matplotlib as mpl 19 | mpl.rcParams['mathtext.default'] = 'regular' 20 | import matplotlib.pyplot as plt 21 | import matplotlib.gridspec as gridspec 22 | from matplotlib.patches import Rectangle 23 | from matplotlib.ticker import MaxNLocator 24 | 25 | ##----------------------------------------------------------------------------- 26 | ## Calculating objective functions 27 | ##----------------------------------------------------------------------------- 28 | 29 | def root_mean_square_error(observed, modelled): 30 | ''' 31 | Root Mean Square Error (RMSE) 32 | 33 | Parameters 34 | ----------- 35 | observed : np.ndarray or pd.DataFrame 36 | observed/measured values of the variable 37 | observed : np.ndarray or pd.DataFrame 38 | simulated values of the variable 39 | 40 | Notes 41 | ------- 42 | The root mean square error is an absolute criterion that is often. 43 | It indicates the overall agreement between predicted and observed data. 44 | The square allows avoiding 45 | error compensation and emphasises larger errors. The root provides 46 | a criterion in actual units. Consequently, this quality criterion 47 | can be compared to the MAE to provide information on the prominence 48 | of outliers in the dataset. 49 | 50 | Notes 51 | ------- 52 | * range: [0, inf] 53 | * optimum: 0 54 | ''' 55 | residuals = observed - modelled 56 | return np.sqrt((residuals**2).mean()) 57 | 58 | 59 | def bias(observed, modelled): 60 | """ 61 | Bias E[obs-mod] 62 | 63 | Parameters 64 | ----------- 65 | observed : np.ndarray or pd.DataFrame 66 | observed/measured values of the variable 67 | observed : np.ndarray or pd.DataFrame 68 | simulated values of the variable 69 | 70 | Notes 71 | ------- 72 | * range: [-inf, inf] 73 | * optimum: 0 74 | """ 75 | residuals = observed - modelled 76 | return np.mean(residuals) 77 | 78 | ##----------------------------------------------------------------------------- 79 | ## MODEL CALIBRATION EVALUATION PLOTS - SPREAD DIAGRAMS 80 | ##----------------------------------------------------------------------------- 81 | 82 | def spread_diagram(axs, obs, mod, infobox = True, *args, **kwargs): 83 | ''' 84 | plot a scatter plot comparing the simulated and observed datasets in a 85 | scatter plot with some extra information about the fit included. 86 | 87 | Parameters 88 | ----------- 89 | axs : axes.AxesSubplot object 90 | an subplot instance where the graph will be located, 91 | this supports the use of different subplots 92 | obs : ndarray 93 | 1D array of the observed data 94 | mod : ndarray 95 | 1D array of the modelled output 96 | infobox : bool True|False 97 | defines if a infobox with the regression info is added or not 98 | *args, **kwargs : args 99 | argument passed to the matplotlib scatter command 100 | 101 | Returns 102 | -------- 103 | axs 104 | ''' 105 | p.rc('mathtext', default = 'regular') 106 | 107 | axs.scatter(obs,mod, *args, **kwargs) 108 | axs.set_aspect('equal') 109 | 110 | if isinstance(obs, np.ndarray): 111 | getmax = min(obs.max(), mod.max())*0.9 112 | getmin = max(obs.min(), mod.min())*1.1 113 | else: 114 | getmax = min(obs.max().values, mod.max().values)*0.9 115 | getmin = max(obs.min().values, mod.min().values)*1.1 116 | obs = obs.values 117 | mod = mod.values 118 | 119 | axs.plot([getmin, getmax], [getmin, getmax],'k--', linewidth = 0.5) 120 | 121 | slope, intercept, r_value, p_value, std_err = stats.linregress(obs, mod) 122 | 123 | forplot = np.arange(getmin, getmax, 0.01) 124 | axs.plot(forplot, slope*forplot + intercept, '-', color = 'grey', 125 | linewidth = 0.5) 126 | axs.set_xlim(left = getmin, right = getmax) 127 | axs.set_ylim(bottom = getmin, top = getmax) 128 | 129 | rmse = root_mean_square_error(obs, mod) 130 | 131 | #for infobox 132 | if infobox == True: 133 | patch = Rectangle((0., 0.65), 0.35, 0.35, facecolor = 'white', 134 | edgecolor = 'k', transform = axs.transAxes) 135 | axs.add_patch(patch) 136 | axs.set_axisbelow(True) 137 | 138 | textinfo = ({'transform' : axs.transAxes, 139 | 'verticalalignment' : 'center', 140 | 'horizontalalignment' : 'left', 141 | 'fontsize' : 12}) 142 | 143 | axs.text(0.05, 0.95, r'$\bar{x}\ $', textinfo) 144 | axs.text(0.05, 0.90, r'$\bar{y}\ $', textinfo) 145 | axs.text(0.05, 0.85, r'$rico\ $', textinfo) 146 | axs.text(0.05, 0.8, r'$intc.\ $', textinfo) 147 | axs.text(0.05, 0.75, r'$R^2\ $', textinfo) 148 | axs.text(0.05, 0.70, r'$RMSE\ $', textinfo) 149 | 150 | axs.text(0.2, 0.95, r': %.2f'%obs.mean(), textinfo) 151 | axs.text(0.2, 0.90, r': %.2f'%mod.mean(), textinfo) 152 | axs.text(0.2, 0.85, r': %.2f'%slope, textinfo) 153 | axs.text(0.2, 0.8, r': %.2f'%intercept, textinfo) 154 | axs.text(0.2, 0.75, r': %.2f'%r_value, textinfo) 155 | axs.text(0.2, 0.70, r': %.2f'%rmse, textinfo) 156 | 157 | return axs 158 | 159 | 160 | def main(argv=None): 161 | print(argv[0]) 162 | 163 | # loading data from a file 164 | data = pd.read_csv(argv[1], parse_dates=True, index_col=0).dropna() 165 | 166 | # using custom plot function 167 | 168 | formatfig = argv[2] 169 | fig, ax = plt.subplots() 170 | spread_diagram(ax, data.iloc[:,0].values, 171 | data.iloc[:,1].values, infobox = True) 172 | fig.savefig("{}_evaluation.{}".format(datetime.date.today().strftime("%Y%m%d"), formatfig)) 173 | 174 | 175 | if __name__ == "__main__": 176 | sys.exit(main(sys.argv)) 177 | 178 | -------------------------------------------------------------------------------- /check_environment.py: -------------------------------------------------------------------------------- 1 | # This script is adapted from Andreas Mueller: 2 | # https://github.com/amueller/scipy-2018-sklearn/blob/master/check_env.ipynb 3 | # and glemaitre: https://github.com/glemaitre/pyparis-2018-sklearn/blob/master/check_environment.py 4 | 5 | from __future__ import print_function 6 | import sys 7 | 8 | # packaging is not in the stdlib, but should be available as dependency of 9 | # some other package (eg jupyterlab, matplotlib, ..) 10 | from packaging import version 11 | 12 | try: 13 | import curses 14 | curses.setupterm() 15 | assert curses.tigetnum("colors") > 2 16 | OK = "\x1b[1;%dm[ OK ]\x1b[0m" % (30 + curses.COLOR_GREEN) 17 | FAIL = "\x1b[1;%dm[FAIL]\x1b[0m" % (30 + curses.COLOR_RED) 18 | except: 19 | OK = '[ OK ]' 20 | FAIL = '[FAIL]' 21 | 22 | try: 23 | import importlib 24 | except ImportError: 25 | print(FAIL, "Python version 3.4 is required," 26 | " but %s is installed." % sys.version) 27 | 28 | 29 | def import_version(pkg, min_ver, fail_msg=""): 30 | mod = None 31 | try: 32 | mod = importlib.import_module(pkg) 33 | 34 | if pkg in {'PIL'}: 35 | ver = mod.VERSION 36 | elif pkg in {'xlrd'}: 37 | ver = mod.__VERSION__ 38 | else: 39 | ver = mod.__version__ 40 | if version.parse(ver) < version.parse(min_ver): 41 | print(FAIL, "%s version %s or higher required, but %s installed." 42 | % (lib, min_ver, ver)) 43 | else: 44 | print(OK, '%s version %s' % (pkg, ver)) 45 | except ImportError: 46 | print(FAIL, '%s not installed. %s' % (pkg, fail_msg)) 47 | return mod 48 | 49 | 50 | # first check the python version 51 | print('Using python in', sys.prefix) 52 | print(sys.version) 53 | pyversion = version.parse(sys.version.split(" ")[0]) 54 | if pyversion >= version.parse("3"): 55 | if pyversion < version.parse("3.8"): 56 | print(FAIL, "Python version 3.8 is required," 57 | " but %s is installed." % sys.version) 58 | else: 59 | print(FAIL, "Python 3 is required, but %s is installed." % sys.version) 60 | 61 | print() 62 | requirements = {'numpy': "2", 'matplotlib': "3", 63 | 'pandas': "2", 'jupyterlab': "3", 64 | 'pyproj': "2", 'requests': "2.32", 65 | 'seaborn': "0.13"} 66 | 67 | # now the dependencies 68 | for lib, required_version in list(requirements.items()): 69 | import_version(lib, required_version) 70 | 71 | # mplleaflet has no option to derive __version__ 72 | try: 73 | import mplleaflet 74 | print(OK, '%s can be loaded' % ('mplleaflet')) 75 | except: 76 | print(FAIL, '%s can not be loaded.' % ('mplleaflet')) 77 | -------------------------------------------------------------------------------- /convert_notebooks.sh: -------------------------------------------------------------------------------- 1 | # run this from the the top-level directory 2 | # it creates there a notebooks/ and _solved/solutions/ dir 3 | # that get automatically copied to the correct places 4 | 5 | 6 | declare -a arr=( 7 | #"00-jupyter_introduction.ipynb" 8 | #"01-basic.ipynb" 9 | #"02-control_flow.ipynb" 10 | #"03-functions.ipynb" 11 | #"04-reusing_code.ipynb" 12 | #"05-numpy.ipynb" 13 | #"python_rehearsal" 14 | "00-jupyter_introduction.ipynb" 15 | "pandas_01_data_structures.ipynb" 16 | "pandas_02_basic_operations.ipynb" 17 | "pandas_03a_selecting_data.ipynb" 18 | "pandas_03b_indexing.ipynb" 19 | "pandas_04_time_series_data.ipynb" 20 | "pandas_05_groupby_operations.ipynb" 21 | "pandas_06_data_cleaning.ipynb" 22 | "pandas_07_missing_values.ipynb" 23 | "pandas_08_reshaping_data.ipynb" 24 | "pandas_09_combining_datasets.ipynb" 25 | "visualization_01_matplotlib.ipynb" 26 | "visualization_02_seaborn.ipynb" 27 | "visualization_03_landscape.ipynb" 28 | "case1_bike_count.ipynb" 29 | "case2_observations.ipynb" 30 | "case3_bacterial_resistance_lab_experiment" 31 | "case4_air_quality_processing.ipynb" 32 | "case4_air_quality_analysis.ipynb" 33 | ) 34 | 35 | cd _solved 36 | 37 | mkdir ./notebooks 38 | 39 | echo "- Converting notebooks" 40 | 41 | for i in "${arr[@]}" 42 | do 43 | echo "--" "$i" 44 | jupyter nbconvert --to=notebook --config ../nbconvert_config.py --output "notebooks/$i" "$i" 45 | done 46 | 47 | echo "- Copying converted notebooks and solutions" 48 | cp -r notebooks/. ../notebooks 49 | cp -r _solutions/. ../notebooks/_solutions 50 | 51 | rm -r notebooks/ 52 | rm -r _solutions/ 53 | 54 | cd .. 55 | 56 | 57 | declare -a arr=( 58 | "00-jupyterlab.ipynb" 59 | "01-variables.ipynb" 60 | "02-functions-use.ipynb" 61 | "03-containers.ipynb" 62 | "04-control-flow.ipynb" 63 | "05-functions-write.ipynb" 64 | ) 65 | 66 | cd _solved/python_intro 67 | 68 | mkdir ./notebooks 69 | 70 | echo "- Converting notebooks" 71 | 72 | for i in "${arr[@]}" 73 | do 74 | echo "--" "$i" 75 | jupyter nbconvert --to=notebook --config ../../nbconvert_config.py --output "notebooks/$i" "$i" 76 | done 77 | 78 | 79 | 80 | echo "- Copying converted notebooks and solutions" 81 | cp -r notebooks/. ../../notebooks/python_intro 82 | cp -r _solutions/. ../../notebooks/python_intro/_solutions 83 | 84 | rm -r notebooks/ 85 | rm -r _solutions/ 86 | 87 | cd ../.. 88 | 89 | 90 | -------------------------------------------------------------------------------- /docs/_config.yml: -------------------------------------------------------------------------------- 1 | title: Data manipulation, analysis and visualisation in Python 2 | logo: 3 | description: Specialist course Doctoral schools of Ghent University 4 | show_downloads: true 5 | theme: jekyll-theme-minimal -------------------------------------------------------------------------------- /docs/contributing.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | --- 4 | 5 | # Contributing guide 6 | 7 | First of all, thanks for considering contributing to the course! 👍 8 | 9 | ## How you can contribute 10 | 11 | There are several ways you can contribute to this course. 12 | 13 | ### Share the love ❤️ 14 | 15 | Think this course is useful? Let others discover it, by telling them in person, via Twitter or a blog post. 16 | 17 | ### Ask a question ⁉️ 18 | 19 | Trying out the material and got stuck? Post your question as an [issue on GitHub](https://github.com/jorisvandenbossche/course-python-data/issues). While we cannot offer user support, we'll try to do our best to address it, as questions often lead to the discovery of bugs. 20 | 21 | Want to ask a question in private? Contact the course maintainer by [email](jorisvandenbossche@gmail.com). 22 | 23 | ### Propose an idea 💡 24 | 25 | Have an idea for to improve the course? Take a look at the [issue list](https://github.com/jorisvandenbossche/course-python-data/issues) to see if it isn't included or suggested yet. If not, suggest your idea as an [issue on GitHub](https://github.com/jorisvandenbossche/course-python-data/issues/new). 26 | 27 | ### Report a bug 🐛 28 | 29 | Using the course and discovered a bug or a typo? That's annoying! Don't let others have the same experience and report it as an [issue on GitHub](https://github.com/jorisvandenbossche/Have an idea for to improve the course? Take a look at the [issue list](https://github.com/jorisvandenbossche/course-python-data/issues) to see if it isn't included or suggested yet. If not, suggest your idea as an [issue on GitHub](https://github.com/jorisvandenbossche/course-python-data/issues/new). 30 | /issues/new) so we can fix it. A good bug report makes it easier for us to do so, so please include: 31 | 32 | * Your operating system name and version (e.g. Mac OS 10.13.6). 33 | * Any details about your local setup that might be helpful in troubleshooting. 34 | * Detailed steps to reproduce the bug. 35 | 36 | ### Contribute code 📝 37 | 38 | Care to fix issues or typo's? Awesome! 👏 39 | 40 | Some notes to take into account: 41 | 42 | - The course material is developed in the [course-python-data](https://github.com/jorisvandenbossche/course-python-data) repository. When updating course material, edit the notebooks in the [course-python-data](https://github.com/jorisvandenbossche/course-python-data) repository, the other ones (the ones used in the tutorial) are generated automatically. 43 | - the exercises are cleared using the `nbtutor` notebook extension: 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # Data manipulation, analysis and visualisation in Python 2 | 3 | ## Introduction 4 | 5 | The handling of data is a recurring task for data analysts. Reading in experimental data, checking its properties, 6 | and creating visualisations are crucial steps in the research process. Hence, increasing the efficiency in this process is beneficial for professionals 7 | handling data. Spreadsheet-based software lacks the ability to properly support this process, due to the lack of automation and repeatability. 8 | The usage of a high-level scripting language such as Python is ideal for these tasks. 9 | 10 | This course trains participants to use Python effectively to do these tasks. The course focuses on data manipulation and cleaning of tabular data, 11 | explorative analysis and visualisation using important packages such as Pandas, Matplotlib and Seaborn. 12 | 13 | The course does not cover statistics, data mining, machine learning, or predictive modelling. It aims to provide participants the means to effectively 14 | tackle commonly encountered data handling tasks in order to increase the overall efficiency. These skills are both useful for data cleaning as well as 15 | feature engineering. 16 | 17 | The course has been developed as a course for the Specialist course Doctoral schools of Ghent University, but can be taught to others upon request. 18 | 19 | ## Course info 20 | 21 | ### Aim & scope 22 | 23 | This course is intended for researchers that have at least basic programming skills. A basic (scientific) programming course that is part of 24 | the regular curriculum should suffice. For those who have experience in another programming language (e.g. Matlab, R, ...), following a Python 25 | tutorial prior to the course is advised. 26 | 27 | The course is intended for professionals who wish to enhance their general data manipulation and visualization skills in Python, with a specific 28 | focus on tabular data. The course is NOT intended to be a course on statistics or machine learning. 29 | 30 | ### Program 31 | 32 | After setting up the programming environment with the required packages using the conda package manager and an introduction of the Jupyter 33 | notebook environment, the data analysis package Pandas and the plotting packages Matplotlib and Seaborn are introduced. Advanced usage of Pandas 34 | for different data cleaning and manipulation tasks is taught and the acquired skills will immediately be brought into practice to handle real-world 35 | data sets. Applications include time series handling, categorical data, merging data, tidy data,... 36 | 37 | The course closes with a discussion on the scientific Python ecosystem and the visualisation landscape learning 38 | participants to create interactive charts. 39 | 40 | ## Getting started 41 | 42 | The course uses Python 3 and some data analysis packages such as Pandas, Seaborn, Numpy and Matplotlib. To install the required libraries, 43 | we recommend Anaconda or miniconda ([https://www.anaconda.com/download/](https://www.anaconda.com/download/)) or another Python distribution that 44 | includes the scientific libraries (this recommendation applies to all platforms, so for both Window, Linux and Mac). 45 | 46 | For detailed instructions to get started on your local machine, see the [setup instructions](./setup.html). 47 | 48 | In case you do not want to install everything and just want to try out the course material, use the environment setup by 49 | Binder [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/jorisvandenbossche/DS-python-data-analysis/HEAD) and open de notebooks 50 | rightaway (inside the `notebooks` directory). 51 | 52 | ## Slides 53 | 54 | For the course slides, click [here](https://jorisvandenbossche.github.io/DS-python-data-analysis/slides.html). 55 | 56 | ## Contributing 57 | 58 | Found any typo or have a suggestion, see [how to contribute](./contributing.html). 59 | 60 | ## Meta 61 | 62 | Authors: Joris Van den Bossche, Stijn Van Hoey 63 | 64 | With the support of the Flemish Government. 65 | 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /docs/setup.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | --- 4 | 5 | # Course setup 6 | 7 | To get started, you should have the following elements setup: 8 | 9 | 1. Download the course material to your computer 10 | 2. Install Python and the required Python packages using `conda` 11 | 3. Test your configuration and installation 12 | 4. Start Jupyter lab 13 | 14 | In the following sections, more details are provided for each of these steps. When all three are done, you are ready to start coding! 15 | 16 | ## 1. Getting the course materials 17 | 18 | ### Option 1: You are already a git user 19 | 20 | As the course has been set up as a [git](https://git-scm.com/) repository managed on [Github](https://github.com/jorisvandenbossche/DS-python-data-analysis), 21 | you can clone the entire course to your local machine. Use the command line to clone the repository and go into the course folder: 22 | 23 | ``` 24 | git clone https://github.com/jorisvandenbossche/DS-python-data-analysis.git 25 | cd DS-python-data-analysis 26 | ``` 27 | 28 | In case you would prefer using Github Desktop, 29 | see [this tutorial](https://help.github.com/desktop/guides/contributing-to-projects/cloning-a-repository-from-github-to-github-desktop/). 30 | 31 | ### Option 2: You are not a git user 32 | 33 | To download the repository to your local machine as a zip-file, click the `download ZIP` on the 34 | repository page (green button "Code"): 35 | 36 | ![Download button](./static/img/download-button.png) 37 | 38 | After the download, unzip on the location you prefer within your user account (e.g. `My Documents`, not `C:\`). Watch out for a nested 'DS-python-data-analysis/DS-python-data-analysis' folder structure after unzipping and move the inner DS-python-data-analysis folder to your preferred location. 39 | 40 | __Note:__ Make sure you know where you stored the course material, e.g. `C:/Users/yourusername/Documents/DS-python-data-analysis`. 41 | 42 | ## 2. Install Python and the required Python packages using `conda` 43 | 44 | For scientific and data analysis, we recommend to use `conda`, a command line tool for package and environment management (). 45 | `conda` allows us to install a Python distribution with the the scientific libraries we will use in this course (this recommendation applies to all platforms, so for both Windows, Linux and Mac). 46 | 47 | ### 2.1 Install `conda` 48 | 49 | #### Option 1: I do not have `conda` installed 50 | 51 | We recommend to use the installer provided by the conda-forge community: . 52 | 53 | Follow the instructions on that page, i.e. first download the appropriate installed (depending on your operating system), and then run that installer. 54 | 55 | On Windows, this will mean double-clicking the downloaded `.exe` file, and following the instructions. During installation, choose the options (click checkbox): 56 | 57 | - '_Register Miniforge3 as my default Python 3.12_' (in case this returns an error about an existing Python 3.12 installation, remove the existing Python installation using [windows Control Panel](https://support.microsoft.com/en-us/windows/uninstall-or-remove-apps-and-programs-in-windows-4b55f974-2cc6-2d2b-d092-5905080eaf98)). 58 | - '_Clear the package cache upon completion_'. 59 | 60 | On MacOS or Linux, you have to open a terminal, and run `bash Miniforge3-$(uname)-$(uname -m).sh` 61 | 62 | #### Option 2: I already have `conda`, Anaconda or Miniconda installed 63 | 64 | When you already have an installation of `conda` or Anaconda, you have to make sure you are working with a recent version. If you installed it only a 65 | few months ago, this step is probably not needed, otherwise follow the next steps: 66 | 67 | 1. Open a terminal window (on Windows, use the dedicated "Anaconda Prompt" or "Miniforge Prompt", via Start Menu) 68 | 2. Run `conda update conda`, by typing that command, hit the ENTER-button 69 | (make sure you have an internet connection), and respond with *Yes* by typing `y`. 70 | 3. Run `conda config --add channels conda-forge`, by typing that command, hit the ENTER-button 71 | 4. Run `conda config --set channel_priority strict`, by typing that command, hit the ENTER-button 72 | 73 | If you are using Anaconda on Windows, replace each time "Miniforge Prompt" by "Anaconda Prompt" in the following sections. 74 | 75 | ### 2.2 Setup after `conda` installation 76 | 77 | Now we will use `conda` to install the Python packages we are going to use 78 | throughout this course. 79 | As a good practice, we will create a new _conda environment_ to work with. 80 | 81 | The packages used in the course are enlisted in 82 | an [`environment.yml` file](https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/main/environment.yml). The file looks as follows: 83 | 84 | ``` 85 | name: DS-python 86 | channels: 87 | - conda-forge 88 | dependencies: 89 | - python=3.12 90 | - geopandas 91 | - ... 92 | ``` 93 | 94 | The file contains information on: 95 | - `name` is the name used for the environment 96 | - `channels` to define where to download the packages from 97 | - `dependencies` contains each of the packages 98 | 99 | The environment.yml file for this course is included in the course material you 100 | downloaded. 101 | 102 | Now we can create the environment: 103 | 104 | 1. Open the terminal window (on Windows use "Miniforge Prompt", open it via Start Menu > 'Miniforge Prompt') 105 | 2. Navigate to the directory where you downloaded the course materials (that directory should contain a `environment.yml` file, double check in your file explorer).: 106 | 107 | ``` 108 | cd FOLDER_PATH_TO_COURSE_MATERIAL 109 | ``` 110 | (Make sure to hit the ENTER-button to run the command) 111 | 112 | 3. Create the environment by typing the following commands line by line + hitting the ENTER-button (make sure you have an internet connection): 113 | 114 | ``` 115 | conda env create -f environment.yml 116 | ``` 117 | 118 | __!__ `FOLDER_PATH_TO_COURSE_MATERIAL` should be replaced by the path to the folder containing the downloaded course materials (e.g. in the example it is `C:/Users/yourusername/Documents/DS-python-data-analysis`) 119 | 120 | __!__ You can safely ignore the warning `FutureWarning: 'remote_definition'...`. 121 | 122 | Respond with *Yes* by typing `y` when asked. Output will be printed and if no error occurs, you should have the environment configured with all packages installed. 123 | 124 | When finished, keep the terminal window (or "Miniforge Prompt") open (or reopen it). Execute the following commands to check your installation: 125 | 126 | ``` 127 | conda activate DS-python 128 | ipython 129 | ``` 130 | 131 | Within the terminal, a Python session will be started in which you can start writing Python! Type the following command: 132 | 133 | ``` 134 | import pandas 135 | import matplotlib 136 | ``` 137 | 138 | If no message is returned, you're all set! If a message (probably an error) returned, contact the instructors. Copy paste the message returned. 139 | 140 | To get out of the Python session, type: 141 | 142 | ``` 143 | quit 144 | ``` 145 | 146 | ## 3. Test your configuration 147 | 148 | To check if your packages are properly installed, open the Conda Terminal again (see above) and navigate to the course directory: 149 | 150 | ``` 151 | cd FOLDER_PATH_TO_COURSE_MATERIAL 152 | ``` 153 | 154 | With `FOLDER_PATH_TO_COURSE_MATERIAL` replaced by the path to the folder with the downloaded 155 | course material (e.g. in the example it is `C:/Users/yourusername/Documents/DS-python-data-analysis`). 156 | 157 | Activate the newly created conda environment: 158 | 159 | ``` 160 | conda activate DS-python 161 | ``` 162 | 163 | Then, run the `check_environment.py` script: 164 | 165 | ``` 166 | python check_environment.py 167 | ``` 168 | 169 | When all checkmarks are ok, you're ready to go! 170 | 171 | 172 | ## 4.(_start of day during course_) Starting Jupyter Notebook with Jupyter Lab 173 | 174 | Each of the course modules is set up as a [Jupyter notebook](http://jupyter.org/), an interactive environment to write and run code. It is no problem if you never used jupyter notebooks before as an introduction to notebooks is part of the course. 175 | 176 | 177 | * In the terminal (or "Miniforge Prompt"), navigate to the `DS-python-data-analysis` directory (downloaded or cloned in the previous section) 178 | 179 | ``` 180 | cd FOLDER_PATH_TO_COURSE_MATERIAL 181 | ``` 182 | 183 | * Ensure that the correct environment is activated. 184 | 185 | ``` 186 | conda activate DS-python 187 | ``` 188 | 189 | * Start a jupyter notebook server by typing 190 | 191 | ``` 192 | jupyter lab 193 | ``` 194 | 195 | ## Next? 196 | 197 | This will open a browser window automatically. Navigate to the course directory (if not already there) and choose the `notebooks` folder to access the individual notebooks containing the course material. -------------------------------------------------------------------------------- /docs/static/img/datacleaning1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/docs/static/img/datacleaning1.jpg -------------------------------------------------------------------------------- /docs/static/img/datacleaning2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/docs/static/img/datacleaning2.jpg -------------------------------------------------------------------------------- /docs/static/img/dataframe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/docs/static/img/dataframe.png -------------------------------------------------------------------------------- /docs/static/img/doctoralschoolsprofiel_hq_rgb_web.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/docs/static/img/doctoralschoolsprofiel_hq_rgb_web.png -------------------------------------------------------------------------------- /docs/static/img/download-button.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/docs/static/img/download-button.png -------------------------------------------------------------------------------- /docs/static/img/environment_save.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/docs/static/img/environment_save.png -------------------------------------------------------------------------------- /docs/static/img/icon_github.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/static/img/icon_twitter.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/static/img/ipython.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/docs/static/img/ipython.png -------------------------------------------------------------------------------- /docs/static/img/issuetracker.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/docs/static/img/issuetracker.png -------------------------------------------------------------------------------- /docs/static/img/logo_flanders+richtingmorgen.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/docs/static/img/logo_flanders+richtingmorgen.png -------------------------------------------------------------------------------- /docs/static/img/navigator_notebook.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/docs/static/img/navigator_notebook.png -------------------------------------------------------------------------------- /docs/static/img/navigator_terminal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/docs/static/img/navigator_terminal.png -------------------------------------------------------------------------------- /docs/static/img/notebook.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/docs/static/img/notebook.png -------------------------------------------------------------------------------- /docs/static/img/startup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/docs/static/img/startup.png -------------------------------------------------------------------------------- /docs/static/img/tidy_data_paper.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/docs/static/img/tidy_data_paper.png -------------------------------------------------------------------------------- /docs/static/img/tidy_data_scheme.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/docs/static/img/tidy_data_scheme.png -------------------------------------------------------------------------------- /docs/static/img/work_stijn_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/docs/static/img/work_stijn_1.png -------------------------------------------------------------------------------- /docs/static/slides.css: -------------------------------------------------------------------------------- 1 | body { 2 | font-family: -apple-system, BlinkMacSystemFont, avenir next, avenir, segoe ui, helvetica neue, helvetica, Cantarell, Ubuntu, roboto, noto, arial, sans-serif; 3 | font-weight: normal; 4 | } 5 | 6 | h1, h2, h3, h4, h5, h6 { 7 | font-weight: 300; 8 | margin-top: 5px; 9 | margin-bottom: 10px; 10 | } 11 | h1 { 12 | margin-top: 0.5em; 13 | } 14 | h2 { 15 | font-size: 140%; 16 | line-height: 150%; 17 | } 18 | h3 { 19 | font-size: 120%; 20 | line-height: 140%; 21 | } 22 | h2, h3, h4, h5, h6 { 23 | font-weight: normal; 24 | } 25 | 26 | li { 27 | font-size: 120%; 28 | line-height: 130%; 29 | } 30 | 31 | p { 32 | font-size: 100%; 33 | line-height: 120%; 34 | } 35 | 36 | table { 37 | margin: 0 auto 0.8em; 38 | border-collapse: collapse; 39 | } 40 | td, th { 41 | border: 1px solid #ddd; 42 | padding: 0.3em 0.5em; 43 | } 44 | 45 | .bgheader h1 { 46 | background-color: rgba(0, 0, 0, 0.9); 47 | opacity: 50%; 48 | padding: 0.5em; 49 | color: white; 50 | border-radius: .5em; 51 | } 52 | 53 | .section_background { 54 | background-color:#c2c444; 55 | color: #fff; 56 | font-weight: normal; 57 | } 58 | 59 | .middlebelowheader { 60 | /* This fixed size height was found to work well with the slide 61 | scaling mechanism of remark.js: 62 | */ 63 | height: 500px; 64 | display: table-cell; 65 | vertical-align: middle; 66 | } 67 | 68 | .hidden { 69 | visibility: hidden; 70 | } 71 | 72 | .small { 73 | font-size: 90%; 74 | } 75 | 76 | a:visited { 77 | color: #356196; 78 | } 79 | 80 | a:link { 81 | color: #356196; 82 | } 83 | 84 | .footnote { 85 | color: #808080; 86 | background-color: rgba(256, 256, 256, 0.9); 87 | font-size: 60%; 88 | position: absolute; 89 | bottom: 30px; 90 | left: 20px; 91 | text-align: left; 92 | line-height: 100%; 93 | padding: 5px; 94 | } 95 | 96 | .remark-slide-content { 97 | background-size: contain; 98 | } 99 | 100 | .emphasize { 101 | color: rgba(100, 100, 100, 0.95); 102 | font-size: 150%; 103 | line-height: 120%; 104 | } 105 | 106 | .widthlimit { 107 | width: 600px; 108 | font-size: 200%; 109 | } 110 | 111 | .fadetext { 112 | opacity: 0.3; 113 | } 114 | 115 | blockquote { 116 | padding: 0px 20px; 117 | font-weight: lighter; 118 | border-left: 5px solid #eee; 119 | } 120 | 121 | code { 122 | font-family: Menlo, Consolas, Monaco, Liberation Mono, Lucida Console, monospace; 123 | background-color: #f5f5f5; 124 | border: 1px solid #ccc; 125 | border-radius: 4px; 126 | font-size: 0.95; 127 | } 128 | 129 | .remark-code { 130 | font-family: Menlo, Consolas, Monaco, Liberation Mono, Lucida Console, monospace; 131 | display: block; 132 | padding: 9.5px; 133 | margin: 0 0 10px; 134 | font-size: 13px; 135 | line-height: 1.42857143; 136 | color: #333; 137 | word-break: break-all; 138 | word-wrap: break-word; 139 | background-color: #f5f5f5; 140 | border: 1px solid #ccc; 141 | border-radius: 4px; 142 | } 143 | 144 | 145 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: DS-python 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python=3.12 6 | - ipython 7 | - jupyter 8 | - jupyterlab>=3 9 | - numpy 10 | - pandas=2.2 11 | - matplotlib>3 12 | - mplleaflet 13 | - ipympl 14 | - seaborn 15 | - plotnine 16 | - pyproj 17 | - requests 18 | - openpyxl 19 | - geopandas 20 | - pyarrow 21 | -------------------------------------------------------------------------------- /img/bacteriophage.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/bacteriophage.jpeg -------------------------------------------------------------------------------- /img/bike_count_illustration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/bike_count_illustration.png -------------------------------------------------------------------------------- /img/change_kernel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/change_kernel.png -------------------------------------------------------------------------------- /img/doctoralschoolsprofiel_hq_rgb_web.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/doctoralschoolsprofiel_hq_rgb_web.png -------------------------------------------------------------------------------- /img/enterbutton.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/enterbutton.png -------------------------------------------------------------------------------- /img/heatmap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/heatmap.png -------------------------------------------------------------------------------- /img/keya.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/keya.png -------------------------------------------------------------------------------- /img/keyb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/keyb.png -------------------------------------------------------------------------------- /img/keyescape.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/keyescape.png -------------------------------------------------------------------------------- /img/logo_flanders+richtingmorgen.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/logo_flanders+richtingmorgen.png -------------------------------------------------------------------------------- /img/matplotlib_fundamentals.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/matplotlib_fundamentals.png -------------------------------------------------------------------------------- /img/matplotlib_oo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/matplotlib_oo.png -------------------------------------------------------------------------------- /img/pandas.svg: -------------------------------------------------------------------------------- 1 | Artboard 63 -------------------------------------------------------------------------------- /img/pandas/pivot_excel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/pandas/pivot_excel.png -------------------------------------------------------------------------------- /img/pandas/splitApplyCombine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/pandas/splitApplyCombine.png -------------------------------------------------------------------------------- /img/plot_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/plot_overview.png -------------------------------------------------------------------------------- /img/python-function.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | def fahr_to_celsius(temp): return ((temp - 32) * (5/9)) 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | def statement 17 | name 18 | parameter names 19 | body 20 | return statement 21 | return value 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /img/python-sticky-note-variables-01.svg: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 7 | 8 | 10 | 11 | 12 | 15 | 16 | 17 | 65.0 22 | 23 | 24 | weight_kg 29 | 30 | -------------------------------------------------------------------------------- /img/python-sticky-note-variables-02.svg: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 7 | 8 | 10 | 11 | 12 | 15 | 16 | 17 | 65.0 22 | 23 | 24 | weight_kg 29 | 30 | 31 | 33 | 34 | 35 | 38 | 39 | 40 | 143.0 45 | 46 | 47 | weight_lb 52 | 53 | -------------------------------------------------------------------------------- /img/python-sticky-note-variables-03.svg: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 7 | 8 | 10 | 11 | 12 | 15 | 16 | 17 | 100.0 22 | 23 | 24 | weight_kg 29 | 30 | 31 | 33 | 34 | 35 | 38 | 39 | 40 | 143.0 45 | 46 | 47 | weight_lb 52 | 53 | -------------------------------------------------------------------------------- /img/seaborn_overview_modules.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/seaborn_overview_modules.png -------------------------------------------------------------------------------- /img/shift-tab.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/shift-tab.png -------------------------------------------------------------------------------- /img/shift_button.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/shift_button.png -------------------------------------------------------------------------------- /img/shiftenter.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/shiftenter.jpg -------------------------------------------------------------------------------- /img/stack.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/stack.png -------------------------------------------------------------------------------- /img/tabbutton.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/tabbutton.jpg -------------------------------------------------------------------------------- /img/tidy_data_scheme.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/tidy_data_scheme.png -------------------------------------------------------------------------------- /img/toomuch.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/img/toomuch.jpg -------------------------------------------------------------------------------- /nbconvert_config.py: -------------------------------------------------------------------------------- 1 | c.Exporter.preprocessors = ['nbtutor.ClearExercisePreprocessor', 'nbconvert.preprocessors.ClearOutputPreprocessor'] 2 | -------------------------------------------------------------------------------- /notebooks/_solutions/case1_bike_count1.py: -------------------------------------------------------------------------------- 1 | df = pd.read_csv("data/fietstelpaal-coupure-links-2022-gent.zip", sep=';') -------------------------------------------------------------------------------- /notebooks/_solutions/case1_bike_count10.py: -------------------------------------------------------------------------------- 1 | def process_bike_count_data(df): 2 | """Process the provided dataframe: parse datetimes and rename columns. 3 | 4 | Parameters 5 | ---------- 6 | df : pandas.DataFrame 7 | DataFrame as read from the raw `fietstellingen`, 8 | containing the 'Datum', 'Uur5Minuten', 9 | 'Ordening', 'Totaal', 'Tegenrichting', 'Hoofdrichting' columns. 10 | 11 | Returns 12 | ------- 13 | df2 : pandas.DataFrame 14 | DataFrame with the datetime info as index and the 15 | `direction_centre` and `direction_mariakerke` columns 16 | with the counts. 17 | """ 18 | timestamps = pd.to_datetime(df["Ordening"], format="%Y-%m-%dT%H:%M:%S%z", utc=True) 19 | df2 = df.drop(columns=['Datum', 'Uur5Minuten', 'Ordening', 'Code']) 20 | df2["timestamp"] = timestamps 21 | df2 = df2.set_index("timestamp") 22 | df2 = df2.rename(columns={'Tegenrichting': 'direction_centre', 23 | 'Hoofdrichting': 'direction_mariakerke', 24 | 'Totaal': 'total', 25 | 'Locatie': 'location' 26 | }) 27 | return df2 -------------------------------------------------------------------------------- /notebooks/_solutions/case1_bike_count11.py: -------------------------------------------------------------------------------- 1 | df_both = df.sum(axis=1) 2 | df_both -------------------------------------------------------------------------------- /notebooks/_solutions/case1_bike_count12.py: -------------------------------------------------------------------------------- 1 | df_quiet = df_both[df_both < 5] 2 | df_quiet -------------------------------------------------------------------------------- /notebooks/_solutions/case1_bike_count13.py: -------------------------------------------------------------------------------- 1 | df[(df['direction_centre'] < 3) | (df['direction_mariakerke'] < 3)] -------------------------------------------------------------------------------- /notebooks/_solutions/case1_bike_count14.py: -------------------------------------------------------------------------------- 1 | df.mean() -------------------------------------------------------------------------------- /notebooks/_solutions/case1_bike_count15.py: -------------------------------------------------------------------------------- 1 | df.resample('h').sum().mean() -------------------------------------------------------------------------------- /notebooks/_solutions/case1_bike_count16.py: -------------------------------------------------------------------------------- 1 | df['direction_centre'].nlargest(10) 2 | # alternative: 3 | # df['direction_centre'].sort_values(ascending=False).head(10) -------------------------------------------------------------------------------- /notebooks/_solutions/case1_bike_count17.py: -------------------------------------------------------------------------------- 1 | df_both = df.sum(axis=1) -------------------------------------------------------------------------------- /notebooks/_solutions/case1_bike_count18.py: -------------------------------------------------------------------------------- 1 | df_daily = df_both.resample('D').sum() -------------------------------------------------------------------------------- /notebooks/_solutions/case1_bike_count19.py: -------------------------------------------------------------------------------- 1 | df_daily.max() -------------------------------------------------------------------------------- /notebooks/_solutions/case1_bike_count2.py: -------------------------------------------------------------------------------- 1 | df.head() -------------------------------------------------------------------------------- /notebooks/_solutions/case1_bike_count20.py: -------------------------------------------------------------------------------- 1 | df_daily.nlargest(10) -------------------------------------------------------------------------------- /notebooks/_solutions/case1_bike_count21.py: -------------------------------------------------------------------------------- 1 | df_monthly = df.resample('ME').sum() 2 | df_monthly.plot() -------------------------------------------------------------------------------- /notebooks/_solutions/case1_bike_count22.py: -------------------------------------------------------------------------------- 1 | df_hourly = df.resample('h').sum() -------------------------------------------------------------------------------- /notebooks/_solutions/case1_bike_count23.py: -------------------------------------------------------------------------------- 1 | df_hourly.head() -------------------------------------------------------------------------------- /notebooks/_solutions/case1_bike_count24.py: -------------------------------------------------------------------------------- 1 | df_hourly['2023-01-01':'2023-01-21'].plot() -------------------------------------------------------------------------------- /notebooks/_solutions/case1_bike_count25.py: -------------------------------------------------------------------------------- 1 | newyear = df["2022-12-31 12:00:00": "2023-01-01 12:00:00"] -------------------------------------------------------------------------------- /notebooks/_solutions/case1_bike_count26.py: -------------------------------------------------------------------------------- 1 | newyear.plot() -------------------------------------------------------------------------------- /notebooks/_solutions/case1_bike_count27.py: -------------------------------------------------------------------------------- 1 | newyear.rolling(10, center=True).mean().plot(linewidth=2) -------------------------------------------------------------------------------- /notebooks/_solutions/case1_bike_count28.py: -------------------------------------------------------------------------------- 1 | # A more in-detail plotting version of the graph. 2 | fig, ax = plt.subplots() 3 | newyear.plot(ax=ax, color=['LightGreen', 'LightBlue'], legend=False, rot=0) 4 | newyear.rolling(10, center=True).mean().plot(linewidth=2, ax=ax, color=['DarkGreen', 'DarkBlue'], rot=0) 5 | 6 | ax.set_xlabel('') 7 | ax.set_ylabel('Cyclists count') -------------------------------------------------------------------------------- /notebooks/_solutions/case1_bike_count3.py: -------------------------------------------------------------------------------- 1 | df.tail() -------------------------------------------------------------------------------- /notebooks/_solutions/case1_bike_count4.py: -------------------------------------------------------------------------------- 1 | len(df) -------------------------------------------------------------------------------- /notebooks/_solutions/case1_bike_count5.py: -------------------------------------------------------------------------------- 1 | df.dtypes -------------------------------------------------------------------------------- /notebooks/_solutions/case1_bike_count6.py: -------------------------------------------------------------------------------- 1 | df["timestamp"] = pd.to_datetime(df["Ordening"], format="%Y-%m-%dT%H:%M:%S%z", utc=True) -------------------------------------------------------------------------------- /notebooks/_solutions/case1_bike_count7.py: -------------------------------------------------------------------------------- 1 | df = df.set_index("timestamp") -------------------------------------------------------------------------------- /notebooks/_solutions/case1_bike_count8.py: -------------------------------------------------------------------------------- 1 | df2022 = df.drop(columns=['Datum', 'Uur5Minuten', 'Ordening', 'Code']) -------------------------------------------------------------------------------- /notebooks/_solutions/case1_bike_count9.py: -------------------------------------------------------------------------------- 1 | df2022 = df2022.rename(columns={'Tegenrichting': 'direction_centre', 2 | 'Hoofdrichting': 'direction_mariakerke', 3 | 'Totaal': 'total', 4 | 'Locatie': 'location'}) -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations1.py: -------------------------------------------------------------------------------- 1 | observations = pd.read_csv("data/observations.csv", index_col="occurrenceID") -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations10.py: -------------------------------------------------------------------------------- 1 | observations.duplicated().sum() -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations11.py: -------------------------------------------------------------------------------- 1 | duplicate_observations = observations[observations.duplicated(keep=False)] 2 | duplicate_observations.sort_values(["eventDate", "verbatimLocality"]).head(9) -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations12.py: -------------------------------------------------------------------------------- 1 | observations_unique = observations.drop_duplicates() -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations13.py: -------------------------------------------------------------------------------- 1 | len(observations_unique) -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations14.py: -------------------------------------------------------------------------------- 1 | len(observations_unique.dropna()) -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations15.py: -------------------------------------------------------------------------------- 1 | len(observations_unique.dropna(subset=['species_ID'])) -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations16.py: -------------------------------------------------------------------------------- 1 | observations_with_ID = observations_unique.dropna(subset=['species_ID']) 2 | observations_with_ID.head() -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations17.py: -------------------------------------------------------------------------------- 1 | mask = observations['species_ID'].isna() & observations['sex'].notna() 2 | not_identified = observations[mask] -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations18.py: -------------------------------------------------------------------------------- 1 | not_identified.head() -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations19.py: -------------------------------------------------------------------------------- 1 | observations.groupby("name").size().nlargest(8) -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations2.py: -------------------------------------------------------------------------------- 1 | observations.head() -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations20.py: -------------------------------------------------------------------------------- 1 | observations['name'].value_counts().iloc[:8] -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations21.py: -------------------------------------------------------------------------------- 1 | n_species_per_plot = observations.groupby(["verbatimLocality"])["name"].nunique() -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations22.py: -------------------------------------------------------------------------------- 1 | fig, ax = plt.subplots(figsize=(6, 6)) 2 | n_species_per_plot.plot(kind="barh", ax=ax) 3 | ax.set_ylabel("Plot number"); -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations23.py: -------------------------------------------------------------------------------- 1 | ## Alternative option to calculate the species per plot: 2 | ## inspired on the pivot table we already had: 3 | #species_per_plot = observations.reset_index().pivot_table( 4 | # index="species_ID", columns="verbatimLocality", values="occurrenceID", aggfunc='count') 5 | #n_species_per_plot = species_per_plot.count() -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations24.py: -------------------------------------------------------------------------------- 1 | n_plots_per_species = observations.groupby(["name"])["verbatimLocality"].nunique().sort_values() 2 | 3 | fig, ax = plt.subplots(figsize=(10, 8)) 4 | n_plots_per_species.plot(kind="barh", ax=ax) 5 | ax.set_xlabel("Number of plots"); 6 | ax.set_ylabel(""); -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations25.py: -------------------------------------------------------------------------------- 1 | n_plot_sex = observations.groupby(["sex", "verbatimLocality"]).size().rename("count").reset_index() 2 | n_plot_sex.head() -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations26.py: -------------------------------------------------------------------------------- 1 | pivoted = n_plot_sex.pivot(columns="sex", index="verbatimLocality", values="count") 2 | pivoted.head() -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations27.py: -------------------------------------------------------------------------------- 1 | sns.catplot(data=observations, x="verbatimLocality", 2 | hue="sex", kind="count", height=3, aspect=3) -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations28.py: -------------------------------------------------------------------------------- 1 | heatmap_prep = observations.pivot_table(index='year', columns='month', 2 | values="species_ID", aggfunc='count') 3 | fig, ax = plt.subplots(figsize=(10, 8)) 4 | ax = sns.heatmap(heatmap_prep, cmap='Reds') -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations29.py: -------------------------------------------------------------------------------- 1 | survey_data = pd.merge(observations_data, species_names, how="left", 2 | left_on="species_ID", right_on="ID") 3 | survey_data -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations3.py: -------------------------------------------------------------------------------- 1 | observations.info() -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations30.py: -------------------------------------------------------------------------------- 1 | non_rodent_species = survey_data[survey_data['taxa'].isin(['Rabbit', 'Bird', 'Reptile'])] -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations31.py: -------------------------------------------------------------------------------- 1 | len(non_rodent_species) -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations32.py: -------------------------------------------------------------------------------- 1 | r_species = survey_data[survey_data['name'].str.lower().str.startswith('r')] -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations33.py: -------------------------------------------------------------------------------- 1 | len(r_species) -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations34.py: -------------------------------------------------------------------------------- 1 | non_bird_species = survey_data[survey_data['taxa'] != 'Bird'] -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations35.py: -------------------------------------------------------------------------------- 1 | birds_85_89 = survey_data[(survey_data["eventDate"] >= "1985-01-01") 2 | & (survey_data["eventDate"] <= "1989-12-31 23:59") 3 | & (survey_data['taxa'] == 'Bird')] 4 | birds_85_89.head() -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations36.py: -------------------------------------------------------------------------------- 1 | # alternative solution 2 | birds_85_89 = survey_data[(survey_data["eventDate"].dt.year >= 1985) 3 | & (survey_data["eventDate"].dt.year <= 1989) 4 | & (survey_data['taxa'] == 'Bird')] 5 | birds_85_89.head() -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations37.py: -------------------------------------------------------------------------------- 1 | # Multiple lines 2 | obs_with_weight = survey_data.dropna(subset=["weight"]) 3 | median_weight = obs_with_weight.groupby(['name'])["weight"].median() 4 | median_weight.sort_values(ascending=False) -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations38.py: -------------------------------------------------------------------------------- 1 | # Single line statement 2 | survey_data.dropna(subset=["weight"]).groupby(['name'])["weight"].median().sort_values(ascending=False) -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations39.py: -------------------------------------------------------------------------------- 1 | species_per_plot = survey_data.reset_index().pivot_table(index="name", 2 | columns="verbatimLocality", 3 | values="ID", 4 | aggfunc='count') 5 | species_per_plot.head() -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations4.py: -------------------------------------------------------------------------------- 1 | observations["eventDate"] = pd.to_datetime(observations[["year", "month", "day"]]) 2 | observations -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations40.py: -------------------------------------------------------------------------------- 1 | fig, ax = plt.subplots(figsize=(8,8)) 2 | sns.heatmap(species_per_plot, ax=ax, cmap='Greens') -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations41.py: -------------------------------------------------------------------------------- 1 | survey_data.resample('YE', on='eventDate').size().plot() -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations42.py: -------------------------------------------------------------------------------- 1 | merriami = survey_data[survey_data["name"] == "Dipodomys merriami"] -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations43.py: -------------------------------------------------------------------------------- 1 | fig, ax = plt.subplots() 2 | merriami.groupby(merriami['eventDate'].dt.month).size().plot(kind="barh", ax=ax) 3 | ax.set_xlabel("number of occurrences") 4 | ax.set_ylabel("Month of the year") -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations44.py: -------------------------------------------------------------------------------- 1 | subsetspecies = survey_data[survey_data["name"].isin(['Dipodomys merriami', 'Dipodomys ordii', 2 | 'Reithrodontomys megalotis', 'Chaetodipus baileyi'])] -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations45.py: -------------------------------------------------------------------------------- 1 | month_evolution = subsetspecies.groupby("name").resample('ME', on='eventDate').size() -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations46.py: -------------------------------------------------------------------------------- 1 | species_evolution = month_evolution.unstack(level=0) 2 | axs = species_evolution.plot(subplots=True, figsize=(14, 8), sharey=True) -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations47.py: -------------------------------------------------------------------------------- 1 | sns.relplot(data=month_evolution, x='eventDate', y="counts", 2 | row="name", kind="line", hue="name", height=2, aspect=5) -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations48.py: -------------------------------------------------------------------------------- 1 | year_evolution = survey_data.groupby("taxa").resample('YE', on='eventDate').size() 2 | year_evolution.name = "counts" 3 | year_evolution = year_evolution.reset_index() -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations49.py: -------------------------------------------------------------------------------- 1 | year_evolution.head() -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations5.py: -------------------------------------------------------------------------------- 1 | observations["datasetName"] = "Ecological Archives E090-118-D1." -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations50.py: -------------------------------------------------------------------------------- 1 | sns.relplot(data=year_evolution, x='eventDate', y="counts", 2 | col="taxa", col_wrap=2, kind="line", height=2, aspect=5, 3 | facet_kws={"sharey": False}) -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations51.py: -------------------------------------------------------------------------------- 1 | fig, ax = plt.subplots() 2 | survey_data.groupby(survey_data["eventDate"].dt.weekday).size().plot(kind='barh', color='#66b266', ax=ax) 3 | 4 | import calendar 5 | xticks = ax.set_yticklabels(calendar.day_name) -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations6.py: -------------------------------------------------------------------------------- 1 | sex_dict = {"M": "male", 2 | "F": "female", 3 | "R": "male", 4 | "P": "female", 5 | "Z": np.nan} -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations7.py: -------------------------------------------------------------------------------- 1 | observations['sex'] = observations['verbatimSex'].replace(sex_dict) -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations8.py: -------------------------------------------------------------------------------- 1 | observations["sex"].unique() -------------------------------------------------------------------------------- /notebooks/_solutions/case2_observations9.py: -------------------------------------------------------------------------------- 1 | observations['species_ID'].isna().sum() -------------------------------------------------------------------------------- /notebooks/_solutions/case3_bacterial_resistance_lab_experiment1.py: -------------------------------------------------------------------------------- 1 | tidy_experiment = main_experiment.melt(id_vars=['Bacterial_genotype', 'Phage_t', 'experiment_ID'], 2 | value_vars=['OD_0h', 'OD_20h', 'OD_72h'], 3 | var_name='experiment_time_h', 4 | value_name='optical_density', ) 5 | tidy_experiment -------------------------------------------------------------------------------- /notebooks/_solutions/case3_bacterial_resistance_lab_experiment10.py: -------------------------------------------------------------------------------- 1 | sns.catplot(data=falcor, kind="point", 2 | x='Bacterial_genotype', 3 | y='log10 Mc', 4 | row="Phage", 5 | linestyle="none", 6 | errorbar=None, 7 | row_order=["Lambda", "T4", "T7"], 8 | order=['WT', 'MUT', 'D87G', 'S83L', 'D516G', 'S512F', 'K43N', 'K88R', 'RSF1010', 'RP4'], 9 | aspect=3, height=3, 10 | color="black") -------------------------------------------------------------------------------- /notebooks/_solutions/case3_bacterial_resistance_lab_experiment11.py: -------------------------------------------------------------------------------- 1 | falcor["Bacterial_genotype"] = falcor["Bacterial_genotype"].replace({'WT(2)': 'WT', 2 | 'MUT(2)': 'MUT'}) -------------------------------------------------------------------------------- /notebooks/_solutions/case3_bacterial_resistance_lab_experiment12.py: -------------------------------------------------------------------------------- 1 | def errorbar(x, y, low, high, **kws): 2 | """Utility function to link falcor data representation with the errorbar representation""" 3 | plt.errorbar(x, y, (y - low, high - y), capsize=3, fmt="o", color="black", ms=4) -------------------------------------------------------------------------------- /notebooks/_solutions/case3_bacterial_resistance_lab_experiment13.py: -------------------------------------------------------------------------------- 1 | sns.set_style("ticks") 2 | g = sns.FacetGrid(data=falcor, row="Phage", aspect=3, height=3) 3 | g.map(errorbar, 4 | "Bacterial_genotype", "log10 Mc", 5 | "log10 LBc", "log10 UBc") -------------------------------------------------------------------------------- /notebooks/_solutions/case3_bacterial_resistance_lab_experiment2.py: -------------------------------------------------------------------------------- 1 | sns.set_style("white") 2 | histplot = sns.displot(data=tidy_experiment, x="optical_density", 3 | color='grey', edgecolor='white') 4 | 5 | histplot.fig.suptitle("Optical density distribution") 6 | histplot.axes[0][0].set_ylabel("Frequency"); -------------------------------------------------------------------------------- /notebooks/_solutions/case3_bacterial_resistance_lab_experiment3.py: -------------------------------------------------------------------------------- 1 | sns.catplot(data=tidy_experiment, x="experiment_time_h", 2 | y="optical_density", kind="violin") -------------------------------------------------------------------------------- /notebooks/_solutions/case3_bacterial_resistance_lab_experiment4.py: -------------------------------------------------------------------------------- 1 | sns.catplot(data=tidy_experiment, x="experiment_time_h", y="optical_density", 2 | col="Phage_t", col_wrap=2, kind="violin") -------------------------------------------------------------------------------- /notebooks/_solutions/case3_bacterial_resistance_lab_experiment5.py: -------------------------------------------------------------------------------- 1 | pd.pivot_table(tidy_experiment, values='optical_density', 2 | index='Bacterial_genotype', 3 | columns='experiment_time_h', 4 | aggfunc='mean') -------------------------------------------------------------------------------- /notebooks/_solutions/case3_bacterial_resistance_lab_experiment6.py: -------------------------------------------------------------------------------- 1 | # advanced/optional solution 2 | tidy_experiment.groupby(['Bacterial_genotype', 'experiment_time_h'])['optical_density'].mean().unstack() -------------------------------------------------------------------------------- /notebooks/_solutions/case3_bacterial_resistance_lab_experiment7.py: -------------------------------------------------------------------------------- 1 | density_mean = (tidy_experiment 2 | .groupby(['Bacterial_genotype','Phage_t', 'experiment_time_h'])['optical_density'] 3 | .mean().reset_index()) -------------------------------------------------------------------------------- /notebooks/_solutions/case3_bacterial_resistance_lab_experiment8.py: -------------------------------------------------------------------------------- 1 | sns.catplot(data=density_mean, kind="bar", 2 | x='Bacterial_genotype', 3 | y='optical_density', 4 | hue='Phage_t', 5 | row="experiment_time_h", 6 | sharey=False, 7 | aspect=3, height=3, 8 | palette="colorblind") -------------------------------------------------------------------------------- /notebooks/_solutions/case3_bacterial_resistance_lab_experiment9.py: -------------------------------------------------------------------------------- 1 | falcor["Bacterial_genotype"] = falcor["Bacterial_genotype"].replace({'WT(2)': 'WT', 2 | 'MUT(2)': 'MUT'}) 3 | falcor.head() -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_analysis1.py: -------------------------------------------------------------------------------- 1 | data_tidy = data.reset_index().melt(id_vars=["datetime"], var_name='station', value_name='no2') 2 | data_tidy.head() -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_analysis10.py: -------------------------------------------------------------------------------- 1 | fig, ax = plt.subplots() 2 | 3 | data['1999':].resample('YE').mean().plot(ax=ax) 4 | data['1999':].mean(axis=1).resample('YE').mean().plot(color='k', 5 | linestyle='--', 6 | linewidth=4, 7 | ax=ax, 8 | label='Overall mean') 9 | ax.legend(loc='center', ncol=3, 10 | bbox_to_anchor=(0.5, 1.06)) 11 | ax.set_ylabel("NO$_2$ concentration (µg/m³)"); -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_analysis11.py: -------------------------------------------------------------------------------- 1 | # add a column to the dataframe that indicates the month (integer value of 1 to 12): 2 | data['month'] = data.index.month 3 | 4 | # now, we can calculate the mean of each month over the different years: 5 | data.groupby('month').mean() 6 | 7 | # plot the typical monthly profile of the different stations: 8 | data.groupby('month').mean().plot() -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_analysis12.py: -------------------------------------------------------------------------------- 1 | # Resample wise 2 | df2011 = data.loc['2011'] 3 | df2011[['BETN029', 'BETR801']].resample('W').quantile(0.95).plot() -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_analysis13.py: -------------------------------------------------------------------------------- 1 | # Groupby wise 2 | # Note the different x-axis labels 3 | df2011.groupby(df2011.index.isocalendar().week)[['BETN029', 'BETR801']].quantile(0.95).plot() -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_analysis14.py: -------------------------------------------------------------------------------- 1 | data.groupby(data.index.hour).mean().plot() -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_analysis15.py: -------------------------------------------------------------------------------- 1 | data['weekend'] = data.index.dayofweek.isin([5, 6]) 2 | data['weekend'] = data['weekend'].replace({True: 'weekend', False: 'weekday'}) 3 | data['hour'] = data.index.hour -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_analysis16.py: -------------------------------------------------------------------------------- 1 | data_weekend = data.groupby(['weekend', 'hour']).mean() 2 | data_weekend.head() -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_analysis17.py: -------------------------------------------------------------------------------- 1 | # using unstack and pandas plotting 2 | data_weekend_BETR801 = data_weekend['BETR801'].unstack(level=0) 3 | data_weekend_BETR801.plot() -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_analysis18.py: -------------------------------------------------------------------------------- 1 | # using a tidy dataset and seaborn 2 | data_weekend_BETR801_tidy = data_weekend['BETR801'].reset_index() 3 | 4 | sns.lineplot(data=data_weekend_BETR801_tidy, x="hour", y="BETR801", hue="weekend") -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_analysis19.py: -------------------------------------------------------------------------------- 1 | # tidy dataset that still includes all stations 2 | 3 | data_weekend_tidy = pd.melt(data_weekend.reset_index(), id_vars=['weekend', 'hour'], 4 | var_name='station', value_name='no2') 5 | data_weekend_tidy.head() -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_analysis2.py: -------------------------------------------------------------------------------- 1 | data_tidy['no2'].isna().sum() -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_analysis20.py: -------------------------------------------------------------------------------- 1 | # when still having multiple factors, it becomes useful to convert to tidy dataset and use seaborn 2 | sns.relplot(data=data_weekend_tidy, x="hour", y="no2", kind="line", 3 | hue="weekend", col="station", col_wrap=2) -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_analysis21.py: -------------------------------------------------------------------------------- 1 | data[['BETR801', 'BETN029', 'FR04037', 'FR04012']].corr() -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_analysis22.py: -------------------------------------------------------------------------------- 1 | exceedances = data > 200 -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_analysis23.py: -------------------------------------------------------------------------------- 1 | # group by year and count exceedances (sum of boolean) 2 | exceedances = exceedances.groupby(exceedances.index.year).sum() -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_analysis24.py: -------------------------------------------------------------------------------- 1 | # Make a barplot of the yearly number of exceedances 2 | ax = exceedances.loc[2005:].plot(kind='bar') 3 | ax.axhline(18, color='k', linestyle='--') -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_analysis25.py: -------------------------------------------------------------------------------- 1 | FR_station = data['FR04012'] # select the specific data series 2 | FR_station = FR_station[(FR_station.notnull()) & (FR_station != 0.0)] # exclude the Nan and zero values -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_analysis26.py: -------------------------------------------------------------------------------- 1 | FR_sorted = FR_station.sort_values(ascending=True) 2 | FR_scaled = (FR_sorted - FR_sorted.min())/(FR_sorted.max() - FR_sorted.min()) -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_analysis27.py: -------------------------------------------------------------------------------- 1 | fig, axfr = plt.subplots() 2 | FR_scaled.plot(use_index=False, ax = axfr) #alternative version: FR_scaled.reset_index(drop=True).plot(use_index=False) 3 | axfr.set_ylabel('FR04012') 4 | # optional addition, just in case you need this 5 | axfr.axvline(x=FR_scaled.searchsorted(0.3), color='0.6', linestyle='--', linewidth=3) -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_analysis28.py: -------------------------------------------------------------------------------- 1 | # Mixing an matching matplotlib and Pandas 2 | fig, (ax1, ax2) = plt.subplots(1, 2, 3 | sharex=True, 4 | sharey=True) 5 | 6 | data.loc['2009', ['BETN029', 'BETR801']].plot(kind='hist', subplots=True, 7 | bins=30, legend=False, 8 | ax=(ax1, ax2)) 9 | ax1.set_title('BETN029') 10 | ax2.set_title('BETR801') 11 | # Remark: the width of the bins is calculated over the x data range for both plots together -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_analysis29.py: -------------------------------------------------------------------------------- 1 | # A more step by step approach (equally valid) 2 | fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True, sharex=True) 3 | data.loc['2009', 'BETN029'].plot(kind='hist', bins=30, ax=ax1) 4 | ax1.set_title('BETN029') 5 | data.loc['2009', 'BETR801'].plot(kind='hist', bins=30, ax=ax2) 6 | ax2.set_title('BETR801') 7 | # Remark: the width of the bins is calculated over the x data range for each plot individually -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_analysis3.py: -------------------------------------------------------------------------------- 1 | data_tidy = data_tidy.dropna() -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_analysis30.py: -------------------------------------------------------------------------------- 1 | subset = data.loc['2009-01'].copy() 2 | subset["dayofweek"] = subset.index.dayofweek 3 | subset = subset[subset['dayofweek'].isin([0, 6])] -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_analysis31.py: -------------------------------------------------------------------------------- 1 | subset["dayofweek"] = subset["dayofweek"].replace(to_replace={0:"Monday", 6:"Sunday"}) -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_analysis32.py: -------------------------------------------------------------------------------- 1 | sns.set_style("whitegrid") -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_analysis33.py: -------------------------------------------------------------------------------- 1 | sns.lmplot( 2 | data=subset, x="BETN029", y="FR04037", hue="dayofweek" 3 | ) -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_analysis34.py: -------------------------------------------------------------------------------- 1 | exceedances = data.rolling(8).mean().resample('D').max() > 100 -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_analysis35.py: -------------------------------------------------------------------------------- 1 | exceedances = exceedances.groupby(exceedances.index.year).sum() 2 | ax = exceedances.plot(kind='bar') -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_analysis36.py: -------------------------------------------------------------------------------- 1 | data_daily = data.resample('D').mean() -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_analysis37.py: -------------------------------------------------------------------------------- 1 | # add a dayofweek column 2 | data_daily['dayofweek'] = data_daily.index.dayofweek 3 | data_daily.head() -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_analysis38.py: -------------------------------------------------------------------------------- 1 | # seaborn 2 | sns.boxplot(data=data_daily, x='dayofweek', y='BETR801', color="grey") -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_analysis39.py: -------------------------------------------------------------------------------- 1 | # when using pandas to plot, the different boxplots should be different columns 2 | # therefore, pivot table so that the weekdays are the different columns 3 | data_daily['week'] = data_daily.index.isocalendar().week 4 | data_pivoted = data_daily.pivot_table(columns='dayofweek', index='week', 5 | values='BETR801') 6 | data_pivoted.head() 7 | data_pivoted.boxplot(); -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_analysis4.py: -------------------------------------------------------------------------------- 1 | fig, ax = plt.subplots() 2 | data.loc['2009':, 'FR04037'].resample('ME').mean().plot(ax=ax, label='mean') 3 | data.loc['2009':, 'FR04037'].resample('ME').median().plot(ax=ax, label='median') 4 | ax.legend(ncol=2) 5 | ax.set_title("FR04037"); -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_analysis40.py: -------------------------------------------------------------------------------- 1 | # An alternative method using `groupby` and `unstack` 2 | data_daily.groupby(['dayofweek', 'week'])['BETR801'].mean().unstack(level=0).boxplot(); -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_analysis5.py: -------------------------------------------------------------------------------- 1 | data.loc['2009':, 'FR04037'].resample('ME').agg(['mean', 'median']).plot() -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_analysis6.py: -------------------------------------------------------------------------------- 1 | # with wide dataframe 2 | fig, ax = plt.subplots() 3 | sns.violinplot(data=data['2011-01': '2011-08'], color="C0", ax=ax) 4 | ax.set_ylabel("NO$_2$ concentration (µg/m³)") -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_analysis7.py: -------------------------------------------------------------------------------- 1 | # with tidy dataframe 2 | data_tidy_subset = data_tidy[(data_tidy['datetime'] >= "2011-01") & (data_tidy['datetime'] < "2011-09")] 3 | 4 | fig, ax = plt.subplots() 5 | sns.violinplot(data=data_tidy_subset, x="station", y="no2", color="C0", ax=ax) 6 | ax.set_ylabel("NO$_2$ concentration (µg/m³)") -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_analysis8.py: -------------------------------------------------------------------------------- 1 | # with figure-level function 2 | sns.catplot(data=data_tidy_subset, x="station", y="no2", kind="violin") -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_analysis9.py: -------------------------------------------------------------------------------- 1 | fig, ax = plt.subplots() 2 | data['2012':].mean().plot(kind='bar', ax=ax, rot=0, color='C0') 3 | ax.set_ylabel("NO$_2$ concentration (µg/m³)") 4 | ax.axhline(y=40., color='darkorange') 5 | ax.text(0.3, 0.48, 'Yearly limit is 40 µg/m³', 6 | horizontalalignment='left', fontsize=13, 7 | transform=ax.transAxes, color='darkorange'); -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_processing1.py: -------------------------------------------------------------------------------- 1 | data = pd.read_csv("data/BETR8010000800100hour.1-1-1990.31-12-2012", 2 | sep='\t', header=None, names=column_names, na_values=[-999, -9999]) -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_processing10.py: -------------------------------------------------------------------------------- 1 | def read_airbase_file(filename, station): 2 | """ 3 | Read hourly AirBase data files. 4 | 5 | Parameters 6 | ---------- 7 | filename : string 8 | Path to the data file. 9 | station : string 10 | Name of the station. 11 | 12 | Returns 13 | ------- 14 | DataFrame 15 | Processed dataframe. 16 | """ 17 | 18 | # construct the column names 19 | hours = ["{:02d}".format(i) for i in range(24)] 20 | flags = ['flag' + str(i) for i in range(24)] 21 | colnames = ['date'] + [item for pair in zip(hours, flags) for item in pair] 22 | 23 | # read the actual data 24 | data = pd.read_csv(filename, sep='\t', header=None, na_values=[-999, -9999], names=colnames) 25 | 26 | # drop the 'flag' columns 27 | data = data.drop([col for col in data.columns if 'flag' in col], axis=1) 28 | 29 | # reshape 30 | data_stacked = pd.melt(data, id_vars=['date'], var_name='hour') 31 | 32 | # parse to datetime and remove redundant columns 33 | data_stacked.index = pd.to_datetime(data_stacked['date'] + data_stacked['hour'], format="%Y-%m-%d%H") 34 | data_stacked = data_stacked.drop(['date', 'hour'], axis=1) 35 | data_stacked = data_stacked.rename(columns={'value': station}) 36 | 37 | return data_stacked -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_processing11.py: -------------------------------------------------------------------------------- 1 | data_folder = Path("./data") 2 | data_files = list(data_folder.glob("*0008001*")) 3 | data_files -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_processing12.py: -------------------------------------------------------------------------------- 1 | dfs = [] 2 | 3 | for filename in data_files: 4 | station = filename.name[:7] 5 | df = read_airbase_file(filename, station) 6 | dfs.append(df) -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_processing13.py: -------------------------------------------------------------------------------- 1 | combined_data = pd.concat(dfs, axis=1) -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_processing2.py: -------------------------------------------------------------------------------- 1 | data.head() -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_processing3.py: -------------------------------------------------------------------------------- 1 | data = data.drop(flag_columns, axis=1) -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_processing4.py: -------------------------------------------------------------------------------- 1 | data_stacked = pd.melt(data, id_vars=['date'], var_name='hour') 2 | data_stacked.head() -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_processing5.py: -------------------------------------------------------------------------------- 1 | # we use stack to reshape the data to move the hours (the column labels) into a column. 2 | # But we don't want to move the 'date' column label, therefore we first set this as the index. 3 | # You can check the difference with "data.stack()" 4 | data_stacked = data.set_index('date').stack() 5 | data_stacked.head() -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_processing6.py: -------------------------------------------------------------------------------- 1 | # We reset the index to have the date and hours available as columns 2 | data_stacked = data_stacked.reset_index() 3 | data_stacked = data_stacked.rename(columns={'level_1': 'hour'}) 4 | data_stacked.head() -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_processing7.py: -------------------------------------------------------------------------------- 1 | # Now we combine the dates and the hours into a datetime, and set this as the index 2 | data_stacked.index = pd.to_datetime(data_stacked['date'] + data_stacked['hour'], format="%Y-%m-%d%H") -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_processing8.py: -------------------------------------------------------------------------------- 1 | # Drop the origal date and hour columns 2 | data_stacked = data_stacked.drop(['date', 'hour'], axis=1) 3 | data_stacked.head() -------------------------------------------------------------------------------- /notebooks/_solutions/case4_air_quality_processing9.py: -------------------------------------------------------------------------------- 1 | # rename the remaining column to the name of the measurement station 2 | # (this is 0 or 'value' depending on which method was used) 3 | data_stacked = data_stacked.rename(columns={0: 'BETR801'}) -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_01_data_structures1.py: -------------------------------------------------------------------------------- 1 | df = pd.read_csv("data/titanic.csv") -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_01_data_structures2.py: -------------------------------------------------------------------------------- 1 | df.head() -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_01_data_structures3.py: -------------------------------------------------------------------------------- 1 | len(df) -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_01_data_structures4.py: -------------------------------------------------------------------------------- 1 | df['Age'] -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_01_data_structures5.py: -------------------------------------------------------------------------------- 1 | df['Fare'].plot.box() # or .plot(kind='box') -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_01_data_structures6.py: -------------------------------------------------------------------------------- 1 | df.sort_values(by='Age', ascending=False) -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_02_basic_operations1.py: -------------------------------------------------------------------------------- 1 | df['Age'].mean() -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_02_basic_operations10.py: -------------------------------------------------------------------------------- 1 | np.log(df['Fare']) -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_02_basic_operations2.py: -------------------------------------------------------------------------------- 1 | df['Age'].plot.hist() # bins=30, log=True) -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_02_basic_operations3.py: -------------------------------------------------------------------------------- 1 | df['Survived'].sum() / len(df['Survived']) -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_02_basic_operations4.py: -------------------------------------------------------------------------------- 1 | df['Survived'].mean() -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_02_basic_operations5.py: -------------------------------------------------------------------------------- 1 | df['Fare'].max() -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_02_basic_operations6.py: -------------------------------------------------------------------------------- 1 | df['Fare'].median() -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_02_basic_operations7.py: -------------------------------------------------------------------------------- 1 | df['Fare'].quantile(0.75) -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_02_basic_operations8.py: -------------------------------------------------------------------------------- 1 | df['Fare'] / df['Fare'].mean() -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_02_basic_operations9.py: -------------------------------------------------------------------------------- 1 | df['Fare_scaled'] = df['Fare'] / df['Fare'].mean() 2 | df.head() -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_03a_selecting_data1.py: -------------------------------------------------------------------------------- 1 | males = df[df['Sex'] == 'male'] -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_03a_selecting_data10.py: -------------------------------------------------------------------------------- 1 | df[df['Surname'].str.len() > 15] -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_03a_selecting_data11.py: -------------------------------------------------------------------------------- 1 | len(titles) -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_03a_selecting_data12.py: -------------------------------------------------------------------------------- 1 | titles.sort_values('year').head(2) -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_03a_selecting_data13.py: -------------------------------------------------------------------------------- 1 | titles.nsmallest(2, columns="year") -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_03a_selecting_data14.py: -------------------------------------------------------------------------------- 1 | len(titles[titles['title'] == 'Hamlet']) -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_03a_selecting_data15.py: -------------------------------------------------------------------------------- 1 | titles[titles['title'] == 'Treasure Island'].sort_values('year') -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_03a_selecting_data16.py: -------------------------------------------------------------------------------- 1 | len(titles[(titles['year'] >= 1950) & (titles['year'] <= 1959)]) -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_03a_selecting_data17.py: -------------------------------------------------------------------------------- 1 | len(titles[titles['year'] // 10 == 195]) -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_03a_selecting_data18.py: -------------------------------------------------------------------------------- 1 | inception = cast[cast['title'] == 'Inception'] -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_03a_selecting_data19.py: -------------------------------------------------------------------------------- 1 | len(inception[inception['n'].isna()]) -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_03a_selecting_data2.py: -------------------------------------------------------------------------------- 1 | males['Age'].mean() -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_03a_selecting_data20.py: -------------------------------------------------------------------------------- 1 | inception['n'].isna().sum() -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_03a_selecting_data21.py: -------------------------------------------------------------------------------- 1 | len(inception[inception['n'].notna()]) -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_03a_selecting_data22.py: -------------------------------------------------------------------------------- 1 | titanic = cast[(cast['title'] == 'Titanic') & (cast['year'] == 1997)] 2 | titanic = titanic[titanic['n'].notna()] 3 | titanic.sort_values('n') -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_03a_selecting_data23.py: -------------------------------------------------------------------------------- 1 | brad = cast[cast['name'] == 'Brad Pitt'] 2 | brad = brad[brad['year'] // 10 == 199] 3 | brad = brad[brad['n'] == 2] 4 | brad.sort_values('year') -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_03a_selecting_data3.py: -------------------------------------------------------------------------------- 1 | df[df['Sex'] == 'female']['Age'].mean() -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_03a_selecting_data4.py: -------------------------------------------------------------------------------- 1 | len(df[df['Age'] > 70]) -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_03a_selecting_data5.py: -------------------------------------------------------------------------------- 1 | (df['Age'] > 70).sum() -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_03a_selecting_data6.py: -------------------------------------------------------------------------------- 1 | df[(df['Age'] > 30) & (df['Age'] <= 40)] -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_03a_selecting_data7.py: -------------------------------------------------------------------------------- 1 | name.split(",")[0] -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_03a_selecting_data8.py: -------------------------------------------------------------------------------- 1 | df['Surname'] = df['Name'].str.split(",").str.get(0) 2 | df['Surname'] -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_03a_selecting_data9.py: -------------------------------------------------------------------------------- 1 | df[df['Surname'].str.startswith('Williams')] -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_03b_indexing1.py: -------------------------------------------------------------------------------- 1 | countries['density'] = countries['population']*1000000 / countries['area'] -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_03b_indexing2.py: -------------------------------------------------------------------------------- 1 | countries.loc[countries['density'] > 300, ['capital', 'population']] -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_03b_indexing3.py: -------------------------------------------------------------------------------- 1 | countries['density_ratio'] = countries['density'] / countries['density'].mean() 2 | countries -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_03b_indexing4.py: -------------------------------------------------------------------------------- 1 | countries.loc['United Kingdom', 'capital'] = 'Cambridge' 2 | countries -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_03b_indexing5.py: -------------------------------------------------------------------------------- 1 | countries[(countries['density'] > 100) & (countries['density'] < 300)] -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_03b_indexing6.py: -------------------------------------------------------------------------------- 1 | df.loc[df['Sex'] == 'male', 'Age'].mean() -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_03b_indexing7.py: -------------------------------------------------------------------------------- 1 | df.loc[df['Sex'] == 'female', 'Age'].mean() -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_04_time_series_data1.py: -------------------------------------------------------------------------------- 1 | data['2012':] -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_04_time_series_data2.py: -------------------------------------------------------------------------------- 1 | data[data.index.month == 1] -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_04_time_series_data3.py: -------------------------------------------------------------------------------- 1 | data[data.index.month.isin([4, 5, 6])] -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_04_time_series_data4.py: -------------------------------------------------------------------------------- 1 | data[(data.index.hour > 8) & (data.index.hour < 20)] -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_04_time_series_data5.py: -------------------------------------------------------------------------------- 1 | data.resample('ME').std().plot() # 'A' -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_04_time_series_data6.py: -------------------------------------------------------------------------------- 1 | subset = data['2011':'2012']['L06_347'] 2 | subset.resample('ME').agg(['mean', 'median']).plot() -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_04_time_series_data7.py: -------------------------------------------------------------------------------- 1 | daily = data['LS06_348'].resample('D').mean() # daily averages calculated -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_04_time_series_data8.py: -------------------------------------------------------------------------------- 1 | daily.resample('MS').agg(['min', 'max']).plot() # monthly minimum and maximum values of these daily averages -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_04_time_series_data9.py: -------------------------------------------------------------------------------- 1 | data['2013':'2013'].mean().plot(kind='barh') -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_05_groupby_operations1.py: -------------------------------------------------------------------------------- 1 | df.groupby('Sex')['Age'].mean() -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_05_groupby_operations10.py: -------------------------------------------------------------------------------- 1 | titles['decade'] = titles['year'] // 10 * 10 2 | hamlet = titles[titles['title'].str.contains('Hamlet')] 3 | hamlet.groupby('decade').size().plot.bar(color="lightblue") -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_05_groupby_operations11.py: -------------------------------------------------------------------------------- 1 | cast1990 = cast[cast['year'] >= 1990] 2 | cast1990 = cast1990[cast1990['n'] == 1] 3 | cast1990.groupby('name').size().nlargest(10) -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_05_groupby_operations12.py: -------------------------------------------------------------------------------- 1 | cast1990['name'].value_counts().head(10) -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_05_groupby_operations13.py: -------------------------------------------------------------------------------- 1 | hamlets = titles[titles['title'].str.contains('Hamlet')] 2 | hamlets['title'].value_counts() -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_05_groupby_operations14.py: -------------------------------------------------------------------------------- 1 | hamlets = titles[titles['title'].str.startswith('Hamlet')] 2 | hamlets['title'].value_counts() -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_05_groupby_operations15.py: -------------------------------------------------------------------------------- 1 | title_longest = titles['title'].str.len().nlargest(10) 2 | title_longest -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_05_groupby_operations16.py: -------------------------------------------------------------------------------- 1 | pd.options.display.max_colwidth = 210 2 | titles.loc[title_longest.index] -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_05_groupby_operations17.py: -------------------------------------------------------------------------------- 1 | cast1950 = cast[cast['year'] // 10 == 195] 2 | cast1950 = cast1950[cast1950['n'] == 1] 3 | cast1950.groupby(['year', 'type']).size() -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_05_groupby_operations18.py: -------------------------------------------------------------------------------- 1 | cast.character.value_counts().head(11) -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_05_groupby_operations19.py: -------------------------------------------------------------------------------- 1 | cast[cast.name == 'Brad Pitt'].year.value_counts().sort_index().plot() -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_05_groupby_operations2.py: -------------------------------------------------------------------------------- 1 | # df['Survived'].sum() / len(df['Survived']) 2 | df['Survived'].mean() -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_05_groupby_operations20.py: -------------------------------------------------------------------------------- 1 | titles[titles['title'].str.startswith('The Life')]['title'].value_counts().head(10) -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_05_groupby_operations21.py: -------------------------------------------------------------------------------- 1 | cast[cast.year == 2010].name.value_counts().head(10) -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_05_groupby_operations22.py: -------------------------------------------------------------------------------- 1 | pink = cast[cast['title'] == 'The Pink Panther'] 2 | pink.groupby(['year'])[['n']].max() -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_05_groupby_operations23.py: -------------------------------------------------------------------------------- 1 | oz = cast[cast['name'] == 'Frank Oz'] 2 | oz_roles = oz.groupby(['year', 'title']).size() 3 | oz_roles[oz_roles > 1] -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_05_groupby_operations24.py: -------------------------------------------------------------------------------- 1 | oz = cast[cast['name'] == 'Frank Oz'] 2 | oz_roles = oz.groupby(['character']).size() 3 | oz_roles[oz_roles > 1].sort_values() -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_05_groupby_operations25.py: -------------------------------------------------------------------------------- 1 | cast['n_total'] = cast.groupby(['title', 'year'])['n'].transform('size') # transform will return an element for each row, so the size value is given to the whole group 2 | cast.head() -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_05_groupby_operations26.py: -------------------------------------------------------------------------------- 1 | leading = cast[cast['n'] == 1] 2 | sums_decade = leading.groupby([cast['year'] // 10 * 10, 'type']).size() 3 | sums_decade -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_05_groupby_operations27.py: -------------------------------------------------------------------------------- 1 | #sums_decade.groupby(level='year').transform(lambda x: x / x.sum()) 2 | ratios_decade = sums_decade / sums_decade.groupby(level='year').transform('sum') 3 | ratios_decade -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_05_groupby_operations28.py: -------------------------------------------------------------------------------- 1 | ratios_decade[:, 'actor'].plot() 2 | ratios_decade[:, 'actress'].plot() -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_05_groupby_operations29.py: -------------------------------------------------------------------------------- 1 | t = titles 2 | t.year.value_counts().head(3) -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_05_groupby_operations3.py: -------------------------------------------------------------------------------- 1 | df25 = df[df['Age'] < 25] 2 | df25['Survived'].mean() -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_05_groupby_operations30.py: -------------------------------------------------------------------------------- 1 | cast1950 = cast[cast['year'] // 10 == 195] 2 | cast1950 = cast1950[cast1950['n'] == 1] 3 | cast1950['type'].value_counts() -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_05_groupby_operations31.py: -------------------------------------------------------------------------------- 1 | cast2000 = cast[cast['year'] // 10 == 200] 2 | cast2000 = cast2000[cast2000['n'] == 1] 3 | cast2000['type'].value_counts() -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_05_groupby_operations4.py: -------------------------------------------------------------------------------- 1 | df.groupby('Sex')['Survived'].mean() -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_05_groupby_operations5.py: -------------------------------------------------------------------------------- 1 | df.groupby('Pclass')['Survived'].mean().plot.bar() #and what if you would compare the total number of survivors? -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_05_groupby_operations6.py: -------------------------------------------------------------------------------- 1 | df.groupby('AgeClass', observed=False)['Fare'].mean().plot.bar(rot=0) -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_05_groupby_operations7.py: -------------------------------------------------------------------------------- 1 | titles['decade'] = titles['year'] // 10 * 10 -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_05_groupby_operations8.py: -------------------------------------------------------------------------------- 1 | titles.groupby('decade').size().plot.bar(color='green') -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_05_groupby_operations9.py: -------------------------------------------------------------------------------- 1 | titles['decade'] = titles['year'] // 10 * 10 2 | hamlet = titles[titles['title'] == 'Hamlet'] 3 | hamlet.groupby('decade').size().plot.bar(color="orange") -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_06_data_cleaning1.py: -------------------------------------------------------------------------------- 1 | casualties_raw["TX_SEX_DESCR_NL"].unique() -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_06_data_cleaning10.py: -------------------------------------------------------------------------------- 1 | casualties["datetime"] = pd.to_datetime(casualties["datetime"]) 2 | casualties["datetime"] -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_06_data_cleaning11.py: -------------------------------------------------------------------------------- 1 | casualties["week_day"] = pd.Categorical(casualties["DAY_OF_WEEK"], 2 | categories=["Monday", "Tuesday", "Wednesday", "Thursday", 3 | "Friday", "Saturday", "Sunday"], 4 | ordered=True) -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_06_data_cleaning12.py: -------------------------------------------------------------------------------- 1 | casualties["week_day"].dtype -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_06_data_cleaning13.py: -------------------------------------------------------------------------------- 1 | casualties["AGE_CLS"] = casualties["AGE_CLS"].str.replace(" tot ", " - ").str.removesuffix(" jaar").str.strip() 2 | casualties["AGE_CLS"] = casualties["AGE_CLS"].replace({"Onbekend": None, "75 jaar en meer": ">75", "": None}) -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_06_data_cleaning14.py: -------------------------------------------------------------------------------- 1 | unique_combinations = ["DT_DAY", "DT_HOUR", "CD_MUNTY_REFNIS", "BUILD_UP_AREA","LIGHT_COND", "ROAD_TYPE"] 2 | casualties.drop_duplicates(subset=unique_combinations).shape -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_06_data_cleaning15.py: -------------------------------------------------------------------------------- 1 | # alternative using `duplicated` 2 | (~casualties.duplicated(subset=unique_combinations)).sum() -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_06_data_cleaning2.py: -------------------------------------------------------------------------------- 1 | gender_mapping = {"Vrouwelijk": "female", "Mannelijk": "male", "Onbekend": None} 2 | casualties_raw["TX_SEX_DESCR_NL"] = casualties_raw["TX_SEX_DESCR_NL"].replace(gender_mapping) 3 | casualties_raw["TX_SEX_DESCR_NL"].unique() -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_06_data_cleaning3.py: -------------------------------------------------------------------------------- 1 | casualties_raw["DT_HOUR"].unique() -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_06_data_cleaning4.py: -------------------------------------------------------------------------------- 1 | (casualties_raw["DT_HOUR"] == 99).sum() -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_06_data_cleaning5.py: -------------------------------------------------------------------------------- 1 | casualties_raw["DT_HOUR"] = casualties_raw["DT_HOUR"].replace(99, 9) -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_06_data_cleaning6.py: -------------------------------------------------------------------------------- 1 | casualties_nl = casualties_raw.drop(columns=column_names_with_fr) 2 | casualties_nl -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_06_data_cleaning7.py: -------------------------------------------------------------------------------- 1 | casualties = casualties_nl.rename(columns=clean_column_name) 2 | casualties.head() -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_06_data_cleaning8.py: -------------------------------------------------------------------------------- 1 | casualties[["DT_DAY", "DT_HOUR"]].dtypes -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_06_data_cleaning9.py: -------------------------------------------------------------------------------- 1 | casualties["datetime"] = casualties["DT_DAY"] + " " + casualties["DT_HOUR"].astype(str) -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_08_reshaping_data1.py: -------------------------------------------------------------------------------- 1 | df = pd.read_excel("data/verbruiksgegevens-per-maand.xlsx") -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_08_reshaping_data10.py: -------------------------------------------------------------------------------- 1 | df.pivot_table(index='Underaged', columns='Sex', 2 | values='Fare', aggfunc='median') -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_08_reshaping_data11.py: -------------------------------------------------------------------------------- 1 | df_survival = df.groupby(["Pclass", "Sex"])["Survived"].mean().reset_index() 2 | df_survival -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_08_reshaping_data12.py: -------------------------------------------------------------------------------- 1 | df_survival.pivot(index="Pclass", columns="Sex", values="Survived") -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_08_reshaping_data13.py: -------------------------------------------------------------------------------- 1 | df.groupby(['Pclass', 'Sex'])['Survived'].mean().unstack() -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_08_reshaping_data14.py: -------------------------------------------------------------------------------- 1 | grouped = cast.groupby(['year', 'type']).size() 2 | table = grouped.unstack('type') 3 | table.plot() -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_08_reshaping_data15.py: -------------------------------------------------------------------------------- 1 | cast.pivot_table(index='year', columns='type', values="character", aggfunc='count').plot() 2 | # for the values column to use in the aggfunc, take a column with no NaN values in order to count effectively all values 3 | # -> at this stage: aha-erlebnis about crosstab function(!) -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_08_reshaping_data16.py: -------------------------------------------------------------------------------- 1 | pd.crosstab(index=cast['year'], columns=cast['type']).plot() -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_08_reshaping_data17.py: -------------------------------------------------------------------------------- 1 | pd.crosstab(index=cast['year'], columns=cast['type']).plot.area() -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_08_reshaping_data18.py: -------------------------------------------------------------------------------- 1 | grouped = cast.groupby(['year', 'type']).size() 2 | table = grouped.unstack('type').fillna(0) 3 | (table['actor'] / (table['actor'] + table['actress'])).plot(ylim=[0, 1]) -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_08_reshaping_data19.py: -------------------------------------------------------------------------------- 1 | c = cast 2 | c = c[(c.character == 'Superman') | (c.character == 'Batman')] 3 | c = c.groupby(['year', 'character']).size() 4 | c = c.unstack() 5 | c = c.fillna(0) 6 | c.head() -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_08_reshaping_data2.py: -------------------------------------------------------------------------------- 1 | df = df.drop(columns=["Regio"]) 2 | df -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_08_reshaping_data20.py: -------------------------------------------------------------------------------- 1 | d = c.Superman - c.Batman 2 | print('Superman years:') 3 | print(len(d[d > 0.0])) -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_08_reshaping_data3.py: -------------------------------------------------------------------------------- 1 | df_tidy = pd.melt(df, id_vars=["Hoofdgemeente", "Energie", "SLP"], var_name="time", value_name="consumption") 2 | df_tidy -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_08_reshaping_data4.py: -------------------------------------------------------------------------------- 1 | df_tidy["time"] = pd.to_datetime(df_tidy["time"], format="%Y%m") -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_08_reshaping_data5.py: -------------------------------------------------------------------------------- 1 | df_overall = df_tidy.groupby(["time", "Energie"])[["consumption"]].sum() # or with .reset_index() 2 | df_overall.head() -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_08_reshaping_data6.py: -------------------------------------------------------------------------------- 1 | facet = sns.relplot(x="time", y="consumption", col="Energie", 2 | data=df_overall, kind="line") 3 | facet.set(ylim=(0, None)) -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_08_reshaping_data7.py: -------------------------------------------------------------------------------- 1 | df.pivot_table(index='Pclass', columns='Sex', 2 | values='Survived', aggfunc='mean') -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_08_reshaping_data8.py: -------------------------------------------------------------------------------- 1 | fig, ax1 = plt.subplots() 2 | (df.pivot_table(index='Pclass', columns='Sex', 3 | values='Survived', aggfunc='mean') 4 | .plot.bar(rot=0, ax=ax1) 5 | ) 6 | ax1.set_ylabel('Survival ratio') -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_08_reshaping_data9.py: -------------------------------------------------------------------------------- 1 | df['Underaged'] = df['Age'] <= 18 -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_09_combining_datasets1.py: -------------------------------------------------------------------------------- 1 | joined = pd.merge(df, df_legal_forms, on="CD_LGL_PSN_VAT", how="left") 2 | joined -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_09_combining_datasets2.py: -------------------------------------------------------------------------------- 1 | joined.groupby("TX_LGL_PSN_VAT_EN_LVL1")["MS_NUM_VAT"].sum().sort_values(ascending=False) -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_09_combining_datasets3.py: -------------------------------------------------------------------------------- 1 | df_muni = pd.read_sql("SELECT * FROM TD_MUNTY_REFNIS", con) 2 | df_muni -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_09_combining_datasets4.py: -------------------------------------------------------------------------------- 1 | joined = pd.merge(df, df_muni[["CD_REFNIS", "TX_PROV_DESCR_EN"]], on="CD_REFNIS", how="left") 2 | joined -------------------------------------------------------------------------------- /notebooks/_solutions/pandas_09_combining_datasets5.py: -------------------------------------------------------------------------------- 1 | joined.groupby("TX_PROV_DESCR_EN")["MS_NUM_VAT"].sum() -------------------------------------------------------------------------------- /notebooks/_solutions/visualization_01_matplotlib1.py: -------------------------------------------------------------------------------- 1 | fig, ax = plt.subplots(figsize=(12, 4)) 2 | 3 | ax.plot(data, color='darkgrey') 4 | ax.set_xlabel('days since start'); 5 | ax.set_ylabel('measured value'); -------------------------------------------------------------------------------- /notebooks/_solutions/visualization_01_matplotlib2.py: -------------------------------------------------------------------------------- 1 | dates = pd.date_range("2021-01-01", periods=100, freq="D") 2 | 3 | fig, ax = plt.subplots(figsize=(12, 4)) 4 | 5 | ax.plot(dates, data, color='darkgrey') 6 | ax.axhspan(ymin=-5, ymax=5, color='green', alpha=0.2) 7 | 8 | ax.set_xlabel('days since start'); 9 | ax.set_ylabel('measured value'); -------------------------------------------------------------------------------- /notebooks/_solutions/visualization_01_matplotlib3.py: -------------------------------------------------------------------------------- 1 | fig, ax = plt.subplots(figsize=(12, 4)) 2 | 3 | ax.bar(dates[-10:], data[-10:], color='darkgrey') 4 | ax.bar(dates[-6], data[-6], color='orange') -------------------------------------------------------------------------------- /notebooks/_solutions/visualization_01_matplotlib4.py: -------------------------------------------------------------------------------- 1 | fig, ax = plt.subplots() 2 | flowdata.mean().plot.bar(ylabel="mean discharge", ax=ax) -------------------------------------------------------------------------------- /notebooks/_solutions/visualization_01_matplotlib5.py: -------------------------------------------------------------------------------- 1 | fig, (ax0, ax1) = plt.subplots(1, 2, constrained_layout=True) 2 | 3 | flowdata.min().plot.bar(ylabel="min discharge", ax=ax0) 4 | flowdata.max().plot.bar(ylabel="max discharge", ax=ax1) 5 | 6 | fig.suptitle(f"Minimal and maximal discharge from {flowdata.index[0]:%Y-%m-%d} till {flowdata.index[-1]:%Y-%m-%d}"); -------------------------------------------------------------------------------- /notebooks/_solutions/visualization_01_matplotlib6.py: -------------------------------------------------------------------------------- 1 | alarm_level = 20 2 | max_datetime, max_value = flowdata["LS06_347"].idxmax(), flowdata["LS06_347"].max() 3 | 4 | fig, ax = plt.subplots(figsize=(18, 4)) 5 | flowdata["LS06_347"].plot(ax=ax) 6 | 7 | ax.axhline(y=alarm_level, color='red', linestyle='-', alpha=0.8) 8 | ax.annotate('Alarm level', xy=(flowdata.index[0], alarm_level), 9 | xycoords="data", xytext=(10, 10), textcoords="offset points", 10 | color="red", fontsize=12) 11 | ax.annotate(f"Flood event on {max_datetime:%Y-%m-%d}", 12 | xy=(max_datetime, max_value), xycoords='data', 13 | xytext=(-30, -30), textcoords='offset points', 14 | arrowprops=dict(facecolor='black', shrink=0.05), 15 | horizontalalignment='right', verticalalignment='bottom', 16 | fontsize=12) -------------------------------------------------------------------------------- /notebooks/_solutions/visualization_02_seaborn1.py: -------------------------------------------------------------------------------- 1 | sns.displot(data=titanic, x="Age", row="Sex", aspect=3, height=2) -------------------------------------------------------------------------------- /notebooks/_solutions/visualization_02_seaborn10.py: -------------------------------------------------------------------------------- 1 | # filter the data 2 | compare_dead_30 = casualties.set_index("datetime")["2019":"2021"] 3 | compare_dead_30 = compare_dead_30[compare_dead_30["road_user_type"].isin( 4 | ["Bicycle", "Passenger car", "Pedestrian", "Motorbike"])] 5 | 6 | # Sum the victims and dead within 30 days victims for each year/road-user type combination 7 | compare_dead_30 = compare_dead_30.groupby( 8 | ["road_user_type", compare_dead_30.index.year])[["n_dead_30days", "n_victims"]].sum().reset_index() 9 | 10 | # create a new colum with the percentage deads 11 | compare_dead_30["dead_prop"] = compare_dead_30["n_dead_30days"] / compare_dead_30["n_victims"] * 100 -------------------------------------------------------------------------------- /notebooks/_solutions/visualization_02_seaborn11.py: -------------------------------------------------------------------------------- 1 | sns.catplot(data=compare_dead_30, 2 | x="dead_prop", 3 | y="road_user_type", 4 | kind="bar", 5 | hue="datetime" 6 | ) -------------------------------------------------------------------------------- /notebooks/_solutions/visualization_02_seaborn12.py: -------------------------------------------------------------------------------- 1 | monthly_victim_counts = casualties.resample("ME", on="datetime")[ 2 | ["n_victims_ok", "n_slightly_injured", "n_seriously_injured", "n_dead_30days"] 3 | ].sum() -------------------------------------------------------------------------------- /notebooks/_solutions/visualization_02_seaborn13.py: -------------------------------------------------------------------------------- 1 | sns.relplot( 2 | data=monthly_victim_counts, 3 | kind="line", 4 | palette="colorblind", 5 | height=3, aspect=4, 6 | ) -------------------------------------------------------------------------------- /notebooks/_solutions/visualization_02_seaborn14.py: -------------------------------------------------------------------------------- 1 | # Optional solution with tidy data representation (providing x and y) 2 | monthly_victim_counts_melt = monthly_victim_counts.reset_index().melt( 3 | id_vars="datetime", var_name="victim_type", value_name="count" 4 | ) 5 | 6 | sns.relplot( 7 | data=monthly_victim_counts_melt, 8 | x="datetime", 9 | y="count", 10 | hue="victim_type", 11 | kind="line", 12 | palette="colorblind", 13 | height=3, aspect=4, 14 | ) -------------------------------------------------------------------------------- /notebooks/_solutions/visualization_02_seaborn15.py: -------------------------------------------------------------------------------- 1 | # Pandas area plot 2 | monthly_victim_counts.plot.area(colormap='Reds', figsize=(15, 5)) -------------------------------------------------------------------------------- /notebooks/_solutions/visualization_02_seaborn16.py: -------------------------------------------------------------------------------- 1 | # Using Pandas 2 | daily_total_counts_2020 = casualties.set_index("datetime")["2020":"2021"].resample("D")["n_victims"].sum() 3 | daily_total_counts_2020.plot.line(figsize=(12, 3)) -------------------------------------------------------------------------------- /notebooks/_solutions/visualization_02_seaborn17.py: -------------------------------------------------------------------------------- 1 | # Using Seaborn 2 | sns.relplot(data=daily_total_counts_2020, 3 | kind="line", 4 | aspect=4, height=3) -------------------------------------------------------------------------------- /notebooks/_solutions/visualization_02_seaborn18.py: -------------------------------------------------------------------------------- 1 | # weekly proportion of deadly victims for each light condition 2 | weekly_victim_dead_lc = ( 3 | casualties 4 | .groupby("light_conditions") 5 | .resample("W", on="datetime")[["datetime", "n_victims", "n_dead_30days"]] 6 | .sum() 7 | .reset_index() 8 | ) 9 | weekly_victim_dead_lc["dead_prop"] = weekly_victim_dead_lc["n_dead_30days"] / weekly_victim_dead_lc["n_victims"] * 100 10 | 11 | # .. and the same for each road type 12 | weekly_victim_dead_rt = ( 13 | casualties 14 | .groupby("road_type") 15 | .resample("W", on="datetime")[["datetime", "n_victims", "n_dead_30days"]] 16 | .sum() 17 | .reset_index() 18 | ) 19 | weekly_victim_dead_rt["dead_prop"] = weekly_victim_dead_rt["n_dead_30days"] / weekly_victim_dead_rt["n_victims"] * 100 -------------------------------------------------------------------------------- /notebooks/_solutions/visualization_02_seaborn19.py: -------------------------------------------------------------------------------- 1 | fig, (ax0, ax1) = plt.subplots(1, 2, figsize=(15, 5)) 2 | 3 | sns.ecdfplot(data=weekly_victim_dead_lc, x="dead_prop", hue="light_conditions", ax=ax0) 4 | sns.ecdfplot(data=weekly_victim_dead_rt, x="dead_prop", hue="road_type", ax=ax1) -------------------------------------------------------------------------------- /notebooks/_solutions/visualization_02_seaborn2.py: -------------------------------------------------------------------------------- 1 | # Figure based 2 | sns.catplot(data=titanic, x="Pclass", y="Age", 3 | hue="Sex", split=True, 4 | palette="Set2", kind="violin") 5 | sns.despine(left=True) -------------------------------------------------------------------------------- /notebooks/_solutions/visualization_02_seaborn20.py: -------------------------------------------------------------------------------- 1 | daily_min_temp_2020 = pd.read_csv("./data/daily_min_temperature_2020.csv", 2 | parse_dates=["datetime"]) -------------------------------------------------------------------------------- /notebooks/_solutions/visualization_02_seaborn21.py: -------------------------------------------------------------------------------- 1 | daily_with_temp = daily_total_counts_2020.reset_index().merge(daily_min_temp_2020, on="datetime") -------------------------------------------------------------------------------- /notebooks/_solutions/visualization_02_seaborn22.py: -------------------------------------------------------------------------------- 1 | g = sns.jointplot( 2 | data=daily_with_temp, x="air_temperature", y="n_victims", kind="reg" 3 | ) -------------------------------------------------------------------------------- /notebooks/_solutions/visualization_02_seaborn3.py: -------------------------------------------------------------------------------- 1 | # Axes based 2 | sns.violinplot(data=titanic, x="Pclass", y="Age", 3 | hue="Sex", split=True, 4 | palette="Set2") 5 | sns.despine(left=True) -------------------------------------------------------------------------------- /notebooks/_solutions/visualization_02_seaborn4.py: -------------------------------------------------------------------------------- 1 | victims_hour_of_day = casualties.groupby(casualties["datetime"].dt.hour)["n_victims"].sum().reset_index() 2 | victims_hour_of_day = victims_hour_of_day.rename( 3 | columns={"datetime": "Hour of the day", "n_victims": "Number of victims"} 4 | ) -------------------------------------------------------------------------------- /notebooks/_solutions/visualization_02_seaborn5.py: -------------------------------------------------------------------------------- 1 | sns.catplot(data=victims_hour_of_day, 2 | x="Hour of the day", 3 | y="Number of victims", 4 | kind="bar", 5 | aspect=4, 6 | height=3, 7 | ) -------------------------------------------------------------------------------- /notebooks/_solutions/visualization_02_seaborn6.py: -------------------------------------------------------------------------------- 1 | victims_gender_hour_of_day = casualties.groupby([casualties["datetime"].dt.hour, "gender"], 2 | dropna=False)["n_victims"].sum().reset_index() 3 | victims_gender_hour_of_day.head() -------------------------------------------------------------------------------- /notebooks/_solutions/visualization_02_seaborn7.py: -------------------------------------------------------------------------------- 1 | sns.catplot(data=victims_gender_hour_of_day.fillna("unknown"), 2 | x="datetime", 3 | y="n_victims", 4 | row="gender", 5 | kind="bar", 6 | aspect=4, 7 | height=3) -------------------------------------------------------------------------------- /notebooks/_solutions/visualization_02_seaborn8.py: -------------------------------------------------------------------------------- 1 | casualties_motorway_trucks = casualties[ 2 | (casualties["road_type"] == "Motorway") 3 | & casualties["road_user_type"].isin(["Light truck", "Truck"]) 4 | ] -------------------------------------------------------------------------------- /notebooks/_solutions/visualization_02_seaborn9.py: -------------------------------------------------------------------------------- 1 | sns.catplot(data=casualties_motorway_trucks, 2 | x="week_day", 3 | y="n_victims", 4 | estimator=np.sum, 5 | errorbar=None, 6 | kind="bar", 7 | color="#900C3F", 8 | height=3, 9 | aspect=4) -------------------------------------------------------------------------------- /notebooks/data/Dryad_Arias_Hall_v3.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/notebooks/data/Dryad_Arias_Hall_v3.xlsx -------------------------------------------------------------------------------- /notebooks/data/TF_ACCIDENTS_VICTIMS_2020.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/notebooks/data/TF_ACCIDENTS_VICTIMS_2020.zip -------------------------------------------------------------------------------- /notebooks/data/TF_VAT_NACE_SQ_2019.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/notebooks/data/TF_VAT_NACE_SQ_2019.zip -------------------------------------------------------------------------------- /notebooks/data/daily_min_temperature_2020.csv: -------------------------------------------------------------------------------- 1 | datetime,air_temperature 2 | 2020-01-01,0.43 3 | 2020-01-02,2.44 4 | 2020-01-03,4.46 5 | 2020-01-04,1.56 6 | 2020-01-05,5.99 7 | 2020-01-06,2.2 8 | 2020-01-07,2.54 9 | 2020-01-08,8.54 10 | 2020-01-09,11.02 11 | 2020-01-10,5.74 12 | 2020-01-11,3.39 13 | 2020-01-12,5.49 14 | 2020-01-13,5.67 15 | 2020-01-14,9.77 16 | 2020-01-15,2.97 17 | 2020-01-16,0.48 18 | 2020-01-17,4.51 19 | 2020-01-18,-0.65 20 | 2020-01-19,-0.98 21 | 2020-01-20,-1.61 22 | 2020-01-21,-1.49 23 | 2020-01-22,-0.7 24 | 2020-01-23,2.49 25 | 2020-01-24,0.74 26 | 2020-01-25,0.48 27 | 2020-01-26,1.75 28 | 2020-01-27,6.18 29 | 2020-01-28,3.62 30 | 2020-01-29,1.96 31 | 2020-01-30,3.85 32 | 2020-01-31,10.38 33 | 2020-02-01,7.87 34 | 2020-02-02,7.12 35 | 2020-02-03,6.36 36 | 2020-02-04,2.12 37 | 2020-02-05,-0.12 38 | 2020-02-06,-1.02 39 | 2020-02-07,-2.5 40 | 2020-02-08,6.44 41 | 2020-02-09,6.86 42 | 2020-02-10,4.01 43 | 2020-02-11,4.76 44 | 2020-02-12,3.05 45 | 2020-02-13,2.46 46 | 2020-02-14,6.68 47 | 2020-02-15,7.93 48 | 2020-02-16,8.12 49 | 2020-02-17,6.78 50 | 2020-02-18,5.1 51 | 2020-02-19,4.32 52 | 2020-02-20,4.44 53 | 2020-02-21,2.33 54 | 2020-02-22,6.0 55 | 2020-02-23,4.43 56 | 2020-02-24,4.08 57 | 2020-02-25,3.34 58 | 2020-02-26,1.42 59 | 2020-02-27,1.78 60 | 2020-02-28,-0.51 61 | 2020-02-29,4.71 62 | 2020-03-01,4.94 63 | 2020-03-02,2.84 64 | 2020-03-03,1.44 65 | 2020-03-04,2.14 66 | 2020-03-05,5.9 67 | 2020-03-06,5.0 68 | 2020-03-07,0.05 69 | 2020-03-08,6.96 70 | 2020-03-09,5.58 71 | 2020-03-10,7.0 72 | 2020-03-11,11.95 73 | 2020-03-12,6.18 74 | 2020-03-13,4.11 75 | 2020-03-14,3.5 76 | 2020-03-15,7.3 77 | 2020-03-16,3.64 78 | 2020-03-17,0.79 79 | 2020-03-18,2.43 80 | 2020-03-19,2.6 81 | 2020-03-20,5.07 82 | 2020-03-21,4.04 83 | 2020-03-22,1.4 84 | 2020-03-23,-0.63 85 | 2020-03-24,-3.49 86 | 2020-03-25,-3.38 87 | 2020-03-26,0.77 88 | 2020-03-27,0.95 89 | 2020-03-28,3.03 90 | 2020-03-29,1.34 91 | 2020-03-30,-2.33 92 | 2020-03-31,0.91 93 | 2020-04-01,-2.17 94 | 2020-04-02,-1.53 95 | 2020-04-03,2.86 96 | 2020-04-04,1.38 97 | 2020-04-05,1.49 98 | 2020-04-06,5.41 99 | 2020-04-07,4.11 100 | 2020-04-08,6.19 101 | 2020-04-09,6.48 102 | 2020-04-10,7.9 103 | 2020-04-11,4.95 104 | 2020-04-12,4.38 105 | 2020-04-13,5.02 106 | 2020-04-14,1.26 107 | 2020-04-15,0.05 108 | 2020-04-16,1.73 109 | 2020-04-17,8.49 110 | 2020-04-18,8.02 111 | 2020-04-19,6.89 112 | 2020-04-20,7.53 113 | 2020-04-21,8.36 114 | 2020-04-22,8.81 115 | 2020-04-23,4.71 116 | 2020-04-24,5.13 117 | 2020-04-25,4.59 118 | 2020-04-26,2.59 119 | 2020-04-27,4.48 120 | 2020-04-28,9.3 121 | 2020-04-29,9.8 122 | 2020-04-30,7.45 123 | 2020-05-01,7.71 124 | 2020-05-02,4.59 125 | 2020-05-03,2.95 126 | 2020-05-04,8.91 127 | 2020-05-05,6.41 128 | 2020-05-06,3.76 129 | 2020-05-07,3.86 130 | 2020-05-08,6.58 131 | 2020-05-09,8.23 132 | 2020-05-10,8.15 133 | 2020-05-11,3.64 134 | 2020-05-12,1.0 135 | 2020-05-13,2.34 136 | 2020-05-14,2.67 137 | 2020-05-15,3.77 138 | 2020-05-16,2.35 139 | 2020-05-17,2.68 140 | 2020-05-18,6.7 141 | 2020-05-19,6.27 142 | 2020-05-20,9.09 143 | 2020-05-21,10.09 144 | 2020-05-22,13.38 145 | 2020-05-23,10.85 146 | 2020-05-24,8.93 147 | 2020-05-25,9.62 148 | 2020-05-26,7.48 149 | 2020-05-27,9.04 150 | 2020-05-28,9.74 151 | 2020-05-29,8.09 152 | 2020-05-30,7.58 153 | 2020-05-31,8.77 154 | 2020-06-01,8.55 155 | 2020-06-02,9.55 156 | 2020-06-03,9.74 157 | 2020-06-04,10.83 158 | 2020-06-05,7.53 159 | 2020-06-06,6.64 160 | 2020-06-07,9.69 161 | 2020-06-08,11.91 162 | 2020-06-09,8.93 163 | 2020-06-10,8.89 164 | 2020-06-11,12.26 165 | 2020-06-12,10.62 166 | 2020-06-13,12.58 167 | 2020-06-14,12.6 168 | 2020-06-15,12.54 169 | 2020-06-16,12.24 170 | 2020-06-17,12.74 171 | 2020-06-18,12.68 172 | 2020-06-19,10.87 173 | 2020-06-20,9.03 174 | 2020-06-21,10.86 175 | 2020-06-22,10.47 176 | 2020-06-23,10.46 177 | 2020-06-24,13.11 178 | 2020-06-25,12.98 179 | 2020-06-26,15.35 180 | 2020-06-27,16.32 181 | 2020-06-28,10.37 182 | 2020-06-29,10.3 183 | 2020-06-30,11.56 184 | 2020-07-01,12.53 185 | 2020-07-02,12.19 186 | 2020-07-03,12.1 187 | 2020-07-04,14.35 188 | 2020-07-05,13.28 189 | 2020-07-06,12.81 190 | 2020-07-07,12.45 191 | 2020-07-08,13.38 192 | 2020-07-09,16.62 193 | 2020-07-10,10.35 194 | 2020-07-11,7.88 195 | 2020-07-12,8.1 196 | 2020-07-13,9.01 197 | 2020-07-14,12.68 198 | 2020-07-15,13.44 199 | 2020-07-16,14.88 200 | 2020-07-17,14.68 201 | 2020-07-18,12.76 202 | 2020-07-19,10.75 203 | 2020-07-20,10.45 204 | 2020-07-21,7.13 205 | 2020-07-22,7.48 206 | 2020-07-23,8.1 207 | 2020-07-24,13.23 208 | 2020-07-25,13.17 209 | 2020-07-26,12.95 210 | 2020-07-27,12.72 211 | 2020-07-28,12.23 212 | 2020-07-29,9.63 213 | 2020-07-30,10.01 214 | 2020-07-31,12.45 215 | 2020-08-01,14.11 216 | 2020-08-02,11.21 217 | 2020-08-03,10.46 218 | 2020-08-04,9.32 219 | 2020-08-05,10.22 220 | 2020-08-06,11.78 221 | 2020-08-07,14.0 222 | 2020-08-08,15.37 223 | 2020-08-09,18.68 224 | 2020-08-10,17.67 225 | 2020-08-11,18.89 226 | 2020-08-12,18.24 227 | 2020-08-13,19.24 228 | 2020-08-14,17.63 229 | 2020-08-15,17.13 230 | 2020-08-16,16.72 231 | 2020-08-17,14.84 232 | 2020-08-18,13.13 233 | 2020-08-19,13.8 234 | 2020-08-20,18.6 235 | 2020-08-21,15.48 236 | 2020-08-22,14.58 237 | 2020-08-23,13.97 238 | 2020-08-24,13.59 239 | 2020-08-25,12.52 240 | 2020-08-26,12.75 241 | 2020-08-27,10.48 242 | 2020-08-28,11.24 243 | 2020-08-29,10.88 244 | 2020-08-30,11.9 245 | 2020-08-31,10.64 246 | 2020-09-01,9.97 247 | 2020-09-02,8.09 248 | 2020-09-03,11.29 249 | 2020-09-04,15.12 250 | 2020-09-05,9.97 251 | 2020-09-06,7.23 252 | 2020-09-07,6.5 253 | 2020-09-08,12.13 254 | 2020-09-09,16.28 255 | 2020-09-10,11.05 256 | 2020-09-11,8.01 257 | 2020-09-12,7.76 258 | 2020-09-13,8.81 259 | 2020-09-14,9.65 260 | 2020-09-15,13.1 261 | 2020-09-16,13.19 262 | 2020-09-17,9.85 263 | 2020-09-18,7.81 264 | 2020-09-19,10.44 265 | 2020-09-20,8.93 266 | 2020-09-21,5.91 267 | 2020-09-22,8.78 268 | 2020-09-23,9.33 269 | 2020-09-24,10.49 270 | 2020-09-25,7.6 271 | 2020-09-26,9.18 272 | 2020-09-27,12.46 273 | 2020-09-28,12.34 274 | 2020-09-29,9.44 275 | 2020-09-30,12.81 276 | 2020-10-01,7.77 277 | 2020-10-02,7.82 278 | 2020-10-03,8.4 279 | 2020-10-04,9.69 280 | 2020-10-05,10.25 281 | 2020-10-06,11.48 282 | 2020-10-07,8.65 283 | 2020-10-08,11.45 284 | 2020-10-09,8.85 285 | 2020-10-10,4.89 286 | 2020-10-11,5.34 287 | 2020-10-12,6.87 288 | 2020-10-13,7.39 289 | 2020-10-14,5.02 290 | 2020-10-15,6.51 291 | 2020-10-16,4.69 292 | 2020-10-17,3.22 293 | 2020-10-18,7.22 294 | 2020-10-19,6.04 295 | 2020-10-20,6.35 296 | 2020-10-21,12.66 297 | 2020-10-22,9.57 298 | 2020-10-23,8.81 299 | 2020-10-24,12.22 300 | 2020-10-25,8.5 301 | 2020-10-26,8.47 302 | 2020-10-27,7.15 303 | 2020-10-28,9.25 304 | 2020-10-29,7.81 305 | 2020-10-30,13.26 306 | 2020-10-31,9.79 307 | 2020-11-01,10.63 308 | 2020-11-02,9.54 309 | 2020-11-03,4.74 310 | 2020-11-04,0.88 311 | 2020-11-05,-0.47 312 | 2020-11-06,-0.63 313 | 2020-11-07,1.27 314 | 2020-11-08,4.14 315 | 2020-11-09,5.77 316 | 2020-11-10,6.68 317 | 2020-11-11,7.35 318 | 2020-11-12,4.86 319 | 2020-11-13,4.14 320 | 2020-11-14,10.41 321 | 2020-11-15,9.3 322 | 2020-11-16,8.01 323 | 2020-11-17,10.97 324 | 2020-11-18,8.56 325 | 2020-11-19,4.32 326 | 2020-11-20,0.63 327 | 2020-11-21,4.6 328 | 2020-11-22,8.24 329 | 2020-11-23,4.49 330 | 2020-11-24,4.25 331 | 2020-11-25,3.99 332 | 2020-11-26,5.37 333 | 2020-11-27,6.85 334 | 2020-11-28,1.85 335 | 2020-11-29,1.39 336 | 2020-11-30,1.22 337 | 2020-12-01,7.01 338 | 2020-12-02,6.25 339 | 2020-12-03,5.9 340 | 2020-12-04,-0.15 341 | 2020-12-05,-0.11 342 | 2020-12-06,-1.7 343 | 2020-12-07,0.78 344 | 2020-12-08,-0.68 345 | 2020-12-09,0.45 346 | 2020-12-10,0.71 347 | 2020-12-11,1.66 348 | 2020-12-12,7.8 349 | 2020-12-13,4.4 350 | 2020-12-14,8.25 351 | 2020-12-15,4.28 352 | 2020-12-16,2.56 353 | 2020-12-17,5.21 354 | 2020-12-18,5.79 355 | 2020-12-19,6.59 356 | 2020-12-20,4.84 357 | 2020-12-21,3.56 358 | 2020-12-22,10.91 359 | 2020-12-23,9.62 360 | 2020-12-24,3.46 361 | 2020-12-25,-0.13 362 | 2020-12-26,0.99 363 | 2020-12-27,3.94 364 | 2020-12-28,-0.49 365 | 2020-12-29,-0.39 366 | 2020-12-30,-0.94 367 | 2020-12-31,3.42 368 | -------------------------------------------------------------------------------- /notebooks/data/data-preprocessing.md: -------------------------------------------------------------------------------- 1 | --- 2 | jupytext: 3 | text_representation: 4 | extension: .md 5 | format_name: myst 6 | format_version: 0.13 7 | jupytext_version: 1.13.6 8 | kernelspec: 9 | display_name: Python 3 (ipykernel) 10 | language: python 11 | name: python3 12 | --- 13 | 14 | ## Simplified Statistical Sectors 15 | 16 | https://statbel.fgov.be/nl/open-data/statistische-sectoren-2019 17 | 18 | ```{code-cell} ipython3 19 | import geopandas 20 | ``` 21 | 22 | ```{code-cell} ipython3 23 | df = geopandas.read_file("/home/joris/Downloads/sh_statbel_statistical_sectors_20190101.shp.zip") 24 | ``` 25 | 26 | ```{code-cell} ipython3 27 | df = df.dissolve("CNIS5_2019").reset_index() 28 | ``` 29 | 30 | ```{code-cell} ipython3 31 | import topojson as tp 32 | topo = tp.Topology(df, prequantize=True) 33 | res = topo.toposimplify(1000).to_gdf() 34 | ``` 35 | 36 | ```{code-cell} ipython3 37 | res.plot() 38 | ``` 39 | 40 | ```{code-cell} ipython3 41 | res.crs = df.crs 42 | ``` 43 | 44 | ```{code-cell} ipython3 45 | res[["CNIS5_2019", "T_MUN_NL", "geometry"]].to_file("statbel_statistical_sectors_2019.shp") 46 | ``` 47 | 48 | ```{code-cell} ipython3 49 | 50 | ``` 51 | -------------------------------------------------------------------------------- /notebooks/data/fietstelpaal-coupure-links-2022-gent.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/notebooks/data/fietstelpaal-coupure-links-2022-gent.zip -------------------------------------------------------------------------------- /notebooks/data/fietstelpaal-coupure-links-2023-gent.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/notebooks/data/fietstelpaal-coupure-links-2023-gent.zip -------------------------------------------------------------------------------- /notebooks/data/fietstelpaal-coupure-links-gent.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/notebooks/data/fietstelpaal-coupure-links-gent.zip -------------------------------------------------------------------------------- /notebooks/data/load_casualties.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import urllib 3 | import logging 4 | from tempfile import tempdir 5 | from pathlib import Path 6 | 7 | import pandas as pd 8 | import numpy as np 9 | 10 | 11 | def clean_casualties_data(casualties_raw): 12 | """Convert raw casualties data to english and restructured format""" 13 | casualties = ( 14 | casualties_raw 15 | .drop(columns=[col for col in casualties_raw.columns 16 | if col.endswith("_FR")]) 17 | .drop(columns=[col for col in casualties_raw.columns 18 | if col.startswith("CD_") and not col.endswith("_REFNIS")]) 19 | .rename(columns={name: name.removeprefix("TX_").removesuffix("_DESCR_NL") 20 | for name in casualties_raw.columns}) 21 | .replace("Onbekend", None) 22 | ) 23 | casualties["gender"] = casualties["SEX"].replace( 24 | {"Vrouwelijk": "female", "Mannelijk": "male"} 25 | ) 26 | 27 | casualties["DT_HOUR"] = casualties["DT_HOUR"].replace(99, 0) 28 | casualties["datetime"] = pd.to_datetime( 29 | casualties["DT_DAY"] + " " + casualties["DT_HOUR"].astype(str) + ":00" 30 | ) 31 | 32 | casualties["age"] = casualties["AGE_CLS"].str.replace( 33 | " tot ", " - ").str.removesuffix("jaar").str.strip() 34 | casualties["age"] = casualties["age"].replace( 35 | {"": None, "75 jaar en meer": ">75", ' ': None}) 36 | 37 | casualties["DAY_OF_WEEK"] = casualties["DAY_OF_WEEK"].replace({ 38 | "maandag": "Monday", "dinsdag": "Tuesday", "woensdag": "Wednesday", 39 | "donderdag": "Thursday", "vrijdag": "Friday", "zaterdag": "Saturday", 40 | "zondag": "Sunday"}) 41 | casualties["week_day"] = pd.Categorical( 42 | casualties["DAY_OF_WEEK"], 43 | categories=["Monday", "Tuesday", "Wednesday", 44 | "Thursday", "Friday", "Saturday", "Sunday"], 45 | ordered=True 46 | ) 47 | 48 | casualties["victim_type"] = casualties["VICT_TYPE"].replace({ 49 | "Bestuurder": "Driver", "Bromfietser": "Moped driver", 50 | "Passagier": "Passenger", "Motorfietser": 'Motorcyclist', 51 | "Fietser": "Cyclist", "Voetganger": "Pedestrian", 52 | "Autres victimes": None}) 53 | 54 | casualties["build_up_area"] = casualties["BUILD_UP_AREA"].replace({ 55 | "Binnen bebouwde kom": "Inside built-up area", 56 | "Buiten bebouwde kom": "Outside built-up area", 57 | " ": None}) 58 | 59 | casualties["ROAD_USR_TYPE"] = casualties["ROAD_USR_TYPE"].replace({ 60 | 'Personenauto': 'Passenger car', 61 | 'Auto voor dubbel gebruik': 'Dual-purpose vehicle', 62 | 'Lichte vrachtauto': 'Light truck', 63 | 'Bromfiets': 'Moped', 64 | 'Bromfiets A (tweewielige)': 'Moped', 65 | 'Bromfiets B (tweewielige)': 'Moped', 66 | 'Bromfiets met 3 of 4 wielen': 'Moped', 67 | 'Motorfiets': 'Motorbike', 68 | 'Motorfiets meer dan 400 cc': 'Motorbike', 69 | 'Motorfiets niet meer dan 400 cc': 'Motorbike', 70 | 'Fiets': 'Bicycle', 71 | 'Elektrische fiets': 'Electric bicycle', 72 | 'Fiets met elektrische hulpmotor (<=250W en <=25km/u)': 'Electric bicycle', 73 | 'Gemotoriseerde fiets (<=1000W en <=25km/u)': 'Electric bicycle', 74 | 'Speed pedelec (<= 4000W en <=45km/u)': 'Speed pedelec', 75 | 'Gemotoriseerd voortbewegingstoestel (<=18km/u)': 'Electric bicycle', 76 | 'Trekker + aanhangwagen': 'Trailer', 77 | 'Trekker alleen': 'Trailer', 78 | 'Vrachtwagen': 'Truck', 79 | 'Ruiter': 'Horse rider', 80 | 'Bespannen voertuig': 'Horse rider', 81 | 'Andere voetganger': 'Pedestrian', 82 | 'Gehandicapte in rolstoel': 'Disabled person in a wheelchair', 83 | 'Voetganger die zijn (brom)fiets duwt': 'Pedestrian', 84 | 'Trolleybus, Tram': 'Tram', 85 | 'Minibus': 'Van', 86 | 'Autobus': 'Bus', 87 | 'Autocar': 'Bus', 88 | 'Autobus/Autocar': 'Bus', 89 | 'Kampeerwagen': 'Campervan', 90 | 'Landbouwtractor': 'Tractor', 91 | 'Andere weggebruiker': None, 92 | 'Niet ingevuld': None, 93 | np.nan: None 94 | }) 95 | 96 | casualties["LIGHT_COND"] = casualties["LIGHT_COND"].replace( 97 | {'Bij klaarlichte dag': 'In broad daylight', 98 | 'Nacht, ontstoken openbare verlichting': 'Night, public lighting lit', 99 | 'Dageraad - schemering': 'Dawn', 100 | 'Nacht, openbare verlichting aanwezig, maar niet ontstoken': 'Night, no public lighting', 101 | 'Nacht, geen openbare verlichting': 'Night, no public lighting', 102 | ' ': None 103 | }) 104 | 105 | casualties["ROAD_TYPE"] = casualties["ROAD_TYPE"].replace({ 106 | 'Gemeenteweg': 'Municipal road', 107 | 'Gewestweg': 'Regional road', 108 | 'Autosnelweg': 'Motorway' 109 | }) 110 | 111 | casualties["RGN"] = casualties["RGN"].replace({ 112 | 'Vlaams Gewest': 'Flemish Region', 113 | 'Brussels Hoofdstedelijk Gewest': 'Brussels-Capital Region', 114 | 'Waals Gewest': 'Walloon Region' 115 | }) 116 | casualties["CD_RGN_REFNIS"] = casualties["CD_RGN_REFNIS"].replace( 117 | {'02000': 2000, '03000': 3000, '04000': 4000, ' ': None} 118 | ) 119 | 120 | casualties = casualties.replace(" ", None) 121 | casualties = casualties.rename(columns={ 122 | "MS_VICT": "n_victims", 123 | "MS_VIC_OK": "n_victims_ok", 124 | "MS_SLY_INJ": "n_slightly_injured", 125 | "MS_SERLY_INJ": "n_seriously_injured", 126 | "MS_DEAD_30_DAYS": "n_dead_30days", 127 | "ROAD_USR_TYPE": "road_user_type", 128 | "LIGHT_COND": "light_conditions", 129 | "ROAD_TYPE": "road_type", 130 | "RGN": "region", 131 | "CD_RGN_REFNIS": "refnis_region", 132 | "CD_MUNTY_REFNIS": "refnis_municipality", 133 | "MUNTY": "municipality" 134 | }) 135 | casualties_clean = casualties.drop( 136 | columns=[ 137 | "DT_DAY", "DT_HOUR", "DAY_OF_WEEK", "SEX", "VICT_TYPE", 138 | "BUILD_UP_AREA", "AGE_CLS", "CD_PROV_REFNIS", "PROV", 139 | "CD_DSTR_REFNIS", "ADM_DSTR"] 140 | ) 141 | 142 | return casualties_clean 143 | 144 | 145 | def main(start_year=2005, end_year=2020, 146 | processed_file_name="casualties.csv"): 147 | """Download casualties data, run cleaning function, concat and save as CSV 148 | 149 | Parameters 150 | ---------- 151 | start_year : int, default 2005 152 | Start year to download data from. 153 | end_year : int, default 2021 154 | End year to download data from. 155 | processed_file_name : str 156 | File name of the concatenated clean data set. 157 | """ 158 | download_folder = Path(tempdir) / "casualties" 159 | download_folder.mkdir(exist_ok=True) 160 | 161 | logger.info("Start processing causalties Belgium open data from {start_year} till {end_year}.") 162 | casualties_all = [] 163 | for year in range(start_year, end_year+1): 164 | logger.info(f"Handling year {year}") 165 | file_name = download_folder / f"TF_ACCIDENTS_VICTIMS_{year}_.zip" 166 | if not file_name.exists(): 167 | logger.info(f"Download year {year}.") 168 | urllib.request.urlretrieve( 169 | f"https://statbel.fgov.be/sites/default/files/files/opendata/Verkeersslachtoffers/TF_ACCIDENTS_VICTIMS_{year}.zip", 170 | file_name) 171 | casualties = pd.read_csv(file_name, compression='zip', 172 | sep="|", low_memory=False) 173 | try: 174 | casualties_clean = clean_casualties_data(casualties) 175 | casualties_all.append(casualties_clean) 176 | except: 177 | logger.error(f"Data processing of year {year} failed") 178 | logger.info("All casualties raw data set donwloads ready.") 179 | 180 | logger.info("Combining individual years to single DataFrame.") 181 | casualties_all = pd.concat(casualties_all).sort_values("datetime") 182 | 183 | if 'n_victims_ok' in casualties_all.columns: 184 | casualties = casualties_all[["datetime", "week_day", 185 | "n_victims", "n_victims_ok", "n_slightly_injured", 186 | "n_seriously_injured", "n_dead_30days", 187 | "road_user_type", "victim_type", "gender", "age", 188 | "road_type", "build_up_area", "light_conditions", 189 | "refnis_municipality", "municipality", 190 | "refnis_region", "region" 191 | ]] 192 | else: 193 | casualties = casualties_all[["datetime", "week_day", 194 | "n_victims", "n_slightly_injured", 195 | "n_seriously_injured", "n_dead_30days", 196 | "road_user_type", "victim_type", "gender", "age", 197 | "road_type", "build_up_area", "light_conditions", 198 | "refnis_municipality", "municipality", 199 | "refnis_region", "region" 200 | ]] 201 | 202 | logger.info("Writing combined casualties data file to disk.") 203 | casualties.to_csv(Path("./data") / processed_file_name, index=False) 204 | 205 | logger.info("Combined casualties data file ready.") 206 | 207 | 208 | if __name__ == "__main__": 209 | 210 | logger = logging.getLogger(__name__) 211 | 212 | parser = argparse.ArgumentParser( 213 | description='Collect and prepare casualties open data Belgium.' 214 | ) 215 | parser.add_argument('start_year', metavar='start-year', type=int, default=2015, 216 | help='First year to download casualties data.') 217 | parser.add_argument('end_year', metavar='end-year', type=int, default=20210, 218 | help='Last year to download casualties data.') 219 | 220 | args = parser.parse_args() 221 | 222 | print("Start casualties data preparation...") 223 | main(args.start_year, args.end_year) 224 | print("...done!") -------------------------------------------------------------------------------- /notebooks/data/plot_location.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/notebooks/data/plot_location.xlsx -------------------------------------------------------------------------------- /notebooks/data/species.csv: -------------------------------------------------------------------------------- 1 | species_id;genus;species;taxa 2 | AB;Amphispiza;bilineata;Bird 3 | AH;Ammospermophilus;harrisi;Rodent-not censused 4 | AS;Ammodramus;savannarum;Bird 5 | BA;Baiomys;taylori;Rodent 6 | CB;Campylorhynchus;brunneicapillus;Bird 7 | CM;Calamospiza;melanocorys;Bird 8 | CQ;Callipepla;squamata;Bird 9 | CS;Crotalus;scutalatus;Reptile 10 | CT;Cnemidophorus;tigris;Reptile 11 | CU;Cnemidophorus;uniparens;Reptile 12 | CV;Crotalus;viridis;Reptile 13 | DM;Dipodomys;merriami;Rodent 14 | DO;Dipodomys;ordii;Rodent 15 | DS;Dipodomys;spectabilis;Rodent 16 | DX;Dipodomys;sp.;Rodent 17 | EO;Eumeces;obsoletus;Reptile 18 | GS;Gambelia;silus;Reptile 19 | NE;Neotoma;albigula;Rodent 20 | NX;Neotoma;sp.;Rodent 21 | OL;Onychomys;leucogaster;Rodent 22 | OT;Onychomys;torridus;Rodent 23 | OX;Onychomys;sp.;Rodent 24 | PB;Chaetodipus;baileyi;Rodent 25 | PC;Pipilo;chlorurus;Bird 26 | PE;Peromyscus;eremicus;Rodent 27 | PF;Perognathus;flavus;Rodent 28 | PG;Pooecetes;gramineus;Bird 29 | PH;Perognathus;hispidus;Rodent 30 | PI;Chaetodipus;intermedius;Rodent 31 | PL;Peromyscus;leucopus;Rodent 32 | PM;Peromyscus;maniculatus;Rodent 33 | PP;Chaetodipus;penicillatus;Rodent 34 | PU;Pipilo;fuscus;Bird 35 | PX;Chaetodipus;sp.;Rodent 36 | RF;Reithrodontomys;fulvescens;Rodent 37 | RM;Reithrodontomys;megalotis;Rodent 38 | RO;Reithrodontomys;montanus;Rodent 39 | RX;Reithrodontomys;sp.;Rodent 40 | SA;Sylvilagus;audubonii;Rabbit 41 | SB;Spizella;breweri;Bird 42 | SC;Sceloporus;clarki;Reptile 43 | SF;Sigmodon;fulviventer;Rodent 44 | SH;Sigmodon;hispidus;Rodent 45 | SO;Sigmodon;ochrognathus;Rodent 46 | SS;Spermophilus;spilosoma;Rodent-not censused 47 | ST;Spermophilus;tereticaudus;Rodent-not censused 48 | SU;Sceloporus;undulatus;Reptile 49 | SX;Sigmodon;sp.;Rodent 50 | UL;Lizard;sp.;Reptile 51 | UP;Pipilo;sp.;Bird 52 | UR;Rodent;sp.;Rodent 53 | US;Sparrow;sp.;Bird 54 | XX;;;Zero Trapping Success 55 | ZL;Zonotrichia;leucophrys;Bird 56 | ZM;Zenaida;macroura;Bird 57 | -------------------------------------------------------------------------------- /notebooks/data/species_names.csv: -------------------------------------------------------------------------------- 1 | class,kingdom,order,phylum,scientificName,ID,taxa 2 | Mammalia,Animalia,Rodentia,Chordata,"Dipodomys merriami Mearns, 1890",2439521,Rodent 3 | Mammalia,Animalia,Rodentia,Chordata,"Perognathus flavus Baird, 1855",2439566,Rodent 4 | Mammalia,Animalia,Rodentia,Chordata,"Peromyscus eremicus (Baird, 1857)",2437981,Rodent 5 | Mammalia,Animalia,Rodentia,Chordata,"Sigmodon hispidus Say & Ord, 1825",2438147,Rodent 6 | Mammalia,Animalia,Rodentia,Chordata,"Dipodomys spectabilis Merriam, 1890",2439531,Rodent 7 | Mammalia,Animalia,Rodentia,Chordata,"Chaetodipus penicillatus (Woodhouse, 1852)",2439591,Rodent 8 | Mammalia,Animalia,Rodentia,Chordata,"Onychomys torridus (Coues, 1874)",2438516,Rodent 9 | Mammalia,Animalia,Rodentia,Chordata,"Dipodomys ordii Woodhouse, 1853",2439541,Rodent 10 | Mammalia,Animalia,Rodentia,Chordata,"Spermophilus spilosoma Bennett, 1833",2437300,Rodent-not censused 11 | Mammalia,Animalia,Rodentia,Chordata,"Onychomys leucogaster (Wied-Neuwied, 1841)",2438517,Rodent 12 | Mammalia,Animalia,Rodentia,Chordata,"Reithrodontomys megalotis (Baird, 1857)",2437874,Rodent 13 | Mammalia,Animalia,Lagomorpha,Chordata,"Sylvilagus audubonii (Baird, 1858)",2436910,Rabbit 14 | Mammalia,Animalia,Rodentia,Chordata,"Peromyscus maniculatus (Wagner, 1845)",2437967,Rodent 15 | Mammalia,Animalia,Rodentia,Chordata,"Ammospermophilus harrisii (Audubon & Bachman, 1854)",2437568,Rodent-not censused 16 | Aves,Animalia,Passeriformes,Chordata,"Amphispiza bilineata (Cassin, 1850)",2491757,Bird 17 | Aves,Animalia,Passeriformes,Chordata,"Campylorhynchus brunneicapillus (Lafresnaye, 1835)",5231474,Bird 18 | Aves,Animalia,Passeriformes,Chordata,"Calamospiza melanocorys Stejneger, 1885",2491893,Bird 19 | Aves,Animalia,Galliformes,Chordata,"Callipepla squamata (Vigors, 1830)",5228075,Bird 20 | Mammalia,Animalia,Rodentia,Chordata,"Reithrodontomys fulvescens J.A.Allen, 1894",2437864,Rodent 21 | Aves,Animalia,Passeriformes,Chordata,"Pipilo chlorurus (Audubon, 1839)",2491276,Bird 22 | Aves,Animalia,Passeriformes,Chordata,"Pooecetes gramineus (J.F.Gmelin, 1789)",2491728,Bird 23 | Mammalia,Animalia,Rodentia,Chordata,"Perognathus hispidus Baird, 1858",2439584,Rodent 24 | Aves,Animalia,Passeriformes,Chordata,"Pipilo fuscus Swainson, 1827",2491244,Bird 25 | Reptilia,Animalia,Squamata,Chordata,"Crotalus viridis Rafinesque, 1818",8945077,Reptile 26 | Aves,Animalia,Passeriformes,Chordata,"Zonotrichia leucophrys (J.R.Forster, 1772)",5231132,Bird 27 | Reptilia,Animalia,Squamata,Chordata,"Sceloporus clarkii Baird & Girard, 1852",2451192,Reptile 28 | Mammalia,Animalia,Rodentia,Chordata,"Baiomys taylori (Thomas, 1887)",2438866,Rodent 29 | Mammalia,Animalia,Rodentia,Chordata,"Sigmodon fulviventer J.A.Allen, 1889",2438153,Rodent 30 | Mammalia,Animalia,Rodentia,Chordata,"Reithrodontomys montanus (Baird, 1855)",2437866,Rodent 31 | Aves,Animalia,Passeriformes,Chordata,"Ammodramus savannarum (J.F.Gmelin, 1789)",2491123,Bird 32 | Mammalia,Animalia,Rodentia,Chordata,"Sigmodon ochrognathus Bailey, 1902",2438156,Rodent 33 | Mammalia,Animalia,Rodentia,Chordata,"Chaetodipus intermedius (Merriam, 1889)",2439589,Rodent 34 | Mammalia,Animalia,Rodentia,Chordata,"Spermophilus tereticaudus Baird, 1858",2437325,Rodent-not censused 35 | Reptilia,Animalia,Squamata,Chordata,"Cnemidophorus uniparens Wright & Lowe, 1965",5227544,Reptile 36 | Reptilia,Animalia,Squamata,Chordata,"Sceloporus undulatus (Bosc & Daudin, 1801)",2451347,Reptile 37 | Mammalia,Animalia,Rodentia,Chordata,"Chaetodipus baileyi (Merriam, 1894)",2439581,Rodent 38 | Mammalia,Animalia,Rodentia,Chordata,"Peromyscus leucopus (Rafinesque, 1818)",2438019,Rodent 39 | Reptilia,Animalia,Squamata,Chordata,"Cnemidophorus tigris Grismer, 1999",8071886,Reptile 40 | -------------------------------------------------------------------------------- /notebooks/data/statbel_statistical_sectors_2019.shp.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/notebooks/data/statbel_statistical_sectors_2019.shp.zip -------------------------------------------------------------------------------- /notebooks/data/verbruiksgegevens-per-maand.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/notebooks/data/verbruiksgegevens-per-maand.xlsx -------------------------------------------------------------------------------- /notebooks/pandas_07_missing_values.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "8bd0774d", 6 | "metadata": {}, 7 | "source": [ 8 | "

07 - Pandas: Working with missing data

\n", 9 | "\n", 10 | "\n", 11 | "> *© 2025, Joris Van den Bossche and Stijn Van Hoey (, ). Licensed under [CC BY 4.0 Creative Commons](http://creativecommons.org/licenses/by/4.0/)*\n", 12 | "\n", 13 | "---" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "id": "fad2705f", 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import numpy as np\n", 24 | "import pandas as pd" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "id": "6cf9e666", 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "df = pd.DataFrame({'A': [1, 2, np.nan],\n", 35 | " 'B': [4, np.nan, np.nan],\n", 36 | " 'C': [7, 8, 9]})\n", 37 | "df" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "id": "9204ffad", 43 | "metadata": {}, 44 | "source": [ 45 | "## Missing values in Pandas" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "id": "20ebca57", 51 | "metadata": {}, 52 | "source": [ 53 | "For numerical data, the \"NaN\" (Not-A-Number) floating point value is used as missing value indicator:" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "id": "17a6454f", 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "df.loc[2, 'A']" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "id": "35dc8450", 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "np.nan" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "id": "b116e307", 79 | "metadata": {}, 80 | "source": [ 81 | "
\n", 82 | "\n", 83 | "**NOTE**: because NaN is a float value, it is currently not possible to have integer columns with missing values. Notice how the columns in the example above were casted to float dtype.\n", 84 | "\n", 85 | "
" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "id": "89150b7e", 91 | "metadata": {}, 92 | "source": [ 93 | "### Missing values are skipped by default in *reductions*" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "id": "1e2b48d5", 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "df['A'].mean()" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "id": "96daf776", 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "df['A'].mean(skipna=False)" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "id": "604e4841", 119 | "metadata": {}, 120 | "source": [ 121 | "### ... but propagated in *element-wise arithmetic*" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "id": "92901db0", 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "df['A'] + 3" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "id": "cf8a72a6", 137 | "metadata": {}, 138 | "source": [ 139 | "## Checking missing values" 140 | ] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "id": "5b50553a", 145 | "metadata": {}, 146 | "source": [ 147 | "Checking for a missing value cannot be done with an equality operation (`==`) because NaN is not equal to iself:" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "id": "61a4ebe9", 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "df['A'] == np.nan" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "id": "1acc9e71", 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "np.nan == np.nan" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "id": "b4439546", 173 | "metadata": {}, 174 | "source": [ 175 | "Therefore, dedicated methods are available: `isna()` and `notna()`" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "id": "3c7d6670", 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "df['A'].isna()" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "id": "4b95b7c2", 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "df['A'].notna()" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "id": "683cccc8", 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "df['A'].isna().sum()" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "id": "c023dd7d", 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "df.isna().sum()" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "id": "82b582da", 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "df" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "id": "a8488b86", 231 | "metadata": {}, 232 | "source": [ 233 | "## Dropping missing values" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "id": "e1440709", 239 | "metadata": {}, 240 | "source": [ 241 | "Dropping missing values can be done with `isna()`/`notna()` and boolean indexing (eg `df[df['A'].notna()]`), but pandas also provides some convenient helper functions for this:" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "id": "788d650e", 248 | "metadata": {}, 249 | "outputs": [], 250 | "source": [ 251 | "df.dropna()" 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "id": "c694bb08", 257 | "metadata": {}, 258 | "source": [ 259 | "By default it drop rows if there is a NaN in any of the columns. To limit this to we subset of the columns, use the `subset` keyword:" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "id": "5bb3578c", 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [ 269 | "df.dropna(subset=['A', 'C'])" 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "id": "00036b6f", 275 | "metadata": {}, 276 | "source": [ 277 | "## Filling missing values" 278 | ] 279 | }, 280 | { 281 | "cell_type": "markdown", 282 | "id": "0e64082f", 283 | "metadata": {}, 284 | "source": [ 285 | "Filling missing values with a scalar:" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "id": "94f40e9a", 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [ 295 | "df.fillna(0)" 296 | ] 297 | }, 298 | { 299 | "cell_type": "markdown", 300 | "id": "0a73ff4c", 301 | "metadata": {}, 302 | "source": [ 303 | "Further, more advanced filling techniques are available in the ``interpolate()`` method." 304 | ] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "id": "7b57edf1", 309 | "metadata": {}, 310 | "source": [ 311 | "
\n", 312 | "\n", 313 | "**REMEMBER**:
\n", 314 | "\n", 315 | "* Missing value indicator: `np.nan` (`NaN`)\n", 316 | "* Reductions: skipped by default\n", 317 | "* Mathematical operations (eg `+`): propagate by default\n", 318 | "* Specific functions:\n", 319 | " * `isna()`, `notna()`\n", 320 | " * `dropna()`\n", 321 | " * `fillna()`, `interpolate()`\n", 322 | "\n", 323 | "
" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": null, 329 | "id": "e1f5bf9a", 330 | "metadata": {}, 331 | "outputs": [], 332 | "source": [] 333 | } 334 | ], 335 | "metadata": { 336 | "jupytext": { 337 | "formats": "ipynb,md:myst" 338 | }, 339 | "kernelspec": { 340 | "display_name": "Python 3 (ipykernel)", 341 | "language": "python", 342 | "name": "python3" 343 | }, 344 | "language_info": { 345 | "codemirror_mode": { 346 | "name": "ipython", 347 | "version": 3 348 | }, 349 | "file_extension": ".py", 350 | "mimetype": "text/x-python", 351 | "name": "python", 352 | "nbconvert_exporter": "python", 353 | "pygments_lexer": "ipython3", 354 | "version": "3.12.8" 355 | }, 356 | "widgets": { 357 | "application/vnd.jupyter.widget-state+json": { 358 | "state": {}, 359 | "version_major": 2, 360 | "version_minor": 0 361 | } 362 | } 363 | }, 364 | "nbformat": 4, 365 | "nbformat_minor": 5 366 | } 367 | -------------------------------------------------------------------------------- /notebooks/python_intro/_solutions/00-jupyterlab1.py: -------------------------------------------------------------------------------- 1 | # Jupyter returns the output of the last calculation. 2 | 7 * 3 3 | 2 + 1 -------------------------------------------------------------------------------- /notebooks/python_intro/_solutions/00-jupyterlab2.py: -------------------------------------------------------------------------------- 1 | x = 6 * 7 + 12 2 | print(x) -------------------------------------------------------------------------------- /notebooks/python_intro/_solutions/01-variables1.py: -------------------------------------------------------------------------------- 1 | weight_kg = 65 2 | weight_g = weight_kg * 1000 -------------------------------------------------------------------------------- /notebooks/python_intro/_solutions/01-variables2.py: -------------------------------------------------------------------------------- 1 | initial = "left" 2 | position = initial 3 | initial = "right" 4 | position -------------------------------------------------------------------------------- /notebooks/python_intro/_solutions/01-variables3.py: -------------------------------------------------------------------------------- 1 | pressure, weight = 1010, 60.5 2 | print(weight) # prints 60.5 -------------------------------------------------------------------------------- /notebooks/python_intro/_solutions/01-variables4.py: -------------------------------------------------------------------------------- 1 | # Variables must be created before they are used. 2 | # print(pressure_p) -------------------------------------------------------------------------------- /notebooks/python_intro/_solutions/01-variables5.py: -------------------------------------------------------------------------------- 1 | type(21.55) -------------------------------------------------------------------------------- /notebooks/python_intro/_solutions/01-variables6.py: -------------------------------------------------------------------------------- 1 | type(3.25 + 4) -------------------------------------------------------------------------------- /notebooks/python_intro/_solutions/01-variables7.py: -------------------------------------------------------------------------------- 1 | first = 1.0 2 | second = "1" 3 | third = "1.1" 4 | print(first + float(second)) # 1 5 | print(first + int(float(third))) # 4 -------------------------------------------------------------------------------- /notebooks/python_intro/_solutions/01-variables8.py: -------------------------------------------------------------------------------- 1 | int(float("3.0")) -------------------------------------------------------------------------------- /notebooks/python_intro/_solutions/02-functions-use1.py: -------------------------------------------------------------------------------- 1 | math.floor(1.7) -------------------------------------------------------------------------------- /notebooks/python_intro/_solutions/02-functions-use2.py: -------------------------------------------------------------------------------- 1 | experiment_label = "Lab1_C_2" 2 | experiment_label.endswith("2") -------------------------------------------------------------------------------- /notebooks/python_intro/_solutions/02-functions-use3.py: -------------------------------------------------------------------------------- 1 | import random 2 | #help(random) -------------------------------------------------------------------------------- /notebooks/python_intro/_solutions/02-functions-use4.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | random.randint(1, 6) -------------------------------------------------------------------------------- /notebooks/python_intro/_solutions/02-functions-use5.py: -------------------------------------------------------------------------------- 1 | # alternative using randrange 2 | random.randrange(1, 7) -------------------------------------------------------------------------------- /notebooks/python_intro/_solutions/02-functions-use6.py: -------------------------------------------------------------------------------- 1 | print(""" 2 | Order of operations: 3 | - 1.1 * radiance = 1.1 4 | - 1.1 - 0.5 = 0.6 5 | - min(radiance, 0.6) = 0.6 6 | - 2.0 + 0.6 = 2.6 7 | - max(2.1, 2.6) = 2.6 8 | 9 | At the end, result = 2.6 10 | """) -------------------------------------------------------------------------------- /notebooks/python_intro/_solutions/02-functions-use7.py: -------------------------------------------------------------------------------- 1 | pressure_hPa = 1010 2 | height = 2500 3 | 4 | pressure_hPa * math.exp(-gravit_acc * molar_mass_earth * height/(gas_constant * standard_temperature)) -------------------------------------------------------------------------------- /notebooks/python_intro/_solutions/03-containers1.py: -------------------------------------------------------------------------------- 1 | pressures_hPa[2] = 1111 2 | pressures_hPa -------------------------------------------------------------------------------- /notebooks/python_intro/_solutions/03-containers2.py: -------------------------------------------------------------------------------- 1 | pressures_hPa.insert(4, 1212) 2 | pressures_hPa -------------------------------------------------------------------------------- /notebooks/python_intro/_solutions/03-containers3.py: -------------------------------------------------------------------------------- 1 | pressures_hPa[-3:] -------------------------------------------------------------------------------- /notebooks/python_intro/_solutions/03-containers4.py: -------------------------------------------------------------------------------- 1 | pressures_hPa[1::2] -------------------------------------------------------------------------------- /notebooks/python_intro/_solutions/03-containers5.py: -------------------------------------------------------------------------------- 1 | # Returns a sorted copy of the list (the original list `pressures_hPa` remains unchanged) 2 | print(sorted(pressures_hPa)) 3 | # The list methos `sort` sorts the list in-place and does not return anything on itself 4 | print(pressures_hPa.sort()) 5 | print(pressures_hPa) -------------------------------------------------------------------------------- /notebooks/python_intro/_solutions/03-containers6.py: -------------------------------------------------------------------------------- 1 | a_third_list = ['red', 'blue', 'green', 'black', 'white'] 2 | a_third_list_reversed = a_third_list.copy() 3 | a_third_list_reversed.reverse() 4 | a_concatenated_list = a_third_list + a_third_list_reversed 5 | a_concatenated_list -------------------------------------------------------------------------------- /notebooks/python_intro/_solutions/03-containers7.py: -------------------------------------------------------------------------------- 1 | my_spell = "abracadabra" 2 | my_spell.upper() -------------------------------------------------------------------------------- /notebooks/python_intro/_solutions/03-containers8.py: -------------------------------------------------------------------------------- 1 | my_spell = "abracadabra" 2 | my_spell[1::2] -------------------------------------------------------------------------------- /notebooks/python_intro/_solutions/03-containers9.py: -------------------------------------------------------------------------------- 1 | report_location = "Nete" 2 | f"The measured dissolved oxygen in {report_location} on March 18th 2024 was {water_quality[report_location]} mg/l." -------------------------------------------------------------------------------- /notebooks/python_intro/_solutions/04-control-flow1.py: -------------------------------------------------------------------------------- 1 | # using the 'accumulator pattern' to check the number of counts 2 | acc = 0 3 | for letter in 'oxygen': 4 | acc += 1 # the in-place operator 5 | print(acc) -------------------------------------------------------------------------------- /notebooks/python_intro/_solutions/04-control-flow2.py: -------------------------------------------------------------------------------- 1 | a = 0.43 2 | r = 1.35 3 | for conductivity in conductivities: 4 | print(a + conductivity*r ) -------------------------------------------------------------------------------- /notebooks/python_intro/_solutions/04-control-flow3.py: -------------------------------------------------------------------------------- 1 | indices = [] 2 | for j, pressure in enumerate(pressures_hPa): 3 | if pressure < 1000: 4 | indices.append(j) 5 | indices -------------------------------------------------------------------------------- /notebooks/python_intro/_solutions/04-control-flow4.py: -------------------------------------------------------------------------------- 1 | for location, do in water_quality.items(): 2 | if (do > 20) or (do < 5): 3 | print(f"Alert: Poor conditions measured at {location} with DO concentration of {do} mg/l.") -------------------------------------------------------------------------------- /notebooks/python_intro/_solutions/04-control-flow5.py: -------------------------------------------------------------------------------- 1 | for file_name in file_names: 2 | if file_name.startswith("sigma"): 3 | print(f"Processing file {file_name} with sigma pipeline.") 4 | elif file_name.startswith("ava"): 5 | print(f"Processing file {file_name} with avalanche pipeline.") 6 | else: 7 | print(f"Unrecognized file {file_name}") -------------------------------------------------------------------------------- /notebooks/python_intro/_solutions/05-functions-write1.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | def barometric_formula(pressure_sea_level, height=2500): 4 | """Apply barometric formula 5 | 6 | Apply the barometric formula to calculate the air pressure on a given height 7 | 8 | Parameters 9 | ---------- 10 | pressure_sea_level : float 11 | pressure, measured as sea level (hPa) 12 | height : float 13 | height above sea level (m) 14 | 15 | Notes 16 | ------ 17 | see https://www.math24.net/barometric-formula/ or 18 | https://en.wikipedia.org/wiki/Atmospheric_pressure 19 | """ 20 | standard_temperature = 288.15 21 | gas_constant = 8.3144598 22 | gravit_acc = 9.81 23 | molar_mass_earth = 0.02896 24 | 25 | pressure_altitude = pressure_sea_level * math.exp(-gravit_acc * molar_mass_earth* height/(gas_constant*standard_temperature)) 26 | return pressure_altitude 27 | 28 | barometric_formula(1010), barometric_formula(1010, 2750) -------------------------------------------------------------------------------- /notebooks/python_intro/_solutions/05-functions-write2.py: -------------------------------------------------------------------------------- 1 | pressures_hPa_1200 = [barometric_formula(pressure, 1200) for pressure in pressures_hPa] 2 | pressures_hPa_1200 -------------------------------------------------------------------------------- /notebooks/python_intro/_solutions/05-functions-write3.py: -------------------------------------------------------------------------------- 1 | pressures_hPa_1200 = [] 2 | for pressure in pressures_hPa: 3 | pressures_hPa_1200.append(barometric_formula(pressure, 1200)) 4 | pressures_hPa_1200 -------------------------------------------------------------------------------- /notebooks/python_recap/_solutions/01-basic24.py: -------------------------------------------------------------------------------- 1 | a_third_list.count? -------------------------------------------------------------------------------- /notebooks/python_recap/_solutions/01-basic25.py: -------------------------------------------------------------------------------- 1 | a_third_list.index? -------------------------------------------------------------------------------- /notebooks/python_recap/_solutions/01-basic28.py: -------------------------------------------------------------------------------- 1 | a_third_list[::-1] -------------------------------------------------------------------------------- /notebooks/python_recap/_solutions/01-basic47.py: -------------------------------------------------------------------------------- 1 | [el for el in dir(list) if not el[0]=='_'] -------------------------------------------------------------------------------- /notebooks/python_recap/_solutions/01-basic49.py: -------------------------------------------------------------------------------- 1 | #split in words and get word lengths 2 | [len(word) for word in sentence.split()] -------------------------------------------------------------------------------- /notebooks/python_recap/_solutions/01-basic58.py: -------------------------------------------------------------------------------- 1 | str_key = [] 2 | for key in hourly_wage.keys(): 3 | str_key.append(str(key)) 4 | str_key -------------------------------------------------------------------------------- /notebooks/python_recap/_solutions/02-control_flow15.py: -------------------------------------------------------------------------------- 1 | # return the name of the company given a certain value between 1 and 5: 2 | for k in dd: 3 | if dd[k] == value: 4 | print(k.upper()) -------------------------------------------------------------------------------- /notebooks/python_recap/_solutions/02-control_flow16.py: -------------------------------------------------------------------------------- 1 | if 'antea' in dd.keys(): 2 | print('already in dictionary') -------------------------------------------------------------------------------- /notebooks/python_recap/_solutions/02-control_flow24.py: -------------------------------------------------------------------------------- 1 | sentence = "hello world! 123" 2 | d = {"DIGITS": 0, "LETTERS": 0} 3 | for char in sentence: 4 | if char.isdigit(): 5 | d["DIGITS"] += 1 6 | elif char.isalpha(): 7 | d["LETTERS"] += 1 8 | else: 9 | pass 10 | print("LETTERS", d["LETTERS"]) 11 | print("DIGITS", d["DIGITS"]) -------------------------------------------------------------------------------- /notebooks/python_recap/_solutions/03-functions19.py: -------------------------------------------------------------------------------- 1 | def check_for_key(checkdict, key): 2 | """ 3 | Function checks the presence of key in dictionary checkdict and returns an 4 | exception if the key is already used in the dictionary 5 | 6 | """ 7 | if key in checkdict.keys(): 8 | raise Exception('Key already used in this dictionary') -------------------------------------------------------------------------------- /notebooks/python_recap/_solutions/03-functions27.py: -------------------------------------------------------------------------------- 1 | class Employee(): #object 2 | 3 | def __init__(self, name, wage=60.): 4 | """ 5 | Employee class to save the amount of hours worked and related earnings 6 | """ 7 | self.name = name 8 | self.wage = wage 9 | self.projects = {} 10 | 11 | def new_project(self, projectname): 12 | """ 13 | """ 14 | if projectname in self.projects: 15 | raise Exception("project already exist for", self.name) 16 | else: 17 | self.projects[projectname] = 0. 18 | 19 | 20 | def worked(self, hours, projectname): 21 | """add worked hours on a project 22 | """ 23 | try: 24 | hours = float(hours) 25 | except: 26 | raise Exception("Hours not convertable to float!") 27 | 28 | if not projectname in self.projects: 29 | raise Exception("project non-existing for", self.name) 30 | 31 | self.projects[projectname] += hours 32 | 33 | def calc_earnings(self): 34 | """ 35 | Calculate earnings 36 | """ 37 | total_hours = 0 38 | for val in self.projects.values(): 39 | total_hours += val 40 | 41 | return total_hours *self.wage 42 | 43 | def info(self): 44 | """ 45 | get info 46 | """ 47 | for proj, hour in self.projects.items(): 48 | print(hour, 'worked on project', proj) -------------------------------------------------------------------------------- /notebooks/python_recap/_solutions/05-numpy109.py: -------------------------------------------------------------------------------- 1 | # RESCALE: 2 | (Z - Z.min())/(Z.max() - Z.min()) -------------------------------------------------------------------------------- /notebooks/python_recap/_solutions/05-numpy137.py: -------------------------------------------------------------------------------- 1 | x, y = b_data[:,3], b_data[:,4] 2 | t = np.polyfit(x, y, 4) # fit a 2nd degree polynomial to the data, result is x**2 + 2x + 3 3 | t 4 | x.sort() 5 | plt.plot(x, y, 'o') 6 | plt.plot(x, t[0]*x**4 + t[1]*x**3 + t[2]*x**2 + t[3]*x +t[4], '-') -------------------------------------------------------------------------------- /notebooks/python_recap/_solutions/05-numpy34.py: -------------------------------------------------------------------------------- 1 | np.arange(10, 50, 1) -------------------------------------------------------------------------------- /notebooks/python_recap/_solutions/05-numpy35.py: -------------------------------------------------------------------------------- 1 | np.identity(3) -------------------------------------------------------------------------------- /notebooks/python_recap/_solutions/05-numpy36.py: -------------------------------------------------------------------------------- 1 | np.eye(3) -------------------------------------------------------------------------------- /notebooks/python_recap/_solutions/05-numpy37.py: -------------------------------------------------------------------------------- 1 | np.random.random((3, 3, 3)) -------------------------------------------------------------------------------- /notebooks/python_recap/_solutions/05-numpy58.py: -------------------------------------------------------------------------------- 1 | vec = np.zeros(10) 2 | vec[4] = 1. 3 | vec -------------------------------------------------------------------------------- /notebooks/python_recap/_solutions/05-numpy73.py: -------------------------------------------------------------------------------- 1 | #SWAP 2 | A[[0, 1]] = A[[1, 0]] 3 | A -------------------------------------------------------------------------------- /notebooks/python_recap/_solutions/05-numpy75.py: -------------------------------------------------------------------------------- 1 | AR[AR%2==0] = 0. 2 | AR -------------------------------------------------------------------------------- /notebooks/python_recap/_solutions/05-numpy77.py: -------------------------------------------------------------------------------- 1 | AR[1::2] = 0 2 | AR -------------------------------------------------------------------------------- /notebooks/python_recap/_solutions/python_rehearsal1.py: -------------------------------------------------------------------------------- 1 | height = 2500 2 | pressure_hPa * math.exp(-gravit_acc * molar_mass_earth* height/(gas_constant*standard_temperature)) -------------------------------------------------------------------------------- /notebooks/python_recap/_solutions/python_rehearsal10.py: -------------------------------------------------------------------------------- 1 | np.sqrt(AR2[AR2 > np.percentile(AR2, 75)]) -------------------------------------------------------------------------------- /notebooks/python_recap/_solutions/python_rehearsal11.py: -------------------------------------------------------------------------------- 1 | AR3[np.isclose(AR3, -99)] = np.nan 2 | AR3 -------------------------------------------------------------------------------- /notebooks/python_recap/_solutions/python_rehearsal12.py: -------------------------------------------------------------------------------- 1 | [location.lower() for location in locations] -------------------------------------------------------------------------------- /notebooks/python_recap/_solutions/python_rehearsal13.py: -------------------------------------------------------------------------------- 1 | [location.lower() for location in locations] -------------------------------------------------------------------------------- /notebooks/python_recap/_solutions/python_rehearsal2.py: -------------------------------------------------------------------------------- 1 | def barometric_formula(pressure_sea_level, height=2500): 2 | """Apply barometric formula 3 | 4 | Apply the barometric formula to calculate the air pressure on a given height 5 | 6 | Parameters 7 | ---------- 8 | pressure_sea_level : float 9 | pressure, measured as sea level 10 | height : float 11 | height above sea level (m) 12 | 13 | Notes 14 | ------ 15 | see https://www.math24.net/barometric-formula/ or 16 | https://en.wikipedia.org/wiki/Atmospheric_pressure 17 | """ 18 | standard_temperature = 288.15 19 | gas_constant = 8.3144598 20 | gravit_acc = 9.81 21 | molar_mass_earth = 0.02896 22 | 23 | pressure_altitude = pressure_sea_level * math.exp(-gravit_acc * molar_mass_earth* height/(gas_constant*standard_temperature)) 24 | return pressure_altitude -------------------------------------------------------------------------------- /notebooks/python_recap/_solutions/python_rehearsal3.py: -------------------------------------------------------------------------------- 1 | def barometric_formula(pressure_sea_level, height=2500): 2 | """Apply barometric formula 3 | 4 | Apply the barometric formula to calculate the air pressure on a given height 5 | 6 | Parameters 7 | ---------- 8 | pressure_sea_level : float 9 | pressure, measured as sea level 10 | height : float 11 | height above sea level (m) 12 | 13 | Notes 14 | ------ 15 | see https://www.math24.net/barometric-formula/ or 16 | https://en.wikipedia.org/wiki/Atmospheric_pressure 17 | """ 18 | if height > 11000: 19 | raise Exception("Barometric formula only valid for heights lower than 11000m above sea level") 20 | 21 | standard_temperature = 288.15 22 | gas_constant = 8.3144598 23 | gravit_acc = 9.81 24 | molar_mass_earth = 0.02896 25 | 26 | pressure_altitude = pressure_sea_level * math.exp(-gravit_acc * molar_mass_earth* height/(gas_constant*standard_temperature)) 27 | return pressure_altitude -------------------------------------------------------------------------------- /notebooks/python_recap/_solutions/python_rehearsal4.py: -------------------------------------------------------------------------------- 1 | for pressure in pressures_hPa: 2 | print(barometric_formula(pressure, 3000)) -------------------------------------------------------------------------------- /notebooks/python_recap/_solutions/python_rehearsal5.py: -------------------------------------------------------------------------------- 1 | pressures_hPa_adjusted = [barometric_formula(pressure, 3000) for pressure in pressures_hPa] 2 | pressures_hPa_adjusted -------------------------------------------------------------------------------- /notebooks/python_recap/_solutions/python_rehearsal6.py: -------------------------------------------------------------------------------- 1 | np_pressures_hPa * math.exp(-gravit_acc * molar_mass_earth* height/(gas_constant*standard_temperature)) -------------------------------------------------------------------------------- /notebooks/python_recap/_solutions/python_rehearsal7.py: -------------------------------------------------------------------------------- 1 | sum(AR > 10) -------------------------------------------------------------------------------- /notebooks/python_recap/_solutions/python_rehearsal8.py: -------------------------------------------------------------------------------- 1 | AR[AR%2 == 0] = 0 2 | AR -------------------------------------------------------------------------------- /notebooks/python_recap/_solutions/python_rehearsal9.py: -------------------------------------------------------------------------------- 1 | AR[1::2] = 30 2 | AR -------------------------------------------------------------------------------- /notebooks/python_recap/data/bogota_part_dataset.csv: -------------------------------------------------------------------------------- 1 | DIA,SST AM,SSV AM,SSV PM,SSF PM 2 | Unidad,mg/l,mg/l,mg/l,mg/l 3 | ,,,, 4 | 1,198,141,131,38 5 | 2,274,200,125,35 6 | 3,156,119,274,120 7 | 4,382,266,272,105 8 | 5,494,342,202,76 9 | 6,259,182,205,67 10 | 7,247,185,232,77 11 | 8,164,125,112,33 12 | 9,367,265,82,30 13 | 10,123,90,91,26 14 | 11,132,96,130,46 15 | 12,97,66,110,33 16 | 13,160,104,181,83 17 | 14,137,100,122,41 18 | 15,172,123,151,56 19 | 16,192,138,168,78 20 | 17,176,106,94,36 21 | 18,192,132,111,43 22 | 19,152,99,112,37 23 | 20,255,179,181,67 24 | 21,188,134,220,94 25 | 22,215,153,149,58 26 | 23,221,157,147,60 27 | 24,284,199,201,93 28 | 25,134,84,133,65 29 | 26,196,120,132,47 30 | 27,144,88,114,41 31 | 28,193,143,128,45 32 | -------------------------------------------------------------------------------- /notebooks/python_recap/data/out1.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/notebooks/python_recap/data/out1.txt -------------------------------------------------------------------------------- /notebooks/python_recap/data/out2.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/notebooks/python_recap/data/out2.txt -------------------------------------------------------------------------------- /notebooks/python_recap/data/out3.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/notebooks/python_recap/data/out3.txt -------------------------------------------------------------------------------- /notebooks/python_recap/data/out4.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorisvandenbossche/DS-python-data-analysis/e1b5adad721f5febbff2c28d98b8b3e01d3b5d84/notebooks/python_recap/data/out4.txt -------------------------------------------------------------------------------- /notebooks/python_recap/data/values.txt: -------------------------------------------------------------------------------- 1 | 0,09400 3,37968 2 | 0,28820 0,83214 3 | 0,06823 0,57102 4 | 0,65576 0,59619 5 | -1,23714 0,03561 --------------------------------------------------------------------------------