├── .Rprofile ├── .gitignore ├── 99-resources.Rmd ├── LICENSE ├── eda-course.Rproj ├── index.html ├── outline.md ├── proposals ├── eda-worksho.md ├── proposal-om.Rmd └── proposal-om.md ├── readme.Rmd ├── readme.md ├── renv.lock ├── renv ├── .gitignore ├── activate.R └── settings.dcf ├── renv_force.R └── slides ├── 00-intro.Rmd ├── 00-intro.html ├── 01-meet-r.Rmd ├── 01-meet-r.html ├── 01-meet-r_files └── figure-html │ ├── unnamed-chunk-3-1.svg │ ├── unnamed-chunk-49-1.svg │ └── unnamed-chunk-52-1.svg ├── 02-intro-to-tidyverse.Rmd ├── 02-intro-to-tidyverse.html ├── 03-intro-to-the-tidyverse.Rmd ├── 03-intro-to-the-tidyverse.html ├── 03-intro-to-the-tidyverse_files └── figure-html │ ├── output_squirrel-beeswarm_10-1.png │ ├── output_squirrel-beeswarm_10-1.svg │ ├── output_squirrel-beeswarm_11-1.svg │ ├── output_squirrel-beeswarm_12-1.svg │ ├── output_squirrel-beeswarm_14-1.png │ ├── output_squirrel-beeswarm_14-1.svg │ ├── output_squirrel-beeswarm_17-1.svg │ ├── output_squirrel-beeswarm_5-1.png │ ├── output_squirrel-beeswarm_5-1.svg │ ├── output_squirrel-beeswarm_6-1.png │ ├── output_squirrel-beeswarm_6-1.svg │ ├── output_squirrel-beeswarm_7-1.png │ ├── output_squirrel-beeswarm_7-1.svg │ ├── output_squirrel-beeswarm_8-1.png │ ├── output_squirrel-beeswarm_8-1.svg │ ├── output_squirrel-beeswarm_9-1.png │ ├── output_squirrel-beeswarm_9-1.svg │ ├── output_squirrel-height-boxplot_10-1.svg │ ├── output_squirrel-height-boxplot_11-1.svg │ ├── output_squirrel-height-boxplot_12-1.svg │ ├── output_squirrel-height-boxplot_13-1.svg │ ├── output_squirrel-height-boxplot_14-1.svg │ ├── output_squirrel-height-boxplot_4-1.png │ ├── output_squirrel-height-boxplot_4-1.svg │ ├── output_squirrel-height-boxplot_5-1.png │ ├── output_squirrel-height-boxplot_5-1.svg │ ├── output_squirrel-height-boxplot_6-1.png │ ├── output_squirrel-height-boxplot_6-1.svg │ ├── output_squirrel-height-boxplot_7-1.png │ ├── output_squirrel-height-boxplot_7-1.svg │ ├── output_squirrel-height-boxplot_8-1.png │ ├── output_squirrel-height-boxplot_8-1.svg │ ├── output_squirrel-height-boxplot_9-1.png │ ├── output_squirrel-height-boxplot_9-1.svg │ ├── output_squirrel-histogram_10-1.svg │ ├── output_squirrel-histogram_11-1.svg │ ├── output_squirrel-histogram_12-1.svg │ ├── output_squirrel-histogram_16-1.png │ ├── output_squirrel-histogram_16-1.svg │ ├── output_squirrel-histogram_17-1.png │ ├── output_squirrel-histogram_17-1.svg │ ├── output_squirrel-histogram_19-1.svg │ ├── output_squirrel-histogram_20-1.svg │ ├── output_squirrel-histogram_21-1.svg │ ├── output_squirrel-histogram_4-1.png │ ├── output_squirrel-histogram_4-1.svg │ ├── output_squirrel-histogram_6-1.png │ ├── output_squirrel-histogram_6-1.svg │ ├── output_squirrel-histogram_7-1.png │ ├── output_squirrel-histogram_7-1.svg │ ├── output_squirrel-histogram_8-1.png │ ├── output_squirrel-histogram_8-1.svg │ ├── output_squirrel-histogram_9-1.svg │ ├── output_squirrels-geo_10-1.svg │ ├── output_squirrels-geo_11-1.svg │ ├── output_squirrels-geo_12-1.svg │ ├── output_squirrels-geo_3-1.svg │ ├── output_squirrels-geo_4-1.svg │ ├── output_squirrels-geo_7-1.svg │ ├── output_squirrels-geo_8-1.svg │ ├── output_squirrels-geo_9-1.svg │ └── squirrels-geo-1.svg ├── 04-intro-to-tidyverse.Rmd ├── 04-intro-to-tidyverse.html ├── 05-intro-to-the-tidyverse.Rmd ├── 05-intro-to-the-tidyverse.html ├── 05-intro-to-the-tidyverse_files └── figure-html │ ├── output_count_na_4-1.svg │ ├── output_count_na_5-1.svg │ ├── output_count_na_6-1.svg │ ├── output_count_na_9-1.svg │ ├── output_count_na_exp_3-1.svg │ ├── output_count_na_exp_4-1.svg │ ├── output_count_na_exp_5-1.svg │ ├── output_count_na_exp_8-1.svg │ ├── unnamed-chunk-14-1.svg │ ├── unnamed-chunk-15-1.png │ └── unnamed-chunk-19-1.png ├── 06-your-turn.Rmd ├── 06-your-turn.html ├── 10-practical.Rmd ├── css └── remark.css ├── img ├── SVG │ ├── broom.svg │ ├── dplyr.svg │ ├── forcats.svg │ ├── ggplot2.svg │ ├── lubridate.svg │ ├── magrittr.svg │ ├── purrr.svg │ ├── readr.svg │ ├── readxl.svg │ ├── stringr.svg │ ├── tibble.svg │ ├── tidyr.svg │ └── tidyverse.svg ├── grammar-of-graphics.png ├── janitor.png ├── milano.svg ├── portrait.jpg ├── readr.svg └── squirrel-svgrepo-com.svg ├── libs-reveal └── xaringan_reveal_parentheses_balanced.R └── quotes.md /.Rprofile: -------------------------------------------------------------------------------- 1 | source("renv/activate.R") 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | 3 | data/* 4 | img/* 5 | 6 | proposals/*.docx 7 | proposals/*.pdf 8 | 9 | slides/js/* 10 | slides/libs/* 11 | slides/Rplots.* 12 | slides/index.html 13 | 14 | *.docx 15 | *.pdf 16 | # Generic R gitignore 17 | # from https://raw.githubusercontent.com/github/gitignore/master/R.gitignore 18 | 19 | # vignette stuff 20 | vignettes/.build.timestamp 21 | 22 | # History files 23 | .Rhistory 24 | .Rapp.history 25 | 26 | # Session Data files 27 | .RData 28 | 29 | # User-specific files 30 | .Ruserdata 31 | 32 | # Example code in package build process 33 | *-Ex.R 34 | 35 | # Output files from R CMD build 36 | /*.tar.gz 37 | 38 | # Output files from R CMD check 39 | /*.Rcheck/ 40 | 41 | # RStudio files 42 | .Rproj.user/ 43 | 44 | # produced vignettes 45 | vignettes/*.html 46 | vignettes/*.pdf 47 | 48 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 49 | .httr-oauth 50 | 51 | # knitr and R markdown default cache directories 52 | *_cache/ 53 | /cache/ 54 | 55 | # Temporary files created by R markdown 56 | *.utf8.md 57 | *.knit.md 58 | -------------------------------------------------------------------------------- /99-resources.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Resources" 3 | author: "Otho Mantegazza" 4 | date: "10/9/2019" 5 | output: html_document 6 | --- 7 | 8 | https://github.com/jkaupp/CEEA2017-Speaking-With-Data 9 | 10 | https://towardsdatascience.com/a-comprehensive-guide-to-the-grammar-of-graphics-for-effective-visualization-of-multi-dimensional-1f92b4ed4149 -------------------------------------------------------------------------------- /eda-course.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | -------------------------------------------------------------------------------- /index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Exploratory Data Analysis in R @ Unimi 6 | 7 | 8 | 9 | 10 | 11 | 12 |

Exploratory Data Analysis in R

13 | 14 |

Otho Mantegazza

15 | 16 |

17 | Github Repo 18 |

19 | 20 | 25 | -------------------------------------------------------------------------------- /outline.md: -------------------------------------------------------------------------------- 1 | 2 | # 0-Intro 3 | 4 | Data is the present. 5 | 6 | It is not the future, not something that we must prepare for. Data is the present. Most sectors today produce and deal with masssive datasets and databases. And the same is for research: at this moment data are cheap to come by, eterogeneous and everywhere. 7 | 8 | 9 | # 01 - Let's make an histogram 10 | 11 | 12 | Read the data into an R object and store it into a variable. 13 | 14 | ## Load the data in an R object and store it in a variable 15 | 16 | Sorry, this part might be a bit boring, but R is the tool for data science. 17 | 18 | And to do any data science of it, you must first familiarize with the tool. 19 | 20 | I'll try to give you a very minimal introduction to R, oriented to data exploration and data science. But you'll need to learn a bit of R programming anyway. 21 | 22 | ### Objects 23 | 24 | - Everything in R is an object: 25 | - r understands what you tell it, 26 | - everything is an object, and it has a class, 27 | - vectors collect objects of the same class, 28 | - We will skip matrices, everything else is a list. 29 | - list can be nested. 30 | 31 | ### Variables 32 | 33 | - Unquoted text is interpreted as a variable 34 | - you can store in them any object you like 35 | - Once you call the variable it returns the object you associated to it, 36 | - Some variables are already occupied! 37 | 38 | Check the variables with already data associated to them (in Rstudio), reflections on the Rectangular data format. 39 | Introduce concept of tidy data. 40 | 41 | ### Rectangular Data 42 | 43 | - Rectangular data, and rectangular data in R 44 | - most obvious and "commmon" form of data, 45 | - While you explore your data, eventually you will transform you data into rectangular. 46 | - dataframes (and tibbles come from lists column wise - thus each column is a vector. 47 | Each column has the same class, makes sense from a data point of view: would you build 48 | a dataset rowise?) 49 | - But how would you import some data from outside R into R? You need functions. 50 | 51 | ### Functions 52 | 53 | - In R you do everything with functions: 54 | - Some variables already store functions. 55 | - You can inspect the body of the function, often it is just a collection of R codes... 56 | it calls other functions. 57 | - a function takes some argument and returns an output. 58 | - documentations: manual pages, vignette, websites, articles/blog posts, books 59 | - you can provide a variable as an argument and save the output in another variable. 60 | 61 | 62 | ### Functions in packages 63 | 64 | - Why do we need to install packages 65 | - You have primitive / low level functions and various levels of high levels function, 66 | often redundant [readLines, read.csv, read_csv]. 67 | - Packages are a collection of functions (and metadata and documentation), 68 | - Primitive and internal functions are the building blocks of R and generally are written in C, 69 | - You can do everything with those building blocks, but we want our life easier, so we can use some blocks that are already assembled, 70 | - Those assembled block are the functions in packages, 71 | - Example, you need to import a CSV file in R --> readLines is primitive. 72 | - But you can call read.csv that read the lines as if they were a CSV, 73 | - or read_csv, newer version. 74 | 75 | ### You can write your own function 76 | 77 | Better explained with scheme. 78 | 79 | ## Put it together 80 | 81 | Read the data into an R object and store it into a variable. 82 | 83 | Resource :https://evamaerey.github.io/ggplot_flipbook/ggplot_flipbook_xaringan.html 84 | 85 | 86 | # 02 - Analyze data with the tidyverse 87 | 88 | - Do you know what high level programming is? 89 | 90 | - Tidyverse is a collection of high level packages and functions for data science! 91 | There are others, pick the one best suited for you. https://www.tidyverse.org/packages/ 92 | 93 | - I will not introduce them one by one because we must use to analyze data we must use them together. 94 | 95 | - I will not go in detail on bioinformatics and bioconductor (the other huge ecosystem of packages in R) because this is more generic and fundamental. Once you get the basic data exploration right, it is easier to switch to bioconductor 96 | 97 | ## Dataset 98 | 99 | NY Squirrel census: https://github.com/rfordatascience/tidytuesday/tree/master/data/2019/2019-10-29 100 | 101 | Because: 102 | 103 | - It is large enough to provide a challenge. 104 | - It is tidy and detailed, 105 | - It stores quantitative and categorical variables, 106 | - it stores spatial variables, 107 | - it stores time variables. 108 | 109 | 110 | ## Basic exercises of data manipulation 111 | 112 | Afterwards: 113 | 114 | ## With plots you can see things more clearly 115 | 116 | Graphical representation of data is a powerful tool to explore them. 117 | 118 | GGplot2 is one of the most powerful data visualization libraries available. 119 | 120 | It's not your only choice for data vis in r, but we will focus on it only, because it's the most widely used. 121 | 122 | Layered Grammar of Graphics. 123 | 124 | Compare ggplot2 and D3 125 | 126 | # Data cleaning 127 | 128 | This datatset has great examples of data cleaning 129 | 130 | https://github.com/rfordatascience/tidytuesday/tree/master/data/2019/2019-11-05 131 | 132 | # Practical session 133 | 134 | Explore your own dataset. 135 | 136 | 1. Select a dataset, I will provide three of them. But feel free to choose one of your own if you prefer. 137 | 138 | - In case, take care that it is a tamed dataset, 139 | https://rstudio-pubs-static.s3.amazonaws.com/396363_adaf67178eab4bd793bd9dd17dda70b3.html 140 | https://r4ds.had.co.nz/tidy-data.html 141 | 142 | 2. Familiarize yourself with the topic. Know at least just a bit of what the dataset is about. 143 | 144 | 3. Do you have a description of all the columns in the dataset? 145 | - If you don't have this information, can you gather it? 146 | - Do those description match what you observe in R? If not, what does not match? 147 | - Is it an issue of how the data are loaded? Can you fix it? 148 | 149 | 4. Are the data tidy? 150 | - Are there NA? How are NA encoded? 151 | - Do the column have practical names, do they need renaming? 152 | - Do the data are structured according to the tidy principles, do they need spreadong/gathering? 153 | 154 | - Each variable must have its own column. 155 | - Each observation must have its own row. 156 | - Each value must have its own cell. 157 | 158 | 159 | 5. How are the variable distributed? If they are categorical which level do they have. 160 | 161 | 6. Do you have hypothesis on what is the main information hidden in the dataset? Do you know how to find it? 162 | 163 | ## Possible datasets 164 | 165 | https://www.kaggle.com/osmi/mental-health-in-tech-survey 166 | 167 | 168 | ecology 169 | 170 | https://www.kaggle.com/footprintnetwork/ecological-footprint 171 | 172 | big https://www.kaggle.com/uciml/forest-cover-type-dataset 173 | 174 | https://www.kaggle.com/sogun3/uspollution 175 | 176 | https://www.kaggle.com/gustavomodelli/forest-fires-in-brazil 177 | 178 | 179 | 180 | Genetics 181 | 182 | https://www.kaggle.com/kevinarvai/clinvar-conflicting 183 | 184 | ## Deep Dives 185 | 186 | https://github.com/rfordatascience/tidytuesday/tree/master/data/2018/2018-10-02 187 | 188 | https://dati.comune.milano.it/dataset/ds417-rilevazione-qualita-aria-2019 189 | https://dati.comune.milano.it/dataset/ds417-rilevazione-qualita-aria-2019/resource/698a58e6-f276-44e1-92b1-3d2b81a4ad47 -------------------------------------------------------------------------------- /proposals/eda-worksho.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Introduction to Exploratory Data Analysis in R 3 | subtitle: Otho Mantegazza 4 | output: 5 | word_document: 6 | reference_docx: kjhealy-ref-mod2.docx 7 | --- 8 | 9 | # Introduction 10 | 11 | In this workshop, I'll introduce you to Exploratory Data Analysis (EDA). A part of statistics that gained attention and that was formalized only recently, EDA is a set of approaches and practices that allows you to explore a novel dataset and to extract efficiently observations and hypothesis from it. 12 | 13 | In this course you'll learn how to effectively analyze your data by quickly cycling through visualization and modeling methods. 14 | 15 | Since EDA is an intuitive and practical approach to data analysis, we will learn it in a practical way, by analyzing together various datasets in R. 16 | 17 | # Outline 18 | 19 | In two days, on real datasets,we will learn the basis of how to make hypothesis on data and how to test them. 20 | 21 | Day 1 22 | : Learn how to detect patterns in your data by visualizing and transforming them. 23 | 24 | Day 2 25 | : Learn how to test hypothesis on your data with statistical models. 26 | 27 | # Prerequisites: 28 | 29 | You will need [R](https://cran.r-project.org/) and [Rstudio](https://www.rstudio.com/) installed on your laptop. 30 | 31 | Moreover, you will need to know at least a tiny bit of R before starting the workshop, 32 | 33 | I suggest that you figure out beforehand: 34 | 35 | - How to open, save and run an R script in Rstudio 36 | - How to manipulate a basic dataset (dplyr) 37 | - How to produce a simple plot from a dataset (for example a scatterplot) with ggplot2. 38 | - A bit of statistics? We will do with what you know, I'm still learning myself. 39 | 40 | To learn those skills, check the introductory chapters of any of these books. 41 | 42 | - https://r4ds.had.co.nz/ 43 | - https://bookdown.org/rdpeng/rprogdatascience/ 44 | - https://moderndive.com/2-getting-started.html 45 | 46 | At [Bookdown.org](https://bookdown.org/) You'll find many more. 47 | 48 | # Take Home 49 | 50 | 1. How to quickly produce many exploratory plots with `ggplot2`. 51 | 52 | 2. Basic modeling in R, how to produce many models. 53 | 54 | 3. Train - Test set reasoning to formulate and test hypothesis on data. 55 | -------------------------------------------------------------------------------- /proposals/proposal-om.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Exploratory Data Analysis (EDA) in R" 3 | author: "[Otho Mantegazza](https://otho.netlify.com)" 4 | date: "`r Sys.Date()`" 5 | output: pdf_document 6 | urlcolor: Cerulean 7 | --- 8 | 9 | Most experiments produce data, and if you are able to analyze those data efficiently, you'll have an edge on your work. 10 | 11 | [**EDA**](https://r4ds.had.co.nz/exploratory-data-analysis.html) is a practical and empirical approach to data analysis. It does not require strong mathematical basis, and it teaches you how to approach your data empirically, and how to build assumptions and hypothesis step by step. 12 | 13 | In this workshop we will learn EDA in R. 14 | 15 | # Topics 16 | 17 | The topics for the workshop are: 18 | 19 | 1. Reading/Loading Data, 20 | 2. Tidying and transforming Data, 21 | 3. Visualization, 22 | 4. Modeling (just a tiny bit) 23 | 24 | Which are all skills that you'll need to explore your data. 25 | 26 | # Tools (R) 27 | 28 | We will do everything in R with [**Tidyverse**](https://www.tidyverse.org/) packages, for their technical superiority. We will start from the basis and **you can join also if you are new to R**. 29 | 30 | # Datasets 31 | 32 | We will learn those topics by exploring and analyzing together real life datasets. 33 | 34 | We will start from a simple one, that can be loaded in R and visualized directly, and move to a more complicated one that must be tidied and transformed before it can be explored and visualized, and, last, we will explore one that requires also some level of statistical modeling to extract and detect the information that it contains. 35 | 36 | I plan to take the datasets from the [**Tidytuesday**](https://github.com/rfordatascience/tidytuesday) repo, which is a weekly social project in R, and a great tool to practice EDA. 37 | 38 | The dataset could be: 39 | 40 | 1. (Easy) [**Nobel Prize Winners**](https://github.com/rfordatascience/tidytuesday/blob/master/data/2019/2019-05-14/nobel_winners.csv), which can be loaded and visualized right away, 41 | 2. (Medium) [**Malaria Atlas**](https://github.com/rfordatascience/tidytuesday/tree/master/data/2018/2018-11-13), which requires some cleaning and aggregation before it can be explored and visualized 42 | 3. (Difficult) [**Police Stop Counts**](https://github.com/rfordatascience/tidytuesday/tree/master/data/2019/2019-03-19) Which could be modeled as Poisson. (or maybe we could use directly the count matrix of a multivariate RNAseq to get into to the molecular theme). 43 | 44 | 45 | # Duration 46 | 47 | The workshop will last 3 days. -------------------------------------------------------------------------------- /proposals/proposal-om.md: -------------------------------------------------------------------------------- 1 | Exploratory Data Analysis (EDA) in R 2 | ================ 3 | [Otho Mantegazza](https://otho.netlify.com) 4 | 2019-07 5 | 6 | Most experiments produce data, and if you are able to analyze those data 7 | efficiently, you’ll have an edge on your work. 8 | 9 | [**EDA**](https://r4ds.had.co.nz/exploratory-data-analysis.html) is a 10 | practical and empirical approach to data analysis. It does not require 11 | strong mathematical basis, and it teaches you how to approach your data 12 | empirically, and how to build assumptions and hypothesis step by step. 13 | 14 | In this workshop we will learn EDA in R. 15 | 16 | # Topics 17 | 18 | The topics for the workshop are: 19 | 20 | 1. Reading/Loading Data, 21 | 2. Tidying and transforming Data, 22 | 3. Visualization, 23 | 4. Modeling (just a tiny bit) 24 | 25 | Which are all skills that you’ll need to explore your data. 26 | 27 | # Tools (R) 28 | 29 | We will do everything in R with 30 | [**Tidyverse**](https://www.tidyverse.org/) packages, for their 31 | technical superiority. We will start from the basis and **you can join 32 | also if you are new to R**. 33 | 34 | # Datasets 35 | 36 | We will learn those topics by exploring and analyzing together real life 37 | datasets. 38 | 39 | We will start from a simple one, that can be loaded in R and visualized 40 | directly, and move to a more complicated one that must be tidied and 41 | transformed before it can be explored and visualized, and, last, we will 42 | explore one that requires also some level of statistical modeling to 43 | extract and detect the information that it contains. 44 | 45 | I plan to take the datasets from the 46 | [**Tidytuesday**](https://github.com/rfordatascience/tidytuesday) repo, 47 | which is a weekly social project in R, and a great tool to practice EDA. 48 | 49 | The dataset could be: 50 | 51 | 1. (Easy) [**Nobel Prize 52 | Winners**](https://github.com/rfordatascience/tidytuesday/blob/master/data/2019/2019-05-14/nobel_winners.csv), 53 | which can be loaded and visualized right away, 54 | 2. (Medium) [**Malaria 55 | Atlas**](https://github.com/rfordatascience/tidytuesday/tree/master/data/2018/2018-11-13), 56 | which requires some cleaning and aggregation before it can be 57 | explored and visualized 58 | 3. (Difficult) [**Police Stop 59 | Counts**](https://github.com/rfordatascience/tidytuesday/tree/master/data/2019/2019-03-19) 60 | Which could be modeled as Poisson. (or maybe we could use directly 61 | the count matrix of a multivariate RNAseq to get into to the 62 | molecular theme). 63 | 64 | # Duration 65 | 66 | The workshop will last 3 days. 67 | -------------------------------------------------------------------------------- /readme.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: ':milky_way: Material for an Upcoming Course in EDA' 3 | output: 4 | github_document: 5 | toc: true 6 | --- 7 | 8 | :rocket: Work in progress :construction_worker: 9 | 10 | 11 | # :notes: Info 12 | 13 | The course will be hands-on. We have access to a computer room, but if it is possible, I would **suggest** you to **bring your own laptop**. In this way you will be sure to have R and Rstudio installed on your laptop, and after the workshop you will be ready to start making your own data explorations. 14 | 15 | 16 | ## :hammer: Tools 17 | 18 | :floppy_disk: You can install R and Rstudio to your laptop. 19 | 20 | - :link: [R - CRAN](https://cran.r-project.org/) 21 | - :link: [Rstudio](https://rstudio.com/products/rstudio/download/#download) 22 | 23 | Afterwards, you can install the [Tidyverse :milky_way:](https://www.tidyverse.org/), which collects most of the packages that we will use for our explorations. To install it, open Rstudio and type in your R console: 24 | 25 | ```{r, eval = FALSE} 26 | install.packages("tidyverse") 27 | ``` 28 | 29 | If you get any :x: error message, we will fix it together :sparkler:. 30 | 31 | 32 | Otherwise, [Rstudio :cloud: cloud](https://rstudio.cloud/) let's you run Rstudio in cloud computing. 33 | 34 | # :snowboarder: Slides 35 | 36 | 1. :link: [Introduction](https://othomantegazza.github.io/eda-class/slides/00-intro.html#1) 37 | 38 | My contact details and not much else... 39 | 40 | 1. :link: [Meet R](https://othomantegazza.github.io/eda-class/slides/01-meet-r.html#1) 41 | 42 | What is an object in R? What is a variable? Why do we need functions? 43 | 44 | 1. :link: [Load and Manipulate Data - *Tidyverse, part 1*](https://othomantegazza.github.io/eda-class/slides/02-intro-to-tidyverse.html#1) 45 | 46 | A quick introduction to the tidyverse, including how to manipulate data with [dplyr](https://dplyr.tidyverse.org/articles/dplyr.html) and how to [pipe](https://magrittr.tidyverse.org/) many steps of your analysis. 47 | 48 | 1. :link: [Visualize Data - *Tidyverse, part 2*](https://othomantegazza.github.io/eda-class/slides/03-intro-to-the-tidyverse.html#1) 49 | 50 | Build a graphical representation of your data with ggplot2. 51 | 52 | 1. :link: [Clean Data - *Tidyverse, part 3*](https://othomantegazza.github.io/eda-class/slides/04-intro-to-tidyverse.html#1) 53 | 54 | Most of the time you'll need to clean and reashape your data with [Tidyr](https://tidyr.tidyverse.org/) and [Janitor](https://sfirke.github.io/janitor/). 55 | 56 | 1. :link: [More practice - *Tidyverse, part 4*](https://othomantegazza.github.io/eda-class/slides/05-intro-to-the-tidyverse.html#1) 57 | 58 | Practice more Exploratory Data Analysis with Open Data from the City of Milan. 59 | 60 | 1. :link: [Your Turn!](https://othomantegazza.github.io/eda-class/slides/06-your-turn.html) 61 | 62 | Pick a dataset and explore it! 63 | 64 | # :books: Resources 65 | 66 | The R community is active online, and committed to create a friendly and welcoming environment for new everybody. 67 | 68 | This includes writing outsanding :book: open access material that you can use to learn R :whale:. 69 | 70 | 71 | ## :rice: R Building Blocks 72 | 73 | - :link: [R programming for Data Science - Roger D. Peng](https://bookdown.org/rdpeng/rprogdatascience/) - :tiger: Jump start your R! 74 | - :link: [Advanced R - Hadley Wickham](https://adv-r.hadley.nz/) - :elephant: Everything you wish to know about R. 75 | 76 | ## :milky_way: R for Data Science 77 | 78 | :saxophone: Remember to read the articles on the packages' website!! :saxophone: 79 | 80 | - :link: [R for Data Science - Grolemund, Wickham](https://r4ds.had.co.nz/) - :bird: An overview of most data science topics, with great tips. 81 | - :link: [Introduction to Statistical Learning in R - Gareth James et al.,](https://faculty.marshall.usc.edu/gareth-james/ISL/) - :dog: Kick start you statistical models. 82 | 83 | Check the [:books: bookdown](https://bookdown.org/) repository for more books on data science, including [:earth_africa: geocomputation](https://geocompr.robinlovelace.net/), [:tophat: forecasting](https://otexts.com/fpp2/) and [:pick: text mining](https://www.tidytextmining.com/)! 84 | 85 | 86 | ## :art: Visualization in R 87 | 88 | - :link: [Data Visualization - Kieran Healy](https://socviz.co) - :tropical_fish: Communication oriented data visualization in R. 89 | - :link: [R Graphics Cookbook - Winston - Chang](https://r-graphics.org/) - :octopus: Practical introduction to visualization with ggplot2. 90 | 91 | Also, check the [Viz chapters in "R for Data science"](https://r4ds.had.co.nz/data-visualisation.html) (see above) :point_up:. 92 | 93 | ## :blossom: Life Science 94 | 95 | - :link: [HarvardX Biomedical Data Science Open Online Training - Love, Irizarry](https://rafalab.github.io/pages/harvardx.html) - :snail: Full course on R for life science. 96 | - :link: It goes together with [this book](https://rafalab.github.io/dsbook/). 97 | 98 | ## :hibiscus: Extra 99 | 100 | Did I mention that the R community is great? Online you can find wonderful learning material. 101 | 102 | ### Gina Reynolds' Flipbooks 103 | 104 | by [@EvaMaeRay](https://twitter.com/EvaMaeRey) 105 | 106 | - :link: [GGplot flipbook](https://evamaerey.github.io/ggplot_flipbook/ggplot_flipbook_xaringan.html#1). 107 | - :link: [Tidyverse in Action](https://evamaerey.github.io/tidyverse_in_action/tidyverse_in_action.html#1) 108 | - :link: [Interactive Maps](https://evamaerey.github.io/little_flipbooks_library/leaflet/leaflet#1) 109 | 110 | ...and [Others](https://github.com/EvaMaeRey/little_flipbooks_library) 111 | 112 | ### Dataviz and R blogs 113 | 114 | - :link: [Alison Hill - Data Scientist & Professional Educator](https://alison.rbind.io/), 115 | 116 | ### Data Art and Great Unconventional Viz 117 | 118 | - :link: [Fronkonstion - Experiments in R](https://fronkonstin.com/), by [@aschinchon](https://twitter.com/aschinchon). 119 | - :link: [Data Imaginist](https://www.data-imaginist.com/), by [@thomasp85](https://twitter.com/thomasp85). 120 | - :link: [Chi's Impe[r]fect Blog](https://chichacha.netlify.com/), by [@chisatini](https://twitter.com/chisatini) 121 | 122 | 123 | Check out also the work of [Cédric Scherer](https://twitter.com/CedScherer), [Sil Aarts](https://silaarts.netlify.com/post/config-file/), [Jake Kaupp](https://twitter.com/jakekaupp) and [many other TidyTuesdaers](https://nsgrantham.shinyapps.io/tidytuesdayrocks/) with Neal Grantham's app. 124 | 125 | *This is a mostly incomplete list, suggestions are welcome!* :raised_hands: 126 | 127 | # :violin: Practice 128 | 129 | - :link: [Tidy Tuesday](https://github.com/rfordatascience/tidytuesday) - :fish_cake: Best community, weekly social data exercises in R. (check also the [R4DS learning community](https://www.jessemaegan.com/post/r4ds-the-next-iteration/)) 130 | - :link: [Kaggle](https://www.kaggle.com/) - :shaved_ice: Advanced Data Science and Machine Learning community. 131 | - :link: [Data is Beautiful - Reddit](https://www.reddit.com/r/dataisbeautiful/) :oden: - Monthly data visualization competitions. 132 | 133 | # :raised_hands: Acknowledgements 134 | 135 | I would like to thank the [University of Milano](https://www.unimi.it/it) and to the [PhD School in Molecular abnd Cell Biology](http://eng.dbs.unimi.it/ecm/home/teaching/doctoral-schools/molecular-and-cellular-biology) for financing and hosting this workshop. Thanks to [Accurat](https://www.accurat.it/) for the great support. 136 | 137 | 138 | :mortar_board: Best! -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | :milky\_way: Material for a Class in Exploratory Data Analysis 2 | ================ 3 | 4 | - [:notes: Info](#notes-info) 5 | - [:hammer: Tools](#hammer-tools) 6 | - [:snowboarder: Slides](#snowboarder-slides) 7 | - [:books: Resources](#books-resources) 8 | - [:rice: R Building Blocks](#rice-r-building-blocks) 9 | - [:milky\_way: R for Data Science](#milky_way-r-for-data-science) 10 | - [:art: Visualization in R](#art-visualization-in-r) 11 | - [:blossom: Life Science](#blossom-life-science) 12 | - [:hibiscus: Extra](#hibiscus-extra) 13 | - [Gina Reynolds’ Flipbooks](#gina-reynolds-flipbooks) 14 | - [Dataviz and R blogs](#dataviz-and-r-blogs) 15 | - [Data Art and Great Unconventional 16 | Viz](#data-art-and-great-unconventional-viz) 17 | - [:violin: Practice](#violin-practice) 18 | - [:raised\_hands: Acknowledgements](#raised_hands-acknowledgements) 19 | 20 | :rocket: Work in progress :construction\_worker: 21 | 22 | # :notes: Info 23 | 24 | The course will be hands-on. We have access to a computer room, but if 25 | it is possible, I would **suggest** you to **bring your own laptop**. In 26 | this way you will be sure to have R and Rstudio installed on your 27 | laptop, and after the workshop you will be ready to start making your 28 | own data explorations. 29 | 30 | ## :hammer: Tools 31 | 32 | :floppy\_disk: You can install R and Rstudio to your laptop. 33 | 34 | - :link: [R - CRAN](https://cran.r-project.org/) 35 | - :link: 36 | [Rstudio](https://rstudio.com/products/rstudio/download/#download) 37 | 38 | Afterwards, you can install the [Tidyverse 39 | :milky\_way:](https://www.tidyverse.org/), which collects most of the 40 | packages that we will use for our explorations. To install it, open 41 | Rstudio and type in your R console: 42 | 43 | ``` r 44 | install.packages("tidyverse") 45 | ``` 46 | 47 | If you get any :x: error message, we will fix it together :sparkler:. 48 | 49 | Otherwise, [Rstudio :cloud: cloud](https://rstudio.cloud/) let’s you run 50 | Rstudio in cloud computing. 51 | 52 | # :snowboarder: Slides 53 | 54 | 1. :link: 55 | [Introduction](https://othomantegazza.github.io/eda-class/slides/00-intro.html#1) 56 | 57 | My contact details and not much else… 58 | 59 | 2. :link: [Meet 60 | R](https://othomantegazza.github.io/eda-class/slides/01-meet-r.html#1) 61 | 62 | What is an object in R? What is a variable? Why do we need 63 | functions? 64 | 65 | 3. :link: [Load and Manipulate Data - *Tidyverse, part 66 | 1*](https://othomantegazza.github.io/eda-class/slides/02-intro-to-tidyverse.html#1) 67 | 68 | A quick introduction to the tidyverse, including how to manipulate 69 | data with [dplyr](https://dplyr.tidyverse.org/articles/dplyr.html) 70 | and how to [pipe](https://magrittr.tidyverse.org/) many steps of 71 | your analysis. 72 | 73 | 4. :link: [Visualize Data - *Tidyverse, part 74 | 2*](https://othomantegazza.github.io/eda-class/slides/03-intro-to-the-tidyverse.html#1) 75 | 76 | Build a graphical representation of your data with ggplot2. 77 | 78 | 5. :link: [Clean Data - *Tidyverse, part 79 | 3*](https://othomantegazza.github.io/eda-class/slides/04-intro-to-tidyverse.html#1) 80 | 81 | Most of the time you’ll need to clean and reashape your data with 82 | [Tidyr](https://tidyr.tidyverse.org/) and 83 | [Janitor](https://sfirke.github.io/janitor/). 84 | 85 | 6. :link: [More practice - *Tidyverse, part 86 | 4*](https://othomantegazza.github.io/eda-class/slides/05-intro-to-the-tidyverse.html#1) 87 | 88 | Practice more Exploratory Data Analysis with Open Data from the City 89 | of Milan. 90 | 91 | 7. :link: [Your 92 | Turn\!](https://othomantegazza.github.io/eda-class/slides/06-your-turn.html) 93 | 94 | Pick a dataset and explore it\! 95 | 96 | 97 | [Quotes' authors](slides/quotes.md) 98 | 99 | # :books: Resources 100 | 101 | The R community is active online, and committed to create a friendly and 102 | welcoming environment for new everybody. 103 | 104 | This includes writing outsanding :book: open access material that you 105 | can use to learn R :whale:. 106 | 107 | ## :rice: R Building Blocks 108 | 109 | - :link: [R programming for Data Science - Roger D. 110 | Peng](https://bookdown.org/rdpeng/rprogdatascience/) - :tiger: Jump 111 | start your R\! 112 | - :link: [Advanced R - Hadley Wickham](https://adv-r.hadley.nz/) - 113 | :elephant: Everything you wish to know about R. 114 | 115 | ## :milky\_way: R for Data Science 116 | 117 | :saxophone: Remember to read the articles on the packages’ website\!\! 118 | :saxophone: 119 | 120 | - :link: [R for Data Science - Grolemund, 121 | Wickham](https://r4ds.had.co.nz/) - :bird: An overview of most data 122 | science topics, with great tips. 123 | - :link: [Introduction to Statistical Learning in R - Gareth James et 124 | al.,](https://faculty.marshall.usc.edu/gareth-james/ISL/) - :dog: 125 | Kick start you statistical models. 126 | 127 | Check the [:books: bookdown](https://bookdown.org/) repository for more 128 | books on data science, including [:earth\_africa: 129 | geocomputation](https://geocompr.robinlovelace.net/), [:tophat: 130 | forecasting](https://otexts.com/fpp2/) and [:pick: text 131 | mining](https://www.tidytextmining.com/)\! 132 | 133 | ## :art: Visualization in R 134 | 135 | - :link: [Data Visualization - Kieran Healy](https://socviz.co) - 136 | :tropical\_fish: Communication oriented data visualization in R. 137 | - :link: [R Graphics Cookbook - Winston - 138 | Chang](https://r-graphics.org/) - :octopus: Practical introduction 139 | to visualization with ggplot2. 140 | 141 | Also, check the [Viz chapters in “R for Data 142 | science”](https://r4ds.had.co.nz/data-visualisation.html) (see above) 143 | :point\_up:. 144 | 145 | ## :blossom: Life Science 146 | 147 | - :link: [HarvardX Biomedical Data Science Open Online Training - 148 | Love, Irizarry](https://rafalab.github.io/pages/harvardx.html) - 149 | :snail: Full course on R for life science. 150 | - :link: It goes together with [this 151 | book](https://rafalab.github.io/dsbook/). 152 | 153 | ## :hibiscus: Extra 154 | 155 | Did I mention that the R community is great? Online you can find 156 | wonderful learning material. 157 | 158 | ### Gina Reynolds’ Flipbooks 159 | 160 | by \[@EvaMaeRay\]() 161 | 162 | - :link: [GGplot 163 | flipbook](https://evamaerey.github.io/ggplot_flipbook/ggplot_flipbook_xaringan.html#1). 164 | - :link: [Tidyverse in 165 | Action](https://evamaerey.github.io/tidyverse_in_action/tidyverse_in_action.html#1) 166 | - :link: [Interactive 167 | Maps](https://evamaerey.github.io/little_flipbooks_library/leaflet/leaflet#1) 168 | 169 | …and [Others](https://github.com/EvaMaeRey/little_flipbooks_library) 170 | 171 | ### Dataviz and R blogs 172 | 173 | - :link: [Alison Hill - Data Scientist & Professional 174 | Educator](https://alison.rbind.io/), 175 | 176 | ### Data Art and Great Unconventional Viz 177 | 178 | - :link: [Fronkonstion - Experiments in R](https://fronkonstin.com/), 179 | by \[@aschinchon\](). 180 | - :link: [Data Imaginist](https://www.data-imaginist.com/), by 181 | \[@thomasp85\](). 182 | - :link: [Chi’s Impe\[r\]fect Blog](https://chichacha.netlify.com/), 183 | by \[@chisatini\]() 184 | 185 | Check out also the work of [Cédric 186 | Scherer](https://twitter.com/CedScherer), [Sil 187 | Aarts](https://silaarts.netlify.com/post/config-file/), [Jake 188 | Kaupp](https://twitter.com/jakekaupp) and [many other 189 | TidyTuesdaers](https://nsgrantham.shinyapps.io/tidytuesdayrocks/) with 190 | Neal Grantham’s app. 191 | 192 | *This is a mostly incomplete list, suggestions are welcome\!* 193 | :raised\_hands: 194 | 195 | # :violin: Practice 196 | 197 | - :link: [Tidy 198 | Tuesday](https://github.com/rfordatascience/tidytuesday) - 199 | :fish\_cake: Best community, weekly social data exercises in R. 200 | (check also the [R4DS learning 201 | community](https://www.jessemaegan.com/post/r4ds-the-next-iteration/)) 202 | - :link: [Kaggle](https://www.kaggle.com/) - :shaved\_ice: Advanced 203 | Data Science and Machine Learning community. 204 | - :link: [Data is Beautiful - 205 | Reddit](https://www.reddit.com/r/dataisbeautiful/) :oden: - Monthly 206 | data visualization competitions. 207 | 208 | # :raised\_hands: Acknowledgements 209 | 210 | I would like to thank the [University of 211 | Milano](https://www.unimi.it/it) and to the [PhD School in Molecular 212 | abnd Cell 213 | Biology](http://eng.dbs.unimi.it/ecm/home/teaching/doctoral-schools/molecular-and-cellular-biology) 214 | for financing and hosting this workshop. Thanks to 215 | [Accurat](https://www.accurat.it/) for the great support. 216 | 217 | :mortar\_board: Best\! 218 | -------------------------------------------------------------------------------- /renv/.gitignore: -------------------------------------------------------------------------------- 1 | library/ 2 | python/ 3 | staging/ 4 | -------------------------------------------------------------------------------- /renv/activate.R: -------------------------------------------------------------------------------- 1 | 2 | local({ 3 | 4 | # the requested version of renv 5 | version <- "0.8.3" 6 | 7 | # avoid recursion 8 | if (!is.na(Sys.getenv("RENV_R_INITIALIZING", unset = NA))) 9 | return(invisible(TRUE)) 10 | 11 | # signal that we're loading renv during R startup 12 | Sys.setenv("RENV_R_INITIALIZING" = "true") 13 | on.exit(Sys.unsetenv("RENV_R_INITIALIZING"), add = TRUE) 14 | 15 | # signal that we've consented to use renv 16 | options(renv.consent = TRUE) 17 | 18 | # load the 'utils' package eagerly -- this ensures that renv shims, which 19 | # mask 'utils' packages, will come first on the search path 20 | library(utils, lib.loc = .Library) 21 | 22 | # check to see if renv has already been loaded 23 | if ("renv" %in% loadedNamespaces()) { 24 | 25 | # if renv has already been loaded, and it's the requested version of renv, 26 | # nothing to do 27 | spec <- .getNamespaceInfo(.getNamespace("renv"), "spec") 28 | if (identical(spec[["version"]], version)) 29 | return(invisible(TRUE)) 30 | 31 | # otherwise, unload and attempt to load the correct version of renv 32 | unloadNamespace("renv") 33 | 34 | } 35 | 36 | # construct path to renv in library 37 | libpath <- local({ 38 | 39 | root <- Sys.getenv("RENV_PATHS_LIBRARY", unset = "renv/library") 40 | prefix <- paste("R", getRversion()[1, 1:2], sep = "-") 41 | 42 | # include SVN revision for development versions of R 43 | # (to avoid sharing platform-specific artefacts with released versions of R) 44 | devel <- 45 | identical(R.version[["status"]], "Under development (unstable)") || 46 | identical(R.version[["nickname"]], "Unsuffered Consequences") 47 | 48 | if (devel) 49 | prefix <- paste(prefix, R.version[["svn rev"]], sep = "-r") 50 | 51 | file.path(root, prefix, R.version$platform) 52 | 53 | }) 54 | 55 | # try to load renv from the project library 56 | if (requireNamespace("renv", lib.loc = libpath, quietly = TRUE)) 57 | return(renv::load()) 58 | 59 | # failed to find renv locally; we'll try to install from GitHub. 60 | # first, set up download options as appropriate (try to use GITHUB_PAT) 61 | install_renv <- function() { 62 | 63 | message("Failed to find installation of renv -- attempting to bootstrap...") 64 | 65 | # ensure .Rprofile doesn't get executed 66 | rpu <- Sys.getenv("R_PROFILE_USER", unset = NA) 67 | Sys.setenv(R_PROFILE_USER = "") 68 | on.exit({ 69 | if (is.na(rpu)) 70 | Sys.unsetenv("R_PROFILE_USER") 71 | else 72 | Sys.setenv(R_PROFILE_USER = rpu) 73 | }, add = TRUE) 74 | 75 | # prepare download options 76 | pat <- Sys.getenv("GITHUB_PAT") 77 | if (nzchar(Sys.which("curl")) && nzchar(pat)) { 78 | fmt <- "--location --fail --header \"Authorization: token %s\"" 79 | extra <- sprintf(fmt, pat) 80 | saved <- options("download.file.method", "download.file.extra") 81 | options(download.file.method = "curl", download.file.extra = extra) 82 | on.exit(do.call(base::options, saved), add = TRUE) 83 | } else if (nzchar(Sys.which("wget")) && nzchar(pat)) { 84 | fmt <- "--header=\"Authorization: token %s\"" 85 | extra <- sprintf(fmt, pat) 86 | saved <- options("download.file.method", "download.file.extra") 87 | options(download.file.method = "wget", download.file.extra = extra) 88 | on.exit(do.call(base::options, saved), add = TRUE) 89 | } 90 | 91 | # fix up repos 92 | repos <- getOption("repos") 93 | on.exit(options(repos = repos), add = TRUE) 94 | repos[repos == "@CRAN@"] <- "https://cloud.r-project.org" 95 | options(repos = repos) 96 | 97 | # check for renv on CRAN matching this version 98 | db <- as.data.frame(available.packages(), stringsAsFactors = FALSE) 99 | if ("renv" %in% rownames(db)) { 100 | entry <- db["renv", ] 101 | if (identical(entry$Version, version)) { 102 | message("* Installing renv ", version, " ... ", appendLF = FALSE) 103 | dir.create(libpath, showWarnings = FALSE, recursive = TRUE) 104 | utils::install.packages("renv", lib = libpath, quiet = TRUE) 105 | message("Done!") 106 | return(TRUE) 107 | } 108 | } 109 | 110 | # try to download renv 111 | message("* Downloading renv ", version, " ... ", appendLF = FALSE) 112 | prefix <- "https://api.github.com" 113 | url <- file.path(prefix, "repos/rstudio/renv/tarball", version) 114 | destfile <- tempfile("renv-", fileext = ".tar.gz") 115 | on.exit(unlink(destfile), add = TRUE) 116 | utils::download.file(url, destfile = destfile, mode = "wb", quiet = TRUE) 117 | message("Done!") 118 | 119 | # attempt to install it into project library 120 | message("* Installing renv ", version, " ... ", appendLF = FALSE) 121 | dir.create(libpath, showWarnings = FALSE, recursive = TRUE) 122 | 123 | # invoke using system2 so we can capture and report output 124 | bin <- R.home("bin") 125 | exe <- if (Sys.info()[["sysname"]] == "Windows") "R.exe" else "R" 126 | r <- file.path(bin, exe) 127 | args <- c("--vanilla", "CMD", "INSTALL", "-l", shQuote(libpath), shQuote(destfile)) 128 | output <- system2(r, args, stdout = TRUE, stderr = TRUE) 129 | message("Done!") 130 | 131 | # check for successful install 132 | status <- attr(output, "status") 133 | if (is.numeric(status) && !identical(status, 0L)) { 134 | text <- c("Error installing renv", "=====================", output) 135 | writeLines(text, con = stderr()) 136 | } 137 | 138 | 139 | } 140 | 141 | try(install_renv()) 142 | 143 | # try again to load 144 | if (requireNamespace("renv", lib.loc = libpath, quietly = TRUE)) { 145 | message("Successfully installed and loaded renv ", version, ".") 146 | return(renv::load()) 147 | } 148 | 149 | # failed to download or load renv; warn the user 150 | msg <- c( 151 | "Failed to find an renv installation: the project will not be loaded.", 152 | "Use `renv::activate()` to re-initialize the project." 153 | ) 154 | 155 | warning(paste(msg, collapse = "\n"), call. = FALSE) 156 | 157 | }) 158 | -------------------------------------------------------------------------------- /renv/settings.dcf: -------------------------------------------------------------------------------- 1 | external.libraries: 2 | ignored.packages: 3 | snapshot.type: packrat 4 | use.cache: TRUE 5 | -------------------------------------------------------------------------------- /renv_force.R: -------------------------------------------------------------------------------- 1 | library(svglite) 2 | library(hexbin) -------------------------------------------------------------------------------- /slides/00-intro.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Introduction" 3 | author: "Otho Mantegazza
`r Sys.Date()`" 4 | subtitle: "“All R is quite useless.”" 5 | output: 6 | xaringan::moon_reader: 7 | lib_dir: libs 8 | css: "css/remark.css" 9 | # chakra: libs/remark-js-latest-min.js 10 | nature: 11 | highlightStyle: github 12 | highlightLines: true 13 | countIncrementalSlides: false 14 | ratio: '16:9' 15 | editor_options: 16 | chunk_output_type: console 17 | --- 18 | 19 | class: middle, center 20 | 21 | ```{r setup, include=FALSE} 22 | knitr::opts_chunk$set(dev = "svglite") 23 | # source(file = here::here("slides/libs-reveal/xaringan_reveal_parentheses_balanced.R")) 24 | # options(tibble.width = 55, 25 | # tibble.max_extra_cols = 20) 26 | # library(tidyverse) 27 | ``` 28 | 29 | portrait 30 | 31 | # My contacts... 32 | 33 | email: [otho.mantegazza@accurat.it](mailto:otho.mantegazza@accurat.it)
34 | Github: [@othomantegazza](https://github.com/othomantegazza)
35 | Twitter: [@othomn](https://twitter.com/othomn)
36 | Website: [otho.netlify.com](otho.netlify.com)
37 | 38 | 39 | 40 | --- 41 | 42 | class: blueblue, middle 43 | 44 | .verybig[Why Data?] 45 | 46 | .big[What's your experience with Data?] 47 | 48 | --- 49 | 50 | class: blueblue, middle 51 | 52 | .right[ 53 | 54 | .verybig[Why R?] 55 | 56 | .big[Let's check your R setup!] 57 | 58 | ] 59 | 60 | --- 61 | 62 | class: center, middle 63 | 64 | # How is this course structured? 65 | 66 | [All the material + Resources are here on Github](https://github.com/othomantegazza/eda-class) 67 | -------------------------------------------------------------------------------- /slides/00-intro.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Introduction 5 | 6 | 7 | 8 | 9 | 10 | 63 | 64 | 65 | 127 | 128 | 138 | 139 | 158 | 159 | 169 | 170 | 171 | -------------------------------------------------------------------------------- /slides/01-meet-r_files/figure-html/unnamed-chunk-49-1.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /slides/02-intro-to-tidyverse.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "The Tidyverse, Part 1" 3 | subtitle: "“R does not reproduce what we see. It makes us see.”" 4 | author: "Otho Mantegazza
`r Sys.Date()`" 5 | output: 6 | xaringan::moon_reader: 7 | lib_dir: libs 8 | css: "css/remark.css" 9 | # chakra: libs/remark-js-latest-min.js 10 | nature: 11 | highlightStyle: github 12 | highlightLines: true 13 | countIncrementalSlides: false 14 | ratio: '16:9' 15 | --- 16 | 17 | ```{r setup, include=FALSE} 18 | knitr::opts_chunk$set(dev = "svglite") 19 | source(file = here::here("slides/libs-reveal/xaringan_reveal_parentheses_balanced.R")) 20 | options(tibble.width = 55, 21 | tibble.max_extra_cols = 20) 22 | library(ggbeeswarm) 23 | ``` 24 | 25 | 26 | # The Tidyverse 27 | 28 | .center[ 29 | 30 |

31 | 32 | hex-tidyverse 33 |

34 | 35 | ] 36 | 37 | --- 38 | 39 | class: blueblue, middle 40 | 41 | .verybig[The Tidyverse is a collection of packages for Data Science] 42 | 43 | --- 44 | 45 | 46 | class: blueblue, middle 47 | 48 | .big[.right[Let's practice it!]] 49 | 50 | --- 51 | 52 | # NYC Squirrel Census 53 | 54 | 55 | .pull-left[ 56 | 57 | Because: 58 | 59 | .middle[ 60 | 61 | - It is large enough to provide a challenge. 62 | - It is tidy and detailed, 63 | - It stores quantitative and categorical variables, 64 | - it stores spatial variables, 65 | - it stores time variables. 66 | 67 | credits: [NYC Squirrel census](https://www.thesquirrelcensus.com/). 68 | 69 | ] 70 | ] 71 | 72 | 73 | .pull-right[ 74 | .center[ 75 | 76 |

77 | 78 | hex-tidyverse 79 |

80 | 81 | ] 82 | ] 83 | 84 | 85 | --- 86 | 87 | # TidyTuesday 88 | 89 | ![](https://github.com/rfordatascience/tidytuesday/blob/master/static/tt_logo.png) 90 | 91 | [TidyTuesday on Github](https://github.com/rfordatascience/tidytuesday), also check the [tidytuesday hashstag on twitter](https://twitter.com/hashtag/TidyTuesday). 92 | 93 | --- 94 | 95 | # NYC Squirrel Census 96 | 97 | `r # emo::ji("squirrel")` 98 | 99 | ```{r, message=FALSE} 100 | library(tidyverse) 101 | ``` 102 | 103 | 104 | ```{r, message=FALSE} 105 | squirrels_url <- paste0("https://raw.githubusercontent.com/rfordatascience/tidytuesday/", 106 | "master/data/2019/2019-10-29/nyc_squirrels.csv") 107 | 108 | squirrels <- read_csv(squirrels_url) 109 | ``` 110 | 111 | ```{r squirrel-pipe-1, echo=FALSE, eval=FALSE} 112 | squirrels %>% 113 | select(lat_long, 114 | primary_fur_color, 115 | location) %>% 116 | filter(primary_fur_color == 117 | "Cinnamon") %>% 118 | drop_na(location) 119 | ``` 120 | 121 | --- 122 | 123 | `r apply_reveal("squirrel-pipe-1")` 124 | 125 | --- 126 | 127 | class: blueblue, middle 128 | 129 | .big[.right[Run many operations in sequence 130 | 131 | With the .orange[PIPE]]] 132 | 133 | --- 134 | 135 | # The pipe operator 136 | 137 | .pull-left[ 138 | 139 | .middle[ 140 | 141 | Most of the functions in the Tidyverse take a tibble as first argument and produce a tibble as an output. 142 | 143 | ```{r, eval=FALSE} 144 | select(.data = squirrels, 145 | lat_long, primary_fur_color) 146 | ``` 147 | 148 | 149 | The pipe take whats on the left and passes it to the first argument of the function on the right. 150 | 151 | ```{r, eval=FALSE} 152 | squirrels %>% 153 | select(lat_lon, primary_fur_color) %>% 154 | do_something() %>% 155 | and_something_else() 156 | ``` 157 | 158 | 159 | ] 160 | ] 161 | 162 | 163 | .pull-right[ 164 | .center[ 165 | 166 |

167 | 168 | hex-magrittr 169 |

170 | 171 | ] 172 | ] 173 | 174 | --- 175 | 176 | class: blueblue, middle 177 | 178 | .big[.right[Ok, but which operation?]] 179 | 180 | --- 181 | 182 | 183 | # DPLYR Verbs 184 | 185 | .pull-left[ 186 | 187 | .middle[ 188 | 189 | DPLYR contains funcions that are Verbs for data manipulations 190 | 191 | This verbs allow you to perform the operations that you want on data with a declarative synthax. You tell your computer what you want to do, not how to do it. 192 | 193 | For example you can: 194 | 195 | - select columns with `select()`, 196 | - sort rows with `arrange()`, 197 | - filter rows with `filter()`, 198 | 199 | ...and much more. 200 | 201 | 202 | ] 203 | ] 204 | 205 | 206 | .pull-right[ 207 | .center[ 208 | 209 |

210 | 211 | hex-magrittr 212 |

213 | 214 | ] 215 | ] 216 | 217 | --- 218 | 219 | class: blueblue, middle 220 | 221 | .big[.right[ 222 | 223 | .orange[Let's try it:] 224 | 225 | How many .orange[gray] squirrels... 226 | 227 | ...where seen .orange[above ground]... 228 | 229 | ....orange[eating]... 230 | 231 | ...devided by .orange[age] 232 | 233 | ]] 234 | 235 | ```{r squirrel-pipe-2, eval = FALSE, echo = FALSE} 236 | squirrels %>% 237 | select(primary_fur_color, 238 | location, 239 | eating, age) %>% 240 | filter(primary_fur_color == 241 | "Gray") %>% 242 | filter(location == 243 | "Above Ground") %>% 244 | filter(eating) %>% 245 | count(age) 246 | ``` 247 | 248 | --- 249 | 250 | `r apply_reveal("squirrel-pipe-2")` 251 | 252 | --- 253 | 254 | class: exercise, middle 255 | 256 | .exercise-title[Exercise:] 257 | 258 | .exercise-body[ 259 | 260 | Count how many **Juvenile** squirrels... 261 | 262 | ...where seen **foraging**... 263 | 264 | ...aggregated by **primary fur color** 265 | 266 | ] 267 | 268 | --- 269 | 270 | class: exercise, middle 271 | 272 | .exercise-title[Exercise:] 273 | 274 | .exercise-body[ 275 | 276 | Which values does the column **other_activities** take? 277 | 278 | Which value, besides NA, does it take most often? 279 | 280 | ] 281 | 282 | --- 283 | 284 | class: exercise, middle 285 | 286 | .exercise-title[Exercise:] 287 | 288 | .exercise-body[ 289 | 290 | When squirrels are observed **above ground**... 291 | 292 | ...at what height are they on average? 293 | 294 | ] 295 | 296 | ```{r, eval = FALSE, echo = FALSE} 297 | squirrels %>% 298 | filter(!is.na(above_ground_sighter_measurement)) %>% 299 | filter(above_ground_sighter_measurement != FALSE) %>% 300 | summarise(mean_height = above_ground_sighter_measurement %>% 301 | as.numeric() %>% 302 | mean(na.rm = TRUE)) 303 | ``` 304 | 305 | 306 | --- 307 | 308 | class: blueblue, middle 309 | 310 | .verybig[We have identified one column that must be cleaned] 311 | 312 | --- 313 | 314 | # Data Cleaning 1 315 | 316 | Problem: 317 | 318 | ```{r, R.options=list(max.print=30)} 319 | squirrels %>% pull(above_ground_sighter_measurement) 320 | ``` 321 | 322 | ```{r} 323 | squirrels %>% pull(above_ground_sighter_measurement) %>% class() 324 | ``` 325 | 326 | --- 327 | 328 | # Data Cleaning 1 329 | 330 | solution: 331 | 332 | ```{r, R.options=list(max.print=50)} 333 | squirrels %>% 334 | rename(height = above_ground_sighter_measurement) %>% 335 | mutate(height = height %>% { 336 | if_else(. == "FALSE", "0", ., NA_character_) 337 | }) %>% 338 | mutate(height = height %>% 339 | as.numeric()) %>% 340 | pull(height) 341 | ``` 342 | 343 | --- 344 | 345 | # Data Cleaning 2 346 | 347 | Problem: 348 | 349 | ```{r, R.options=list(max.print=30)} 350 | squirrels %>% pull(date) 351 | ``` 352 | 353 | ```{r} 354 | squirrels %>% pull(date) %>% class() 355 | ``` 356 | 357 | --- 358 | 359 | # Lubridate, because dates and times are special 360 | 361 | .pull-left[ 362 | 363 | .middle[ 364 | 365 | Instead of trying to remember how many seconds are in an hour, how many days are in which month, and which year is a leap year, R has special objects and classes to store time, and to perform operations on it. 366 | 367 | Lubridate makes it easier to deal with those objects. 368 | 369 | ```{r, eval=FALSE} 370 | ymd(20101215) 371 | #> [1] "2010-12-15" 372 | 373 | mdy("4/1/17") 374 | #> [1] "2017-04-01" 375 | ``` 376 | 377 | 378 | (from the package examples) 379 | 380 | ] 381 | ] 382 | 383 | 384 | .pull-right[ 385 | .center[ 386 | 387 |

388 | 389 | hex-lubridate 390 |

391 | 392 | ] 393 | ] 394 | 395 | --- 396 | 397 | # Data Cleaning 2 398 | 399 | Solution: 400 | 401 | ```{r, message=FALSE} 402 | library(lubridate) 403 | ``` 404 | 405 | 406 | ```{r, R.options=list(max.print=15)} 407 | squirrels %>% 408 | mutate(date = mdy(date)) %>% 409 | pull(date) 410 | ``` 411 | 412 | ```{r} 413 | squirrels %>% 414 | mutate(date = mdy(date)) %>% 415 | pull(date) %>% 416 | class() 417 | ``` 418 | 419 | --- 420 | 421 | # Put it all together and assign it to a new variable 422 | 423 | ```{r squirrel-data-cleaning} 424 | squirrels_tidy <- 425 | # assign to a new object 426 | squirrels %>% 427 | # first part 428 | rename(height = above_ground_sighter_measurement) %>% 429 | mutate(height = height %>% { 430 | if_else(. == "FALSE", "0", ., NA_character_) 431 | }) %>% 432 | mutate(height = height %>% 433 | as.numeric()) %>% 434 | # Second part 435 | mutate(date = mdy(date)) 436 | ``` 437 | 438 | From now on we are going to work with `squirrels_tidy`! 439 | 440 | ```{r, include=FALSE} 441 | save(squirrels_tidy, file = here::here("data/squirrels_tidy.Rdata")) 442 | ``` 443 | 444 | --- 445 | 446 | class: exercise, middle 447 | 448 | .exercise-title[Exercise:] 449 | 450 | .exercise-body[ 451 | 452 | When do the observations **start**? 453 | 454 | And when do they **end**? 455 | 456 | ] 457 | 458 | --- 459 | 460 | class: exercise, middle 461 | 462 | .exercise-title[Exercise:] 463 | 464 | .exercise-body[ 465 | 466 | How many squirrels where observed in each **weekday**? 467 | 468 | ] 469 | 470 | ```{r, eval = FALSE, echo = FALSE} 471 | squirrels_tidy %>% 472 | count(date) %>% 473 | mutate(date = wday(date, label = T)) %>% 474 | group_by(date) %>% 475 | summarise(mean_n = mean(n)) 476 | ``` 477 | 478 | --- 479 | 480 | class: exercise, middle 481 | 482 | .exercise-title[Exercise:] 483 | 484 | .exercise-body[ 485 | 486 | At what **average height**... 487 | 488 | ...were squirrles from the "three" **age** groups observed? 489 | 490 | ] 491 | 492 | ```{r, eval = FALSE, echo = FALSE} 493 | squirrels_tidy %>% 494 | group_by(age) %>% 495 | summarise(mean_height = mean(height, na.rm = T)) 496 | ``` 497 | -------------------------------------------------------------------------------- /slides/03-intro-to-the-tidyverse.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "The Tidyverse, Part 2" 3 | subtitle: "“R is not in some far-off place.”" 4 | author: "Otho Mantegazza
`r Sys.Date()`" 5 | output: 6 | xaringan::moon_reader: 7 | lib_dir: libs 8 | css: "css/remark.css" 9 | # chakra: libs/remark-js-latest-min.js 10 | nature: 11 | highlightStyle: github 12 | highlightLines: true 13 | countIncrementalSlides: false 14 | ratio: '16:9' 15 | --- 16 | 17 | ```{r setup, include=FALSE} 18 | knitr::opts_chunk$set(dev = "svglite") 19 | source(file = here::here("slides/libs-reveal/xaringan_reveal_parentheses_balanced.R")) 20 | options(tibble.width = 55, 21 | tibble.max_extra_cols = 20) 22 | library(tidyverse) 23 | library(ggbeeswarm) 24 | load(here::here("data/squirrels_tidy.Rdata")) 25 | ``` 26 | 27 | class: blueblue, middle 28 | 29 | .big[You are already .orange[exploring data] using summary statistics.] 30 | 31 | .big[Could we learn more turning them into visual representation?] 32 | 33 | --- 34 | 35 | `r apply_reveal("squirrel-height-boxplot")` 36 | 37 | ```{r squirrel-height-boxplot, eval = FALSE, echo = FALSE} 38 | squirrels_tidy %>% 39 | drop_na(height) %>% 40 | ggplot(aes(x = age, 41 | y = height)) + 42 | geom_point() + 43 | geom_boxplot() + 44 | labs(title = "Height Above Ground of Squirrels by Age", 45 | y = "Feet from Ground", 46 | caption = "Data from data.cityofnewyork.us") + 47 | theme_gray(base_size = 18) + 48 | coord_flip() + 49 | scale_y_sqrt() 50 | ``` 51 | 52 | --- 53 | 54 | `r apply_reveal("squirrel-beeswarm")` 55 | 56 | ```{r squirrel-beeswarm, eval = FALSE, echo = FALSE} 57 | squirrels_tidy %>% 58 | drop_na(height) %>% 59 | filter(height > 0) %>% 60 | ggplot(aes(x = age, 61 | y = height)) + 62 | geom_quasirandom() + 63 | theme_gray(base_size = 18) + 64 | labs(title = "Height Above Ground of Squirrels by Age", 65 | y = "Feet from Ground", 66 | caption = "Data from data.cityofnewyork.us") + 67 | coord_flip() + 68 | scale_y_sqrt() + 69 | stat_summary(fun.y = "median", 70 | geom = "point", 71 | size = 10, 72 | alpha = .9, 73 | colour = "#FF6C0D") 74 | ``` 75 | 76 | --- 77 | 78 | `r apply_reveal("squirrel-histogram")` 79 | 80 | ```{r squirrel-histogram, eval = FALSE, echo = FALSE} 81 | squirrels_tidy %>% 82 | drop_na(height) %>% 83 | filter(height > 0) %>% 84 | ggplot(aes(x = height)) + 85 | geom_density(fill = "grey40", 86 | alpha = .6, 87 | adjust = 1/2) + 88 | labs(title = "Height Above Ground of Squirrels by Age", 89 | x = "Feet from Ground", 90 | caption = "Data from data.cityofnewyork.us") + 91 | theme_bw(base_size = 18) + 92 | facet_grid(age ~ .) + 93 | stat_sum( 94 | aes(y = -.005, 95 | size = 1, 96 | colour = ..n.., 97 | label = "|"), 98 | geom = "text", 99 | fontface = "bold", 100 | show.legend = F) + 101 | scale_colour_viridis_c() 102 | ``` 103 | 104 | --- 105 | 106 | # GGPLOT2 for Data Visualization 107 | 108 | .pull-left[ 109 | 110 | .middle[ 111 | 112 | GGPLOT2 is a library for graphical representation of Data. 113 | 114 | 115 | - It is widely developed, you can use it to plot almost anything, 116 | - It is based on the Layered Grammar of Graphics, 117 | - It's designed both for exploratory viz and for communiacation 118 | 119 | To get used to ggplot you must think in a layered way. 120 | 121 | ] 122 | ] 123 | 124 | 125 | .pull-right[ 126 | .center[ 127 | 128 |

129 | 130 | hex-ggplot 131 |

132 | 133 | ] 134 | ] 135 | 136 | --- 137 | 138 | class: middle 139 | 140 | ![](img/grammar-of-graphics.png) 141 | 142 | Image credits: [@dgkeyes](https://twitter.com/dgkeyes), https://rfortherestofus.com/ . Found through [@W_R_Chase](https://twitter.com/W_R_Chase) 143 | 144 | --- 145 | 146 | # The Grammar of Graphics 147 | 148 | You map **data**... 149 | 150 | 1. to **Aesthetic** properties of objects, 151 | 2. according **Scale**, placed on **Coordinates** 152 | 3. Those objects are represented by **Geometric Shapes**. 153 | 4. You might need to use **statistical** transformations, to highlight those properties, 154 | 5. You might want to use **Facets** to represent multiple dimentions 155 | 156 | 157 | 158 | --- 159 | 160 | class: exercise, middle 161 | 162 | .exercise-title[Exercise:] 163 | 164 | .exercise-body-small[ 165 | 166 | Show the number of squirrels by their primary and highlight fur colour in a barchart. 167 | 168 | - Map the **highlight fur colour** to the **x axis**. Can you make the x axis vertical? 169 | 170 | - Map the **primary fur colour** to the **fill aesthetic** of the bars. Can you map them to to their real colours? 171 | 172 | - What are you mapping to the y axis, can you order the bars by their height? 173 | 174 | ] 175 | 176 | ```{r, eval=FALSE, echo=FALSE} 177 | # exercise 178 | squirrels_tidy %>% 179 | ggplot(aes(x = forcats::fct_infreq(highlight_fur_color), 180 | fill = primary_fur_color)) + 181 | geom_bar() + 182 | coord_flip() + 183 | labs(x = "Highlight Fur Color") + 184 | scale_fill_manual(values = c(Black = "black", 185 | Cinnamon = "#ff9100", 186 | Gray = "gray60"), 187 | na.value = "purple") 188 | ``` 189 | 190 | 191 | --- 192 | 193 | class: exercise, middle 194 | 195 | .exercise-title[Exercise:] 196 | 197 | .exercise-body[ 198 | 199 | Use an histogram to visualize how are the black squirrels distributed along the longitude variable. 200 | 201 | - If you map the longitude on the x axis, which variable are you mapping on the y axis? 202 | 203 | - Are you using a statistical transformation? 204 | 205 | ] 206 | 207 | --- 208 | 209 | class: exercise, middle 210 | 211 | .exercise-title[Exercise:] 212 | 213 | .exercise-body-small[ 214 | 215 | Can you show where are the squirrels with a scatterplot? 216 | 217 | - Map the **longitude** to the **x axis** and the **latitude** to the **y axis**. 218 | 219 | - Map the colour of the points to the real fur colour of the squirrels. 220 | 221 | - Can you also split the fur colours in facets? 222 | 223 | - How can you avoid overplotting? Can you bin the squirrel position on the x and y axis? 224 | 225 | ] 226 | 227 | ```{r, eval = FALSE, echo = FALSE} 228 | p <- 229 | squirrels_tidy %>% 230 | ggplot(aes(x = long, 231 | y = lat)) + 232 | coord_quickmap() + 233 | theme_bw() 234 | 235 | p + 236 | geom_point(aes(colour = primary_fur_color), 237 | alpha = .7) + 238 | scale_colour_manual(values = c(Black = "black", 239 | Cinnamon = "#ff9100", 240 | Gray = "gray60"), 241 | na.value = "purple") + 242 | facet_grid(. ~ primary_fur_color) 243 | 244 | p + 245 | geom_hex(colour = "#FFFFFF", 246 | size = .2, 247 | binwidth = c(.0015, 248 | .001)) 249 | ``` 250 | 251 | 252 | --- 253 | 254 | `r apply_reveal("squirrels-geo")` 255 | 256 | ```{r, squirrels-geo, eval = FALSE, echo = FALSE} 257 | squirrels_tidy %>% 258 | ggplot(aes(x = long, 259 | y = lat)) + 260 | geom_point(alpha = .1) + 261 | geom_hex(colour = "#FFFFFF", 262 | size = .2, 263 | binwidth = c(.0015, 264 | .001)) + 265 | coord_quickmap() + 266 | theme_bw() 267 | ``` 268 | 269 | 270 | ```{r, eval = FALSE, echo = FALSE} 271 | 272 | # map background with ggmap, but it does not work with hexagons 273 | 274 | tst <- 275 | ggmap::get_stamenmap(c(left = min(squirrels_tidy$long), 276 | top = max(squirrels_tidy$lat), 277 | right = max(squirrels_tidy$long), 278 | bottom = min(squirrels_tidy$lat)), 279 | zoom = 15, maptype = "toner-lite") 280 | 281 | ggmap::ggmap(tst) + 282 | geom_point(data = squirrels_tidy, 283 | aes(x = long, 284 | y = lat), 285 | alpha = .3, 286 | colour = "#0693c7") 287 | ``` 288 | 289 | -------------------------------------------------------------------------------- /slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-beeswarm_10-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/othomantegazza/eda-class/567295f0b1cec7b09469d03aa6f0b544845bb9c1/slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-beeswarm_10-1.png -------------------------------------------------------------------------------- /slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-beeswarm_14-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/othomantegazza/eda-class/567295f0b1cec7b09469d03aa6f0b544845bb9c1/slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-beeswarm_14-1.png -------------------------------------------------------------------------------- /slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-beeswarm_5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/othomantegazza/eda-class/567295f0b1cec7b09469d03aa6f0b544845bb9c1/slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-beeswarm_5-1.png -------------------------------------------------------------------------------- /slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-beeswarm_5-1.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 0 40 | 50 41 | 100 42 | 150 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | ? 52 | Adult 53 | Juvenile 54 | NA 55 | age 56 | height 57 | 58 | -------------------------------------------------------------------------------- /slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-beeswarm_6-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/othomantegazza/eda-class/567295f0b1cec7b09469d03aa6f0b544845bb9c1/slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-beeswarm_6-1.png -------------------------------------------------------------------------------- /slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-beeswarm_7-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/othomantegazza/eda-class/567295f0b1cec7b09469d03aa6f0b544845bb9c1/slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-beeswarm_7-1.png -------------------------------------------------------------------------------- /slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-beeswarm_8-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/othomantegazza/eda-class/567295f0b1cec7b09469d03aa6f0b544845bb9c1/slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-beeswarm_8-1.png -------------------------------------------------------------------------------- /slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-beeswarm_9-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/othomantegazza/eda-class/567295f0b1cec7b09469d03aa6f0b544845bb9c1/slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-beeswarm_9-1.png -------------------------------------------------------------------------------- /slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-height-boxplot_4-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/othomantegazza/eda-class/567295f0b1cec7b09469d03aa6f0b544845bb9c1/slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-height-boxplot_4-1.png -------------------------------------------------------------------------------- /slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-height-boxplot_4-1.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 0 40 | 50 41 | 100 42 | 150 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | ? 52 | Adult 53 | Juvenile 54 | NA 55 | age 56 | height 57 | 58 | -------------------------------------------------------------------------------- /slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-height-boxplot_5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/othomantegazza/eda-class/567295f0b1cec7b09469d03aa6f0b544845bb9c1/slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-height-boxplot_5-1.png -------------------------------------------------------------------------------- /slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-height-boxplot_6-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/othomantegazza/eda-class/567295f0b1cec7b09469d03aa6f0b544845bb9c1/slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-height-boxplot_6-1.png -------------------------------------------------------------------------------- /slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-height-boxplot_7-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/othomantegazza/eda-class/567295f0b1cec7b09469d03aa6f0b544845bb9c1/slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-height-boxplot_7-1.png -------------------------------------------------------------------------------- /slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-height-boxplot_8-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/othomantegazza/eda-class/567295f0b1cec7b09469d03aa6f0b544845bb9c1/slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-height-boxplot_8-1.png -------------------------------------------------------------------------------- /slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-height-boxplot_9-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/othomantegazza/eda-class/567295f0b1cec7b09469d03aa6f0b544845bb9c1/slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-height-boxplot_9-1.png -------------------------------------------------------------------------------- /slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-histogram_16-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/othomantegazza/eda-class/567295f0b1cec7b09469d03aa6f0b544845bb9c1/slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-histogram_16-1.png -------------------------------------------------------------------------------- /slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-histogram_17-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/othomantegazza/eda-class/567295f0b1cec7b09469d03aa6f0b544845bb9c1/slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-histogram_17-1.png -------------------------------------------------------------------------------- /slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-histogram_4-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/othomantegazza/eda-class/567295f0b1cec7b09469d03aa6f0b544845bb9c1/slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-histogram_4-1.png -------------------------------------------------------------------------------- /slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-histogram_4-1.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 0 40 | 50 41 | 100 42 | 150 43 | height 44 | 45 | -------------------------------------------------------------------------------- /slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-histogram_6-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/othomantegazza/eda-class/567295f0b1cec7b09469d03aa6f0b544845bb9c1/slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-histogram_6-1.png -------------------------------------------------------------------------------- /slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-histogram_7-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/othomantegazza/eda-class/567295f0b1cec7b09469d03aa6f0b544845bb9c1/slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-histogram_7-1.png -------------------------------------------------------------------------------- /slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-histogram_8-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/othomantegazza/eda-class/567295f0b1cec7b09469d03aa6f0b544845bb9c1/slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrel-histogram_8-1.png -------------------------------------------------------------------------------- /slides/03-intro-to-the-tidyverse_files/figure-html/output_squirrels-geo_3-1.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 40.77 43 | 40.78 44 | 40.79 45 | 40.80 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | -73.98 55 | -73.97 56 | -73.96 57 | -73.95 58 | long 59 | lat 60 | 61 | -------------------------------------------------------------------------------- /slides/04-intro-to-tidyverse.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "The Tidyverse, Part 3" 3 | subtitle: "“Without R, the crudeness of reality would make the world unbearable”" 4 | author: "Otho Mantegazza
`r Sys.Date()`" 5 | output: 6 | xaringan::moon_reader: 7 | lib_dir: libs 8 | css: "css/remark.css" 9 | # chakra: libs/remark-js-latest-min.js 10 | nature: 11 | highlightStyle: github 12 | highlightLines: true 13 | countIncrementalSlides: false 14 | ratio: '16:9' 15 | --- 16 | 17 | ```{r setup, include=FALSE} 18 | knitr::opts_chunk$set(dev = "svglite") 19 | # source(file = here::here("slides/libs-reveal/xaringan_reveal_parentheses_balanced.R")) 20 | options(tibble.width = 55, 21 | tibble.max_extra_cols = 20) 22 | library(tidyverse) 23 | load(here::here("data/squirrels_tidy.Rdata")) 24 | ``` 25 | 26 | class: blueblue, middle 27 | 28 | .big[A couple of .orange[Data Cleaning] steps.] 29 | 30 | --- 31 | 32 | # NA encoded as question marks 33 | 34 | ```{r} 35 | squirrels_tidy %>% 36 | pull(age) %>% 37 | unique() 38 | ``` 39 | 40 | -- 41 | 42 | # Two columns in one 43 | 44 | ```{r, R.options=list(max.print=40)} 45 | squirrels_tidy %>% 46 | pull(hectare) %>% 47 | unique() 48 | ``` 49 | 50 | --- 51 | 52 | # TIDYR 53 | 54 | .pull-left[ 55 | 56 | .middle[ 57 | 58 | TIDYR is a package dedicated to tidying data. It stores functions for 59 | 60 | 61 | - Pivoting, 62 | - Rectangling, 63 | - Nesting, 64 | - Separating and combining columns, 65 | - Deal with missing data. 66 | 67 | At this moment we need the last two. 68 | 69 | ] 70 | ] 71 | 72 | 73 | .pull-right[ 74 | .center[ 75 | 76 |

77 | 78 | hex-tidyr 79 |

80 | 81 | ] 82 | ] 83 | 84 | --- 85 | 86 | 87 | # How many NAs 88 | 89 | 90 | ```{r, R.options=list(max.print=10)} 91 | squirrels_tidy %>% 92 | map(~is.na(.) %>% sum()) %>% unlist() 93 | ``` 94 | 95 | --- 96 | 97 | # How many question marks 98 | 99 | ```{r, R.options=list(max.print=10)} 100 | squirrels_tidy %>% 101 | map(~str_detect(., "\\?") %>% sum(na.rm = T)) %>% unlist() 102 | ``` 103 | 104 | --- 105 | 106 | # Transform question mark to NA 107 | 108 | ```{r} 109 | squirrels_tidy <- 110 | squirrels_tidy %>% 111 | mutate_if(is.character, ~na_if(., "?")) 112 | ``` 113 | 114 | No more NAs coded as question marks. 115 | 116 | ```{r, R.options=list(max.print=10)} 117 | squirrels_tidy %>% 118 | map(~str_detect(., "\\?") %>% sum(na.rm = T)) %>% unlist() 119 | ``` 120 | 121 | --- 122 | 123 | # Separate columns 124 | 125 | 126 | ```{r} 127 | squirrels_tidy <- 128 | squirrels_tidy %>% 129 | separate(hectare, c("hectare_lat", "hectare_lon"), sep = 2) 130 | 131 | squirrels_tidy %>% select(hectare_lat, hectare_lon) 132 | ``` 133 | 134 | --- 135 | 136 | # Janitor 137 | 138 | .pull-left[ 139 | 140 | .middle[ 141 | 142 | Janitor encodes many useful routines to clean data that have been collected in spreadheets. 143 | 144 | According to the package readme, dirtiness includes: 145 | 146 | - Dreadful column names. 147 | - Rows and columns containing Excel formatting but no data. 148 | - Dates stored as numbers. 149 | - Values spread inconsistently over [...] columns. 150 | 151 | 152 | ] 153 | ] 154 | 155 | 156 | .pull-right[ 157 | .center[ 158 | 159 |

160 | 161 | hex-tidyr 162 |

163 | 164 | ] 165 | ] 166 | 167 | --- 168 | 169 | .center[ 170 | .middle[ 171 | 172 | 173 | 174 | ] 175 | ] -------------------------------------------------------------------------------- /slides/04-intro-to-tidyverse.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | The Tidyverse, Part 3 5 | 6 | 7 | 8 | 9 | 10 | 245 | 246 | 247 | 309 | 310 | 320 | 321 | 340 | 341 | 351 | 352 | 353 | -------------------------------------------------------------------------------- /slides/05-intro-to-the-tidyverse.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "The Tidyverse, Part 4" 3 | author: "Otho Mantegazza
`r Sys.Date()`" 4 | subtitle: "“Science is what we understand well enough to explain to a computer; R is everything else.”" 5 | output: 6 | xaringan::moon_reader: 7 | lib_dir: libs 8 | css: "css/remark.css" 9 | # chakra: libs/remark-js-latest-min.js 10 | nature: 11 | highlightStyle: github 12 | highlightLines: true 13 | countIncrementalSlides: false 14 | ratio: '16:9' 15 | editor_options: 16 | chunk_output_type: console 17 | --- 18 | 19 | ```{r setup, include=FALSE} 20 | knitr::opts_chunk$set(dev = "svglite") 21 | source(file = here::here("slides/libs-reveal/xaringan_reveal_parentheses_balanced.R")) 22 | options(tibble.width = 55, 23 | tibble.max_extra_cols = 20) 24 | library(tidyverse) 25 | ``` 26 | 27 | class: blueblue, middle 28 | 29 | .big[More of the same...] 30 | 31 | --- 32 | 33 | # Milano Open Data 34 | 35 | .pull-left[ 36 | 37 | .middle[ 38 | 39 | The city of Milan releases carefully curated and well organized open data. 40 | 41 | Check the website: https://dati.comune.milano.it/ 42 | 43 | Also Regione Lombardia has a great open data portal: https://www.dati.lombardia.it/ 44 | 45 | Did you know that, in **Europe**, you can ask for **Open Data** to the of public administration? 46 | 47 | ] 48 | ] 49 | 50 | 51 | .pull-right[ 52 | .middle[ 53 | 54 |

55 | 56 | stemma-milano 57 |

58 | 59 | ] 60 | ] 61 | 62 | --- 63 | 64 | # Let's download some... 65 | 66 | 67 | Air pollution in Milan: https://dati.comune.milano.it/dataset/ds417-rilevazione-qualita-aria-2019 68 | 69 | ```{r} 70 | mil_url <- 71 | paste0("http://dati.comune.milano.it/dataset/", 72 | "3e752fec-06fd-421b-ae9b-4d5d7a177640/", 73 | "resource/698a58e6-f276-44e1-92b1-3d2b81a4ad47/download/", 74 | "qaria_datoariagiornostazione_2019-11-12.csv") 75 | 76 | mil_path <- here::here("data/qaria_datoariagiornostazione_2019-11-12.csv") 77 | ``` 78 | 79 | 80 | ```{r, eval=FALSE} 81 | download.file(url = mil_url, 82 | destfile = mil_path) 83 | ``` 84 | 85 | 86 | --- 87 | 88 | # Let's read the data... 89 | 90 | ```{r} 91 | dat_mil <- read_csv(mil_path) 92 | ``` 93 | 94 | ```{r} 95 | dat_mil %>% print(n = 6) 96 | ``` 97 | 98 | --- 99 | 100 | 101 | class: exercise, middle 102 | 103 | .exercise-title[Exercise:] 104 | 105 | .exercise-body[ 106 | 107 | Describe the data: 108 | 109 | - How are they structured? 110 | - What's stored in the columns? What's their type? 111 | - Are there any NA? 112 | 113 | ] 114 | 115 | --- 116 | 117 | # Six Pollutants over Eight Stations 118 | 119 | .pull-left[ 120 | 121 | ```{r} 122 | dat_mil %>% 123 | count(stazione_id) 124 | ``` 125 | 126 | ] 127 | 128 | .pull-right[ 129 | 130 | ```{r} 131 | dat_mil %>% 132 | count(inquinante) 133 | ``` 134 | 135 | ] 136 | 137 | --- 138 | 139 | 140 | class: exercise, middle 141 | 142 | .exercise-title[Exercise:] 143 | 144 | .exercise-body[ 145 | 146 | Use a visualization to learn when were the records taken? 147 | 148 | ] 149 | 150 | ```{r, eval = FALSE, echo = FALSE} 151 | dat_mil %>% 152 | ggplot(aes(x = data, 153 | y = inquinante)) + 154 | geom_text(label = "|") 155 | ``` 156 | 157 | ```{r, eval = FALSE, echo = FALSE} 158 | dat_mil %>% 159 | ggplot(aes(x = data, 160 | y = stazione_id)) + 161 | geom_text(label = "|") 162 | ``` 163 | 164 | --- 165 | 166 | ```{r} 167 | dat_mil %>% 168 | group_by(stazione_id, inquinante) %>% 169 | count() 170 | ``` 171 | 172 | --- 173 | 174 | 175 | class: exercise, middle 176 | 177 | .exercise-title[Exercise:] 178 | 179 | .exercise-body[ 180 | 181 | Count the NAs per column of the dataset. 182 | 183 | ] 184 | 185 | ```{r, echo = FALSE, eval = FALSE} 186 | dat_mil %>% 187 | map( ~ is.na(.) %>% sum()) 188 | ``` 189 | 190 | 191 | --- 192 | 193 | 194 | `r apply_reveal("count_na")` 195 | 196 | ```{r count_na, echo=FALSE, eval=FALSE} 197 | dat_mil %>% 198 | ggplot(aes(x = stazione_id, 199 | y = inquinante, 200 | colour = is.na(valore))) + 201 | geom_count(alpha = .8) + 202 | theme_bw() + 203 | scale_size_continuous( 204 | limits = c(0, NA) 205 | ) 206 | ``` 207 | 208 | --- 209 | 210 | # Some missing data are implicit 211 | 212 | https://tidyr.tidyverse.org/reference/complete.html 213 | 214 | ```{r} 215 | # complete 216 | dat_mil_all <- 217 | dat_mil %>% 218 | complete(stazione_id, nesting(data, inquinante)) 219 | ``` 220 | 221 | ```{r} 222 | dat_mil_all %>% print(n = 6) 223 | ``` 224 | 225 | --- 226 | 227 | `r apply_reveal("count_na_exp")` 228 | 229 | ```{r count_na_exp, echo=FALSE, eval=FALSE} 230 | dat_mil_all %>% 231 | ggplot(aes(x = stazione_id, 232 | y = inquinante)) + 233 | geom_count() + 234 | theme_bw() + 235 | scale_size_continuous( 236 | limits = c(0, NA) 237 | ) 238 | ``` 239 | 240 | --- 241 | 242 | 243 | class: exercise, middle 244 | 245 | .exercise-title[Exercise:] 246 | 247 | .exercise-body[ 248 | 249 | Some rows might be duplicated. How would you remove them? 250 | 251 | ] 252 | 253 | ```{r, include=FALSE} 254 | # dat_mil_all[dat_mil_all %>% duplicated(),] %>% view() 255 | 256 | dat_mil_all <- 257 | dat_mil_all %>% 258 | distinct() 259 | 260 | library(lubridate) 261 | 262 | ``` 263 | 264 | 265 | --- 266 | 267 | class: exercise, middle 268 | 269 | .exercise-title[Exercise:] 270 | 271 | .exercise-body[ 272 | 273 | How would you visualize the amount of pollutants in the air by month? 274 | 275 | Draw some sketches by hand. 276 | 277 | What value would you map to what object of your sketch? 278 | 279 | ] 280 | 281 | --- 282 | 283 | 284 | ```{r, message=FALSE, warning=FALSE, fig.height=3} 285 | dat_mil_all %>% 286 | mutate(month = month(data, label = T)) %>% 287 | ggplot(aes(x = month, 288 | y = valore)) + 289 | geom_boxplot() + 290 | aes(fill = inquinante) 291 | ``` 292 | 293 | --- 294 | 295 | ```{r, message=FALSE, warning=FALSE, fig.height=6, dev="png", fig.width=14} 296 | dat_mil_all %>% 297 | ggplot(aes(x = data, 298 | y = valore)) + 299 | geom_point(alpha = .2) + 300 | aes(colour = inquinante) + 301 | geom_smooth() 302 | ``` 303 | 304 | --- 305 | 306 | # Pivot 307 | 308 | Pivoting: https://tidyr.tidyverse.org/articles/pivot.html 309 | 310 | ![](https://d33wubrfki0l68.cloudfront.net/3aea19108d39606bbe49981acda07696c0c7fcd8/2de65/images/tidy-9.png) 311 | 312 | Image Credits: https://r4ds.had.co.nz/tidy-data.html 313 | 314 | ```{r, echo=FALSE} 315 | options(tibble.print_min = 20) 316 | ``` 317 | 318 | 319 | --- 320 | 321 | `r apply_reveal("pivot-wide")` 322 | 323 | ```{r pivot-wide, echo=FALSE, eval=FALSE} 324 | dat_mil_all %>% 325 | pivot_wider( 326 | names_from = inquinante, 327 | values_from = valore 328 | ) 329 | ``` 330 | 331 | ```{r, echo=FALSE} 332 | dat_mil_wide <- 333 | dat_mil_all %>% 334 | pivot_wider( 335 | names_from = inquinante, 336 | values_from = valore 337 | ) 338 | ``` 339 | 340 | 341 | --- 342 | 343 | `r apply_reveal("pivot-long")` 344 | 345 | ```{r pivot-long, eval=FALSE, echo=FALSE} 346 | # Pivot back 347 | dat_mil_wide %>% 348 | pivot_longer( 349 | cols = C6H6:SO2, 350 | names_to = "inquinante", 351 | values_to = "valore" 352 | ) 353 | ``` 354 | 355 | --- 356 | 357 | # Extend GGPLOT2 358 | 359 | https://www.ggplot2-exts.org/gallery/ 360 | 361 | ```{r, eval = FALSE} 362 | # install.packages("GGally") 363 | 364 | library(GGally) 365 | 366 | ggpairs(dat_mil_wide) 367 | ``` 368 | 369 | --- 370 | 371 | .center[ 372 | 373 | ```{r, echo=FALSE, dev="png", message=FALSE, warning=FALSE, fig.height=8, fig.width=12} 374 | library(GGally) 375 | 376 | ggpairs(dat_mil_wide) 377 | ``` 378 | 379 | ] 380 | -------------------------------------------------------------------------------- /slides/05-intro-to-the-tidyverse_files/figure-html/output_count_na_4-1.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | C6H6 43 | CO_8h 44 | NO2 45 | O3 46 | PM10 47 | PM25 48 | SO2 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 2 61 | 4 62 | 6 63 | 8 64 | stazione_id 65 | inquinante 66 | 67 | -------------------------------------------------------------------------------- /slides/05-intro-to-the-tidyverse_files/figure-html/output_count_na_exp_3-1.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | C6H6 43 | CO_8h 44 | NO2 45 | O3 46 | PM10 47 | PM25 48 | SO2 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 2 61 | 4 62 | 6 63 | 8 64 | stazione_id 65 | inquinante 66 | 67 | -------------------------------------------------------------------------------- /slides/05-intro-to-the-tidyverse_files/figure-html/unnamed-chunk-15-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/othomantegazza/eda-class/567295f0b1cec7b09469d03aa6f0b544845bb9c1/slides/05-intro-to-the-tidyverse_files/figure-html/unnamed-chunk-15-1.png -------------------------------------------------------------------------------- /slides/05-intro-to-the-tidyverse_files/figure-html/unnamed-chunk-19-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/othomantegazza/eda-class/567295f0b1cec7b09469d03aa6f0b544845bb9c1/slides/05-intro-to-the-tidyverse_files/figure-html/unnamed-chunk-19-1.png -------------------------------------------------------------------------------- /slides/06-your-turn.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Your Turn" 3 | author: "Otho Mantegazza
`r Sys.Date()`" 4 | subtitle: "I am my own experiment. I am my own work of R." 5 | output: 6 | xaringan::moon_reader: 7 | lib_dir: libs 8 | css: "css/remark.css" 9 | # chakra: libs/remark-js-latest-min.js 10 | nature: 11 | highlightStyle: github 12 | highlightLines: true 13 | countIncrementalSlides: false 14 | ratio: '16:9' 15 | editor_options: 16 | chunk_output_type: console 17 | --- 18 | 19 | ```{r setup, include=FALSE} 20 | knitr::opts_chunk$set(dev = "svglite") 21 | source(file = here::here("slides/libs-reveal/xaringan_reveal_parentheses_balanced.R")) 22 | options(tibble.width = 55, 23 | tibble.max_extra_cols = 20) 24 | library(tidyverse) 25 | ``` 26 | 27 | class: blueblue, middle 28 | 29 | .big[Pick your dataset and go...] 30 | 31 | --- 32 | 33 | class: blueblue, middle 34 | 35 | .verybig[Structured] 36 | 37 | --- 38 | 39 | class: middle 40 | 41 | # 1 - Nobel Prizes 42 | 43 | https://github.com/rfordatascience/tidytuesday/tree/master/data/2019/2019-05-14 44 | 45 | -- 46 | 47 | # 2 - US Births 48 | 49 | https://github.com/rfordatascience/tidytuesday/tree/master/data/2018/2018-10-02 50 | 51 | -- 52 | 53 | # 3 - Ecological Footprint 54 | 55 | https://www.kaggle.com/footprintnetwork/ecological-footprint 56 | 57 | --- 58 | 59 | class: blueblue, middle 60 | 61 | .verybig[Challenging] 62 | 63 | --- 64 | 65 | class: middle 66 | 67 | # 4 - Forest Fires in Brazil 68 | 69 | https://www.kaggle.com/gustavomodelli/forest-fires-in-brazil 70 | 71 | -- 72 | 73 | # 5 - Genetic Variations 74 | 75 | https://www.kaggle.com/kevinarvai/clinvar-conflicting 76 | 77 | --- 78 | 79 | class: blueblue, middle 80 | 81 | .verybig[When you approach new Data...] 82 | 83 | --- 84 | 85 | ## Get Familiar with the Topic 86 | 87 | Gather some background. Know at least just a bit of what the dataset is about. What is the issues that this dataset can solve? 88 | 89 | -- 90 | 91 | ## Do you have a data dictionary? 92 | 93 | Which **variable are stored in the columns** and how? 94 | 95 | - If you don't have this information, can you gather it? 96 | - Do those description match what you observe in R? If not, what does not match? 97 | - Did you get warnings when you loaded the data in R? Can you fix them? 98 | 99 | 100 | --- 101 | 102 | # Are the data Tidy? 103 | 104 | Or can you make them so? 105 | 106 | 107 | - Are there missing values? How are they encoded? How are they distributed in the dataset? 108 | - Do the column have a practical names, do they need renaming? 109 | - Do the data are structured according to the tidy principles, do they need spreadong/gathering? 110 | 111 | Remember: 112 | 113 | - Each variable must have its own column. 114 | - Each observation must have its own row. 115 | - Each value must have its own cell. 116 | 117 | --- 118 | 119 | # Can you find any pattern in your data? 120 | 121 | Show it with a viz! 122 | 123 | -------------------------------------------------------------------------------- /slides/06-your-turn.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Your Turn 5 | 6 | 7 | 8 | 9 | 10 | 118 | 119 | 120 | 182 | 183 | 193 | 194 | 213 | 214 | 224 | 225 | 226 | -------------------------------------------------------------------------------- /slides/10-practical.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Your Turn!" 3 | subtitle: "“The point of R is to inspire you to create your own.”" 4 | author: "Otho Mantegazza
`r Sys.Date()`" 5 | output: 6 | xaringan::moon_reader: 7 | lib_dir: libs 8 | css: "css/remark.css" 9 | # chakra: libs/remark-js-latest-min.js 10 | nature: 11 | highlightStyle: github 12 | highlightLines: true 13 | countIncrementalSlides: false 14 | ratio: '16:9' 15 | --- 16 | 17 | ```{r setup, include=FALSE} 18 | knitr::opts_chunk$set(dev = "svglite") 19 | # source(file = here::here("slides/libs-reveal/xaringan_reveal_parentheses_balanced.R")) 20 | options(tibble.width = 55, 21 | tibble.max_extra_cols = 20) 22 | library(tidyverse) 23 | # library(ggbeeswarm) 24 | # load(here::here("data/squirrels_tidy.Rdata")) 25 | ``` 26 | 27 | --- 28 | 29 | -------------------------------------------------------------------------------- /slides/css/remark.css: -------------------------------------------------------------------------------- 1 | /* Modified from the default css that comes from remark.js - xaringan */ 2 | /* https://github.com/yihui/xaringan */ 3 | 4 | /*from default fonts*/ 5 | 6 | @import url(https://fonts.googleapis.com/css?family=Yanone+Kaffeesatz); 7 | @import url(https://fonts.googleapis.com/css?family=Droid+Serif:400,700,400italic); 8 | @import url('https://fonts.googleapis.com/css?family=Inconsolata&display=swap'); 9 | @import url('https://fonts.googleapis.com/css?family=Indie+Flower&display=swap'); 10 | @import url('https://fonts.googleapis.com/css?family=Leckerli+One&display=swap'); 11 | @import url('https://fonts.googleapis.com/css?family=IBM+Plex+Sans&display=swap'); 12 | @import url('https://fonts.googleapis.com/css?family=News+Cycle&display=swap'); 13 | 14 | body { 15 | font-family: 'News Cycle', sans-serif; 16 | } 17 | 18 | /*From default.css*/ 19 | 20 | a, a > code { 21 | color: rgb(249, 38, 114); 22 | text-decoration: none; 23 | } 24 | .footnote { 25 | position: absolute; 26 | bottom: 3em; 27 | padding-right: 4em; 28 | font-size: 90%; 29 | } 30 | .remark-code-line-highlighted { background-color: #ffff88; } 31 | 32 | .inverse { 33 | background-color: #272822; 34 | color: #d6d6d6; 35 | text-shadow: 0 0 20px #333; 36 | } 37 | .inverse h1, .inverse h2, .inverse h3 { 38 | color: #f3f3f3; 39 | } 40 | /* Two-column layout */ 41 | .left-column { 42 | color: #777; 43 | width: 20%; 44 | height: 92%; 45 | float: left; 46 | } 47 | .left-column h2:last-of-type, .left-column h3:last-child { 48 | color: #000; 49 | } 50 | .right-column { 51 | width: 75%; 52 | float: right; 53 | padding-top: 1em; 54 | } 55 | .pull-left { 56 | float: left; 57 | width: 47%; 58 | } 59 | .pull-right { 60 | float: right; 61 | width: 47%; 62 | } 63 | .pull-right ~ * { 64 | clear: both; 65 | } 66 | img, video, iframe { 67 | max-width: 100%; 68 | } 69 | blockquote { 70 | border-left: solid 5px lightgray; 71 | padding-left: 1em; 72 | } 73 | .remark-slide table { 74 | margin: auto; 75 | border-top: 1px solid #666; 76 | border-bottom: 1px solid #666; 77 | } 78 | .remark-slide table thead th { border-bottom: 1px solid #ddd; } 79 | th, td { padding: 5px; } 80 | .remark-slide thead, .remark-slide tfoot, .remark-slide tr:nth-child(even) { background: #eee } 81 | 82 | @page { margin: 0; } 83 | @media print { 84 | .remark-slide-scaler { 85 | width: 100% !important; 86 | height: 100% !important; 87 | transform: scale(1) !important; 88 | top: 0 !important; 89 | left: 0 !important; 90 | } 91 | } 92 | 93 | /* Extra ------------------------------------------------------- */ 94 | 95 | /* Normal slide */ 96 | 97 | .remark-slide-content h1 { 98 | font-size: 40px; 99 | } 100 | 101 | /* Title slide ----------------------- */ 102 | .title-slide { 103 | background-image: linear-gradient(to right, #F1C232, #37C6C3); 104 | text-shadow: none; 105 | } 106 | 107 | .title-slide h1 { 108 | font-family: 'Leckerli One', cursive; 109 | color: #2087AC; 110 | text-align: left; 111 | font-size: 70px; 112 | margin-bottom: 20px; 113 | } 114 | 115 | .title-slide h2 { 116 | position: absolute; 117 | bottom: 20px; 118 | right: 80px; 119 | text-align: right; 120 | font-family: 'IBM Plex Sans', sans-serif; 121 | font-size: 30px; 122 | color: #707070; 123 | width: 500px; 124 | } 125 | 126 | .title-slide h3 { 127 | font-family: 'IBM Plex Sans', sans-serif; 128 | color: #2E3036; 129 | text-align: left; 130 | margin-top: 5px; 131 | margin-bottom: 5px; 132 | font-size: 24px; 133 | border-top-color: #2E3036; 134 | border-top-style: solid; 135 | border-top-width: 2.5px; 136 | box-sizing: content-box; 137 | } 138 | 139 | /* Blue transition slide -------------------- */ 140 | /* For tips and transiton */ 141 | .blueblue { 142 | background-image: linear-gradient(to right, #3A70DE, #9900FF) ; 143 | color: #E97E00; 144 | font-family: 'Indie Flower', cursive; 145 | } 146 | 147 | /* Big font */ 148 | .big { 149 | font-size: 45px; 150 | color: #C0C8E7; 151 | } 152 | 153 | /* very big font */ 154 | .verybig { 155 | font-size: 82px; 156 | } 157 | 158 | .orange { 159 | color: #E97E00; 160 | } 161 | 162 | /* Scroll text vertically --------------------- */ 163 | .scroller .remark-code { 164 | overflow-y: scroll; 165 | } 166 | 167 | /* Exercise ------------------------------------*/ 168 | .exercise { 169 | background-image: linear-gradient(to right, #ffe598, #ffcd43) ; 170 | font-family: 'Indie Flower', cursive; 171 | } 172 | 173 | .exercise-title { 174 | font-size: 82px; 175 | color: #00aaff; 176 | } 177 | 178 | .exercise-body { 179 | font-size: 45px; 180 | color: rgb(87, 156, 182); 181 | } 182 | 183 | .exercise-body-small { 184 | font-size: 32px; 185 | color: rgb(87, 156, 182); 186 | } 187 | 188 | /* Code output -------------------------------------*/ 189 | 190 | 191 | code.hljs { 192 | border-radius: 10px; 193 | background-image: linear-gradient(to right, #ffffff, #E7C2FF) !important; 194 | } 195 | 196 | /* hex stickers-------------------------------------*/ 197 | 198 | .imagelink { 199 | display: inline-block; 200 | margin: auto; 201 | } 202 | 203 | 204 | .imagelink:hover { 205 | filter: hue-rotate(60deg); 206 | } 207 | 208 | /* Splits --------------------------------------------*/ 209 | 210 | .split-40>.column:first-of-type { 211 | width: 40%; 212 | height:100%; 213 | position: absolute; 214 | top: 0; 215 | left: 0; } 216 | 217 | .split-40>.column:nth-of-type(2) { 218 | width: 60%; 219 | height:100%; 220 | position: absolute; 221 | top: 0; 222 | right: 0;} 223 | 224 | .pull-left-reveal { 225 | float: left; 226 | width: 35%; 227 | } 228 | .pull-right-reveal { 229 | float: right; 230 | width: 60%; 231 | font-size: 15px; 232 | } 233 | 234 | .remark-code-line-highlighted { 235 | border-radius: 7px; 236 | background-image: linear-gradient(to right, rgb(251, 241, 193), #ffffff) !important; 237 | } 238 | 239 | /* Portrait -------------------------------------- */ 240 | 241 | .portrait { 242 | max-width: 20%; 243 | border-radius: 50%; 244 | } 245 | -------------------------------------------------------------------------------- /slides/img/SVG/ggplot2.svg: -------------------------------------------------------------------------------- 1 | 2 | ggplot2 -------------------------------------------------------------------------------- /slides/img/SVG/readr.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 18 | 19 | readr 20 | 22 | 23 | 27 | 29 | 31 | 34 | 37 | 40 | 43 | 45 | 47 | 49 | 51 | 53 | 56 | 59 | 61 | 63 | 66 | 68 | 71 | 73 | 75 | 77 | 80 | 82 | 84 | 86 | 87 | 88 | 90 | 91 | 93 | 96 | 100 | 104 | 106 | 107 | 110 | 113 | 116 | 118 | 120 | 123 | 126 | 128 | 132 | 134 | 136 | 138 | 141 | 143 | 147 | 148 | -------------------------------------------------------------------------------- /slides/img/SVG/tidyr.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 17 | tidyr 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 29 | 30 | 33 | 36 | 39 | 42 | 45 | 48 | 51 | 54 | 56 | 58 | 60 | 63 | 65 | 67 | 69 | 71 | 73 | 75 | 77 | 79 | 81 | 83 | 85 | 89 | 92 | 94 | 96 | 100 | 103 | 106 | 109 | 112 | 115 | 118 | 121 | 124 | 126 | 128 | 130 | 132 | 135 | 138 | 141 | 143 | 145 | 148 | 151 | 153 | 157 | 159 | 161 | 163 | 166 | 168 | 172 | 173 | -------------------------------------------------------------------------------- /slides/img/grammar-of-graphics.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/othomantegazza/eda-class/567295f0b1cec7b09469d03aa6f0b544845bb9c1/slides/img/grammar-of-graphics.png -------------------------------------------------------------------------------- /slides/img/janitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/othomantegazza/eda-class/567295f0b1cec7b09469d03aa6f0b544845bb9c1/slides/img/janitor.png -------------------------------------------------------------------------------- /slides/img/portrait.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/othomantegazza/eda-class/567295f0b1cec7b09469d03aa6f0b544845bb9c1/slides/img/portrait.jpg -------------------------------------------------------------------------------- /slides/img/readr.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 17 | 18 | readr 19 | 21 | 22 | 26 | 28 | 30 | 33 | 36 | 39 | 42 | 44 | 46 | 48 | 50 | 52 | 55 | 58 | 60 | 62 | 65 | 67 | 70 | 72 | 74 | 76 | 79 | 81 | 83 | 85 | 86 | 87 | 89 | 90 | 92 | 95 | 99 | 103 | 105 | 106 | 109 | 112 | 115 | 117 | 119 | 122 | 125 | 127 | 131 | 133 | 135 | 137 | 140 | 142 | 146 | 147 | -------------------------------------------------------------------------------- /slides/img/squirrel-svgrepo-com.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | 9 | 11 | 13 | 15 | 17 | 18 | 21 | 25 | 26 | 30 | 31 | 33 | 36 | 37 | 39 | 41 | 43 | 45 | 47 | 49 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | -------------------------------------------------------------------------------- /slides/quotes.md: -------------------------------------------------------------------------------- 1 | "Without art, the crudeness of reality would make the world unbearable" ― George Bernard Shaw 2 | 3 | “art is the most intense mode of individualism that the world has known.” 4 | ― Oscar Wilde 5 | 6 | “The artist always has the masters in his eyes.” 7 | ― Ralph Waldo Emerson 8 | 9 | “Science is what we understand well enough to explain to a computer; art is everything else.” 10 | ― Donald E. Knuth, Things a Computer Scientist Rarely Talks About 11 | 12 | “art, in itself, is an attempt to bring order out of chaos.” 13 | ― Stephen Sondheim 14 | 15 | “art never comes from happiness.” 16 | ― Chuck Palahniuk 17 | 18 | “You don’t make art out of good intentions.” 19 | ― Gustave Flaubert 20 | 21 | “Fuck art. I’ve gotta get out of the basement. I’ve gotta see the world. I’ve gotta make a difference.” 22 | ― Gerard Way 23 | 24 | “What is fair in men, passes away, but not so in art” 25 | ― Leonardo da Vinci 26 | 27 | “The point of art is to inspire you to create your own.” 28 | ― Misha Collins 29 | 30 | “art does not reproduce what we see. It makes us see.” 31 | ― Paul Klee 32 | 33 | “We need our art to teach us how to breathe” 34 | ― Ray Bradbury 35 | 36 | “To make us feel small in the right way is a function of ar. ” 37 | ― E.M. Forster 38 | 39 | “art is not in some far-off place.” 40 | ― Lydia Davis 41 | 42 | “I am my own experiment. I am my own work of art.” 43 | ― Madonna 44 | 45 | “Art and life are subjective. Not everybody's gonna dig what I dig, but I reserve the right to dig it.” 46 | ― Whoopi Goldberg 47 | 48 | “Art is science made clear.” 49 | ― Jean Cocteau 50 | 51 | “All art is quite useless.” 52 | ― Oscar Wilde, The Picture of Dorian Gray 53 | 54 | “Art is the proper task of life.” 55 | ― Friedrich Nietzsche --------------------------------------------------------------------------------