├── NAMESPACE ├── .Rbuildignore ├── .gitignore ├── inspect.Rproj ├── DESCRIPTION ├── man └── eda.Rd ├── R └── fun.R ├── README.md └── inst ├── mpg.csv └── eda.Rmd /NAMESPACE: -------------------------------------------------------------------------------- 1 | exportPattern("^[[:alpha:]]+") 2 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | -------------------------------------------------------------------------------- /inspect.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: inspect 2 | Type: Package 3 | Title: Inspect data 4 | Version: 0.1.0 5 | Authors@R: 6 | person(given = "Steven", 7 | family = "Ge", 8 | role = c("aut", "cre"), 9 | email = "gelabinfo@gmail.com") 10 | Maintainer: The package maintainer 11 | Description: More about what it does (maybe more than one line) 12 | Use four spaces when indenting paragraphs within the Description. 13 | License: MIT 14 | Encoding: UTF-8 15 | LazyData: true 16 | RoxygenNote: 7.2.3 17 | Imports: 18 | dplyr, 19 | ggplot2, 20 | knitr, 21 | rmarkdown, 22 | gridExtra, 23 | e1071, 24 | corrplot, 25 | GGally, 26 | hexbin 27 | -------------------------------------------------------------------------------- /man/eda.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/fun.R 3 | \name{eda} 4 | \alias{eda} 5 | \title{Create a report for exploratory data analysis (EDA)} 6 | \usage{ 7 | eda(df, target = NULL) 8 | } 9 | \arguments{ 10 | \item{df}{A data.frame or a matrix with at least 2 columns.} 11 | 12 | \item{target}{A selected target variable.} 13 | } 14 | \value{ 15 | This function does not return a value. Instead, it generates an HTML file containing the EDA report and attempts to open this file in the default web browser. 16 | } 17 | \description{ 18 | This function generates a comprehensive EDA report and attempts to open it in a browser. 19 | } 20 | \details{ 21 | The function performs a check on the input data to ensure it is either a data frame or matrix. 22 | It also checks that the input data has at least two columns. The function uses a predefined Rmarkdown template (`eda.Rmd`) 23 | to generate the report. This template should be present in the `inspect` package. 24 | } 25 | \examples{ 26 | # Example with default parameters (using iris dataset) 27 | eda(iris) 28 | 29 | # Example with a target variable 30 | eda(iris, "Species") 31 | 32 | # Example with custom data frame and target variable 33 | my_data <- data.frame(x = rnorm(100), y = rnorm(100)) 34 | eda(df = my_data, target = "x") 35 | 36 | } 37 | \seealso{ 38 | \code{\link[rmarkdown]{render}} for details on the rendering of Rmarkdown documents. 39 | \code{\link[utils]{browseURL}} for how URLs or files are opened in browsers. 40 | } 41 | -------------------------------------------------------------------------------- /R/fun.R: -------------------------------------------------------------------------------- 1 | # This is test 2 | 3 | # this is another test 4 | 5 | #' Create a report for exploratory data analysis (EDA) 6 | #' 7 | #' This function generates a comprehensive EDA report and attempts to open it in a browser. 8 | #' 9 | #' @param df A data.frame or a matrix with at least 2 columns. 10 | #' @param target A selected target variable. 11 | #' @examples 12 | #' # Example with default parameters (using iris dataset) 13 | #' eda(iris) 14 | #' 15 | #' # Example with a target variable 16 | #' eda(iris, "Species") 17 | #' 18 | #' # Example with custom data frame and target variable 19 | #' my_data <- data.frame(x = rnorm(100), y = rnorm(100)) 20 | #' eda(df = my_data, target = "x") 21 | #' 22 | #' @details 23 | #' The function performs a check on the input data to ensure it is either a data frame or matrix. 24 | #' It also checks that the input data has at least two columns. The function uses a predefined Rmarkdown template (`eda.Rmd`) 25 | #' to generate the report. This template should be present in the `inspect` package. 26 | #' 27 | #' @seealso 28 | #' \code{\link[rmarkdown]{render}} for details on the rendering of Rmarkdown documents. 29 | #' \code{\link[utils]{browseURL}} for how URLs or files are opened in browsers. 30 | #' 31 | #' @return This function does not return a value. Instead, it generates an HTML file containing the EDA report and attempts to open this file in the default web browser. 32 | #' @export 33 | eda <- function(df, target = NULL) { 34 | # Validate input 35 | if(is.null(df)) { 36 | stop("No data frame provided.") 37 | } 38 | if(!is.data.frame(df) && !is.matrix(df)) { 39 | stop("The provided data is neither a data frame nor a matrix.") 40 | } 41 | if(ncol(df) < 2) { 42 | stop("The data frame/matrix should have at least two columns.") 43 | } 44 | 45 | # Define file paths 46 | rmd_file <- "inspect_EDA.Rmd" 47 | output_file <- "inspect_EDA_report.html" 48 | 49 | # Copy the Rmd file template 50 | file.copy(from = system.file("eda.Rmd", package = "inspect"), to = rmd_file, overwrite = TRUE) 51 | 52 | # Set up parameters to pass to the Rmd document 53 | params <- list(df = df, target = target) 54 | 55 | # Render the Rmarkdown document 56 | tryCatch({ 57 | rmarkdown::render(input = rmd_file, output_file = output_file, params = params, envir = new.env(parent = globalenv())) 58 | # Attempt to open the HTML report in a browser 59 | utils::browseURL(output_file) 60 | }, error = function(e) { 61 | cat("Error when generating the report: ", e$message, "\n") 62 | }, finally = { 63 | cat("Report generation process completed.\n") 64 | }) 65 | } 66 | 67 | 68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Inspect: An R Package for automated EDA (exploratory analysis) 3 | Writtend mostly by GPT-4, this R package renders an EDA report based on this [R Markdown file.](https://raw.githubusercontent.com/gexijin/inspect/main/inst/eda.Rmd) It can be used to generate an EDA [report like this,](https://rpubs.com/ge600/eda) from any data set. You can also generate this report using the Shiny app [RTutor.](https://RTutor.ai) Contact or feedback [Steven Ge](https://www.linkedin.com/in/steven-ge-ab016947/) 4 | 5 | # Install & use 6 | ``` 7 | library("remotes") 8 | install_github("gexijin/inspect") 9 | library(inspect) 10 | 11 | eda(mtcars) # Generate EDA report for a data frame, i.e. mtcars 12 | eda(iris, "Species") # Specifying a dependent/target variable 13 | ``` 14 | # Main goal 15 | Exploratory data analysis (EDA) is an essential first step in any data science project. Consider it the equivalent of an annual doctor’s check-up but for data science projects. I have long believed that EDA can be automated as the tasks are very general. While there are existing R packages for EDA such as DataExplorer, summarytools, tableone, and GGally, I have not found what I was looking for. Leveraging GPT-4, I was able to create an EDA script in just a few hours. 16 | 17 | Given a data set, the main idea is to streamline these steps: 18 | 1. Starts with a data summary. 19 | 2. Any missing values and outliers? 20 | 3. Plots distribution of numerical variables using histograms and QQ plots. When excessive skewness is present, a log transformation is recommended. 21 | 4. Distribution of categorical variables. 22 | 5. It provides a general data overview with a heatmap and a correlation plot. 23 | 6. Correlation matrix (corrplot) 24 | 7. Scatter plots to examine correlations between numerical variables. 25 | 8. It uses violin plots and performs ANOVA to study the differences between groups delineated by categorical variables. 26 | 9. Are categorical variables independent of each other? Uses Chi-squared test and bar plots. 27 | 28 | To use this RMarkdown file, you just need to obtain a copy from this GitHub repository. Replace the demo data file with your own, specify a target variable, and you’re ready to render the report. 29 | 30 | If that sounds like too much work, simply upload your data file to [RTutor.ai](https://RTutor.ai), and click on the EDA tab. A comprehensive report will be generated in 2 minutes. The template was originally written for RTutor. 31 | 32 | # Example plots 33 | 34 | ![Missing](https://github.com/gexijin/gEDA/assets/18232433/3b4d49cc-a9db-49ff-9790-2e6c5f6f5f4d) 35 | 36 | ## 37 | ![Correlation](https://github.com/gexijin/gEDA/assets/18232433/1c925e74-2b8d-41fd-9542-015e396c2f3c) 38 | 39 | ## 40 | 41 | ![Heatmap](https://github.com/gexijin/gEDA/assets/18232433/d16b5db4-4e32-4872-b7ac-df727a1b6a67) 42 | 43 | ## 44 | 45 | 46 | ![Histogram](https://github.com/gexijin/gEDA/assets/18232433/e67f51c7-be2e-403d-b56f-6130791650d3) 47 | 48 | ## 49 | 50 | 51 | ![Barplot](https://github.com/gexijin/gEDA/assets/18232433/5cef4db8-fc23-49e9-b6aa-0b822ecdc2b5) 52 | 53 | ## 54 | 55 | 56 | 57 | ![Scatter plot](https://github.com/gexijin/gEDA/assets/18232433/7ff6f681-7f91-4030-aefa-6bc7990e999b) 58 | 59 | ## 60 | 61 | 62 | ![Boxplot](https://github.com/gexijin/gEDA/assets/18232433/0f71123a-cce5-4a6a-9d98-217063951c24) 63 | 64 | ## 65 | 66 | 67 | ![Combination](https://github.com/gexijin/gEDA/assets/18232433/a57e1be7-7187-4b9c-9e10-2d884170d2f9) 68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /inst/mpg.csv: -------------------------------------------------------------------------------- 1 | displ,cyl,drv,cty,hwy,class 2 | ,6,f,16,24,subcompact 3 | ,6,f,17,24,compact 4 | ,4,f,22,31,compact 5 | ,4,f,21,29,compact 6 | ,4,f,21,30,midsize 7 | ,8,r,11,15,suv 8 | ,4,4,19,26,subcompact 9 | ,8,4,12,18,suv 10 | ,8,4,11,15,pickup 11 | 6.2,8,r,16,26,2seater 12 | ,8,4,11,17,pickup 13 | 2.5,4,4,20,27,compact 14 | ,6,4,15,20,suv 15 | ,8,4,13,19,suv 16 | ,4,f,25,32,subcompact 17 | 2.5,4,4,20,27,suv 18 | ,8,4,,16,suv 19 | 2.2,4,4,,26,subcompact 20 | ,6,f,,23,minivan 21 | 2.8,6,f,,26,compact 22 | 4.6,8,4,,16,pickup 23 | 4.6,8,4,,17,pickup 24 | 5.2,8,4,,16,pickup 25 | 3.3,,f,16,22,minivan 26 | 3.8,,f,16,23,minivan 27 | 2,,4,20,28,compact 28 | 2.7,,4,16,20,suv 29 | 2.4,,f,19,27,midsize 30 | 2,,f,19,26,compact 31 | 1.8,,f,24,30,compact 32 | 5.2,,,11,15,pickup 33 | 2,,,20,28,subcompact 34 | 2,,,21,30,compact 35 | 5.4,,,14,20,subcompact 36 | 4.2,,,12,18,suv 37 | 5.4,,,12,18,suv 38 | 3.8,,,18,25,subcompact 39 | 2.7,,,,20,pickup 40 | 7,8,,,24,2seater 41 | 4,6,,,17,suv 42 | 3.1,6,,,26,midsize 43 | 4,6,,,19,suv 44 | 4,6,,,17,suv 45 | 1.8,4,f,,34,subcompact 46 | 5.7,8,4,,18,suv 47 | 3.8,6,f,,21,minivan 48 | 2.2,4,4,,26,subcompact 49 | 4.7,8,4,9,12,suv 50 | 4,6,4,13,19,suv 51 | 2,4,4,19,27,compact 52 | 6.2,8,r,15,25,2seater 53 | 2.5,4,4,20,26,suv 54 | 3.7,6,4,15,19,suv 55 | 5.7,8,4,11,15,suv 56 | 5.4,8,r,11,16,suv 57 | 3,6,4,17,22,suv 58 | 3.5,6,f,19,26,midsize 59 | 3.3,6,4,14,17,suv 60 | 3,6,f,17,24,minivan 61 | 3,6,f,19,25,midsize 62 | 5.4,8,4,13,17,pickup 63 | 2.7,6,f,17,24,subcompact 64 | 4.6,8,4,13,16,pickup 65 | 5.9,8,4,11,15,suv 66 | 4.7,8,4,12,16,pickup 67 | 5,8,4,13,17,suv 68 | 3.5,6,f,19,25,midsize 69 | 1.8,4,4,18,26,compact 70 | 2.5,4,4,19,25,suv 71 | 6,8,r,12,17,suv 72 | 4,6,4,16,20,suv 73 | 4.7,8,4,14,17,suv 74 | 3.8,6,f,16,26,midsize 75 | 2,4,f,19,29,subcompact 76 | 3.4,6,4,15,17,suv 77 | 3.5,6,f,19,27,midsize 78 | 4,6,4,14,20,suv 79 | 2,4,f,21,29,subcompact 80 | 3.1,6,f,18,27,compact 81 | 2.5,5,f,21,29,compact 82 | 3.5,6,f,19,28,midsize 83 | 5.3,8,4,14,19,suv 84 | 3.1,6,f,18,26,midsize 85 | 3.7,6,4,15,19,pickup 86 | 3.3,6,f,11,17,minivan 87 | 5.3,8,r,14,20,suv 88 | 3.8,6,f,18,28,midsize 89 | 2,4,f,21,29,subcompact 90 | 1.6,4,f,24,32,subcompact 91 | 4.6,8,r,15,22,subcompact 92 | 2,4,f,22,29,compact 93 | 2.8,6,4,15,24,midsize 94 | 2.2,4,f,21,27,compact 95 | 5.4,8,r,11,17,suv 96 | 3,6,f,18,26,compact 97 | 2.7,4,4,17,22,pickup 98 | 4.7,8,4,9,12,pickup 99 | 1.8,4,f,24,36,subcompact 100 | 4.4,8,4,12,18,suv 101 | 3.8,6,f,17,27,midsize 102 | 4.6,8,r,11,17,suv 103 | 2.4,4,f,21,31,midsize 104 | 5.7,8,4,13,18,suv 105 | 2.8,6,f,16,23,compact 106 | 4.6,8,4,13,19,suv 107 | 2.4,4,f,19,27,compact 108 | 3.3,6,f,17,24,minivan 109 | 1.8,4,f,26,35,compact 110 | 2.4,4,f,18,24,minivan 111 | 3.8,6,f,15,22,minivan 112 | 1.8,4,4,16,25,compact 113 | 4.7,8,4,11,15,suv 114 | 3.9,6,4,14,17,pickup 115 | 2.5,4,4,18,25,suv 116 | 2.4,4,f,21,31,midsize 117 | 3.6,6,f,17,26,midsize 118 | 5.7,8,r,13,17,suv 119 | 4.7,8,4,14,17,suv 120 | 2.8,6,f,17,24,compact 121 | 5,8,4,13,17,suv 122 | 2,4,f,19,26,compact 123 | 1.8,4,f,18,29,compact 124 | 4.7,8,4,13,17,suv 125 | 2.5,4,4,18,24,suv 126 | 1.8,4,f,24,33,compact 127 | 2.4,4,f,18,26,midsize 128 | 2.5,4,f,23,32,midsize 129 | 2.2,4,f,21,29,midsize 130 | 3.9,6,4,13,17,pickup 131 | 4.7,8,4,9,12,suv 132 | 2.8,6,f,16,26,midsize 133 | 4.7,8,4,14,19,suv 134 | 2,4,f,19,26,subcompact 135 | 2.5,4,f,23,31,midsize 136 | 3.5,6,f,18,29,midsize 137 | 4,6,4,15,18,pickup 138 | 2.5,6,f,18,26,midsize 139 | 4,6,r,16,24,subcompact 140 | 4.2,6,4,14,17,pickup 141 | 5.7,8,r,15,23,2seater 142 | 6.5,8,4,14,17,suv 143 | 5.4,8,r,12,18,suv 144 | 2,4,f,22,29,compact 145 | 3.4,6,4,15,19,pickup 146 | 1.9,4,f,29,41,subcompact 147 | 4.7,8,4,14,19,pickup 148 | 1.8,4,f,21,29,compact 149 | 2.5,6,f,18,26,midsize 150 | 4.7,8,4,9,12,pickup 151 | 1.9,4,f,33,44,compact 152 | 4.6,8,r,15,22,subcompact 153 | 2.5,5,f,21,29,compact 154 | 1.8,4,f,26,35,compact 155 | 5.3,8,r,14,20,suv 156 | 2.8,6,4,15,25,compact 157 | 4.7,8,4,13,17,pickup 158 | 1.8,4,f,25,36,subcompact 159 | 6.1,8,4,11,14,suv 160 | 2.4,4,f,22,30,midsize 161 | 4,6,4,16,20,pickup 162 | 3.3,6,4,15,17,suv 163 | 3.9,6,4,13,17,suv 164 | 5.7,8,r,16,26,2seater 165 | 2.7,4,4,15,20,pickup 166 | 3,6,f,18,26,compact 167 | 2,4,f,20,31,compact 168 | 4.6,8,r,15,23,subcompact 169 | 2.7,6,f,17,24,subcompact 170 | 2.7,4,4,15,20,suv 171 | 2.5,4,4,20,25,compact 172 | 3.3,6,f,19,28,midsize 173 | 4.2,6,4,14,17,pickup 174 | 5.3,8,4,11,14,suv 175 | 2.4,4,f,21,31,compact 176 | 2,4,f,21,29,compact 177 | 2,4,f,21,29,compact 178 | 2.5,5,f,20,28,subcompact 179 | 5.4,8,r,11,17,suv 180 | 4.7,8,4,14,19,pickup 181 | 3.4,6,4,15,17,pickup 182 | 4.7,8,4,13,17,suv 183 | 2,4,f,21,29,midsize 184 | 4.7,8,4,9,12,pickup 185 | 3,6,f,18,26,midsize 186 | 2.5,4,4,20,27,compact 187 | 2.5,4,4,19,25,compact 188 | 4.2,8,4,16,23,midsize 189 | 2.5,4,4,18,23,suv 190 | 3.3,6,f,16,22,minivan 191 | 2.4,4,f,21,31,midsize 192 | 2.4,4,f,21,29,compact 193 | 1.8,4,f,18,29,midsize 194 | 4.6,8,4,11,15,suv 195 | 3.6,6,f,17,26,midsize 196 | 2.8,6,f,18,26,midsize 197 | 2,4,f,20,27,subcompact 198 | 5.4,8,4,11,15,pickup 199 | 4,8,4,11,15,suv 200 | 2,4,f,19,26,subcompact 201 | 2.2,4,f,21,27,midsize 202 | 3,6,f,18,26,midsize 203 | 2.4,4,f,18,27,midsize 204 | 1.8,4,f,28,37,compact 205 | 5.2,8,4,11,15,pickup 206 | 4.6,8,r,15,21,subcompact 207 | 4,6,4,14,17,suv 208 | 5.3,8,f,16,25,midsize 209 | 1.6,4,f,28,33,subcompact 210 | 3,6,f,18,26,midsize 211 | 3.1,6,4,17,25,compact 212 | 3.1,6,4,17,25,midsize 213 | 1.9,4,f,35,44,subcompact 214 | 2.5,4,4,19,26,subcompact 215 | 2,4,f,19,28,midsize 216 | 3.7,6,4,14,18,pickup 217 | 5.7,8,4,13,17,pickup 218 | 1.8,4,f,21,29,midsize 219 | 4,6,4,15,19,suv 220 | 3.3,6,f,18,27,compact 221 | 3.4,6,4,15,19,suv 222 | 2.5,5,f,20,29,subcompact 223 | 4.7,8,4,12,16,pickup 224 | 2,4,f,21,29,compact 225 | 4.7,8,4,13,17,pickup 226 | 2.8,6,f,16,26,compact 227 | 5.7,8,4,13,18,suv 228 | 3.8,6,r,18,26,subcompact 229 | 3.3,6,f,17,24,minivan 230 | 2.2,4,f,21,29,compact 231 | 4,6,r,17,26,subcompact 232 | 1.6,4,f,23,29,subcompact 233 | 1.6,4,f,24,32,subcompact 234 | 2.8,6,4,17,25,compact 235 | 3.1,6,4,15,25,compact 236 | -------------------------------------------------------------------------------- /inst/eda.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Exploratory data analysis (EDA) report" 3 | author: "Generated by the inspect package by Steven Ge" 4 | date: "`r Sys.Date()`" 5 | output: 6 | html_document: 7 | number_sections: true 8 | code_folding: hide 9 | toc: true 10 | params: 11 | df: 12 | target: 13 | date: 14 | label: "Date: " 15 | value: !r Sys.Date() 16 | printcode: 17 | label: "Display Code" 18 | value: TRUE 19 | input: checkbox 20 | --- 21 | The inspect package can be installed from [GitHub.](https://github.com/gexijin/inspect) 22 | 23 | ```{r setup, include=FALSE} 24 | knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE) 25 | library(dplyr) 26 | library(ggplot2) 27 | max_data_points_eda <- 2000 28 | target_bin <- NULL 29 | target_var <- NULL 30 | ``` 31 | 32 | # Read data 33 | ```{r} 34 | df <- params$df 35 | target_var <- params$target 36 | ``` 37 | 38 | 39 | # Data pre-process 40 | 41 | If a numeric variable has just a few unique values, it might make more sense to convert it into categorical variable. 42 | If the total number of unique values is less than 5% of the total rows, convert. 43 | ```{r convert-factor} 44 | # Iterate over each column of the data frame 45 | for (i in seq_along(df)) { 46 | # Check if the column is numeric and has less than 5 unique values 47 | if (is.numeric(df[[i]]) && length(unique(df[[i]])) / nrow(df) < 0.05 && length(unique(df[[i]])) < 13) { 48 | # Convert the column to a factor 49 | df[[i]] <- as.factor(df[[i]]) 50 | cat("\nColumn ", colnames(df)[i], "was converted to a factor.") 51 | } 52 | } 53 | ``` 54 | 55 | If a non-numeric variable has too many unique values, it might be names or IDs that is not useful in analysis. 56 | ```{r convert-id} 57 | cutoff <- 0.8 58 | # Initialize a vector to store the names of columns to be removed 59 | cols_to_remove <- c() 60 | 61 | # Loop through each column 62 | for (col_name in names(df)) { 63 | # Check if column is non-numeric and has unique values > 80% of total rows 64 | if (!is.numeric(df[[col_name]]) && length(unique(df[[col_name]])) > cutoff * nrow(df)) { 65 | cols_to_remove <- c(cols_to_remove, col_name) 66 | cat("\nColumn", cols_to_remove, " was excluded from analysis.") 67 | } 68 | } 69 | 70 | # Remove the identified columns 71 | df <- df %>% select(-one_of(cols_to_remove)) 72 | ``` 73 | 74 | If target variable is specified, bin this variable into another variable called target_bin. 75 | ```{r generate-target} 76 | target_bin <- NULL 77 | # Check if the target_var is in the dataframe and is not NA 78 | if (!is.null(target_var)) { 79 | if (target_var %in% names(df)) { 80 | # Check if the target variable is numerical 81 | if (is.numeric(df[[target_var]])) { 82 | # Create bins for the numerical target variable 83 | target_bin <- cut( 84 | df[[target_var]], 85 | breaks = 5, 86 | labels = c("low", "low_mid", "mid", "upper_mid", "high"), 87 | include.lowest = TRUE 88 | ) 89 | print(paste("Binned target variable", target_var, " as", "target_bin")) 90 | } else { 91 | print(paste("The target variable, ", target_var, ", is a categorical variable.")) 92 | } 93 | } else { 94 | print("Selected variable is not part of the data. Proceeding with general EDA.") 95 | target_var <- NULL 96 | } 97 | } else { 98 | print("No valid target variable selected. Proceeding with general EDA.") 99 | target_var <- NULL 100 | } 101 | ``` 102 | 103 | 104 | 105 | 106 | # Basic summary 107 | 108 | ```{R summary} 109 | str(df) 110 | summary(df) 111 | ``` 112 | 113 | # Missing values 114 | ```{r missing-value} 115 | # Calculate the total number of missing values per column 116 | missing_values <- sapply(df, function(x) sum(is.na(x))) 117 | 118 | # Calculate the number of cases with at least one missing value 119 | cases_with_missing <- sum(apply(df, 1, function(x) any(is.na(x)))) 120 | 121 | # Check if there are any missing values 122 | if (all(missing_values == 0)) { 123 | print("There is no missing data in any columns.") 124 | } else { 125 | # Create a data frame for plotting 126 | missing_data_df <- data.frame( 127 | Column = c(names(missing_values), "At Least One Missing"), 128 | MissingValues = c(missing_values, cases_with_missing) 129 | ) 130 | # Calculate the percentage of missing values per column 131 | # missing_percentage <- (missing_values / nrow(df)) * 100 132 | # Plot the number of missing values for all columns with labels 133 | ggplot(missing_data_df, aes(x = Column, y = MissingValues, fill = Column)) + 134 | geom_bar(stat = "identity") + 135 | geom_text(aes(label = sprintf("%.0f%%", MissingValues / nrow(df) * 100)), hjust = -0.3) + # Add labels to the bars 136 | # geom_text(aes(label = sprintf("%.2f%%", MissingPercentage)), hjust = -0.3) + 137 | coord_flip() + # Makes the bars horizontal 138 | labs(title = "Number of Missing Values by Column", x = "Column", y = "Number of Missing Values") + 139 | scale_fill_brewer(palette = "Set3") + # Use a color palette for different bars 140 | theme(legend.position = "none", axis.title.y = element_blank()) + # Remove the legend 141 | scale_y_continuous(expand = expansion(mult = c(0, 0.2))) # Extend the y-axis limits by 10% 142 | } 143 | ``` 144 | 145 | 146 | 147 | ```{r outliers} 148 | detect_outliers_mad <- function(df, accuracy = 0.99) { 149 | # Function to calculate MAD 150 | mad_function <- function(x) { 151 | median(abs(x - median(x))) 152 | } 153 | 154 | # Calculate z-score equivalent for the given accuracy 155 | z_threshold <- qnorm(accuracy + (1 - accuracy) / 2) 156 | 157 | # Calculate MAD threshold 158 | mad_threshold <- z_threshold 159 | 160 | # Initialize a list to store outlier indices for each numeric column 161 | outliers_list <- list() 162 | 163 | # Initialize a vector to keep track of rows with outliers 164 | rows_with_outliers <- rep(FALSE, nrow(df)) 165 | 166 | # Loop through each column in the dataframe 167 | for (col_name in names(df)) { 168 | # Check if the column is numeric 169 | if (is.numeric(df[[col_name]])) { 170 | # Calculate MAD and median for the column 171 | mad_value <- mad_function(df[[col_name]]) 172 | median_value <- median(df[[col_name]]) 173 | 174 | # Calculate the deviation scores (using a modified z-score) 175 | deviation_scores <- 0.6745 * (df[[col_name]] - median_value) / mad_value 176 | 177 | # Identify indices of outliers 178 | outlier_indices <- which(abs(deviation_scores) > mad_threshold) 179 | 180 | # Store the indices in the list 181 | outliers_list[[col_name]] <- outlier_indices 182 | 183 | # Update rows with outliers 184 | rows_with_outliers[outlier_indices] <- TRUE 185 | } 186 | } 187 | 188 | # Calculate the number of outliers in each column 189 | num_outliers_each_col <- sapply(outliers_list, length) 190 | 191 | # Calculate the number of rows with at least one outlier 192 | num_rows_with_outliers <- sum(rows_with_outliers) 193 | 194 | # Combine the results into one vector 195 | combined_results <- c(num_outliers_each_col, "Rows w/ Outliers" = num_rows_with_outliers) 196 | 197 | # Return the combined results 198 | return(combined_results) 199 | } 200 | 201 | # Detect outliers using the previously defined function 202 | outliers_info <- detect_outliers_mad(df) 203 | 204 | # Check if there are any outliers 205 | if (all(outliers_info == 0)) { 206 | print("There are no outliers in any columns.") 207 | } else { 208 | # Create a data frame for plotting 209 | outliers_data_df <- data.frame( 210 | Column = names(outliers_info), 211 | Outliers = outliers_info, 212 | OutlierPercentage = (outliers_info / nrow(df)) * 100 # Calculate the percentage of outliers 213 | ) 214 | 215 | # Plot the number of outliers for all columns with labels 216 | ggplot(outliers_data_df, aes(x = Column, y = Outliers, fill = Column)) + 217 | geom_bar(stat = "identity") + 218 | geom_text(aes(label = sprintf("%.2f%%", OutlierPercentage)), hjust = -0.3, vjust = 0) + # Add labels to the bars 219 | coord_flip() + # Makes the bars horizontal 220 | labs(title = "Number of Outliers by Column", x = "Column", y = "Number of Outliers") + 221 | scale_fill_brewer(palette = "Set3") + # Use a color palette for different bars 222 | theme(legend.position = "none", axis.title.y = element_blank()) + # Remove the legend 223 | scale_y_continuous(expand = expansion(mult = c(0, 0.2))) # Extend the y-axis limits 224 | } 225 | ``` 226 | 227 | # Univariate distributions: Numeric 228 | 229 | ```{r distribution-numbers} 230 | library(gridExtra) 231 | library(e1071) # for skewness 232 | 233 | # Function to check if the variable is highly skewed 234 | is_highly_skewed <- function(x) { 235 | # Remove missing values before computing skewness 236 | x <- na.omit(x) 237 | abs(e1071::skewness(x)) > 1 238 | } 239 | 240 | create_plots <- function(df, var) { 241 | skewness_value <- round(e1071::skewness(na.omit(df[[var]])), 2) 242 | 243 | # Histogram 244 | p1 <- ggplot(df, aes_string(x = var)) + 245 | geom_histogram(bins = 15, fill = "skyblue", color = "black") + 246 | ggtitle(paste("Histogram of", var)) + 247 | annotate("text", x = Inf, y = Inf, label = paste("Skewness:", skewness_value), hjust = 1.1, vjust = 1.1) 248 | 249 | # QQ Plot with reference line 250 | p2 <- ggplot(df, aes_string(sample = var)) + 251 | stat_qq() + 252 | stat_qq_line(color = "red") + 253 | ggtitle(paste("QQ Plot of", var)) 254 | 255 | if (is_highly_skewed(df[[var]])) { 256 | df$log_transformed <- log(df[[var]] + 1) 257 | skewness_log_value <- round(e1071::skewness(na.omit(df$log_transformed)), 2) 258 | 259 | # Histogram after log transformation 260 | p3 <- ggplot(df, aes(x = log_transformed)) + 261 | geom_histogram(bins = 15, fill = "lightgreen", color = "black") + 262 | ggtitle(paste("Log-transformed", var)) + 263 | annotate("text", x = Inf, y = Inf, label = paste("Skewness:", skewness_log_value), hjust = 1.1, vjust = 1.1) 264 | 265 | # QQ Plot after log transformation 266 | p4 <- ggplot(df, aes(sample = log_transformed)) + 267 | stat_qq() + 268 | stat_qq_line(color = "red") + 269 | ggtitle(paste("log-transformed", var)) 270 | 271 | return(grid.arrange(p1, p2, p3, p4, ncol = 2)) 272 | } else { 273 | return(grid.arrange(p1, p2, ncol = 2)) 274 | } 275 | } 276 | # If the dataframe has more than 2000 rows, sample 2000 rows randomly 277 | df_reduced <- df 278 | if (nrow(df) > max_data_points_eda) { 279 | set.seed(123) # Set a random seed for reproducibility 280 | df_reduced <- df_reduced[sample(nrow(df_reduced), 2000), ] 281 | cat("Since there are too many rows, ", max_data_points_eda," randomly selected rows are shown scatter plots.") 282 | } 283 | 284 | # Apply the function to each numeric variable in the dataframe 285 | lapply(names(df_reduced)[sapply(df_reduced, is.numeric)], function(var) create_plots(df, var)) 286 | ``` 287 | 288 | 289 | # Univariate distribution: Categorical variables 290 | ```{r distribution-categories} 291 | # Function to create bar plots 292 | create_bar_plot <- function(df, column_name) { 293 | # Checking if the column is a factor 294 | if (!is.numeric(df[[column_name]])) { 295 | df[[column_name]] <- as.factor(df[[column_name]]) 296 | factor_levels <- levels(df[[column_name]]) 297 | } else { 298 | factor_levels <- NULL 299 | } 300 | 301 | # Modify the data for plotting 302 | plot_data <- df %>% 303 | count(!!sym(column_name)) %>% 304 | arrange(desc(n)) %>% 305 | mutate(value = ifelse(row_number() > 12, "Other", as.character(!!sym(column_name)))) %>% 306 | group_by(value) %>% 307 | summarize(n = sum(n)) 308 | 309 | # If the column is a factor, adjust the value names 310 | if (!is.null(factor_levels)) { 311 | plot_data$value <- factor(plot_data$value, levels = unique(c(factor_levels, "Other"))) 312 | } 313 | 314 | # Finding the maximum value for adjusting y-axis 315 | max_value <- max(plot_data$n) 316 | 317 | # Creating the bar plot 318 | p <- ggplot(plot_data, aes(x = value, y = n, fill = value)) + 319 | geom_bar(stat = "identity") + 320 | geom_text(aes(label = paste0(round(n / sum(n) * 100, 1), "%")), 321 | vjust = -0.5 322 | ) + 323 | labs(title = paste("Bar Plot for", column_name), y = "Count") + 324 | theme( 325 | axis.text.x = element_text(angle = 45, hjust = 1), 326 | axis.title.x = element_blank() 327 | ) + 328 | theme(legend.position = "none") + # Remove the legend 329 | scale_y_continuous(limits = c(0, max_value * 1.2)) # Extend y-axis by 20% 330 | 331 | print(p) # Explicitly print the plot 332 | } 333 | 334 | # Iterating over each column 335 | for (column_name in names(df)) { 336 | if (!is.numeric(df[[column_name]])) { # Checking if the column is non-numeric 337 | unique_values <- length(unique(df[[column_name]])) 338 | if (unique_values != nrow(df)) { # Ignoring columns with all unique values 339 | create_bar_plot(df, column_name) 340 | } 341 | } 342 | } 343 | ``` 344 | 345 | # Overview heatmap 346 | ```{r heatmap, fig.height=8} 347 | data_for_heatmap <- df 348 | if(!is.null(target_bin)) { 349 | data_for_heatmap$target_bin <- target_bin 350 | } 351 | numeric_cols <- sapply(data_for_heatmap, is.numeric) 352 | 353 | # Prepare data for the heatmap 354 | # Filter out rows with missing values 355 | data_for_heatmap <- na.omit(data_for_heatmap) 356 | 357 | # more than 2 numeric columns 358 | if (sum(numeric_cols) > 1) { 359 | # If the dataframe has more than 2000 rows, sample 2000 rows randomly 360 | if (nrow(data_for_heatmap) > max_data_points_eda) { 361 | set.seed(123) # Set a random seed for reproducibility 362 | data_for_heatmap <- data_for_heatmap[sample(nrow(data_for_heatmap), 2000), ] 363 | cat("Since there are too many rows, ", max_data_points_eda," randomly selected rows are shown in this heatmap") 364 | } 365 | 366 | # Determine whether to include row names based on the number of rows 367 | show_row_names <- nrow(data_for_heatmap) <= 100 368 | 369 | # Define a color palette from yellow to blue 370 | my_palette <- colorRampPalette(c("yellow", "blue"))(n = 299) 371 | 372 | # Initialize the RowSideColors parameter as NULL 373 | row_side_colors <- NULL 374 | 375 | dist_pearson <- function(x, ...) 376 | as.dist(1-cor(t(x), method="pearson")) 377 | 378 | # Check if target_var is provided and not null 379 | if (!is.null(target_var) && target_var %in% names(data_for_heatmap)) { 380 | # Use the 'target_bin' column if it exists, otherwise use the column specified by target_var 381 | target_col <- ifelse("target_bin" %in% names(data_for_heatmap), 'target_bin', target_var) 382 | 383 | # define groups of rows 384 | groups <- data_for_heatmap[[target_col]] 385 | groups_colors <- as.numeric(factor(groups)) 386 | unique_groups <- unique(groups) 387 | row_side_colors <- rainbow(length(unique_groups))[groups_colors] 388 | 389 | # Create the heatmap with clustering trees and color bar 390 | gplots::heatmap.2( 391 | as.matrix(data_for_heatmap[numeric_cols]), 392 | scale = "column", # Data is already scaled, so no scaling is needed here 393 | distfun = dist_pearson, # for distance between rows 394 | hclustfun = function(x) hclust(x, method = "average"), # for hierarchical clustering 395 | dendrogram = "both", # only row dendrograms 396 | trace = "none", # turns off trace lines inside the heatmap 397 | density.info = "none", # turns off density plot inside color legend 398 | margins = c(8, 8), # adjusts the margins around the plot 399 | Colv = TRUE, # cluster columns 400 | Rowv = TRUE, # always cluster rows 401 | labRow = if(show_row_names) rownames(data_for_heatmap) else NA, # show/hide row names 402 | key = TRUE, # whether to show the color key 403 | keysize = 1, # size of the color key 404 | symbreaks = FALSE , # whether to make color breaks symmetrical around zero 405 | col = my_palette, # yellow and blue 406 | RowSideColors = row_side_colors, # add side color bar if side_colors is not NULL 407 | cexCol = 0.9, # Make column labels smaller 408 | srtCol = 45, # Rotate column labels 45 degrees 409 | adjCol = c(1,0) # Adjust the position of the column labels 410 | ) 411 | legend("topright", legend = unique_groups, fill = rainbow(length(unique_groups)), title = target_var) 412 | 413 | } else { # RowSideColors = NULL gives errors. 414 | # Create the heatmap without the color bar 415 | gplots::heatmap.2( 416 | as.matrix(data_for_heatmap[numeric_cols]), 417 | scale = "column", # Data is already scaled, so no scaling is needed here 418 | distfun = dist_pearson, # for distance between rows 419 | hclustfun = function(x) hclust(x, method = "average"), # for hierarchical clustering 420 | dendrogram = "both", # only row dendrograms 421 | trace = "none", # turns off trace lines inside the heatmap 422 | density.info = "none", # turns off density plot inside color legend 423 | margins = c(8, 8), # adjusts the margins around the plot 424 | Colv = TRUE, # cluster columns 425 | Rowv = TRUE, # always cluster rows 426 | labRow = if(show_row_names) rownames(data_for_heatmap) else NA, # show/hide row names 427 | key = TRUE, # whether to show the color key 428 | keysize = 1, # size of the color key 429 | symbreaks = FALSE , # whether to make color breaks symmetrical around zero 430 | col = my_palette, # yellow and blue 431 | cexCol = 0.9, # Make column labels smaller 432 | srtCol = 45, # Rotate column labels 45 degrees 433 | adjCol = c(1,0) # Adjust the position of the column labels 434 | ) 435 | } 436 | } 437 | ``` 438 | 439 | 440 | # Bivariate correlation 441 | 442 | ## Correlation matrix (Blank indicates not significant, P > 0.05) 443 | ```{r correlation-map, height = 10, width =10} 444 | library(corrplot) 445 | df2 <- df[sapply(df, is.numeric)] 446 | df2 <- na.omit(df2) 447 | M <- cor(df2) 448 | testRes <- cor.mtest(df2, conf.level = 0.95) 449 | ## leave blank on non-significant coefficient 450 | ## add significant correlation coefficients 451 | corrplot(M, 452 | p.mat = testRes$p, method = "circle", type = "lower", insig = "blank", 453 | addCoef.col = "black", number.cex = 0.8, order = "AOE", diag = FALSE 454 | ) 455 | ``` 456 | 457 | ## Correlation between numeric variables 458 | 459 | ```{r two-numbers} 460 | library(GGally) 461 | library(hexbin) 462 | 463 | num_cols <- names(df)[sapply(df, is.numeric)] 464 | if(length(num_cols) > 1) { 465 | num_col_pairs <- combn(num_cols, 2, simplify = FALSE) 466 | 467 | for (pair in num_col_pairs) { 468 | col_x <- pair[1] 469 | col_y <- pair[2] 470 | 471 | # Perform correlation test 472 | corr_test <- cor.test(df[[col_x]], df[[col_y]], method = "pearson") 473 | 474 | if (corr_test$p.value < 0.01 && abs(corr_test$estimate) > 0.1) { 475 | corr_label <- paste("R =", round(corr_test$estimate, 2), "\nP =", format(corr_test$p.value, scientific = TRUE, digits = 2)) 476 | 477 | if (nrow(df) > 5000) { 478 | # Use hexbin plot 479 | p <- ggplot(df, aes_string(x = col_x, y = col_y)) + 480 | labs(title = paste(col_x, "vs", col_y), x = col_x, y = col_y) + 481 | geom_hex(bins = 70) + 482 | scale_fill_continuous(type = "viridis") 483 | } else { 484 | # Use scatter plot 485 | if (!is.null(target_var) && !is.na(target_var)) { 486 | if(target_var != col_x && target_var != col_y){ # target is not in the pair 487 | df_ggplot <- df 488 | if(!is.null(target_bin)) { 489 | df_ggplot$target_bin <- target_bin 490 | } 491 | 492 | color_var <- ifelse(!is.numeric(df[[target_var]]), target_var, "target_bin") 493 | p <- ggplot(df_ggplot, aes_string(x = col_x, y = col_y, color = color_var)) + 494 | geom_point(alpha = 0.7) + 495 | labs(title = paste(col_x, "vs", col_y), x = col_x, y = col_y) + 496 | guides(color = guide_legend(title = color_var)) 497 | } else { # target is not in the pair 498 | p <- ggplot(df, aes_string(x = col_x, y = col_y)) + 499 | geom_point(alpha = 0.7) + 500 | labs(title = paste(col_x, "vs", col_y), x = col_x, y = col_y) 501 | } 502 | } else { # No target 503 | p <- ggplot(df, aes_string(x = col_x, y = col_y)) + 504 | geom_point(alpha = 0.7) + 505 | labs(title = paste(col_x, "vs", col_y), x = col_x, y = col_y) 506 | } 507 | } 508 | 509 | p <- p + annotate("text", x = Inf, y = Inf, label = corr_label, hjust = 1.1, vjust = 1.1, size = 4) 510 | print(p) 511 | } 512 | } 513 | } else { 514 | cat("Not applicable.") 515 | } 516 | ``` 517 | 518 | 519 | 520 | ## Correlation between a numeric and a categorical variable 521 | ```{r numeric-categorical} 522 | num_cols <- sapply(df, is.numeric) 523 | cat_cols <- sapply(df, is.factor) 524 | if(length(num_cols) > 1 & length(cat_cols) > 1) { 525 | # Perform ANOVA and create violin plots for significant cases 526 | for (num_var in names(df)[num_cols]) { 527 | for (cat_var in names(df)[cat_cols]) { 528 | if (cat_var != "target_bin") { 529 | anova_result <- aov(df[[num_var]] ~ df[[cat_var]], data = df) 530 | p_value <- summary(anova_result)[[1]]$"Pr(>F)"[1] 531 | if (p_value < 0.01) { 532 | plot <- ggplot(df, aes_string(x = cat_var, y = num_var)) + 533 | geom_violin(trim = FALSE, fill = "lightblue", color = "black") + 534 | geom_boxplot(width = 0.2, fill = "white", color = "black") + 535 | labs(title = paste(num_var, "by", cat_var, "(ANOVA P =", format(p_value, scientific = TRUE, digits = 2), ")"), x = cat_var, y = num_var) 536 | 537 | if (nrow(df) < 300) { 538 | plot <- plot + geom_jitter(width = 0.2, color = "black") 539 | } 540 | print(plot) 541 | } 542 | } 543 | } 544 | } 545 | } else { 546 | cat("Not applicable.") 547 | } 548 | ``` 549 | 550 | ## Correlation between two categorical variables 551 | 552 | ```{r two-categories} 553 | cat_cols <- !(sapply(df, is.numeric)) & names(df) != "target_bin" 554 | cat_var_names <- names(df)[cat_cols] 555 | if(length(cat_var_names) > 1) { 556 | # Perform chi-squared tests and create stacked bar plots if p-value < 0.01 557 | for (i in 1:(length(cat_var_names) - 1)) { 558 | for (j in (i + 1):length(cat_var_names)) { 559 | tab <- table(df[[cat_var_names[i]]], df[[cat_var_names[j]]]) 560 | chi_test <- chisq.test(tab) 561 | if (is.na(chi_test$p.value)) next 562 | if (chi_test$p.value < 0.01) { 563 | p <- ggplot(df, aes_string(x = cat_var_names[i], fill = cat_var_names[j])) + 564 | geom_bar(position = "fill") + 565 | labs( 566 | title = paste(cat_var_names[i], "vs", cat_var_names[j], "(Chisq P =", format(chi_test$p.value, scientific = TRUE, digits = 2), ")"), 567 | x = paste(cat_var_names[i]), 568 | y = "Proportion" 569 | ) + 570 | coord_flip() 571 | print(p) 572 | } 573 | } 574 | } 575 | } else { 576 | cat("There is one or zero categorical variable") 577 | } 578 | ``` 579 | 580 | 581 | This RMarkdown document was written by [Steven Ge](https://www.linkedin.com/in/steven-ge-ab016947/), assisted heavily by GPT-4 (who wrote 90% of the code). Source code on [GitHub.](https://github.com/gexijin/inspect) No guarantees. No right reserved. 582 | --------------------------------------------------------------------------------