├── NAMESPACE
├── .Rbuildignore
├── .gitignore
├── inspect.Rproj
├── DESCRIPTION
├── man
    └── eda.Rd
├── R
    └── fun.R
├── README.md
└── inst
    ├── mpg.csv
    └── eda.Rmd


/NAMESPACE:
--------------------------------------------------------------------------------
1 | exportPattern("^[[:alpha:]]+")
2 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | 


--------------------------------------------------------------------------------
/inspect.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: inspect
 2 | Type: Package
 3 | Title: Inspect data
 4 | Version: 0.1.0
 5 | Authors@R: 
 6 |     person(given = "Steven",
 7 |            family = "Ge",
 8 |            role = c("aut", "cre"),
 9 |            email = "gelabinfo@gmail.com")
10 | Maintainer: The package maintainer <gelabinfo@gmail.com>
11 | Description: More about what it does (maybe more than one line)
12 |     Use four spaces when indenting paragraphs within the Description.
13 | License: MIT
14 | Encoding: UTF-8
15 | LazyData: true
16 | RoxygenNote: 7.2.3
17 | Imports:
18 |     dplyr,
19 |     ggplot2,
20 |     knitr,
21 |     rmarkdown,
22 |     gridExtra,
23 |     e1071,
24 |     corrplot,
25 |     GGally,
26 |     hexbin
27 | 


--------------------------------------------------------------------------------
/man/eda.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/fun.R
 3 | \name{eda}
 4 | \alias{eda}
 5 | \title{Create a report for exploratory data analysis (EDA)}
 6 | \usage{
 7 | eda(df, target = NULL)
 8 | }
 9 | \arguments{
10 | \item{df}{A data.frame or a matrix with at least 2 columns.}
11 | 
12 | \item{target}{A selected target variable.}
13 | }
14 | \value{
15 | This function does not return a value. Instead, it generates an HTML file containing the EDA report and attempts to open this file in the default web browser.
16 | }
17 | \description{
18 | This function generates a comprehensive EDA report and attempts to open it in a browser.
19 | }
20 | \details{
21 | The function performs a check on the input data to ensure it is either a data frame or matrix.
22 | It also checks that the input data has at least two columns. The function uses a predefined Rmarkdown template (`eda.Rmd`)
23 | to generate the report. This template should be present in the `inspect` package.
24 | }
25 | \examples{
26 | # Example with default parameters (using iris dataset)
27 | eda(iris)
28 | 
29 | # Example with a target variable
30 | eda(iris, "Species")
31 | 
32 | # Example with custom data frame and target variable
33 | my_data <- data.frame(x = rnorm(100), y = rnorm(100))
34 | eda(df = my_data, target = "x")
35 | 
36 | }
37 | \seealso{
38 | \code{\link[rmarkdown]{render}} for details on the rendering of Rmarkdown documents.
39 | \code{\link[utils]{browseURL}} for how URLs or files are opened in browsers.
40 | }
41 | 


--------------------------------------------------------------------------------
/R/fun.R:
--------------------------------------------------------------------------------
 1 | # This is test
 2 | 
 3 | # this is another test
 4 | 
 5 | #' Create a report for exploratory data analysis (EDA)
 6 | #'
 7 | #' This function generates a comprehensive EDA report and attempts to open it in a browser.
 8 | #'
 9 | #' @param df A data.frame or a matrix with at least 2 columns.
10 | #' @param target A selected target variable.
11 | #' @examples
12 | #' # Example with default parameters (using iris dataset)
13 | #' eda(iris)
14 | #'
15 | #' # Example with a target variable
16 | #' eda(iris, "Species")
17 | #'
18 | #' # Example with custom data frame and target variable
19 | #' my_data <- data.frame(x = rnorm(100), y = rnorm(100))
20 | #' eda(df = my_data, target = "x")
21 | #'
22 | #' @details
23 | #' The function performs a check on the input data to ensure it is either a data frame or matrix.
24 | #' It also checks that the input data has at least two columns. The function uses a predefined Rmarkdown template (`eda.Rmd`)
25 | #' to generate the report. This template should be present in the `inspect` package.
26 | #'
27 | #' @seealso
28 | #' \code{\link[rmarkdown]{render}} for details on the rendering of Rmarkdown documents.
29 | #' \code{\link[utils]{browseURL}} for how URLs or files are opened in browsers.
30 | #'
31 | #' @return This function does not return a value. Instead, it generates an HTML file containing the EDA report and attempts to open this file in the default web browser.
32 | #' @export
33 | eda <- function(df, target = NULL) {
34 |   # Validate input
35 |   if(is.null(df)) {
36 |     stop("No data frame provided.")
37 |   }
38 |   if(!is.data.frame(df) && !is.matrix(df)) {
39 |     stop("The provided data is neither a data frame nor a matrix.")
40 |   }
41 |   if(ncol(df) < 2) {
42 |     stop("The data frame/matrix should have at least two columns.")
43 |   }
44 | 
45 |   # Define file paths
46 |   rmd_file <- "inspect_EDA.Rmd"
47 |   output_file <- "inspect_EDA_report.html"
48 | 
49 |   # Copy the Rmd file template
50 |   file.copy(from = system.file("eda.Rmd", package = "inspect"), to = rmd_file, overwrite = TRUE)
51 | 
52 |   # Set up parameters to pass to the Rmd document
53 |   params <- list(df = df, target = target)
54 | 
55 |   # Render the Rmarkdown document
56 |   tryCatch({
57 |     rmarkdown::render(input = rmd_file, output_file = output_file, params = params, envir = new.env(parent = globalenv()))
58 |     # Attempt to open the HTML report in a browser
59 |     utils::browseURL(output_file)
60 |   }, error = function(e) {
61 |     cat("Error when generating the report: ", e$message, "\n")
62 |   }, finally = {
63 |     cat("Report generation process completed.\n")
64 |   })
65 | }
66 | 
67 | 
68 | 
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Inspect: An R Package for automated EDA (exploratory analysis)
 3 | Writtend mostly by GPT-4, this R package renders an EDA report based on this [R Markdown file.](https://raw.githubusercontent.com/gexijin/inspect/main/inst/eda.Rmd)  It can be used to generate an EDA [report like this,](https://rpubs.com/ge600/eda)  from any data set. You can also generate this report using the Shiny app [RTutor.](https://RTutor.ai)  Contact or feedback [Steven Ge](https://www.linkedin.com/in/steven-ge-ab016947/)
 4 | 
 5 | # Install & use
 6 | ```
 7 | library("remotes")
 8 | install_github("gexijin/inspect")
 9 | library(inspect)
10 | 
11 | eda(mtcars)   # Generate EDA report for a data frame, i.e. mtcars
12 | eda(iris, "Species")  # Specifying a dependent/target variable
13 | ```
14 | #  Main goal
15 | Exploratory data analysis (EDA) is an essential first step in any data science project. Consider it the equivalent of an annual doctor’s check-up but for data science projects. I have long believed that EDA can be automated as the tasks are very general. While there are existing R packages for EDA such as DataExplorer, summarytools, tableone, and GGally, I have not found what I was looking for. Leveraging GPT-4, I was able to create an EDA script in just a few hours.
16 | 
17 | Given a data set, the main idea is to streamline these steps:
18 | 1.  Starts with a data summary.
19 | 2.  Any missing values and outliers?
20 | 3.  Plots distribution of numerical variables using histograms and QQ plots. When excessive skewness is present, a log transformation is recommended.
21 | 4.  Distribution of categorical variables.
22 | 5.  It provides a general data overview with a heatmap and a correlation plot.
23 | 6.  Correlation matrix (corrplot)
24 | 7.  Scatter plots to examine correlations between numerical variables.
25 | 8.  It uses violin plots and performs ANOVA to study the differences between groups delineated by categorical variables.
26 | 9.  Are categorical variables independent of each other? Uses Chi-squared test and bar plots.
27 | 
28 | To use this RMarkdown file, you just need to obtain a copy from this GitHub repository. Replace the demo data file with your own, specify a target variable, and you’re ready to render the report.
29 | 
30 | If that sounds like too much work, simply upload your data file to [RTutor.ai](https://RTutor.ai), and click on the EDA tab. A comprehensive report will be generated in 2 minutes. The template was originally written for RTutor.
31 | 
32 | # Example plots
33 | 
34 | ![Missing](https://github.com/gexijin/gEDA/assets/18232433/3b4d49cc-a9db-49ff-9790-2e6c5f6f5f4d)
35 | 
36 | ##
37 | ![Correlation](https://github.com/gexijin/gEDA/assets/18232433/1c925e74-2b8d-41fd-9542-015e396c2f3c)
38 | 
39 | ##
40 | 
41 | ![Heatmap](https://github.com/gexijin/gEDA/assets/18232433/d16b5db4-4e32-4872-b7ac-df727a1b6a67)
42 | 
43 | ##
44 | 
45 | 
46 | ![Histogram](https://github.com/gexijin/gEDA/assets/18232433/e67f51c7-be2e-403d-b56f-6130791650d3)
47 | 
48 | ##
49 | 
50 | 
51 | ![Barplot](https://github.com/gexijin/gEDA/assets/18232433/5cef4db8-fc23-49e9-b6aa-0b822ecdc2b5)
52 | 
53 | ##
54 | 
55 | 
56 | 
57 | ![Scatter plot](https://github.com/gexijin/gEDA/assets/18232433/7ff6f681-7f91-4030-aefa-6bc7990e999b)
58 | 
59 | ##
60 | 
61 | 
62 | ![Boxplot](https://github.com/gexijin/gEDA/assets/18232433/0f71123a-cce5-4a6a-9d98-217063951c24)
63 | 
64 | ##
65 | 
66 | 
67 | ![Combination](https://github.com/gexijin/gEDA/assets/18232433/a57e1be7-7187-4b9c-9e10-2d884170d2f9)
68 | 
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/inst/mpg.csv:
--------------------------------------------------------------------------------
  1 | displ,cyl,drv,cty,hwy,class
  2 | ,6,f,16,24,subcompact
  3 | ,6,f,17,24,compact
  4 | ,4,f,22,31,compact
  5 | ,4,f,21,29,compact
  6 | ,4,f,21,30,midsize
  7 | ,8,r,11,15,suv
  8 | ,4,4,19,26,subcompact
  9 | ,8,4,12,18,suv
 10 | ,8,4,11,15,pickup
 11 | 6.2,8,r,16,26,2seater
 12 | ,8,4,11,17,pickup
 13 | 2.5,4,4,20,27,compact
 14 | ,6,4,15,20,suv
 15 | ,8,4,13,19,suv
 16 | ,4,f,25,32,subcompact
 17 | 2.5,4,4,20,27,suv
 18 | ,8,4,,16,suv
 19 | 2.2,4,4,,26,subcompact
 20 | ,6,f,,23,minivan
 21 | 2.8,6,f,,26,compact
 22 | 4.6,8,4,,16,pickup
 23 | 4.6,8,4,,17,pickup
 24 | 5.2,8,4,,16,pickup
 25 | 3.3,,f,16,22,minivan
 26 | 3.8,,f,16,23,minivan
 27 | 2,,4,20,28,compact
 28 | 2.7,,4,16,20,suv
 29 | 2.4,,f,19,27,midsize
 30 | 2,,f,19,26,compact
 31 | 1.8,,f,24,30,compact
 32 | 5.2,,,11,15,pickup
 33 | 2,,,20,28,subcompact
 34 | 2,,,21,30,compact
 35 | 5.4,,,14,20,subcompact
 36 | 4.2,,,12,18,suv
 37 | 5.4,,,12,18,suv
 38 | 3.8,,,18,25,subcompact
 39 | 2.7,,,,20,pickup
 40 | 7,8,,,24,2seater
 41 | 4,6,,,17,suv
 42 | 3.1,6,,,26,midsize
 43 | 4,6,,,19,suv
 44 | 4,6,,,17,suv
 45 | 1.8,4,f,,34,subcompact
 46 | 5.7,8,4,,18,suv
 47 | 3.8,6,f,,21,minivan
 48 | 2.2,4,4,,26,subcompact
 49 | 4.7,8,4,9,12,suv
 50 | 4,6,4,13,19,suv
 51 | 2,4,4,19,27,compact
 52 | 6.2,8,r,15,25,2seater
 53 | 2.5,4,4,20,26,suv
 54 | 3.7,6,4,15,19,suv
 55 | 5.7,8,4,11,15,suv
 56 | 5.4,8,r,11,16,suv
 57 | 3,6,4,17,22,suv
 58 | 3.5,6,f,19,26,midsize
 59 | 3.3,6,4,14,17,suv
 60 | 3,6,f,17,24,minivan
 61 | 3,6,f,19,25,midsize
 62 | 5.4,8,4,13,17,pickup
 63 | 2.7,6,f,17,24,subcompact
 64 | 4.6,8,4,13,16,pickup
 65 | 5.9,8,4,11,15,suv
 66 | 4.7,8,4,12,16,pickup
 67 | 5,8,4,13,17,suv
 68 | 3.5,6,f,19,25,midsize
 69 | 1.8,4,4,18,26,compact
 70 | 2.5,4,4,19,25,suv
 71 | 6,8,r,12,17,suv
 72 | 4,6,4,16,20,suv
 73 | 4.7,8,4,14,17,suv
 74 | 3.8,6,f,16,26,midsize
 75 | 2,4,f,19,29,subcompact
 76 | 3.4,6,4,15,17,suv
 77 | 3.5,6,f,19,27,midsize
 78 | 4,6,4,14,20,suv
 79 | 2,4,f,21,29,subcompact
 80 | 3.1,6,f,18,27,compact
 81 | 2.5,5,f,21,29,compact
 82 | 3.5,6,f,19,28,midsize
 83 | 5.3,8,4,14,19,suv
 84 | 3.1,6,f,18,26,midsize
 85 | 3.7,6,4,15,19,pickup
 86 | 3.3,6,f,11,17,minivan
 87 | 5.3,8,r,14,20,suv
 88 | 3.8,6,f,18,28,midsize
 89 | 2,4,f,21,29,subcompact
 90 | 1.6,4,f,24,32,subcompact
 91 | 4.6,8,r,15,22,subcompact
 92 | 2,4,f,22,29,compact
 93 | 2.8,6,4,15,24,midsize
 94 | 2.2,4,f,21,27,compact
 95 | 5.4,8,r,11,17,suv
 96 | 3,6,f,18,26,compact
 97 | 2.7,4,4,17,22,pickup
 98 | 4.7,8,4,9,12,pickup
 99 | 1.8,4,f,24,36,subcompact
100 | 4.4,8,4,12,18,suv
101 | 3.8,6,f,17,27,midsize
102 | 4.6,8,r,11,17,suv
103 | 2.4,4,f,21,31,midsize
104 | 5.7,8,4,13,18,suv
105 | 2.8,6,f,16,23,compact
106 | 4.6,8,4,13,19,suv
107 | 2.4,4,f,19,27,compact
108 | 3.3,6,f,17,24,minivan
109 | 1.8,4,f,26,35,compact
110 | 2.4,4,f,18,24,minivan
111 | 3.8,6,f,15,22,minivan
112 | 1.8,4,4,16,25,compact
113 | 4.7,8,4,11,15,suv
114 | 3.9,6,4,14,17,pickup
115 | 2.5,4,4,18,25,suv
116 | 2.4,4,f,21,31,midsize
117 | 3.6,6,f,17,26,midsize
118 | 5.7,8,r,13,17,suv
119 | 4.7,8,4,14,17,suv
120 | 2.8,6,f,17,24,compact
121 | 5,8,4,13,17,suv
122 | 2,4,f,19,26,compact
123 | 1.8,4,f,18,29,compact
124 | 4.7,8,4,13,17,suv
125 | 2.5,4,4,18,24,suv
126 | 1.8,4,f,24,33,compact
127 | 2.4,4,f,18,26,midsize
128 | 2.5,4,f,23,32,midsize
129 | 2.2,4,f,21,29,midsize
130 | 3.9,6,4,13,17,pickup
131 | 4.7,8,4,9,12,suv
132 | 2.8,6,f,16,26,midsize
133 | 4.7,8,4,14,19,suv
134 | 2,4,f,19,26,subcompact
135 | 2.5,4,f,23,31,midsize
136 | 3.5,6,f,18,29,midsize
137 | 4,6,4,15,18,pickup
138 | 2.5,6,f,18,26,midsize
139 | 4,6,r,16,24,subcompact
140 | 4.2,6,4,14,17,pickup
141 | 5.7,8,r,15,23,2seater
142 | 6.5,8,4,14,17,suv
143 | 5.4,8,r,12,18,suv
144 | 2,4,f,22,29,compact
145 | 3.4,6,4,15,19,pickup
146 | 1.9,4,f,29,41,subcompact
147 | 4.7,8,4,14,19,pickup
148 | 1.8,4,f,21,29,compact
149 | 2.5,6,f,18,26,midsize
150 | 4.7,8,4,9,12,pickup
151 | 1.9,4,f,33,44,compact
152 | 4.6,8,r,15,22,subcompact
153 | 2.5,5,f,21,29,compact
154 | 1.8,4,f,26,35,compact
155 | 5.3,8,r,14,20,suv
156 | 2.8,6,4,15,25,compact
157 | 4.7,8,4,13,17,pickup
158 | 1.8,4,f,25,36,subcompact
159 | 6.1,8,4,11,14,suv
160 | 2.4,4,f,22,30,midsize
161 | 4,6,4,16,20,pickup
162 | 3.3,6,4,15,17,suv
163 | 3.9,6,4,13,17,suv
164 | 5.7,8,r,16,26,2seater
165 | 2.7,4,4,15,20,pickup
166 | 3,6,f,18,26,compact
167 | 2,4,f,20,31,compact
168 | 4.6,8,r,15,23,subcompact
169 | 2.7,6,f,17,24,subcompact
170 | 2.7,4,4,15,20,suv
171 | 2.5,4,4,20,25,compact
172 | 3.3,6,f,19,28,midsize
173 | 4.2,6,4,14,17,pickup
174 | 5.3,8,4,11,14,suv
175 | 2.4,4,f,21,31,compact
176 | 2,4,f,21,29,compact
177 | 2,4,f,21,29,compact
178 | 2.5,5,f,20,28,subcompact
179 | 5.4,8,r,11,17,suv
180 | 4.7,8,4,14,19,pickup
181 | 3.4,6,4,15,17,pickup
182 | 4.7,8,4,13,17,suv
183 | 2,4,f,21,29,midsize
184 | 4.7,8,4,9,12,pickup
185 | 3,6,f,18,26,midsize
186 | 2.5,4,4,20,27,compact
187 | 2.5,4,4,19,25,compact
188 | 4.2,8,4,16,23,midsize
189 | 2.5,4,4,18,23,suv
190 | 3.3,6,f,16,22,minivan
191 | 2.4,4,f,21,31,midsize
192 | 2.4,4,f,21,29,compact
193 | 1.8,4,f,18,29,midsize
194 | 4.6,8,4,11,15,suv
195 | 3.6,6,f,17,26,midsize
196 | 2.8,6,f,18,26,midsize
197 | 2,4,f,20,27,subcompact
198 | 5.4,8,4,11,15,pickup
199 | 4,8,4,11,15,suv
200 | 2,4,f,19,26,subcompact
201 | 2.2,4,f,21,27,midsize
202 | 3,6,f,18,26,midsize
203 | 2.4,4,f,18,27,midsize
204 | 1.8,4,f,28,37,compact
205 | 5.2,8,4,11,15,pickup
206 | 4.6,8,r,15,21,subcompact
207 | 4,6,4,14,17,suv
208 | 5.3,8,f,16,25,midsize
209 | 1.6,4,f,28,33,subcompact
210 | 3,6,f,18,26,midsize
211 | 3.1,6,4,17,25,compact
212 | 3.1,6,4,17,25,midsize
213 | 1.9,4,f,35,44,subcompact
214 | 2.5,4,4,19,26,subcompact
215 | 2,4,f,19,28,midsize
216 | 3.7,6,4,14,18,pickup
217 | 5.7,8,4,13,17,pickup
218 | 1.8,4,f,21,29,midsize
219 | 4,6,4,15,19,suv
220 | 3.3,6,f,18,27,compact
221 | 3.4,6,4,15,19,suv
222 | 2.5,5,f,20,29,subcompact
223 | 4.7,8,4,12,16,pickup
224 | 2,4,f,21,29,compact
225 | 4.7,8,4,13,17,pickup
226 | 2.8,6,f,16,26,compact
227 | 5.7,8,4,13,18,suv
228 | 3.8,6,r,18,26,subcompact
229 | 3.3,6,f,17,24,minivan
230 | 2.2,4,f,21,29,compact
231 | 4,6,r,17,26,subcompact
232 | 1.6,4,f,23,29,subcompact
233 | 1.6,4,f,24,32,subcompact
234 | 2.8,6,4,17,25,compact
235 | 3.1,6,4,15,25,compact
236 | 


--------------------------------------------------------------------------------
/inst/eda.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Exploratory data analysis (EDA) report"
  3 | author: "Generated by the inspect package by Steven Ge"
  4 | date: "`r Sys.Date()`"
  5 | output: 
  6 |   html_document:
  7 |     number_sections: true
  8 |     code_folding: hide
  9 |     toc: true
 10 | params:
 11 |   df:
 12 |   target:
 13 |   date:
 14 |     label: "Date: "
 15 |     value: !r Sys.Date()
 16 |   printcode:
 17 |     label: "Display Code"
 18 |     value: TRUE
 19 |     input: checkbox
 20 | ---
 21 | The inspect package can be installed from [GitHub.](https://github.com/gexijin/inspect)
 22 | 
 23 | ```{r setup, include=FALSE}
 24 | knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
 25 | library(dplyr)
 26 | library(ggplot2)
 27 | max_data_points_eda <- 2000
 28 | target_bin <- NULL
 29 | target_var <- NULL 
 30 | ```
 31 | 
 32 | # Read data
 33 | ```{r}
 34 | df <- params$df
 35 | target_var <- params$target
 36 | ```
 37 | 
 38 | 
 39 | # Data pre-process
 40 | 
 41 | If a numeric variable has just a few unique values, it might make more sense to convert it into categorical variable.
 42 | If the total number of unique values is less than 5% of the total rows, convert.
 43 | ```{r convert-factor}
 44 | # Iterate over each column of the data frame
 45 | for (i in seq_along(df)) {
 46 |   # Check if the column is numeric and has less than 5 unique values
 47 |   if (is.numeric(df[[i]]) && length(unique(df[[i]])) / nrow(df) < 0.05 && length(unique(df[[i]])) < 13) {
 48 |     # Convert the column to a factor
 49 |     df[[i]] <- as.factor(df[[i]])
 50 |     cat("\nColumn ", colnames(df)[i], "was converted to a factor.")
 51 |   }
 52 | }
 53 | ```
 54 | 
 55 | If a non-numeric variable has too many unique values, it might be names or IDs that is not useful in analysis.
 56 | ```{r convert-id}
 57 | cutoff <- 0.8
 58 | # Initialize a vector to store the names of columns to be removed
 59 | cols_to_remove <- c()
 60 | 
 61 | # Loop through each column
 62 | for (col_name in names(df)) {
 63 |   # Check if column is non-numeric and has unique values > 80% of total rows
 64 |   if (!is.numeric(df[[col_name]]) && length(unique(df[[col_name]])) > cutoff * nrow(df)) {
 65 |     cols_to_remove <- c(cols_to_remove, col_name)
 66 |     cat("\nColumn", cols_to_remove, " was excluded from analysis.")
 67 |   }
 68 | }
 69 | 
 70 | # Remove the identified columns
 71 | df <- df %>% select(-one_of(cols_to_remove))
 72 | ```
 73 | 
 74 | If target variable is specified, bin this variable into another variable called target_bin.
 75 | ```{r generate-target}
 76 | target_bin <- NULL
 77 | # Check if the target_var is in the dataframe and is not NA
 78 | if (!is.null(target_var)) {
 79 |   if (target_var %in% names(df)) {
 80 |     # Check if the target variable is numerical
 81 |     if (is.numeric(df[[target_var]])) {
 82 |       # Create bins for the numerical target variable
 83 |       target_bin <- cut(
 84 |         df[[target_var]],
 85 |         breaks = 5,
 86 |         labels = c("low", "low_mid", "mid", "upper_mid", "high"),
 87 |         include.lowest = TRUE
 88 |       )
 89 |       print(paste("Binned target variable", target_var, " as", "target_bin"))
 90 |     } else {
 91 |       print(paste("The target variable, ", target_var, ", is a categorical variable."))
 92 |     }
 93 |   } else {
 94 |     print("Selected variable is not part of the data. Proceeding with general EDA.")
 95 |     target_var <- NULL   
 96 |   }
 97 | } else {
 98 |   print("No valid target variable selected. Proceeding with general EDA.")
 99 |   target_var <- NULL 
100 | }
101 | ```
102 | 
103 | 
104 | 
105 | 
106 | # Basic summary
107 | 
108 | ```{R summary}
109 | str(df)
110 | summary(df)
111 | ```
112 | 
113 | # Missing values
114 | ```{r missing-value}
115 | # Calculate the total number of missing values per column
116 | missing_values <- sapply(df, function(x) sum(is.na(x)))
117 | 
118 | # Calculate the number of cases with at least one missing value
119 | cases_with_missing <- sum(apply(df, 1, function(x) any(is.na(x))))
120 | 
121 | # Check if there are any missing values
122 | if (all(missing_values == 0)) {
123 |   print("There is no missing data in any columns.")
124 | } else {
125 |   # Create a data frame for plotting
126 |   missing_data_df <- data.frame(
127 |     Column = c(names(missing_values), "At Least One Missing"),
128 |     MissingValues = c(missing_values, cases_with_missing)
129 |   )
130 |   # Calculate the percentage of missing values per column
131 |   # missing_percentage <- (missing_values / nrow(df)) * 100
132 |   # Plot the number of missing values for all columns with labels
133 |   ggplot(missing_data_df, aes(x = Column, y = MissingValues, fill = Column)) +
134 |     geom_bar(stat = "identity") +
135 |     geom_text(aes(label = sprintf("%.0f%%", MissingValues / nrow(df) * 100)), hjust = -0.3) + # Add labels to the bars
136 |     # geom_text(aes(label = sprintf("%.2f%%", MissingPercentage)), hjust = -0.3) +
137 |     coord_flip() + # Makes the bars horizontal
138 |     labs(title = "Number of Missing Values by Column", x = "Column", y = "Number of Missing Values") +
139 |     scale_fill_brewer(palette = "Set3") + # Use a color palette for different bars
140 |     theme(legend.position = "none", axis.title.y = element_blank()) + # Remove the legend
141 |     scale_y_continuous(expand = expansion(mult = c(0, 0.2))) # Extend the y-axis limits by 10%
142 | }
143 | ```
144 | 
145 | 
146 | 
147 | ```{r outliers}
148 | detect_outliers_mad <- function(df, accuracy = 0.99) {
149 |   # Function to calculate MAD
150 |   mad_function <- function(x) {
151 |     median(abs(x - median(x)))
152 |   }
153 | 
154 |   # Calculate z-score equivalent for the given accuracy
155 |   z_threshold <- qnorm(accuracy + (1 - accuracy) / 2)
156 | 
157 |   # Calculate MAD threshold
158 |   mad_threshold <- z_threshold
159 | 
160 |   # Initialize a list to store outlier indices for each numeric column
161 |   outliers_list <- list()
162 | 
163 |   # Initialize a vector to keep track of rows with outliers
164 |   rows_with_outliers <- rep(FALSE, nrow(df))
165 | 
166 |   # Loop through each column in the dataframe
167 |   for (col_name in names(df)) {
168 |     # Check if the column is numeric
169 |     if (is.numeric(df[[col_name]])) {
170 |       # Calculate MAD and median for the column
171 |       mad_value <- mad_function(df[[col_name]])
172 |       median_value <- median(df[[col_name]])
173 | 
174 |       # Calculate the deviation scores (using a modified z-score)
175 |       deviation_scores <- 0.6745 * (df[[col_name]] - median_value) / mad_value
176 | 
177 |       # Identify indices of outliers
178 |       outlier_indices <- which(abs(deviation_scores) > mad_threshold)
179 | 
180 |       # Store the indices in the list
181 |       outliers_list[[col_name]] <- outlier_indices
182 | 
183 |       # Update rows with outliers
184 |       rows_with_outliers[outlier_indices] <- TRUE
185 |     }
186 |   }
187 | 
188 |   # Calculate the number of outliers in each column
189 |   num_outliers_each_col <- sapply(outliers_list, length)
190 | 
191 |   # Calculate the number of rows with at least one outlier
192 |   num_rows_with_outliers <- sum(rows_with_outliers)
193 | 
194 |   # Combine the results into one vector
195 |   combined_results <- c(num_outliers_each_col, "Rows w/ Outliers" = num_rows_with_outliers)
196 | 
197 |   # Return the combined results
198 |   return(combined_results)
199 | }
200 | 
201 | # Detect outliers using the previously defined function
202 | outliers_info <- detect_outliers_mad(df)
203 | 
204 | # Check if there are any outliers
205 | if (all(outliers_info == 0)) {
206 |   print("There are no outliers in any columns.")
207 | } else {
208 |   # Create a data frame for plotting
209 |   outliers_data_df <- data.frame(
210 |     Column = names(outliers_info),
211 |     Outliers = outliers_info,
212 |     OutlierPercentage = (outliers_info / nrow(df)) * 100 # Calculate the percentage of outliers
213 |   )
214 | 
215 |   # Plot the number of outliers for all columns with labels
216 |   ggplot(outliers_data_df, aes(x = Column, y = Outliers, fill = Column)) +
217 |     geom_bar(stat = "identity") +
218 |     geom_text(aes(label = sprintf("%.2f%%", OutlierPercentage)), hjust = -0.3, vjust = 0) + # Add labels to the bars
219 |     coord_flip() + # Makes the bars horizontal
220 |     labs(title = "Number of Outliers by Column", x = "Column", y = "Number of Outliers") +
221 |     scale_fill_brewer(palette = "Set3") + # Use a color palette for different bars
222 |     theme(legend.position = "none", axis.title.y = element_blank()) + # Remove the legend
223 |     scale_y_continuous(expand = expansion(mult = c(0, 0.2))) # Extend the y-axis limits
224 | }
225 | ```
226 | 
227 | # Univariate distributions: Numeric
228 | 
229 | ```{r distribution-numbers}
230 | library(gridExtra)
231 | library(e1071) # for skewness
232 | 
233 | # Function to check if the variable is highly skewed
234 | is_highly_skewed <- function(x) {
235 |   # Remove missing values before computing skewness
236 |   x <- na.omit(x)
237 |   abs(e1071::skewness(x)) > 1
238 | }
239 | 
240 | create_plots <- function(df, var) {
241 |   skewness_value <- round(e1071::skewness(na.omit(df[[var]])), 2)
242 | 
243 |   # Histogram
244 |   p1 <- ggplot(df, aes_string(x = var)) +
245 |     geom_histogram(bins = 15, fill = "skyblue", color = "black") +
246 |     ggtitle(paste("Histogram of", var)) +
247 |     annotate("text", x = Inf, y = Inf, label = paste("Skewness:", skewness_value), hjust = 1.1, vjust = 1.1)
248 | 
249 |   # QQ Plot with reference line
250 |   p2 <- ggplot(df, aes_string(sample = var)) +
251 |     stat_qq() +
252 |     stat_qq_line(color = "red") +
253 |     ggtitle(paste("QQ Plot of", var))
254 | 
255 |   if (is_highly_skewed(df[[var]])) {
256 |     df$log_transformed <- log(df[[var]] + 1)
257 |     skewness_log_value <- round(e1071::skewness(na.omit(df$log_transformed)), 2)
258 | 
259 |     # Histogram after log transformation
260 |     p3 <- ggplot(df, aes(x = log_transformed)) +
261 |       geom_histogram(bins = 15, fill = "lightgreen", color = "black") +
262 |       ggtitle(paste("Log-transformed", var)) +
263 |       annotate("text", x = Inf, y = Inf, label = paste("Skewness:", skewness_log_value), hjust = 1.1, vjust = 1.1)
264 | 
265 |     # QQ Plot after log transformation
266 |     p4 <- ggplot(df, aes(sample = log_transformed)) +
267 |       stat_qq() +
268 |       stat_qq_line(color = "red") +
269 |       ggtitle(paste("log-transformed", var))
270 | 
271 |     return(grid.arrange(p1, p2, p3, p4, ncol = 2))
272 |   } else {
273 |     return(grid.arrange(p1, p2, ncol = 2))
274 |   }
275 | }
276 | # If the dataframe has more than 2000 rows, sample 2000 rows randomly
277 | df_reduced <- df
278 | if (nrow(df) > max_data_points_eda) {
279 |   set.seed(123) # Set a random seed for reproducibility
280 |   df_reduced <- df_reduced[sample(nrow(df_reduced), 2000), ]
281 |   cat("Since there are too many rows, ", max_data_points_eda," randomly selected rows are shown scatter plots.")
282 | }
283 | 
284 | # Apply the function to each numeric variable in the dataframe
285 | lapply(names(df_reduced)[sapply(df_reduced, is.numeric)], function(var) create_plots(df, var))
286 | ```
287 | 
288 | 
289 | # Univariate distribution: Categorical variables
290 | ```{r distribution-categories}
291 | # Function to create bar plots
292 | create_bar_plot <- function(df, column_name) {
293 |   # Checking if the column is a factor
294 |   if (!is.numeric(df[[column_name]])) {
295 |     df[[column_name]] <- as.factor(df[[column_name]])
296 |     factor_levels <- levels(df[[column_name]])
297 |   } else {
298 |     factor_levels <- NULL
299 |   }
300 | 
301 |   # Modify the data for plotting
302 |   plot_data <- df %>%
303 |     count(!!sym(column_name)) %>%
304 |     arrange(desc(n)) %>%
305 |     mutate(value = ifelse(row_number() > 12, "Other", as.character(!!sym(column_name)))) %>%
306 |     group_by(value) %>%
307 |     summarize(n = sum(n))
308 | 
309 |   # If the column is a factor, adjust the value names
310 |   if (!is.null(factor_levels)) {
311 |     plot_data$value <- factor(plot_data$value, levels = unique(c(factor_levels, "Other")))
312 |   }
313 | 
314 |   # Finding the maximum value for adjusting y-axis
315 |   max_value <- max(plot_data$n)
316 | 
317 |   # Creating the bar plot
318 |   p <- ggplot(plot_data, aes(x = value, y = n, fill = value)) +
319 |     geom_bar(stat = "identity") +
320 |     geom_text(aes(label = paste0(round(n / sum(n) * 100, 1), "%")),
321 |               vjust = -0.5
322 |     ) +
323 |     labs(title = paste("Bar Plot for", column_name), y = "Count") +
324 |     theme(
325 |       axis.text.x = element_text(angle = 45, hjust = 1),
326 |       axis.title.x = element_blank()
327 |     ) +
328 |     theme(legend.position = "none") + # Remove the legend
329 |     scale_y_continuous(limits = c(0, max_value * 1.2)) # Extend y-axis by 20%
330 | 
331 |   print(p) # Explicitly print the plot
332 | }
333 | 
334 | # Iterating over each column
335 | for (column_name in names(df)) {
336 |   if (!is.numeric(df[[column_name]])) { # Checking if the column is non-numeric
337 |     unique_values <- length(unique(df[[column_name]]))
338 |     if (unique_values != nrow(df)) { # Ignoring columns with all unique values
339 |       create_bar_plot(df, column_name)
340 |     }
341 |   }
342 | }
343 | ```
344 | 
345 | # Overview heatmap
346 | ```{r heatmap, fig.height=8}
347 | data_for_heatmap <- df
348 | if(!is.null(target_bin)) {
349 |   data_for_heatmap$target_bin <- target_bin
350 | }
351 | numeric_cols <- sapply(data_for_heatmap, is.numeric)
352 | 
353 | # Prepare data for the heatmap
354 | # Filter out rows with missing values
355 | data_for_heatmap <- na.omit(data_for_heatmap)
356 | 
357 | # more than 2 numeric columns
358 | if (sum(numeric_cols) > 1) {
359 |   # If the dataframe has more than 2000 rows, sample 2000 rows randomly
360 |   if (nrow(data_for_heatmap) > max_data_points_eda) {
361 |     set.seed(123) # Set a random seed for reproducibility
362 |     data_for_heatmap <- data_for_heatmap[sample(nrow(data_for_heatmap), 2000), ]
363 |     cat("Since there are too many rows, ", max_data_points_eda," randomly selected rows are shown in this heatmap")
364 |   }
365 | 
366 |   # Determine whether to include row names based on the number of rows
367 |   show_row_names <- nrow(data_for_heatmap) <= 100
368 | 
369 |   # Define a color palette from yellow to blue
370 |   my_palette <- colorRampPalette(c("yellow", "blue"))(n = 299)
371 | 
372 |   # Initialize the RowSideColors parameter as NULL
373 |   row_side_colors <- NULL
374 | 
375 |   dist_pearson <- function(x, ...)
376 |     as.dist(1-cor(t(x), method="pearson"))
377 | 
378 |   # Check if target_var is provided and not null
379 |   if (!is.null(target_var) && target_var %in% names(data_for_heatmap)) {
380 |     # Use the 'target_bin' column if it exists, otherwise use the column specified by target_var
381 |     target_col <- ifelse("target_bin" %in% names(data_for_heatmap), 'target_bin',  target_var)
382 | 
383 |     # define groups of rows
384 |     groups <- data_for_heatmap[[target_col]]
385 |     groups_colors <- as.numeric(factor(groups))
386 |     unique_groups <- unique(groups)
387 |     row_side_colors <- rainbow(length(unique_groups))[groups_colors]
388 | 
389 |     # Create the heatmap with clustering trees and color bar
390 |     gplots::heatmap.2(
391 |       as.matrix(data_for_heatmap[numeric_cols]),
392 |       scale = "column",  # Data is already scaled, so no scaling is needed here
393 |       distfun = dist_pearson,  # for distance between rows
394 |       hclustfun = function(x) hclust(x, method = "average"),  # for hierarchical clustering
395 |       dendrogram = "both",  # only row dendrograms
396 |       trace = "none",  # turns off trace lines inside the heatmap
397 |       density.info = "none",  # turns off density plot inside color legend
398 |       margins = c(8, 8),  # adjusts the margins around the plot
399 |       Colv = TRUE,  # cluster columns
400 |       Rowv = TRUE,  # always cluster rows
401 |       labRow = if(show_row_names) rownames(data_for_heatmap) else NA,  # show/hide row names
402 |       key = TRUE,  # whether to show the color key
403 |       keysize = 1,  # size of the color key
404 |       symbreaks = FALSE , # whether to make color breaks symmetrical around zero
405 |       col = my_palette, # yellow and blue
406 |       RowSideColors = row_side_colors,  # add side color bar if side_colors is not NULL
407 |       cexCol = 0.9,         # Make column labels smaller
408 |       srtCol = 45,          # Rotate column labels 45 degrees
409 |       adjCol = c(1,0)   # Adjust the position of the column labels
410 |     )
411 |     legend("topright", legend = unique_groups, fill = rainbow(length(unique_groups)), title = target_var)
412 | 
413 |   } else {   # RowSideColors = NULL gives errors.
414 |     # Create the heatmap without the color bar
415 |     gplots::heatmap.2(
416 |       as.matrix(data_for_heatmap[numeric_cols]),
417 |       scale = "column",  # Data is already scaled, so no scaling is needed here
418 |       distfun = dist_pearson,  # for distance between rows
419 |       hclustfun = function(x) hclust(x, method = "average"),  # for hierarchical clustering
420 |       dendrogram = "both",  # only row dendrograms
421 |       trace = "none",  # turns off trace lines inside the heatmap
422 |       density.info = "none",  # turns off density plot inside color legend
423 |       margins = c(8, 8),  # adjusts the margins around the plot
424 |       Colv = TRUE,  # cluster columns
425 |       Rowv = TRUE,  # always cluster rows
426 |       labRow = if(show_row_names) rownames(data_for_heatmap) else NA,  # show/hide row names
427 |       key = TRUE,  # whether to show the color key
428 |       keysize = 1,  # size of the color key
429 |       symbreaks = FALSE , # whether to make color breaks symmetrical around zero
430 |       col = my_palette, # yellow and blue
431 |       cexCol = 0.9,         # Make column labels smaller
432 |       srtCol = 45,          # Rotate column labels 45 degrees
433 |       adjCol = c(1,0)   # Adjust the position of the column labels
434 |     )
435 |   }
436 | }
437 | ```
438 | 
439 | 
440 | # Bivariate correlation
441 | 
442 | ## Correlation matrix (Blank indicates not significant, P > 0.05)
443 | ```{r correlation-map, height = 10, width =10}
444 | library(corrplot)
445 | df2 <- df[sapply(df, is.numeric)]
446 | df2 <- na.omit(df2)
447 | M <- cor(df2)
448 | testRes <- cor.mtest(df2, conf.level = 0.95)
449 | ## leave blank on non-significant coefficient
450 | ## add significant correlation coefficients
451 | corrplot(M,
452 |          p.mat = testRes$p, method = "circle", type = "lower", insig = "blank",
453 |          addCoef.col = "black", number.cex = 0.8, order = "AOE", diag = FALSE
454 | )
455 | ```
456 | 
457 | ## Correlation between numeric variables
458 | 
459 | ```{r two-numbers}
460 | library(GGally)
461 | library(hexbin)
462 | 
463 | num_cols <- names(df)[sapply(df, is.numeric)]
464 | if(length(num_cols) > 1) {
465 |   num_col_pairs <- combn(num_cols, 2, simplify = FALSE)
466 |   
467 |   for (pair in num_col_pairs) {
468 |     col_x <- pair[1]
469 |     col_y <- pair[2]
470 |     
471 |     # Perform correlation test
472 |     corr_test <- cor.test(df[[col_x]], df[[col_y]], method = "pearson")
473 |     
474 |     if (corr_test$p.value < 0.01 && abs(corr_test$estimate) > 0.1) {
475 |       corr_label <- paste("R =", round(corr_test$estimate, 2), "\nP =", format(corr_test$p.value, scientific = TRUE, digits = 2))
476 |       
477 |       if (nrow(df) > 5000) {
478 |         # Use hexbin plot
479 |         p <- ggplot(df, aes_string(x = col_x, y = col_y)) +
480 |           labs(title = paste(col_x, "vs", col_y), x = col_x, y = col_y) +
481 |           geom_hex(bins = 70) +
482 |           scale_fill_continuous(type = "viridis")
483 |       } else {
484 |         # Use scatter plot
485 |         if (!is.null(target_var) && !is.na(target_var)) {
486 |           if(target_var != col_x && target_var != col_y){ # target is not in the pair
487 |             df_ggplot <- df
488 |             if(!is.null(target_bin)) {
489 |               df_ggplot$target_bin <- target_bin
490 |             }
491 |             
492 |             color_var <- ifelse(!is.numeric(df[[target_var]]), target_var, "target_bin")
493 |             p <- ggplot(df_ggplot, aes_string(x = col_x, y = col_y, color = color_var)) +
494 |               geom_point(alpha = 0.7) +
495 |               labs(title = paste(col_x, "vs", col_y), x = col_x, y = col_y) +
496 |               guides(color = guide_legend(title = color_var))
497 |           } else { # target is not in the pair
498 |             p <- ggplot(df, aes_string(x = col_x, y = col_y)) +
499 |               geom_point(alpha = 0.7) +
500 |               labs(title = paste(col_x, "vs", col_y), x = col_x, y = col_y)
501 |           }
502 |         } else { # No target
503 |           p <- ggplot(df, aes_string(x = col_x, y = col_y)) +
504 |             geom_point(alpha = 0.7) +
505 |             labs(title = paste(col_x, "vs", col_y), x = col_x, y = col_y)
506 |         }
507 |       }
508 |       
509 |       p <- p + annotate("text", x = Inf, y = Inf, label = corr_label, hjust = 1.1, vjust = 1.1, size = 4)
510 |       print(p)
511 |     }
512 |   }
513 | }  else {
514 |   cat("Not applicable.")
515 | }
516 | ```
517 | 
518 | 
519 | 
520 | ## Correlation between a numeric and a categorical variable
521 | ```{r numeric-categorical}
522 | num_cols <- sapply(df, is.numeric)
523 | cat_cols <- sapply(df, is.factor)
524 | if(length(num_cols) > 1 & length(cat_cols) > 1) {
525 |   # Perform ANOVA and create violin plots for significant cases
526 |   for (num_var in names(df)[num_cols]) {
527 |     for (cat_var in names(df)[cat_cols]) {
528 |       if (cat_var != "target_bin") {
529 |         anova_result <- aov(df[[num_var]] ~ df[[cat_var]], data = df)
530 |         p_value <- summary(anova_result)[[1]]$"Pr(>F)"[1]
531 |         if (p_value < 0.01) {
532 |           plot <- ggplot(df, aes_string(x = cat_var, y = num_var)) +
533 |             geom_violin(trim = FALSE, fill = "lightblue", color = "black") +
534 |             geom_boxplot(width = 0.2, fill = "white", color = "black") +
535 |             labs(title = paste(num_var, "by", cat_var, "(ANOVA P =", format(p_value, scientific = TRUE, digits = 2), ")"), x = cat_var, y = num_var)
536 | 
537 |           if (nrow(df) < 300) {
538 |             plot <- plot + geom_jitter(width = 0.2, color = "black")
539 |           }
540 |           print(plot)
541 |         }
542 |       }
543 |     }
544 |   }
545 | }  else {
546 |   cat("Not applicable.")
547 | }
548 | ```
549 | 
550 | ## Correlation between two categorical variables
551 | 
552 | ```{r two-categories}
553 | cat_cols <- !(sapply(df, is.numeric)) & names(df) != "target_bin"
554 | cat_var_names <- names(df)[cat_cols]
555 | if(length(cat_var_names) > 1) {
556 |   # Perform chi-squared tests and create stacked bar plots if p-value < 0.01
557 |   for (i in 1:(length(cat_var_names) - 1)) {
558 |     for (j in (i + 1):length(cat_var_names)) {
559 |       tab <- table(df[[cat_var_names[i]]], df[[cat_var_names[j]]])
560 |       chi_test <- chisq.test(tab)
561 |       if (is.na(chi_test$p.value)) next
562 |       if (chi_test$p.value < 0.01) {
563 |         p <- ggplot(df, aes_string(x = cat_var_names[i], fill = cat_var_names[j])) +
564 |           geom_bar(position = "fill") +
565 |           labs(
566 |             title = paste(cat_var_names[i], "vs", cat_var_names[j], "(Chisq P =", format(chi_test$p.value, scientific = TRUE, digits = 2), ")"),
567 |             x = paste(cat_var_names[i]),
568 |             y = "Proportion"
569 |           ) +
570 |           coord_flip()
571 |         print(p)
572 |       }
573 |     }
574 |   }
575 | } else {
576 |   cat("There is one or zero categorical variable")
577 | }
578 | ```
579 | 
580 | 
581 | This RMarkdown document was written by [Steven Ge](https://www.linkedin.com/in/steven-ge-ab016947/), assisted heavily by GPT-4 (who wrote 90% of the code). Source code on [GitHub.](https://github.com/gexijin/inspect) No guarantees. No right reserved.  
582 | 


--------------------------------------------------------------------------------