├── .gitignore
├── DESCRIPTION
├── NAMESPACE
├── R
    ├── Plot.R
    ├── bivariate.R
    ├── charSummary.R
    ├── numSummary.R
    └── removeSpecial.R
├── README.md
├── images
    ├── Plot.png
    ├── Plot1.png
    ├── Plot2.png
    ├── bivariate.png
    ├── charSummary.png
    ├── new_charsummary.png
    ├── new_numsummary.png
    └── numSummary.png
├── man
    ├── Plot.Rd
    ├── bivariate.Rd
    ├── charSummary.Rd
    ├── numSummary.Rd
    └── removeSpecial.Rd
└── xda.Rproj


/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: xda
 2 | Version: 0.2
 3 | Title: Functions to perform exploratory analysis with any dataframe
 4 | Description: This package contains several functions to perform initial exploratory analysis with any dataframe
 5 | Depends: R (>= 3.1.3)
 6 | Imports:
 7 |     stats,
 8 |     graphics
 9 | Author: Ujjwal Karn (ujwlkarn@gmail.com) with contributions from Shanti Jha
10 | Maintainer: Ujjwal Karn <ujwlkarn@gmail.com>
11 | BugReports: https://github.com/ujwlkarn/xda/issues
12 | License: GPL-3
13 | LazyData: true
14 | URL: http://github.com/ujwlkarn/xda
15 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2 (4.1.0): do not edit by hand
2 | 
3 | export(Plot)
4 | export(bivariate)
5 | export(charSummary)
6 | export(numSummary)
7 | export(removeSpecial)
8 | 


--------------------------------------------------------------------------------
/R/Plot.R:
--------------------------------------------------------------------------------
 1 | #'Plots all variables of a data frame against the specified dependant variable  
 2 | #'@param df name of the data frame
 3 | #'@param dep.var name the dependant variable
 4 | #'@param range specify which variables to plot using numeric range (default is 'all' which plots all variables) 
 5 | #'@return returns multiple plots
 6 | #'@examples
 7 | #'data(iris)
 8 | #'Plot(iris,'Species')
 9 | #'@export
10 | 
11 | Plot<-function(df,dep.var,range='all'){
12 |   
13 |   if(!dep.var %in% names(df)) stop("Dependant Variable not in dataset")
14 |                           
15 |   cols<-df[, !names(df) %in% dep.var] 
16 |   n <- ncol(cols)
17 |   
18 |   if (!(class(range) %in% c("integer","numeric")) && range!='all'){stop("Please provide 'all' or numeric range")}
19 |     
20 |   if (class(range) %in% c("integer","numeric")){
21 |     if(min(range) < 1) { stop("Please provide a minimum value of range as 1")}
22 |     if(length(range) > n) { stop("Please provide correct range")}}
23 |   
24 |     
25 |   if (range[1]=='all'){
26 |     n4 <- ifelse(n%%4==0,floor(n/4),floor(n/4)+1)
27 |     for (k in 1:n4){
28 |       par(mfrow=c(2,2))
29 |       for (i in ((k-1)*4+1):(4*k)){
30 |         if (i<=n) plot(cols[,i],df[[dep.var]],xlab=names(cols[i]),ylab=dep.var)
31 |        }
32 |       mtext(bquote("Figure Number"== ~ .(k)), outer=TRUE, line=-3)
33 |     }}
34 |   else{
35 |     l=length(range)
36 |     l4 <- ifelse(l%%4==0,floor(l/4),floor(l/4)+1)
37 |     for (k in 1:l4){
38 |       par(mfrow=c(2,2))
39 |       for (i in (range[1]+(k-1)*4):(range[4]+(k-1)*4)){
40 |         if (i<=n && i<=range[length(range)]) plot(cols[,i],df[[dep.var]],xlab=names(cols[i]),ylab=dep.var)
41 |       }
42 |       mtext(bquote("Figure Number"== ~ .(k)), outer=TRUE, line=-3)
43 |     }}   
44 |   }
45 | 
46 | # Test
47 | # Plot(iris,'Species')
48 | # Plot(mtcars,'mpg')
49 | # 
50 | # Plot(mtcars,'mpg',1:7)
51 | # 
52 | # Plot(mtcars,'mpg',1:6)
53 | # Plot(mtcars,'mpg',1:8)
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/R/bivariate.R:
--------------------------------------------------------------------------------
  1 | #'Plots all variables of a data frame against the specified dependant variable  
  2 | #'@param df name of the data frame
  3 | #'@param dep.var name the dependant variable
  4 | #'@param indep.var name the independant variable
  5 | #'@param n.bins number of bins to create 
  6 | #'@return returns bivariate analysis summary dataframe
  7 | #'@examples
  8 | #'data(iris)
  9 | #'bivariate(iris,'Species','Petal.Width',n.bins=3)
 10 | #'@export
 11 | #'
 12 | bivariate <- function(df,dep.var,indep.var,n.bins=4,na.rm=TRUE){
 13 |   
 14 |   c.dep.var   <- dep.var
 15 |   c.indep.var <- indep.var
 16 |   cols        <- which(!c(c.dep.var,c.indep.var) %in% names(df))
 17 |   
 18 |   if(length(cols)){
 19 |     stop(paste0(paste0(names(df)[cols],collapse=",")," column(s) not present"))
 20 |   }
 21 |   
 22 |   df        <- df[,c(c.dep.var,c.indep.var)]
 23 |   
 24 |   class_var <- sapply(X = c(c.dep.var,c.indep.var),
 25 |                       FUN = function(x){class(df[[x]])})
 26 |   
 27 |   if((class_var[[1]] %in% c("integer","numeric")) & 
 28 |        (class_var[[2]] %in% c("integer","numeric"))){
 29 |     
 30 |     n.bins             <- min(n.bins,
 31 |                               length(na.omit(unique(df[[c.indep.var]]))))
 32 |     n.newvar           <- cut(x      = df[[c.indep.var]],
 33 |                               breaks = n.bins)
 34 |     df.result          <- data.frame(levels(n.newvar),sapply(X   = c("min","max","mean"),
 35 |                                                              FUN = function(x){sprintf(fmt = "%.02f",
 36 |                                                                                        tapply(X    = df[[c.dep.var]],
 37 |                                                                                               INDEX = n.newvar, 
 38 |                                                                                               FUN   = x, 
 39 |                                                                                               na.rm = na.rm))}))
 40 |     
 41 |     
 42 |     colnames(df.result) <- c(paste0("bin_",c.indep.var),
 43 |                             sapply(X   = c("min","max","mean"),
 44 |                                    FUN = function(x){paste0(x,"_",c.dep.var)}))
 45 |     
 46 |     return(df.result)
 47 |   }else if((class_var[[1]] %in% c("factor","character")) &
 48 |              (class_var[[2]] %in% c("factor","character"))){
 49 |     
 50 |     df.result          <- NULL
 51 |     c.indep.level      <- unique(df[[c.indep.var]])
 52 |     c.dep.level        <- unique(df[[c.dep.var]])
 53 |     
 54 |     for(tempi in c.indep.level){
 55 |       temp <- tempi
 56 |       for(tempj in c.dep.level){
 57 |         count <- length(which(df[[c.indep.var]] == tempi & df[[c.dep.var]] == tempj))
 58 |         temp  <- cbind(temp,count)
 59 |       }
 60 |       df.result<-rbind.data.frame(df.result,temp)
 61 |     }
 62 |     
 63 |     colnames(df.result) <- c(c.indep.var,as.character(c.dep.level))
 64 |     
 65 |     return(df.result)
 66 |   }else if((class_var[[1]] %in% c("factor","character")) & 
 67 |              (class_var[[2]] %in% c("integer","numeric"))){
 68 |     n.bins             <- min(n.bins,
 69 |                               length(na.omit(unique(df[[c.indep.var]]))))
 70 |     
 71 |     df[,'bin']         <- cut(x =     df[[c.indep.var]],
 72 |                               breaks = n.bins)
 73 |     df.result          <- NULL
 74 |     c.indep.level      <- unique(df[['bin']])
 75 |     c.dep.level        <- unique(df[[c.dep.var]])
 76 |     
 77 |     for(tempi in c.indep.level){
 78 |       temp      <- tempi
 79 |       for(tempj in c.dep.level){
 80 |         count   <- length(which(df[['bin']] == tempi & df[[c.dep.var]] == tempj))
 81 |         temp    <- cbind(temp,count)
 82 |       }
 83 |       df.result <- rbind.data.frame(df.result,temp)
 84 |     }
 85 |     
 86 |     colnames(df.result) <- c(paste0("bin_",c.indep.var),as.character(c.dep.level))
 87 |     
 88 |     return(df.result)
 89 |   }else if((class_var[[1]] %in% c("integer","numeric")) &
 90 |              (class_var[[2]] %in% c("factor","character"))){
 91 |     
 92 |     n.newvar            <- df[[c.indep.var]]
 93 |     df.result           <- data.frame(unique(n.newvar),sapply(X = c("min","max","mean"),
 94 |                                                               FUN = function(x){sprintf(fmt="%.02f",
 95 |                                                                                         tapply(X     = df[[c.dep.var]],
 96 |                                                                                                INDEX = n.newvar, 
 97 |                                                                                                FUN   = x, 
 98 |                                                                                                na.rm = na.rm))}))
 99 |     colnames(df.result) <- c(c.indep.var,
100 |                              sapply(X   = c("min","max","mean"),
101 |                                     FUN = function(x){paste0(x,"_",c.dep.var)}))
102 |     return(df.result)
103 |   }
104 |   
105 | }
106 | 
107 | # #sample data
108 | # test<-data.frame(a=sample(x = 1:100,
109 | #                              size = 25,
110 | #                              replace = TRUE),
111 | #                     b=sample(x = 10:20,
112 | #                              size=25,
113 | #                              replace = TRUE))
114 | # 
115 | # test2<-data.frame(b=sample(x = c('north','south'),
116 | #                           size = 25,
117 | #                           replace = TRUE),
118 | #                  a=sample(x = c('food/drug','supercenter','supermarket','superstore','supercombo'),
119 | #                           size=25,
120 | #                           replace = TRUE))
121 | # 
122 | # bivariate(df=test,dep.var='b',
123 | #           indep.var='a',
124 | #           n.bins=4,
125 | #           na.rm=TRUE)
126 | # 
127 | # bivariate(df=test2,dep.var='b',
128 | #                  indep.var='a',
129 | #                  n.bins=4,
130 | #                  na.rm=TRUE)
131 | # 
132 | # bivariate(df=iris,
133 | #           dep.var='Species',
134 | #           indep.var='Petal.Width',
135 | #           n.bins=3,
136 | #           na.rm=TRUE)
137 | # 
138 | # bivariate(df=mtcars,
139 | #           dep.var='mpg',
140 | #           indep.var='hp',
141 | #           na.rm=TRUE)
142 | 


--------------------------------------------------------------------------------
/R/charSummary.R:
--------------------------------------------------------------------------------
 1 | #'Automatically detects character/factor variables and gives a comprehensive summary 
 2 | #'@param df name of your data frame
 3 | #'@return Returns the summary data frame
 4 | #'@examples
 5 | #'data(iris)
 6 | #'charSummary(iris)
 7 | #'@export
 8 | 
 9 | charSummary <- function(df){
10 |   
11 |   num        <- vector(mode = "character")
12 |   char       <- vector(mode = "character")
13 |   for (var in 1:ncol(df)) {
14 |     if (class(df[[var]]) == "numeric") {
15 |       num    <- c(num, names(df[var]))
16 |     }else if (class(df[[var]]) == "factor" || class(df[[var]]) == "character") {
17 |       char   <- c(char, names(df[var]))
18 |     }
19 |   }
20 |   
21 |   if (length(char)!=0){
22 |     dfchar   <- subset(df, select=char)
23 |     E        <- sapply(dfchar, function(x) as.character(x))
24 |     EE       <- as.data.frame(E)
25 |     n        <- as.data.frame(sapply(EE, function(x) sum(!is.na(x))))
26 |     n        <- data.frame(n)
27 |     colnames(n) <- "n"
28 |     
29 |     n1       <- nrow(df)
30 | 
31 |     #missing value computation
32 |     miss     <- sapply(EE, function(x) sum(is.na(x)))
33 |     miss     <- as.data.frame(miss)
34 |     g3       <- cbind(n, miss)
35 |     perc     <- (miss/n1)*100
36 |     m3       <- cbind(g3, perc)
37 |     colnames(m3)[ncol(m3)] <- "miss%"
38 |     
39 |     #top-5 level count
40 |     topfivelevel <- function(x){
41 |      tbl_x             <- table(x)
42 |      topfive           <- sort(tbl_x, decreasing = TRUE)[1:ifelse(length(tbl_x) >= 5, yes = 5, no = length(tbl_x))]
43 |      topfivelevelcount <- paste0(names(topfive), ":", topfive)
44 |     }
45 |     
46 |     unique     <- sapply(EE, function(x) length(unique(x)))
47 |     unique_val <- sapply(EE, function(x) paste0(topfivelevel(x), collapse = ", "))
48 |     m4         <- cbind.data.frame(m3, unique, "top5levels:count" = unique_val)
49 |     
50 |     return(m4)
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/R/numSummary.R:
--------------------------------------------------------------------------------
  1 | #'Automatically detects numeric variables and gives a comprehensive summary 
  2 | #'@param df name of your data frame
  3 | #'@return Returns the summary data frame
  4 | #'@examples
  5 | #'data(iris)
  6 | #'numSummary(iris)
  7 | #'@export
  8 | 
  9 | numSummary <- function(df){
 10 |   
 11 |   num       <- vector(mode = "character")
 12 |   char      <- vector(mode = "character")
 13 |   for (var in 1:ncol(df)) {
 14 |     if (class(df[[var]])=="numeric" || class(df[[var]])=="integer") {
 15 |       num   <- c(num,names(df[var]))
 16 |     }else if (class(df[[var]])=="factor" || class(df[[var]])=="character") {
 17 |       char  <- c(char,names(df[var]))
 18 |     }
 19 |   }
 20 |   
 21 |   dfnum     <- subset(df,select=num)
 22 |   D         <- sapply(dfnum, function(x) as.numeric(x,na.rm=TRUE))
 23 |   DD        <- as.data.frame(D)
 24 |   
 25 |   #kurtosis computation
 26 |   kurtosis <- function(x,na.rm = TRUE){
 27 |     if(na.rm){
 28 |       x     <- x[which(!is.na(x))]
 29 |     }
 30 |     x_mean  <- mean(x)
 31 |     x_count <- length(x)
 32 |     s2      <- sum((x-x_mean)^2)
 33 |     s4      <- sum((x-x_mean)^4)
 34 |     m2      <- s2/x_count
 35 |     m4      <- s4/x_count
 36 |     res     <- ((m4 / m2^2 - 3) + 3) * (1 - 1 / x_count)^2 - 3
 37 |   }
 38 |   
 39 |   #skewness calculation
 40 |   skewness <- function(x,na.rm = TRUE){
 41 |     if(na.rm){
 42 |       x     <- x[which(!is.na(x))]
 43 |     }
 44 |     x_mean  <- mean(x)
 45 |     x_count <- length(x)
 46 |     s2      <- sum((x-x_mean)^2)
 47 |     s3      <- sum((x-x_mean)^3)
 48 |     m2      <- s2/x_count
 49 |     m3      <- s3/x_count
 50 |     res     <- (m3 / m2^(3.0/2)) * (1 - 1 / x_count)^(3.0/2)
 51 |   }
 52 |   
 53 |   options(digits = 3)
 54 |   n                   <- sapply(DD, function(x) sum(!is.na(x)))
 55 |   mean                <- sapply(DD, function(x) mean(x,na.rm=TRUE))
 56 |   sd                  <- sapply(DD, function(x) sd(x,na.rm=TRUE))
 57 |   max                 <- sapply(DD, function(x) max(x,na.rm=TRUE))
 58 |   min                 <- sapply(DD, function(x) min(x,na.rm=TRUE))
 59 |   range               <- max - min
 60 |   nzero               <- sapply(DD, function(x) length(which(x == 0)))
 61 |   nunique             <- sapply(DD, function(x) length(unique(x)))
 62 |   outliersummary      <- t(sapply(DD, function(x) {
 63 |                                                         iqr          <- IQR(x,na.rm = TRUE,type = 4)
 64 |                                                         lowerbound   <- quantile(x,0.25,na.rm=TRUE)-(1.5*iqr)
 65 |                                                         upperbound   <- quantile(x,0.75,na.rm=TRUE)+(1.5*iqr)
 66 |                                                         noofoutliers <- length  (which(x > upperbound | x <lowerbound))
 67 |                                                         return(c(iqr,lowerbound,upperbound,noofoutliers))
 68 |                                                       }))
 69 |   kurtosis_val        <- sapply(DD, function(x) kurtosis(x))
 70 |   skewness_val        <- sapply(DD, function(x) skewness(x))
 71 |   d2                  <- cbind.data.frame(n,mean,sd,max,min,range,nunique,nzero,outliersummary,kurtosis_val,skewness_val)
 72 |   colnames(d2)        <- c("n","mean","sd","max","min","range","nunique","nzeros","iqr","lowerbound","upperbound","noutlier","kurtosis","skewness")
 73 |   
 74 |   #mode computation
 75 |   Mode <- function(x) {
 76 |     ux      <- unique(x)
 77 |     ux[which.max(tabulate(match(x, ux)))]
 78 |   }
 79 |   mode      <- sapply(dfnum, function(x) Mode(x) )
 80 |   mode      <- as.data.frame(mode)
 81 |   
 82 |   n1        <- nrow(dfnum)
 83 |   c1        <- ncol(dfnum)
 84 |   numb      <- rep(n1,c1)
 85 |   numb      <- data.frame(numb)
 86 |   
 87 |   #missing value computation
 88 |   miss      <- sapply(dfnum, function(x) sum(is.na(x)) )
 89 |   miss      <- as.data.frame(miss)
 90 |   d3        <- cbind(d2,mode,miss)
 91 |   missPer   <- (miss/n1)*100
 92 |   d3        <- cbind(d3,missPer)
 93 |   colnames(d3)[ncol(d3)] <- "miss%"
 94 |   
 95 |   #percentile value computation
 96 |   q         <- sapply(DD, function(x) quantile(x, c(.01,.05,.25,.5,.75,.95, .99),na.rm=TRUE) )
 97 | 
 98 |   q         <- as.data.frame(q)
 99 |   q         <- t(q)
100 |   d3        <- cbind(d3,q)
101 |   
102 |   return(d3)
103 | }
104 | 


--------------------------------------------------------------------------------
/R/removeSpecial.R:
--------------------------------------------------------------------------------
 1 | #'Replaces special characters in your data frame to NA
 2 | #'@param df name of your data frame
 3 | #'@param vec vector containing the special characters you want to replace with NA
 4 | #'@return Returns the modified data frame
 5 | #'@examples
 6 | #'data(iris)
 7 | #'iris[1,2]<-"?"
 8 | #'iris[2,2]<-"@@"
 9 | #'iris[3,2]<-"???"
10 | #'iris<-removeSpecial(iris,c("@@","???"))
11 | #'head(iris)  
12 | #'@export
13 | removeSpecial<-function(df,vec){
14 |   df[ df == "NaN" ] = NA
15 |   df[ df == "<NA>" ] = NA
16 |   df[ df == "?" ] = NA
17 |   df[ df == "@" ] = NA
18 |   df[ df== "" ] = NA
19 |   df[ df == " " ] = NA
20 |   df[ df == "N/A" ] = NA
21 |   for (i in 1:length(vec)){df[ df == vec[i] ] = 
22 |                            NA}
23 |   return(df)
24 | }
25 | 
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # xda: R package for exploratory data analysis
  2 | 
  3 | This package contains several tools to perform initial exploratory analysis on any input dataset. It includes custom functions for plotting the data as well as performing different kinds of analyses such as univariate, bivariate and multivariate investigation which is the first step of any predictive modeling pipeline. This package can be used to get a good sense of any dataset before jumping on to building predictive models.
  4 | 
  5 | The package is constantly under development and more functionalities will be added soon. Pull requests to add more functions are welcome!
  6 | 
  7 | The functions currently included in the package are mentioned below:
  8 | 
  9 | - `numSummary(mydata)` function automatically detects all numeric columns in the dataframe `mydata` and provides their summary statistics 
 10 | - `charSummary(mydata)` function automatically detects all character columns in the dataframe `mydata` and provides their summary statistics 
 11 | - `Plot(mydata, dep.var)` plots all independent variables in the dataframe `mydata` against the dependant variable specified by the `dep.var` parameter 
 12 | - `removeSpecial(mydata, vec)` replaces all special characters (specified by vector `vec`) in the dataframe `mydata` with `NA` 
 13 | - `bivariate(mydata, dep.var, indep.var)` performs bivariate analysis between dependent variable `dep.var` and independent variable `indep.var` in the dataframe `mydata`
 14 | 
 15 | More functions to be added soon.
 16 |  
 17 | **Note:** All functions mentioned above expect `mydata` to be a data.frame - please convert your input dataset to a data.frame before using any function from this package.
 18 | 
 19 | # Installation
 20 | 
 21 | - The best way to install `xda` package is to install `devtools` package first. To install `devtools`, please follow instructions [here](https://github.com/hadley/devtools). Then, use the following commands to install `xda`:
 22 | 
 23 |   ```s
 24 |   library(devtools)
 25 |   install_github("ujjwalkarn/xda")
 26 |   ```
 27 | 
 28 | - Alternatively, you may also use the `githubinstall` package for installing `xda`:
 29 | 
 30 |   ```s
 31 |   install.packages("githubinstall")
 32 |   library(githubinstall)
 33 |   githubinstall("xda")
 34 |   ```
 35 | 
 36 | # Usage
 37 | 
 38 | For examples below, the [popular iris dataset](https://en.wikipedia.org/wiki/Iris_flower_data_set) and the [warpbreaks dataset](https://stat.ethz.ch/R-manual/R-devel/library/datasets/html/warpbreaks.html) has been used. Please refer to the documentation of each function to understand how to use it. For example, to see the documenation for the `numSummary()` function, use `?numSummary`.
 39 | 
 40 | ```s
 41 | ## load the package into the current session
 42 | 
 43 | library(xda)
 44 | ```
 45 | ## numSummary()
 46 | ```s
 47 | ## to view a comprehensive summary for all numeric columns in the iris dataset
 48 | 
 49 | numSummary(iris)
 50 | 
 51 | ## n = total number of rows for that variable
 52 | ## nunique = number of unique values
 53 | ## nzeroes = number of zeroes
 54 | ## iqr = interquartile range
 55 | ## noutlier = number of outliers
 56 | ## miss = number of rows with missing value
 57 | ## miss% = percentage of total rows with missing values ((miss/n)*100)
 58 | ## 5% = 5th percentile value of that variable (value below which 5 percent of the observations may be found)
 59 | ## the percentile values are helpful in detecting outliers
 60 | ```
 61 | ##### Output
 62 | ```s
 63 | > numSummary(iris)
 64 | 
 65 |                 n mean    sd max min range nunique nzeros  iqr lowerbound upperbound noutlier kurtosis skewness mode miss miss%   1%   5% 25%  50% 75%  95%  99%
 66 |  Sepal.Length 150 5.84 0.828 7.9 4.3   3.6      35      0 1.30       3.15       8.35        0   -0.606    0.309  5.0    0     0 4.40 4.60 5.1 5.80 6.4 7.25 7.70
 67 |  Sepal.Width  150 3.06 0.436 4.4 2.0   2.4      23      0 0.50       2.05       4.05        4    0.139    0.313  3.0    0     0 2.20 2.34 2.8 3.00 3.3 3.80 4.15
 68 |  Petal.Length 150 3.76 1.765 6.9 1.0   5.9      43      0 3.55      -3.72      10.42        0   -1.417   -0.269  1.4    0     0 1.15 1.30 1.6 4.35 5.1 6.10 6.70
 69 |  Petal.Width  150 1.20 0.762 2.5 0.1   2.4      22      0 1.50      -1.95       4.05        0   -1.358   -0.101  0.2    0     0 0.10 0.20 0.3 1.30 1.8 2.30 2.50
 70 | 
 71 | ```
 72 | 
 73 | ## charSummary()
 74 | ```s
 75 | ## to view a comprehensive summary for all character columns in the warpbreaks dataset
 76 | 
 77 | charSummary(warpbreaks)
 78 | 
 79 | ## n = total number of rows for that variable
 80 | ## miss = number of rows with missing value
 81 | ## miss% = percentage of total rows with missing values ((n/miss)*100)
 82 | ## unique = number of unique levels of that variable
 83 | ## top5levels:count = top 5 levels (unique values) in each column sorted by count
 84 | ## for example, wool has 2 unique levels 'A' and 'B' each with count of 27 
 85 | 
 86 | ```
 87 | ##### Output
 88 | ```s
 89 | > charSummary(warpbreaks)
 90 | 
 91 |           n miss miss% unique top5levels:count
 92 |  wool    54    0     0      2       A:27, B:27
 93 |  tension 54    0     0      3 H:18, L:18, M:18
 94 | 
 95 | ```
 96 | 
 97 | ## bivariate()
 98 | ```s
 99 | ## to perform bivariate analysis between 'Species' and 'Sepal.Length' in the iris dataset
100 | 
101 | bivariate(iris,'Species','Sepal.Length')
102 | 
103 | ## bin_Sepal.Length = 'Sepal.Length' variable has been binned into 4 equal intervals (original range is [4.3,7.9])
104 | ## for each interval of 'Sepal.Length', the number of samples from each category of 'Species' is shown 
105 | ## i.e. 39 of the 50 samples of Setosa have Sepal.Length is in the range (4.3,5.2], and so on. 
106 | ## the number of intervals (4 in this case) can be customized (see documentation)
107 | 
108 | ```
109 | ##### Output
110 | ```s
111 | > bivariate(iris,'Species','Sepal.Length')
112 | 
113 |    bin_Sepal.Length setosa versicolor virginica
114 |  1        (4.3,5.2]     39          5         1
115 |  2        (5.2,6.1]     11         29        10
116 |  3          (6.1,7]      0         16        27
117 |  4          (7,7.9]      0          0        12
118 | 
119 | ```
120 | 
121 | ## Plot()
122 | ```s
123 | ## to plot all other variables against the 'Petal.Length' variable in the iris dataset
124 | 
125 | Plot(iris,'Petal.Length')
126 | 
127 | ## some interesting patterns can be seen in the plots below and these insights can be used for predictive modeling
128 | ```
129 | ##### Output
130 | ```s
131 | > Plot(iris,'Petal.Length')
132 | ```
133 | ![Plot(iris,'Petal.Length') Output](/images/Plot2.png?raw=true)
134 | 
135 | 
136 | 


--------------------------------------------------------------------------------
/images/Plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ujjwalkarn/xda/86cf14dbfaa96b805a702261e2b078052ccbab70/images/Plot.png


--------------------------------------------------------------------------------
/images/Plot1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ujjwalkarn/xda/86cf14dbfaa96b805a702261e2b078052ccbab70/images/Plot1.png


--------------------------------------------------------------------------------
/images/Plot2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ujjwalkarn/xda/86cf14dbfaa96b805a702261e2b078052ccbab70/images/Plot2.png


--------------------------------------------------------------------------------
/images/bivariate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ujjwalkarn/xda/86cf14dbfaa96b805a702261e2b078052ccbab70/images/bivariate.png


--------------------------------------------------------------------------------
/images/charSummary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ujjwalkarn/xda/86cf14dbfaa96b805a702261e2b078052ccbab70/images/charSummary.png


--------------------------------------------------------------------------------
/images/new_charsummary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ujjwalkarn/xda/86cf14dbfaa96b805a702261e2b078052ccbab70/images/new_charsummary.png


--------------------------------------------------------------------------------
/images/new_numsummary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ujjwalkarn/xda/86cf14dbfaa96b805a702261e2b078052ccbab70/images/new_numsummary.png


--------------------------------------------------------------------------------
/images/numSummary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ujjwalkarn/xda/86cf14dbfaa96b805a702261e2b078052ccbab70/images/numSummary.png


--------------------------------------------------------------------------------
/man/Plot.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.0): do not edit by hand
 2 | % Please edit documentation in R/Plot.R
 3 | \name{Plot}
 4 | \alias{Plot}
 5 | \title{Plots all variables of a data frame against the specified dependant variable}
 6 | \usage{
 7 | Plot(df, dep.var, range = "all")
 8 | }
 9 | \arguments{
10 | \item{df}{name of the data frame}
11 | 
12 | \item{dep.var}{name the dependant variable}
13 | 
14 | \item{range}{specify which variables to plot using numeric range (default is 'all' which plots all variables)}
15 | }
16 | \value{
17 | returns multiple plots
18 | }
19 | \description{
20 | Plots all variables of a data frame against the specified dependant variable
21 | }
22 | \examples{
23 | data(iris)
24 | Plot(iris,'Species')
25 | }
26 | 
27 | 


--------------------------------------------------------------------------------
/man/bivariate.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.0): do not edit by hand
 2 | % Please edit documentation in R/bivariate.R
 3 | \name{bivariate}
 4 | \alias{bivariate}
 5 | \title{Plots all variables of a data frame against the specified dependant variable}
 6 | \usage{
 7 | bivariate(df, dep.var, indep.var, n.bins = 4, na.rm = TRUE)
 8 | }
 9 | \arguments{
10 | \item{df}{name of the data frame}
11 | 
12 | \item{dep.var}{name the dependant variable}
13 | 
14 | \item{indep.var}{name the independant variable}
15 | 
16 | \item{n.bins}{number of bins to create}
17 | }
18 | \value{
19 | returns bivariate analysis summary dataframe
20 | }
21 | \description{
22 | Plots all variables of a data frame against the specified dependant variable
23 | }
24 | \examples{
25 | data(iris)
26 | bivariate(iris,'Species','Petal.Width',n.bins=3)
27 | }
28 | 
29 | 


--------------------------------------------------------------------------------
/man/charSummary.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.0): do not edit by hand
 2 | % Please edit documentation in R/charSummary.R
 3 | \name{charSummary}
 4 | \alias{charSummary}
 5 | \title{Automatically detects character/factor variables and gives a comprehensive summary}
 6 | \usage{
 7 | charSummary(df)
 8 | }
 9 | \arguments{
10 | \item{df}{name of your data frame}
11 | }
12 | \value{
13 | Returns the summary data frame
14 | }
15 | \description{
16 | Automatically detects character/factor variables and gives a comprehensive summary
17 | }
18 | \examples{
19 | data(iris)
20 | charSummary(iris)
21 | }
22 | 
23 | 


--------------------------------------------------------------------------------
/man/numSummary.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.0): do not edit by hand
 2 | % Please edit documentation in R/numSummary.R
 3 | \name{numSummary}
 4 | \alias{numSummary}
 5 | \title{Automatically detects numeric variables and gives a comprehensive summary}
 6 | \usage{
 7 | numSummary(df)
 8 | }
 9 | \arguments{
10 | \item{df}{name of your data frame}
11 | }
12 | \value{
13 | Returns the summary data frame
14 | }
15 | \description{
16 | Automatically detects numeric variables and gives a comprehensive summary
17 | }
18 | \examples{
19 | data(iris)
20 | numSummary(iris)
21 | }
22 | 
23 | 


--------------------------------------------------------------------------------
/man/removeSpecial.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.0): do not edit by hand
 2 | % Please edit documentation in R/removeSpecial.R
 3 | \name{removeSpecial}
 4 | \alias{removeSpecial}
 5 | \title{Replaces special characters in your data frame to NA}
 6 | \usage{
 7 | removeSpecial(df, vec)
 8 | }
 9 | \arguments{
10 | \item{df}{name of your data frame}
11 | 
12 | \item{vec}{vector containing the special characters you want to replace with NA}
13 | }
14 | \value{
15 | Returns the modified data frame
16 | }
17 | \description{
18 | Replaces special characters in your data frame to NA
19 | }
20 | \examples{
21 | data(iris)
22 | iris[1,2]<-"?"
23 | iris[2,2]<-"@"
24 | iris[3,2]<-"???"
25 | iris<-removeSpecial(iris,c("@","???"))
26 | head(iris)
27 | }
28 | 
29 | 


--------------------------------------------------------------------------------
/xda.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: No
 4 | SaveWorkspace: No
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | Encoding: UTF-8
 9 | 
10 | AutoAppendNewline: Yes
11 | StripTrailingWhitespace: Yes
12 | 
13 | BuildType: Package
14 | PackageUseDevtools: Yes
15 | PackageInstallArgs: --no-multiarch --with-keep.source
16 | PackageRoxygenize: rd,collate,namespace
17 | 


--------------------------------------------------------------------------------