├── .gitignore ├── DESCRIPTION ├── NAMESPACE ├── R ├── Plot.R ├── bivariate.R ├── charSummary.R ├── numSummary.R └── removeSpecial.R ├── README.md ├── images ├── Plot.png ├── Plot1.png ├── Plot2.png ├── bivariate.png ├── charSummary.png ├── new_charsummary.png ├── new_numsummary.png └── numSummary.png ├── man ├── Plot.Rd ├── bivariate.Rd ├── charSummary.Rd ├── numSummary.Rd └── removeSpecial.Rd └── xda.Rproj /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: xda 2 | Version: 0.2 3 | Title: Functions to perform exploratory analysis with any dataframe 4 | Description: This package contains several functions to perform initial exploratory analysis with any dataframe 5 | Depends: R (>= 3.1.3) 6 | Imports: 7 | stats, 8 | graphics 9 | Author: Ujjwal Karn (ujwlkarn@gmail.com) with contributions from Shanti Jha 10 | Maintainer: Ujjwal Karn 11 | BugReports: https://github.com/ujwlkarn/xda/issues 12 | License: GPL-3 13 | LazyData: true 14 | URL: http://github.com/ujwlkarn/xda 15 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2 (4.1.0): do not edit by hand 2 | 3 | export(Plot) 4 | export(bivariate) 5 | export(charSummary) 6 | export(numSummary) 7 | export(removeSpecial) 8 | -------------------------------------------------------------------------------- /R/Plot.R: -------------------------------------------------------------------------------- 1 | #'Plots all variables of a data frame against the specified dependant variable 2 | #'@param df name of the data frame 3 | #'@param dep.var name the dependant variable 4 | #'@param range specify which variables to plot using numeric range (default is 'all' which plots all variables) 5 | #'@return returns multiple plots 6 | #'@examples 7 | #'data(iris) 8 | #'Plot(iris,'Species') 9 | #'@export 10 | 11 | Plot<-function(df,dep.var,range='all'){ 12 | 13 | if(!dep.var %in% names(df)) stop("Dependant Variable not in dataset") 14 | 15 | cols<-df[, !names(df) %in% dep.var] 16 | n <- ncol(cols) 17 | 18 | if (!(class(range) %in% c("integer","numeric")) && range!='all'){stop("Please provide 'all' or numeric range")} 19 | 20 | if (class(range) %in% c("integer","numeric")){ 21 | if(min(range) < 1) { stop("Please provide a minimum value of range as 1")} 22 | if(length(range) > n) { stop("Please provide correct range")}} 23 | 24 | 25 | if (range[1]=='all'){ 26 | n4 <- ifelse(n%%4==0,floor(n/4),floor(n/4)+1) 27 | for (k in 1:n4){ 28 | par(mfrow=c(2,2)) 29 | for (i in ((k-1)*4+1):(4*k)){ 30 | if (i<=n) plot(cols[,i],df[[dep.var]],xlab=names(cols[i]),ylab=dep.var) 31 | } 32 | mtext(bquote("Figure Number"== ~ .(k)), outer=TRUE, line=-3) 33 | }} 34 | else{ 35 | l=length(range) 36 | l4 <- ifelse(l%%4==0,floor(l/4),floor(l/4)+1) 37 | for (k in 1:l4){ 38 | par(mfrow=c(2,2)) 39 | for (i in (range[1]+(k-1)*4):(range[4]+(k-1)*4)){ 40 | if (i<=n && i<=range[length(range)]) plot(cols[,i],df[[dep.var]],xlab=names(cols[i]),ylab=dep.var) 41 | } 42 | mtext(bquote("Figure Number"== ~ .(k)), outer=TRUE, line=-3) 43 | }} 44 | } 45 | 46 | # Test 47 | # Plot(iris,'Species') 48 | # Plot(mtcars,'mpg') 49 | # 50 | # Plot(mtcars,'mpg',1:7) 51 | # 52 | # Plot(mtcars,'mpg',1:6) 53 | # Plot(mtcars,'mpg',1:8) 54 | 55 | 56 | -------------------------------------------------------------------------------- /R/bivariate.R: -------------------------------------------------------------------------------- 1 | #'Plots all variables of a data frame against the specified dependant variable 2 | #'@param df name of the data frame 3 | #'@param dep.var name the dependant variable 4 | #'@param indep.var name the independant variable 5 | #'@param n.bins number of bins to create 6 | #'@return returns bivariate analysis summary dataframe 7 | #'@examples 8 | #'data(iris) 9 | #'bivariate(iris,'Species','Petal.Width',n.bins=3) 10 | #'@export 11 | #' 12 | bivariate <- function(df,dep.var,indep.var,n.bins=4,na.rm=TRUE){ 13 | 14 | c.dep.var <- dep.var 15 | c.indep.var <- indep.var 16 | cols <- which(!c(c.dep.var,c.indep.var) %in% names(df)) 17 | 18 | if(length(cols)){ 19 | stop(paste0(paste0(names(df)[cols],collapse=",")," column(s) not present")) 20 | } 21 | 22 | df <- df[,c(c.dep.var,c.indep.var)] 23 | 24 | class_var <- sapply(X = c(c.dep.var,c.indep.var), 25 | FUN = function(x){class(df[[x]])}) 26 | 27 | if((class_var[[1]] %in% c("integer","numeric")) & 28 | (class_var[[2]] %in% c("integer","numeric"))){ 29 | 30 | n.bins <- min(n.bins, 31 | length(na.omit(unique(df[[c.indep.var]])))) 32 | n.newvar <- cut(x = df[[c.indep.var]], 33 | breaks = n.bins) 34 | df.result <- data.frame(levels(n.newvar),sapply(X = c("min","max","mean"), 35 | FUN = function(x){sprintf(fmt = "%.02f", 36 | tapply(X = df[[c.dep.var]], 37 | INDEX = n.newvar, 38 | FUN = x, 39 | na.rm = na.rm))})) 40 | 41 | 42 | colnames(df.result) <- c(paste0("bin_",c.indep.var), 43 | sapply(X = c("min","max","mean"), 44 | FUN = function(x){paste0(x,"_",c.dep.var)})) 45 | 46 | return(df.result) 47 | }else if((class_var[[1]] %in% c("factor","character")) & 48 | (class_var[[2]] %in% c("factor","character"))){ 49 | 50 | df.result <- NULL 51 | c.indep.level <- unique(df[[c.indep.var]]) 52 | c.dep.level <- unique(df[[c.dep.var]]) 53 | 54 | for(tempi in c.indep.level){ 55 | temp <- tempi 56 | for(tempj in c.dep.level){ 57 | count <- length(which(df[[c.indep.var]] == tempi & df[[c.dep.var]] == tempj)) 58 | temp <- cbind(temp,count) 59 | } 60 | df.result<-rbind.data.frame(df.result,temp) 61 | } 62 | 63 | colnames(df.result) <- c(c.indep.var,as.character(c.dep.level)) 64 | 65 | return(df.result) 66 | }else if((class_var[[1]] %in% c("factor","character")) & 67 | (class_var[[2]] %in% c("integer","numeric"))){ 68 | n.bins <- min(n.bins, 69 | length(na.omit(unique(df[[c.indep.var]])))) 70 | 71 | df[,'bin'] <- cut(x = df[[c.indep.var]], 72 | breaks = n.bins) 73 | df.result <- NULL 74 | c.indep.level <- unique(df[['bin']]) 75 | c.dep.level <- unique(df[[c.dep.var]]) 76 | 77 | for(tempi in c.indep.level){ 78 | temp <- tempi 79 | for(tempj in c.dep.level){ 80 | count <- length(which(df[['bin']] == tempi & df[[c.dep.var]] == tempj)) 81 | temp <- cbind(temp,count) 82 | } 83 | df.result <- rbind.data.frame(df.result,temp) 84 | } 85 | 86 | colnames(df.result) <- c(paste0("bin_",c.indep.var),as.character(c.dep.level)) 87 | 88 | return(df.result) 89 | }else if((class_var[[1]] %in% c("integer","numeric")) & 90 | (class_var[[2]] %in% c("factor","character"))){ 91 | 92 | n.newvar <- df[[c.indep.var]] 93 | df.result <- data.frame(unique(n.newvar),sapply(X = c("min","max","mean"), 94 | FUN = function(x){sprintf(fmt="%.02f", 95 | tapply(X = df[[c.dep.var]], 96 | INDEX = n.newvar, 97 | FUN = x, 98 | na.rm = na.rm))})) 99 | colnames(df.result) <- c(c.indep.var, 100 | sapply(X = c("min","max","mean"), 101 | FUN = function(x){paste0(x,"_",c.dep.var)})) 102 | return(df.result) 103 | } 104 | 105 | } 106 | 107 | # #sample data 108 | # test<-data.frame(a=sample(x = 1:100, 109 | # size = 25, 110 | # replace = TRUE), 111 | # b=sample(x = 10:20, 112 | # size=25, 113 | # replace = TRUE)) 114 | # 115 | # test2<-data.frame(b=sample(x = c('north','south'), 116 | # size = 25, 117 | # replace = TRUE), 118 | # a=sample(x = c('food/drug','supercenter','supermarket','superstore','supercombo'), 119 | # size=25, 120 | # replace = TRUE)) 121 | # 122 | # bivariate(df=test,dep.var='b', 123 | # indep.var='a', 124 | # n.bins=4, 125 | # na.rm=TRUE) 126 | # 127 | # bivariate(df=test2,dep.var='b', 128 | # indep.var='a', 129 | # n.bins=4, 130 | # na.rm=TRUE) 131 | # 132 | # bivariate(df=iris, 133 | # dep.var='Species', 134 | # indep.var='Petal.Width', 135 | # n.bins=3, 136 | # na.rm=TRUE) 137 | # 138 | # bivariate(df=mtcars, 139 | # dep.var='mpg', 140 | # indep.var='hp', 141 | # na.rm=TRUE) 142 | -------------------------------------------------------------------------------- /R/charSummary.R: -------------------------------------------------------------------------------- 1 | #'Automatically detects character/factor variables and gives a comprehensive summary 2 | #'@param df name of your data frame 3 | #'@return Returns the summary data frame 4 | #'@examples 5 | #'data(iris) 6 | #'charSummary(iris) 7 | #'@export 8 | 9 | charSummary <- function(df){ 10 | 11 | num <- vector(mode = "character") 12 | char <- vector(mode = "character") 13 | for (var in 1:ncol(df)) { 14 | if (class(df[[var]]) == "numeric") { 15 | num <- c(num, names(df[var])) 16 | }else if (class(df[[var]]) == "factor" || class(df[[var]]) == "character") { 17 | char <- c(char, names(df[var])) 18 | } 19 | } 20 | 21 | if (length(char)!=0){ 22 | dfchar <- subset(df, select=char) 23 | E <- sapply(dfchar, function(x) as.character(x)) 24 | EE <- as.data.frame(E) 25 | n <- as.data.frame(sapply(EE, function(x) sum(!is.na(x)))) 26 | n <- data.frame(n) 27 | colnames(n) <- "n" 28 | 29 | n1 <- nrow(df) 30 | 31 | #missing value computation 32 | miss <- sapply(EE, function(x) sum(is.na(x))) 33 | miss <- as.data.frame(miss) 34 | g3 <- cbind(n, miss) 35 | perc <- (miss/n1)*100 36 | m3 <- cbind(g3, perc) 37 | colnames(m3)[ncol(m3)] <- "miss%" 38 | 39 | #top-5 level count 40 | topfivelevel <- function(x){ 41 | tbl_x <- table(x) 42 | topfive <- sort(tbl_x, decreasing = TRUE)[1:ifelse(length(tbl_x) >= 5, yes = 5, no = length(tbl_x))] 43 | topfivelevelcount <- paste0(names(topfive), ":", topfive) 44 | } 45 | 46 | unique <- sapply(EE, function(x) length(unique(x))) 47 | unique_val <- sapply(EE, function(x) paste0(topfivelevel(x), collapse = ", ")) 48 | m4 <- cbind.data.frame(m3, unique, "top5levels:count" = unique_val) 49 | 50 | return(m4) 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /R/numSummary.R: -------------------------------------------------------------------------------- 1 | #'Automatically detects numeric variables and gives a comprehensive summary 2 | #'@param df name of your data frame 3 | #'@return Returns the summary data frame 4 | #'@examples 5 | #'data(iris) 6 | #'numSummary(iris) 7 | #'@export 8 | 9 | numSummary <- function(df){ 10 | 11 | num <- vector(mode = "character") 12 | char <- vector(mode = "character") 13 | for (var in 1:ncol(df)) { 14 | if (class(df[[var]])=="numeric" || class(df[[var]])=="integer") { 15 | num <- c(num,names(df[var])) 16 | }else if (class(df[[var]])=="factor" || class(df[[var]])=="character") { 17 | char <- c(char,names(df[var])) 18 | } 19 | } 20 | 21 | dfnum <- subset(df,select=num) 22 | D <- sapply(dfnum, function(x) as.numeric(x,na.rm=TRUE)) 23 | DD <- as.data.frame(D) 24 | 25 | #kurtosis computation 26 | kurtosis <- function(x,na.rm = TRUE){ 27 | if(na.rm){ 28 | x <- x[which(!is.na(x))] 29 | } 30 | x_mean <- mean(x) 31 | x_count <- length(x) 32 | s2 <- sum((x-x_mean)^2) 33 | s4 <- sum((x-x_mean)^4) 34 | m2 <- s2/x_count 35 | m4 <- s4/x_count 36 | res <- ((m4 / m2^2 - 3) + 3) * (1 - 1 / x_count)^2 - 3 37 | } 38 | 39 | #skewness calculation 40 | skewness <- function(x,na.rm = TRUE){ 41 | if(na.rm){ 42 | x <- x[which(!is.na(x))] 43 | } 44 | x_mean <- mean(x) 45 | x_count <- length(x) 46 | s2 <- sum((x-x_mean)^2) 47 | s3 <- sum((x-x_mean)^3) 48 | m2 <- s2/x_count 49 | m3 <- s3/x_count 50 | res <- (m3 / m2^(3.0/2)) * (1 - 1 / x_count)^(3.0/2) 51 | } 52 | 53 | options(digits = 3) 54 | n <- sapply(DD, function(x) sum(!is.na(x))) 55 | mean <- sapply(DD, function(x) mean(x,na.rm=TRUE)) 56 | sd <- sapply(DD, function(x) sd(x,na.rm=TRUE)) 57 | max <- sapply(DD, function(x) max(x,na.rm=TRUE)) 58 | min <- sapply(DD, function(x) min(x,na.rm=TRUE)) 59 | range <- max - min 60 | nzero <- sapply(DD, function(x) length(which(x == 0))) 61 | nunique <- sapply(DD, function(x) length(unique(x))) 62 | outliersummary <- t(sapply(DD, function(x) { 63 | iqr <- IQR(x,na.rm = TRUE,type = 4) 64 | lowerbound <- quantile(x,0.25,na.rm=TRUE)-(1.5*iqr) 65 | upperbound <- quantile(x,0.75,na.rm=TRUE)+(1.5*iqr) 66 | noofoutliers <- length (which(x > upperbound | x " ] = NA 16 | df[ df == "?" ] = NA 17 | df[ df == "@" ] = NA 18 | df[ df== "" ] = NA 19 | df[ df == " " ] = NA 20 | df[ df == "N/A" ] = NA 21 | for (i in 1:length(vec)){df[ df == vec[i] ] = 22 | NA} 23 | return(df) 24 | } 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # xda: R package for exploratory data analysis 2 | 3 | This package contains several tools to perform initial exploratory analysis on any input dataset. It includes custom functions for plotting the data as well as performing different kinds of analyses such as univariate, bivariate and multivariate investigation which is the first step of any predictive modeling pipeline. This package can be used to get a good sense of any dataset before jumping on to building predictive models. 4 | 5 | The package is constantly under development and more functionalities will be added soon. Pull requests to add more functions are welcome! 6 | 7 | The functions currently included in the package are mentioned below: 8 | 9 | - `numSummary(mydata)` function automatically detects all numeric columns in the dataframe `mydata` and provides their summary statistics 10 | - `charSummary(mydata)` function automatically detects all character columns in the dataframe `mydata` and provides their summary statistics 11 | - `Plot(mydata, dep.var)` plots all independent variables in the dataframe `mydata` against the dependant variable specified by the `dep.var` parameter 12 | - `removeSpecial(mydata, vec)` replaces all special characters (specified by vector `vec`) in the dataframe `mydata` with `NA` 13 | - `bivariate(mydata, dep.var, indep.var)` performs bivariate analysis between dependent variable `dep.var` and independent variable `indep.var` in the dataframe `mydata` 14 | 15 | More functions to be added soon. 16 | 17 | **Note:** All functions mentioned above expect `mydata` to be a data.frame - please convert your input dataset to a data.frame before using any function from this package. 18 | 19 | # Installation 20 | 21 | - The best way to install `xda` package is to install `devtools` package first. To install `devtools`, please follow instructions [here](https://github.com/hadley/devtools). Then, use the following commands to install `xda`: 22 | 23 | ```s 24 | library(devtools) 25 | install_github("ujjwalkarn/xda") 26 | ``` 27 | 28 | - Alternatively, you may also use the `githubinstall` package for installing `xda`: 29 | 30 | ```s 31 | install.packages("githubinstall") 32 | library(githubinstall) 33 | githubinstall("xda") 34 | ``` 35 | 36 | # Usage 37 | 38 | For examples below, the [popular iris dataset](https://en.wikipedia.org/wiki/Iris_flower_data_set) and the [warpbreaks dataset](https://stat.ethz.ch/R-manual/R-devel/library/datasets/html/warpbreaks.html) has been used. Please refer to the documentation of each function to understand how to use it. For example, to see the documenation for the `numSummary()` function, use `?numSummary`. 39 | 40 | ```s 41 | ## load the package into the current session 42 | 43 | library(xda) 44 | ``` 45 | ## numSummary() 46 | ```s 47 | ## to view a comprehensive summary for all numeric columns in the iris dataset 48 | 49 | numSummary(iris) 50 | 51 | ## n = total number of rows for that variable 52 | ## nunique = number of unique values 53 | ## nzeroes = number of zeroes 54 | ## iqr = interquartile range 55 | ## noutlier = number of outliers 56 | ## miss = number of rows with missing value 57 | ## miss% = percentage of total rows with missing values ((miss/n)*100) 58 | ## 5% = 5th percentile value of that variable (value below which 5 percent of the observations may be found) 59 | ## the percentile values are helpful in detecting outliers 60 | ``` 61 | ##### Output 62 | ```s 63 | > numSummary(iris) 64 | 65 | n mean sd max min range nunique nzeros iqr lowerbound upperbound noutlier kurtosis skewness mode miss miss% 1% 5% 25% 50% 75% 95% 99% 66 | Sepal.Length 150 5.84 0.828 7.9 4.3 3.6 35 0 1.30 3.15 8.35 0 -0.606 0.309 5.0 0 0 4.40 4.60 5.1 5.80 6.4 7.25 7.70 67 | Sepal.Width 150 3.06 0.436 4.4 2.0 2.4 23 0 0.50 2.05 4.05 4 0.139 0.313 3.0 0 0 2.20 2.34 2.8 3.00 3.3 3.80 4.15 68 | Petal.Length 150 3.76 1.765 6.9 1.0 5.9 43 0 3.55 -3.72 10.42 0 -1.417 -0.269 1.4 0 0 1.15 1.30 1.6 4.35 5.1 6.10 6.70 69 | Petal.Width 150 1.20 0.762 2.5 0.1 2.4 22 0 1.50 -1.95 4.05 0 -1.358 -0.101 0.2 0 0 0.10 0.20 0.3 1.30 1.8 2.30 2.50 70 | 71 | ``` 72 | 73 | ## charSummary() 74 | ```s 75 | ## to view a comprehensive summary for all character columns in the warpbreaks dataset 76 | 77 | charSummary(warpbreaks) 78 | 79 | ## n = total number of rows for that variable 80 | ## miss = number of rows with missing value 81 | ## miss% = percentage of total rows with missing values ((n/miss)*100) 82 | ## unique = number of unique levels of that variable 83 | ## top5levels:count = top 5 levels (unique values) in each column sorted by count 84 | ## for example, wool has 2 unique levels 'A' and 'B' each with count of 27 85 | 86 | ``` 87 | ##### Output 88 | ```s 89 | > charSummary(warpbreaks) 90 | 91 | n miss miss% unique top5levels:count 92 | wool 54 0 0 2 A:27, B:27 93 | tension 54 0 0 3 H:18, L:18, M:18 94 | 95 | ``` 96 | 97 | ## bivariate() 98 | ```s 99 | ## to perform bivariate analysis between 'Species' and 'Sepal.Length' in the iris dataset 100 | 101 | bivariate(iris,'Species','Sepal.Length') 102 | 103 | ## bin_Sepal.Length = 'Sepal.Length' variable has been binned into 4 equal intervals (original range is [4.3,7.9]) 104 | ## for each interval of 'Sepal.Length', the number of samples from each category of 'Species' is shown 105 | ## i.e. 39 of the 50 samples of Setosa have Sepal.Length is in the range (4.3,5.2], and so on. 106 | ## the number of intervals (4 in this case) can be customized (see documentation) 107 | 108 | ``` 109 | ##### Output 110 | ```s 111 | > bivariate(iris,'Species','Sepal.Length') 112 | 113 | bin_Sepal.Length setosa versicolor virginica 114 | 1 (4.3,5.2] 39 5 1 115 | 2 (5.2,6.1] 11 29 10 116 | 3 (6.1,7] 0 16 27 117 | 4 (7,7.9] 0 0 12 118 | 119 | ``` 120 | 121 | ## Plot() 122 | ```s 123 | ## to plot all other variables against the 'Petal.Length' variable in the iris dataset 124 | 125 | Plot(iris,'Petal.Length') 126 | 127 | ## some interesting patterns can be seen in the plots below and these insights can be used for predictive modeling 128 | ``` 129 | ##### Output 130 | ```s 131 | > Plot(iris,'Petal.Length') 132 | ``` 133 | ![Plot(iris,'Petal.Length') Output](/images/Plot2.png?raw=true) 134 | 135 | 136 | -------------------------------------------------------------------------------- /images/Plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ujjwalkarn/xda/86cf14dbfaa96b805a702261e2b078052ccbab70/images/Plot.png -------------------------------------------------------------------------------- /images/Plot1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ujjwalkarn/xda/86cf14dbfaa96b805a702261e2b078052ccbab70/images/Plot1.png -------------------------------------------------------------------------------- /images/Plot2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ujjwalkarn/xda/86cf14dbfaa96b805a702261e2b078052ccbab70/images/Plot2.png -------------------------------------------------------------------------------- /images/bivariate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ujjwalkarn/xda/86cf14dbfaa96b805a702261e2b078052ccbab70/images/bivariate.png -------------------------------------------------------------------------------- /images/charSummary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ujjwalkarn/xda/86cf14dbfaa96b805a702261e2b078052ccbab70/images/charSummary.png -------------------------------------------------------------------------------- /images/new_charsummary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ujjwalkarn/xda/86cf14dbfaa96b805a702261e2b078052ccbab70/images/new_charsummary.png -------------------------------------------------------------------------------- /images/new_numsummary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ujjwalkarn/xda/86cf14dbfaa96b805a702261e2b078052ccbab70/images/new_numsummary.png -------------------------------------------------------------------------------- /images/numSummary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ujjwalkarn/xda/86cf14dbfaa96b805a702261e2b078052ccbab70/images/numSummary.png -------------------------------------------------------------------------------- /man/Plot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.0): do not edit by hand 2 | % Please edit documentation in R/Plot.R 3 | \name{Plot} 4 | \alias{Plot} 5 | \title{Plots all variables of a data frame against the specified dependant variable} 6 | \usage{ 7 | Plot(df, dep.var, range = "all") 8 | } 9 | \arguments{ 10 | \item{df}{name of the data frame} 11 | 12 | \item{dep.var}{name the dependant variable} 13 | 14 | \item{range}{specify which variables to plot using numeric range (default is 'all' which plots all variables)} 15 | } 16 | \value{ 17 | returns multiple plots 18 | } 19 | \description{ 20 | Plots all variables of a data frame against the specified dependant variable 21 | } 22 | \examples{ 23 | data(iris) 24 | Plot(iris,'Species') 25 | } 26 | 27 | -------------------------------------------------------------------------------- /man/bivariate.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.0): do not edit by hand 2 | % Please edit documentation in R/bivariate.R 3 | \name{bivariate} 4 | \alias{bivariate} 5 | \title{Plots all variables of a data frame against the specified dependant variable} 6 | \usage{ 7 | bivariate(df, dep.var, indep.var, n.bins = 4, na.rm = TRUE) 8 | } 9 | \arguments{ 10 | \item{df}{name of the data frame} 11 | 12 | \item{dep.var}{name the dependant variable} 13 | 14 | \item{indep.var}{name the independant variable} 15 | 16 | \item{n.bins}{number of bins to create} 17 | } 18 | \value{ 19 | returns bivariate analysis summary dataframe 20 | } 21 | \description{ 22 | Plots all variables of a data frame against the specified dependant variable 23 | } 24 | \examples{ 25 | data(iris) 26 | bivariate(iris,'Species','Petal.Width',n.bins=3) 27 | } 28 | 29 | -------------------------------------------------------------------------------- /man/charSummary.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.0): do not edit by hand 2 | % Please edit documentation in R/charSummary.R 3 | \name{charSummary} 4 | \alias{charSummary} 5 | \title{Automatically detects character/factor variables and gives a comprehensive summary} 6 | \usage{ 7 | charSummary(df) 8 | } 9 | \arguments{ 10 | \item{df}{name of your data frame} 11 | } 12 | \value{ 13 | Returns the summary data frame 14 | } 15 | \description{ 16 | Automatically detects character/factor variables and gives a comprehensive summary 17 | } 18 | \examples{ 19 | data(iris) 20 | charSummary(iris) 21 | } 22 | 23 | -------------------------------------------------------------------------------- /man/numSummary.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.0): do not edit by hand 2 | % Please edit documentation in R/numSummary.R 3 | \name{numSummary} 4 | \alias{numSummary} 5 | \title{Automatically detects numeric variables and gives a comprehensive summary} 6 | \usage{ 7 | numSummary(df) 8 | } 9 | \arguments{ 10 | \item{df}{name of your data frame} 11 | } 12 | \value{ 13 | Returns the summary data frame 14 | } 15 | \description{ 16 | Automatically detects numeric variables and gives a comprehensive summary 17 | } 18 | \examples{ 19 | data(iris) 20 | numSummary(iris) 21 | } 22 | 23 | -------------------------------------------------------------------------------- /man/removeSpecial.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.0): do not edit by hand 2 | % Please edit documentation in R/removeSpecial.R 3 | \name{removeSpecial} 4 | \alias{removeSpecial} 5 | \title{Replaces special characters in your data frame to NA} 6 | \usage{ 7 | removeSpecial(df, vec) 8 | } 9 | \arguments{ 10 | \item{df}{name of your data frame} 11 | 12 | \item{vec}{vector containing the special characters you want to replace with NA} 13 | } 14 | \value{ 15 | Returns the modified data frame 16 | } 17 | \description{ 18 | Replaces special characters in your data frame to NA 19 | } 20 | \examples{ 21 | data(iris) 22 | iris[1,2]<-"?" 23 | iris[2,2]<-"@" 24 | iris[3,2]<-"???" 25 | iris<-removeSpecial(iris,c("@","???")) 26 | head(iris) 27 | } 28 | 29 | -------------------------------------------------------------------------------- /xda.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: No 4 | SaveWorkspace: No 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | Encoding: UTF-8 9 | 10 | AutoAppendNewline: Yes 11 | StripTrailingWhitespace: Yes 12 | 13 | BuildType: Package 14 | PackageUseDevtools: Yes 15 | PackageInstallArgs: --no-multiarch --with-keep.source 16 | PackageRoxygenize: rd,collate,namespace 17 | --------------------------------------------------------------------------------