├── .Rbuildignore ├── .gitignore ├── AUX ├── auto_data_prep.R ├── data_integ_test.R ├── data_integrity_model.R ├── messy_data.txt ├── metadata_models_integrity.R └── test.R ├── DESCRIPTION ├── LICENSE.md ├── NAMESPACE ├── R ├── attach.R ├── common_lib.R ├── cross_plot.R ├── data.R ├── data_integrity.R ├── data_preparation.R ├── discretize.R ├── exploratory_data_analysis.R ├── funModeling.R ├── information_theory.R ├── models_lib.R ├── outliers.R └── target_profiling.R ├── README.Rmd ├── README.html ├── README.md ├── data ├── data_country.rda ├── data_golf.rda ├── heart_disease.rda └── metadata_models.rda ├── docs ├── 404.html ├── LICENSE.html ├── articles │ ├── discre1.png │ ├── dslb.png │ ├── funModeling_quickstart.html │ ├── funModeling_quickstart_files │ │ └── figure-html │ │ │ ├── boxplot_analysis-1.png │ │ │ ├── boxplot_analysis-2.png │ │ │ ├── cluster_performance-1.png │ │ │ ├── density_histogram-1.png │ │ │ ├── distribution1-1.png │ │ │ ├── distribution1-2.png │ │ │ ├── performance-1.png │ │ │ ├── profiling1-1.png │ │ │ ├── profiling1-2.png │ │ │ ├── unnamed-chunk-3-1.png │ │ │ └── unnamed-chunk-4-1.png │ └── index.html ├── authors.html ├── docsearch.css ├── docsearch.js ├── index.html ├── link.svg ├── pkgdown.css ├── pkgdown.js ├── pkgdown.yml └── reference │ ├── auto_grouping.html │ ├── categ_analysis.html │ ├── compare_df.html │ ├── concatenate_n_vars.html │ ├── convert_df_to_categoric.html │ ├── coord_plot.html │ ├── correlation_table.html │ ├── cross_plot.html │ ├── data_country.html │ ├── data_golf.html │ ├── data_integrity.html │ ├── data_integrity_model.html │ ├── desc_groups.html │ ├── desc_groups_rank.html │ ├── df_status.html │ ├── discretize_df.html │ ├── discretize_get_bins.html │ ├── discretize_rgr.html │ ├── entropy_2.html │ ├── equal_freq.html │ ├── export_plot.html │ ├── fibonacci.html │ ├── figures │ ├── README-boxplot_analysis-1.png │ ├── README-boxplot_analysis-2.png │ ├── README-density_histogram-1.png │ ├── README-distribution1-1.png │ ├── README-distribution1-2.png │ ├── README-performance-1.png │ ├── README-profiling1-1.png │ ├── README-profiling1-2.png │ └── README-unnamed-chunk-3-1.png │ ├── freq-1.png │ ├── freq-2.png │ ├── freq-3.png │ ├── freq.html │ ├── funModeling-package.html │ ├── gain_lift-1.png │ ├── gain_lift.html │ ├── gain_ratio.html │ ├── get_sample.html │ ├── hampel_outlier.html │ ├── heart_disease.html │ ├── index.html │ ├── infor_magic.html │ ├── information_gain.html │ ├── metadata_models.html │ ├── plot_num.html │ ├── plotar.html │ ├── prep_outliers.html │ ├── profiling_num.html │ ├── range01.html │ ├── status.html │ ├── tukey_outlier.html │ ├── v_compare.html │ └── var_rank_info.html ├── funModeling.Rproj ├── man ├── auto_grouping.Rd ├── categ_analysis.Rd ├── compare_df.Rd ├── concatenate_n_vars.Rd ├── convert_df_to_categoric.Rd ├── coord_plot.Rd ├── correlation_table.Rd ├── cross_plot.Rd ├── data_country.Rd ├── data_golf.Rd ├── data_integrity.Rd ├── data_integrity_model.Rd ├── desc_groups.Rd ├── desc_groups_rank.Rd ├── df_status.Rd ├── discretize_df.Rd ├── discretize_get_bins.Rd ├── discretize_rgr.Rd ├── entropy_2.Rd ├── equal_freq.Rd ├── export_plot.Rd ├── fibonacci.Rd ├── figures │ ├── README-boxplot_analysis-1.png │ ├── README-boxplot_analysis-2.png │ ├── README-density_histogram-1.png │ ├── README-distribution1-1.png │ ├── README-distribution1-2.png │ ├── README-performance-1.png │ ├── README-profiling1-1.png │ ├── README-profiling1-2.png │ └── README-unnamed-chunk-3-1.png ├── freq.Rd ├── funModeling-package.Rd ├── gain_lift.Rd ├── gain_ratio.Rd ├── get_sample.Rd ├── hampel_outlier.Rd ├── heart_disease.Rd ├── infor_magic.Rd ├── information_gain.Rd ├── metadata_models.Rd ├── plot_num.Rd ├── plotar.Rd ├── prep_outliers.Rd ├── profiling_num.Rd ├── range01.Rd ├── status.Rd ├── tukey_outlier.Rd ├── v_compare.Rd └── var_rank_info.Rd ├── myTESTS ├── test_freq.R ├── test_prep_outliers.R ├── tests_cross_plot.R └── tests_plotar.R └── vignettes ├── discre1.png ├── dslb.png ├── funModeling_quickstart.R ├── funModeling_quickstart.Rmd ├── funModeling_quickstart.html ├── img ├── data-science-live-book.png └── funmodeling_logo_wh.png └── man └── figures ├── README-boxplot_analysis-1.png ├── README-boxplot_analysis-2.png ├── README-density_histogram-1.png ├── README-distribution1-1.png ├── README-distribution1-2.png ├── README-performance-1.png ├── README-profiling1-1.png ├── README-profiling1-2.png └── README-unnamed-chunk-3-1.png /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | README.* 4 | vignettes/figure 5 | myTESTS 6 | LICENSE.md 7 | AUX 8 | docs 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | vignettes/figure/ 5 | IGNORE_ME/ 6 | vignettes/funModeling.md 7 | vignettes/funModeling.R 8 | .Rproj.user 9 | -------------------------------------------------------------------------------- /AUX/auto_data_prep.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | contr.ltfr <- caret::contr.ltfr 3 | 4 | #' @title xxx 5 | #' @description xxx 6 | #' 7 | #' @param df xxx 8 | 9 | #' @return xxx 10 | #' @export 11 | one_hot_encoding <- function(df, fullRank=F) 12 | { 13 | dmy = caret::dummyVars(formula=" ~ .", data = df, fullRank=fullRank) 14 | dummy_data = data.frame(predict(dmy, newdata = df)) 15 | dummy_data[is.na(dummy_data)] = 0 16 | 17 | return(dummy_data) 18 | } 19 | 20 | #' @export 21 | data_to_num <- function(data, target=NULL, fullRank=F) 22 | { 23 | #data=heart_disease 24 | #target="has_heart_disease" 25 | #library(tidyverse) 26 | 27 | ## removing target from one hot 28 | if(!missing(target)) 29 | { 30 | df_to_prep=select(data, -target) 31 | } else { 32 | df_to_prep=data 33 | } 34 | 35 | stat=status(df_to_prep) 36 | 37 | ## NA treatment 38 | var_num_NA=filter(stat, q_na>0, type %in% c("numeric", "integer")) %>% pull(variable) 39 | 40 | if(length(var_num_NA)>0) 41 | { 42 | NUM_BINS=5 43 | # df_to_prep[1,'oldpeak']=NA 44 | d_bins=discretize_get_bins(data = df_to_prep, n_bins = NUM_BINS, input = var_num_NA) 45 | 46 | if(length(d_bins)>0) 47 | { 48 | df_disc=suppressMessages(discretize_df(df_to_prep, d_bins, stringsAsFactors = F)) 49 | } 50 | } 51 | 52 | ## removing categorical variables with more than MAX_CATEGORIES 53 | MAX_CATEGORIES=50 54 | 55 | status2=status %>% 56 | mutate(cat_to_prep=ifelse(unique > MAX_CATEGORIES & type %in% c('factor', 'character'), 1, 0)) 57 | 58 | ## reporting the others 59 | vars_high_card=filter(status2, cat_to_prep==1) %>% pull(variable) 60 | if(length(vars_high_card)>0) 61 | { 62 | message(sprintf("Skipping high cardinallity variables (> MAX_CATEGORIES): %s", paste(vars_high_card, collapse = ', '))) 63 | } 64 | 65 | ## keeping valid categorical variables 66 | df_cat_onehot=select(df_to_prep, filter(status2, cat_to_prep==0) %>% pull(variable)) 67 | 68 | ## One hot 69 | if(nrow(df_cat_onehot)>0) 70 | { 71 | df_onehot=funModeling::one_hot_encoding(df_cat_onehot) 72 | } 73 | 74 | ## Adding discretized numeric variable 75 | if(exists("df_disc") & exists("df_onehot")) 76 | { 77 | d1=cbind(df_disc, df_onehot) 78 | } else if(!exists("df_onehot")) { 79 | d1=df_disc 80 | } else { 81 | d1=df_onehot 82 | } 83 | 84 | if(nrow(d1)==0) stop("No data.") 85 | 86 | ## Adding the target if exists 87 | if(!missing(target)) 88 | { 89 | d1=cbind(d1, select(data, target)) 90 | } 91 | 92 | return(d1) 93 | } 94 | 95 | 96 | # prep_categorical( 97 | 98 | # que pasa si es categ > 50 y no es tgt binario 99 | 100 | 101 | 102 | -------------------------------------------------------------------------------- /AUX/data_integ_test.R: -------------------------------------------------------------------------------- 1 | #library(funModeling) 2 | data=heart_disease 3 | data$has_heart_disease2=ifelse(data$has_heart_disease==1,T, F) 4 | data$fecha=Sys.Date() 5 | data$fecha2=as.POSIXct(Sys.Date()) 6 | data$gender=as.character(data$gender) 7 | data$gender[1]=NA 8 | data$fecha[1]=Sys.Date()+1 9 | data$fecha2[2]=as.POSIXct(Sys.Date())+1 10 | data$max_heart_rate=as.character(data$max_heart_rate) 11 | data$constant=999 12 | data$id=as.character(seq(1:nrow(data))) 13 | 14 | write_delim(data, "messy_data.txt", delim = ';') 15 | 16 | # 17 | #library(stringr) 18 | 19 | status(data) 20 | di=data_integrity(data) 21 | summary(di) 22 | 23 | di$results$vars_other 24 | 25 | 26 | ## all ok: 27 | library(tidyverse) 28 | o5=data_integrity(dplyr::select(data_country, -country));summary(o5) 29 | print(o5) 30 | 31 | 32 | ############## 33 | 34 | 35 | o4=data_integrity(data);summary(o4) 36 | print(o4) 37 | reprex() 38 | 39 | 40 | o1=data_integrity_model(data, 'xgboost') 41 | print(o1) 42 | 43 | o1$model_selected 44 | 45 | o1=data_integrity_model(data, 'xgboost', MAX_UNIQUE = 100);print(o1) 46 | o2=data_integrity_model(data, 'randomForest');print(o2) 47 | o3=data_integrity_model(data, 'no_na');print(o3) 48 | ibrary(tidyverse) 49 | 50 | -------------------------------------------------------------------------------- /AUX/data_integrity_model.R: -------------------------------------------------------------------------------- 1 | #' @title data integrity model (UNDER DEVELOPMENT) 2 | #' @description data_integrity_model 3 | #' @param data data frame or a single vector 4 | #' @param model_name model name 5 | #' @param MAX_UNIQUE max unique 6 | #' @return metrics 7 | data_integrity_model <- function(data, model_name, MAX_UNIQUE=35) 8 | { 9 | model_selected=metadata_models %>% dplyr::filter(name==model_name) 10 | 11 | if(nrow(model_selected)==0) { 12 | stop("Configuration not available") 13 | } 14 | 15 | l_err_msgs=list() 16 | 17 | # Run the data_integrity with custom max unique due to random forest 18 | if(model_selected$max_unique != "Inf") { 19 | MAX_UNIQUE = model_selected$max_unique 20 | } 21 | 22 | san=data_integrity(data, MAX_UNIQUE = MAX_UNIQUE) 23 | 24 | # Check NA in both: numerical or categorical variables 25 | if(!model_selected$allow_NA & (nrow(san$vars_num_with_NA)>0 | nrow(san$vars_cat_with_NA)>0)) 26 | { 27 | vars_NA=c(san$vars_num_with_NA$variable, san$vars_cat_with_NA$variable) 28 | vars_NA=paste(vars_NA, collapse = ", ") 29 | err_msg_NA=sprintf("{NA detected} %s", vars_NA) 30 | l_err_msgs=c(l_err_msgs, msg=err_msg_NA) 31 | } 32 | 33 | # Check if only numeric (categorical and other are not allow) 34 | if(model_selected$only_numeric & (length(san$vars_cat)>0 | length(san$vars_other)>0)) 35 | { 36 | vars_non_num=c(san$vars_cat, san$vars_other) 37 | vars_non_num=paste(vars_non_num, collapse = ", ") 38 | err_msg_non_num=sprintf("{Non-numeric detected} %s", vars_non_num) 39 | l_err_msgs=c(l_err_msgs, msg=err_msg_non_num) 40 | } 41 | 42 | # Check if character are not allowed 43 | if(!model_selected$allow_character & length(san$vars_char)) 44 | { 45 | vars_char=paste(san$vars_char, collapse = ", ") 46 | err_msg_char=sprintf("{Character detected} %s", vars_char) 47 | l_err_msgs=c(l_err_msgs, msg=err_msg_char) 48 | } 49 | 50 | # Check categorical variables with high cardinality 51 | if(nrow(san$vars_cat_high_card)>0) 52 | { 53 | vars_high_card=paste(san$vars_cat_high_card$variable, collapse = ", ") 54 | err_msg_high_card=sprintf("{High cardinality detected (MAX_UNIQUE > %s)} %s", MAX_UNIQUE, vars_high_card) 55 | l_err_msgs=c(l_err_msgs, msg=err_msg_high_card) 56 | } 57 | 58 | 59 | final_msg="" 60 | if(length(l_err_msgs)>0) 61 | { 62 | for(i in 1:length(l_err_msgs)) 63 | { 64 | 65 | if(str_detect(l_err_msgs[[i]], "High cardinality")) { 66 | emoji_mark=ifelse(model_selected$max_unique != Inf, emo::ji('x'), emo::ji('warning')) 67 | } else { 68 | emoji_mark=emo::ji('x') 69 | } 70 | 71 | final_msg=str_c(final_msg, str_c(emoji_mark, l_err_msgs[[i]], sep=" "), sep = "\n") 72 | data_ok=F 73 | } 74 | 75 | } else { 76 | final_msg=str_c(str_c(emo::ji('white_check_mark') , "Data integrity ok!", sep=" "), sep = "\n") 77 | data_ok=T 78 | } 79 | 80 | # Creating S3 object 81 | obj_chk=list(final_msg=final_msg, data_ok=data_ok, model_selected=model_selected, checked=san) 82 | class(obj_chk)="integritymodel" 83 | 84 | return(obj_chk) 85 | 86 | } 87 | 88 | #' @export 89 | print.integritymodel <- function(object) { 90 | cat(object$final_msg) 91 | } 92 | 93 | #' @export 94 | summary.integritymodel <- function(object) { 95 | return(object$data_ok) 96 | } 97 | 98 | -------------------------------------------------------------------------------- /AUX/metadata_models_integrity.R: -------------------------------------------------------------------------------- 1 | #library(datapasta) 2 | # tribble_paste 3 | d_models=tibble::tribble( 4 | ~name, ~allow_NA, ~max_unique, ~allow_factor, ~allow_character, ~only_numeric, 5 | "randomForest", FALSE, 53, TRUE, FALSE, FALSE, 6 | "xgboost", TRUE, Inf, FALSE, FALSE, TRUE, 7 | "num_no_na", FALSE, Inf, FALSE, FALSE, TRUE, 8 | "no_na", FALSE, Inf, TRUE, TRUE, TRUE, 9 | "kmeans", FALSE, Inf, TRUE, TRUE, TRUE, 10 | "hclust", FALSE, Inf, TRUE, TRUE, TRUE, 11 | "hdbscan", FALSE, Inf, TRUE, TRUE, TRUE, 12 | "dbscan", FALSE, Inf, TRUE, TRUE, TRUE, 13 | "umap", FALSE, Inf, TRUE, TRUE, TRUE, 14 | "pca", FALSE, Inf, TRUE, TRUE, TRUE, 15 | "rpart", TRUE, Inf, TRUE, TRUE, FALSE 16 | 17 | ) 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /AUX/test.R: -------------------------------------------------------------------------------- 1 | 2 | o1=data_integrity_model(data, 'xgboost');print(o1) 3 | o2=data_integrity_model(data, 'randomForest');print(o2) 4 | 5 | o3=data_integrity_model(data, 'no_na');print(o3) 6 | 7 | data_models 8 | o1$model_selected 9 | o1$checked 10 | 11 | san.sancheck(o1) 12 | model.sancheck(o1) 13 | 14 | 15 | 16 | df_status2(data) 17 | 18 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: funModeling 2 | Type: Package 3 | Title: Exploratory Data Analysis and Data Preparation Tool-Box 4 | Description: Around 10% of almost any predictive modeling project is spent in predictive modeling, 'funModeling' and the book Data Science Live Book () are intended to cover remaining 90%: data preparation, profiling, selecting best variables 'dataViz', assessing model performance and other functions. 5 | Version: 1.9.4 6 | Date: 2020-06-14 7 | Authors@R: person("Pablo", "Casas", role = c("aut", "cre"), email = "pcasas.biz@gmail.com") 8 | Maintainer: Pablo Casas 9 | License: GPL-2 10 | BugReports: https://github.com/pablo14/funModeling/issues 11 | URL: https://livebook.datascienceheroes.com 12 | LazyData: true 13 | Encoding: UTF-8 14 | Imports: 15 | ROCR, 16 | ggplot2, 17 | gridExtra, 18 | pander, 19 | reshape2, 20 | scales, 21 | dplyr, 22 | lazyeval, 23 | utils, 24 | RColorBrewer, 25 | moments, 26 | entropy, 27 | cli, 28 | stringr 29 | Depends: 30 | R (>= 3.4.0), 31 | Hmisc (>= 3.17.1) 32 | Suggests: knitr, rmarkdown 33 | VignetteBuilder: knitr 34 | RoxygenNote: 7.1.0 35 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2017 Pablo Casas 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 10 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(print,integrity) 4 | S3method(print,integritymodel) 5 | S3method(summary,integrity) 6 | export(auto_grouping) 7 | export(categ_analysis) 8 | export(compare_df) 9 | export(concatenate_n_vars) 10 | export(convert_df_to_categoric) 11 | export(coord_plot) 12 | export(correlation_table) 13 | export(cross_plot) 14 | export(data_integrity) 15 | export(data_integrity_model) 16 | export(desc_groups) 17 | export(desc_groups_rank) 18 | export(df_status) 19 | export(discretize_df) 20 | export(discretize_get_bins) 21 | export(discretize_rgr) 22 | export(entropy_2) 23 | export(equal_freq) 24 | export(export_plot) 25 | export(fibonacci) 26 | export(freq) 27 | export(gain_lift) 28 | export(gain_ratio) 29 | export(get_sample) 30 | export(hampel_outlier) 31 | export(infor_magic) 32 | export(information_gain) 33 | export(plot_num) 34 | export(plotar) 35 | export(prep_outliers) 36 | export(profiling_num) 37 | export(range01) 38 | export(status) 39 | export(tukey_outlier) 40 | export(v_compare) 41 | export(var_rank_info) 42 | import(dplyr) 43 | import(ggplot2) 44 | importFrom(Hmisc,cut2) 45 | importFrom(RColorBrewer,brewer.pal) 46 | importFrom(ROCR,performance) 47 | importFrom(ROCR,plot) 48 | importFrom(ROCR,prediction) 49 | importFrom(cli,symbol) 50 | importFrom(entropy,entropy) 51 | importFrom(grDevices,colorRampPalette) 52 | importFrom(grDevices,dev.off) 53 | importFrom(grDevices,jpeg) 54 | importFrom(grDevices,rainbow) 55 | importFrom(graphics,abline) 56 | importFrom(graphics,grid) 57 | importFrom(gridExtra,grid.arrange) 58 | importFrom(lazyeval,interp) 59 | importFrom(moments,kurtosis) 60 | importFrom(moments,skewness) 61 | importFrom(pander,pandoc.table) 62 | importFrom(reshape2,dcast) 63 | importFrom(reshape2,melt) 64 | importFrom(scales,percent) 65 | importFrom(stats,IQR) 66 | importFrom(stats,cor) 67 | importFrom(stats,cutree) 68 | importFrom(stats,dist) 69 | importFrom(stats,frequency) 70 | importFrom(stats,hclust) 71 | importFrom(stats,kmeans) 72 | importFrom(stats,mad) 73 | importFrom(stats,median) 74 | importFrom(stats,na.omit) 75 | importFrom(stats,predict) 76 | importFrom(stats,quantile) 77 | importFrom(stats,rbeta) 78 | importFrom(stats,sd) 79 | importFrom(stringr,str_c) 80 | importFrom(stringr,str_detect) 81 | importFrom(utils,head) 82 | importFrom(utils,packageVersion) 83 | importFrom(utils,tail) 84 | -------------------------------------------------------------------------------- /R/attach.R: -------------------------------------------------------------------------------- 1 | .onAttach <- function(...) 2 | { 3 | packageStartupMessage(sprintf("funModeling v.%s :)\nExamples and tutorials at livebook.datascienceheroes.com\n / Now in Spanish: librovivodecienciadedatos.ai", packageVersion("funModeling"))) 4 | } 5 | -------------------------------------------------------------------------------- /R/common_lib.R: -------------------------------------------------------------------------------- 1 | #' @title Fibonacci series 2 | #' @description 3 | #' It retrieves a vector containing the first N numbers specified in 'length' parameter of the Fibonacci series. 4 | #' @param length data frame 5 | #' @param remove_first removes the first value of the series, because first 2 elements are the same (number=1). False by default. 6 | #' @examples 7 | #' # Get the first 4 elements of Fibonacci series 8 | #' fibonacci(4) 9 | #' @return vector 10 | #' @export 11 | fibonacci <- function(length, remove_first=F) 12 | { 13 | fibvals = numeric(length) 14 | fibvals[1] = 1 15 | fibvals[2] = 1 16 | for (i in 3:length) { 17 | fibvals[i] = fibvals[i-1] + fibvals[i-2] 18 | } 19 | 20 | if(remove_first) 21 | fibvals=fibvals[-1] 22 | 23 | return(fibvals) 24 | } 25 | 26 | remove_na_target <- function(data, target) 27 | { 28 | ## Removing NA from target variable ######### 29 | data_tmp=subset(data, !is.na(data[[target]])) 30 | if(nrow(data) > nrow(data_tmp)) 31 | { 32 | warning(sprintf("There were removed %d rows with NA values in target variable '%s'.", nrow(data)-nrow(data_tmp), target)) 33 | 34 | ## Keeping with cleaned data 35 | data=data_tmp 36 | } 37 | 38 | return(data) 39 | } 40 | 41 | 42 | check_target_2_values <- function(data, target) 43 | { 44 | ## Stop if target is not binary 45 | if(length(unique(data[[target]]))>2) 46 | { 47 | stop(sprintf("Target variable '%s' does not have 2 unique values.", target)) 48 | } 49 | } 50 | 51 | 52 | check_target_existence <- function(data, target) 53 | { 54 | ## Checking for variable existence. 55 | if(!(target %in% colnames(data))) stop(sprintf("Target variable '%s' does not exists in the data", target)) 56 | } 57 | 58 | give_me_num_vars <- function(data, target=NULL) 59 | { 60 | ## 61 | stat=status(data) 62 | di=data_integrity(data) 63 | 64 | ## keeping numeric variables 65 | input=di$results$vars_num 66 | 67 | ## Excluding variables with less than two unique value 68 | ex_variables=stat[stat$unique<=2, 'variable'] 69 | input=input[!(input %in% ex_variables)] 70 | 71 | if(length(ex_variables)>0) 72 | sprintf('Excluding variables with 1 or 2 unique values: %s', paste(ex_variables, collapse = ', ')) 73 | 74 | 75 | return(input) 76 | } 77 | 78 | give_me_character_vars <- function(data, target=NULL) 79 | { 80 | ## 81 | status=df_status(data, print_results = F) 82 | 83 | ## Excluding not numeric variables 84 | input=status[status$type %in% "factor" | status$type %in% "character", 'variable'] 85 | 86 | return(input) 87 | } 88 | 89 | #' @title Export plot to jpeg file 90 | #' @description 91 | #' Export 'object_plot' to jpeg file under the name 'file_name' in the directory 'path_out' 92 | #' @param object_plot Object plot to export (like ggplot2) 93 | #' @param path_out path directory to export the output, if it has a value the plot is saved, 94 | #' if the directory doesn't existis it will try to create it. To save in current directory path must be dot: "." 95 | #' @param file_name output file name 96 | #' @return none 97 | #' @export 98 | export_plot <- function(object_plot, path_out, file_name) 99 | { 100 | ## Save plot into a jpeg file 101 | dir.create(path_out, showWarnings = F) 102 | 103 | if(dir.exists(path_out)) 104 | { 105 | file_name_png=sprintf("%s/%s.png", path_out, file_name) 106 | suppressMessages(jpeg(file_name_png, width= 12.25, height= 6.25, units="in",res=200, quality = 90)) 107 | plot(object_plot) 108 | suppressMessages(dev.off()) 109 | } else { 110 | warning(sprintf("Directory '%s' doesn't exist or it couldn't be created", path_out)) 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /R/data.R: -------------------------------------------------------------------------------- 1 | #' Heart Disease Data 2 | #' 3 | #' There are variables related to patient clinic trial. The variable to predict is `has_heart_disease`. 4 | #' 5 | #' @format A data frame with 303 rows and 16 variables: 6 | #' \itemize{ 7 | #' 8 | #' \url{https://archive.ics.uci.edu/ml/datasets/Heart+Disease} 9 | #' } 10 | "heart_disease" 11 | 12 | #' People with flu data 13 | #' 14 | #' Each row represents a person from different countries indicating if he or she has or not flu. 15 | #' Colmuns 16 | #' person: unique id 17 | #' country: country of the person, 70 different countries 18 | #' has_flu: character variable with values "yes" or "no" indicating if the person has flu 19 | #' 20 | #' @format A data frame with 910 rows and 3 variables 21 | #' 22 | "data_country" 23 | 24 | #' Play golf 25 | #' 26 | #' This well known small data frame containst 14 cases indicating wheter or not play golf based on wheather conditions. Target variable: 'play_golf.' 27 | #' 28 | #' @format A data frame with 14 rows and 3 variables 29 | #' 30 | "data_golf" 31 | 32 | #' Metadata models data integrity 33 | #' 34 | #' 35 | #' 36 | #' @format Tibble 37 | #' 38 | "metadata_models" 39 | -------------------------------------------------------------------------------- /R/data_preparation.R: -------------------------------------------------------------------------------- 1 | #' @title Reduce cardinality in categorical variable by automatic grouping 2 | #' @description Reduce the cardinality of an input variable based on a target -binary by now- variable based on attribitues of accuracy and representativity, 3 | #' for both input and target variable. It uses a cluster model to create the new groups. 4 | #' Full documentation can be found at: 5 | #' \url{https://livebook.datascienceheroes.com/data-preparation.html#high_cardinality_predictive_modeling} 6 | #' @param data data frame source 7 | #' @param input categorical variable indicating 8 | #' @param target string of the variable to optimize the re-grouping 9 | #' @param n_groups number of groups for the new category based on input, normally between 3 and 10. 10 | #' @param model is the clustering model used to create the grouping, supported models: "kmeans" (default) or "hclust" (hierarchical clustering). 11 | #' @param seed optional, random number used internally for the k-means, changing this value will change the model 12 | #' @examples 13 | #' \dontrun{ 14 | #' # Reducing quantity of countries based on has_flu variable 15 | #' auto_grouping(data=data_country, input='country', target="has_flu", n_groups=8) 16 | #' } 17 | #' @return A list containing 3 elements: recateg_results which contains the description of the target variable with the new groups; 18 | #' df_equivalence is a data frame containing the input category and the new category; fit_cluster which is the cluster model used to do the re-grouping 19 | #' @export 20 | auto_grouping <- function(data, input, target, n_groups, model="kmeans", seed=999) 21 | { 22 | data[[input]]=as.character(data[[input]]) 23 | 24 | df_categ=categ_analysis(data, input , target) 25 | 26 | d=select_(df_categ, "perc_target", "perc_rows") 27 | 28 | set.seed(seed) 29 | if(model=="kmeans") { 30 | fit_cluster=kmeans(scale(data.frame(d)), n_groups) 31 | # Checking results: it_cluster$centers;fit_cluster$size 32 | cluster_vec=fit_cluster$cluster 33 | } else if(model=="hclust"){ 34 | # hclust 35 | fit_cluster=hclust(dist(scale(data.frame(d)))) 36 | cluster_vec=cutree(fit_cluster, k=n_groups) 37 | } else { 38 | stop("Parameter 'model' can be 'kmeans' or 'hclust'") 39 | } 40 | 41 | 42 | ## Equivalence table 43 | var_rec=paste(input, "rec", sep="_") 44 | df_categ[, var_rec]=paste("group_", cluster_vec, sep = "") 45 | 46 | ## See new profiling based on new groups 47 | data_rec=merge(select_(data, input, target), select_(df_categ, input, var_rec), by=input) 48 | recateg_results=categ_analysis(data_rec, var_rec, target) 49 | 50 | l_res=list() 51 | l_res$recateg_results=recateg_results 52 | l_res$df_equivalence=arrange_(unique(select_(data_rec, input, var_rec)), var_rec) 53 | l_res$fit_cluster=fit_cluster 54 | 55 | return(l_res) 56 | } 57 | 58 | #' @title Transform a variable into the [0-1] range 59 | #' @description Range a variable into [0-1], assigning 0 to the min and 1 to the max of the input variable. All NA values will be removed. 60 | #' @param var numeric input vector 61 | #' @examples 62 | #' range01(mtcars$cyl) 63 | #' @return vector with the values scaled into the 0 to 1 range 64 | #' @export 65 | range01 <- function(var) 66 | { 67 | return((var-min(var, na.rm=T))/(max(var, na.rm=T)-min(var, na.rm=T))) 68 | } 69 | 70 | -------------------------------------------------------------------------------- /R/funModeling.R: -------------------------------------------------------------------------------- 1 | #' funModeling: Exploratory data analysis, data preparation and model performance 2 | #' 3 | #' funModeling is intimately related to the Data Science Live Book -Open Source- (2017) in the sense that most 4 | #' of its functionality is used to explain different topics addressed by the book. 5 | #' 6 | #' To start using funModeling you can start by the vignette: 7 | #' `browseVignettes(package = "funModeling")` 8 | #' 9 | #' Or you can read the Data Science Live Book, fully accessible at: \url{https://livebook.datascienceheroes.com} 10 | #' 11 | #' 12 | 13 | #' @importFrom grDevices dev.off jpeg rainbow 14 | #' @importFrom graphics abline grid 15 | #' @importFrom stats predict frequency 16 | #' @importFrom pander pandoc.table 17 | #' @importFrom Hmisc cut2 18 | #' @import ggplot2 19 | #' @import dplyr 20 | #' @importFrom cli symbol 21 | #' @importFrom reshape2 dcast melt 22 | #' @importFrom utils packageVersion 23 | #' @importFrom scales percent 24 | #' @importFrom lazyeval interp 25 | #' @importFrom gridExtra grid.arrange 26 | #' @importFrom ROCR prediction performance plot 27 | #' @importFrom stats cor quantile 28 | #' @importFrom RColorBrewer brewer.pal 29 | #' @importFrom grDevices colorRampPalette 30 | #' @importFrom stats kmeans rbeta hclust cutree dist IQR na.omit sd mad median 31 | #' @importFrom utils head tail 32 | #' @importFrom moments skewness kurtosis 33 | #' @importFrom entropy entropy 34 | #' @importFrom stringr str_c str_detect 35 | 36 | "_PACKAGE" 37 | 38 | utils::globalVariables(names=c("fum","element_blank","value","ratio","aes","variable","geom_bar","geom_text","position", 39 | "guides","labs","theme","element_text","scale_y_continuous","position_dodge","ylim","guide_legend","scale_fill_discrete", 40 | "aes_string", "geom_boxplot","stat_summary", "theme_bw", "freq", "geom_vline", "geom_density", "margin", 41 | "scale_colour_continuous",'Var1','label','coord_flip','ylab','xlab','geom_label','unit','Population','Gain', 42 | 'Score.Point','geom_line','geom_point','xlim','geom_segment','Lift', 'Freq', 'sum_pos', 'likelih','.','one_of', 43 | 'grp_mean', 'mean_target',"'colorRampPalette","head","tail","rbeta","p_10","p_90","sd" ,"std_dev","variation_coef", 44 | "iqr", "type","gr","discretize_bins", "cuts", "p_na", "q_na", "str_c","metadata_models","name","category"), package = "funModeling", add = F) 45 | -------------------------------------------------------------------------------- /R/information_theory.R: -------------------------------------------------------------------------------- 1 | #' @title Computes the entropy between two variables 2 | #' @description It calculates the entropy between two categorical variables using log2. 3 | #' This log2 is mentioned in most of the Claude Shannon bibliography. 4 | #' Input/target can be numeric or character. 5 | #' @param input numeric/character vector 6 | #' @param target numeric/character vector 7 | #' @examples 8 | #' \dontrun{ 9 | #' # Measuring entropy between input and target variable 10 | #' entropy_2(input=data_golf$outlook, target=data_golf$play_golf) 11 | #' } 12 | #' @return Entropy measured in bits 13 | #' @export 14 | entropy_2 <- function(input, target) 15 | { 16 | # converting x input into frequency table 17 | tbl_input=table(input) 18 | 19 | # cell percentages (distribution) 20 | probs_input=prop.table(tbl_input) 21 | 22 | tbl=table(input, target) 23 | 24 | # get partial entropy 25 | df_tbl=as.data.frame.matrix(tbl) 26 | res_entropy=data.frame(t(df_tbl)) %>% mutate_all(funs(entropy(., unit = "log2"))) %>% head(.,1) 27 | 28 | # computing total entropy 29 | total_en=sum(probs_input*res_entropy) 30 | 31 | return(total_en) 32 | } 33 | 34 | #' @title Information gain 35 | #' @description Computes the information gain between an 'input' and 'target' variable (using log2). In general terms, the higher the more predictable the input is. 36 | #' @param input numeric/character vector 37 | #' @param target numeric/character vector 38 | #' @examples 39 | #' \dontrun{ 40 | #' information_gain(input=data_golf$outlook, target=data_golf$play_golf) 41 | #' } 42 | #' @return information gain 43 | #' @export 44 | information_gain <- function(input, target) 45 | { 46 | tbl=table(target) 47 | en_y=entropy::entropy(tbl, unit = "log2") 48 | en=entropy_2(input, target) 49 | info_gain=en_y-en 50 | 51 | return(info_gain) 52 | } 53 | 54 | #' @title Gain ratio 55 | #' @description Computes the information gain between an 'input' and 'target' variable (using log2). Similar to information gain but less sensitive to high cardinality variables. 56 | #' @param input numeric/character vector 57 | #' @param target numeric/character vector 58 | #' @examples 59 | #' \dontrun{ 60 | #' gain_ratio(input=data_golf$outlook, target=data_golf$play_golf) 61 | #' } 62 | #' @return gain ratio 63 | #' @export 64 | gain_ratio <- function(input, target) 65 | { 66 | ig=information_gain(input, target) 67 | split=information_gain(input, input) 68 | 69 | gain_r=ig/split 70 | 71 | return(gain_r) 72 | } 73 | 74 | #' @title Importance variable ranking based on information theory 75 | #' @description Retrieves a data frame containing several metrics related to information theory. 76 | #' Metrics are: entropy (en), mutual information (mi), information gain (ig) and gain ratio (gr). 77 | #' 78 | #' @param data input data frame, all the variables will be evaluated against the variable defined in 'target' parameter 79 | #' @param target string variable name containing the output variable. 80 | #' @examples 81 | #' \dontrun{ 82 | #' var_rank_info(data_golf, "play_golf") 83 | #' } 84 | #' @return data frame ordered by gain ratio metric 85 | #' @export 86 | var_rank_info <- function(data, target) 87 | { 88 | nam=colnames(data) 89 | nam=nam[nam!=target] 90 | 91 | df_res=data.frame(var=NULL, en=NULL, mi=NULL, ig=NULL, gr=NULL, stringsAsFactors = F) 92 | 93 | for(var in nam) 94 | { 95 | r=infor_magic(data[[var]], data[[target]]) 96 | df_res=rbind(df_res, data.frame(var=var, en=r[1], mi=r[2], ig=r[3], gr=r[4])) 97 | } 98 | 99 | df_res$var=as.character(df_res$var) 100 | 101 | df_res=df_res %>% arrange(-gr) 102 | 103 | return(df_res) 104 | } 105 | 106 | #' @title Computes several information theory metrics between two vectors 107 | #' @description It retrieves the same as \code{\link{var_rank_info}} but receiving two vectors. 108 | #' Metrics are: entropy (en), mutual information (mi), information gain (ig) and gain ratio (gr). 109 | #' 110 | #' @param input vector to be evaluated against the variable defined in 'target' parameter 111 | #' @param target vector containing the output variable. 112 | #' @examples 113 | #' \dontrun{ 114 | #' infor_magic(data_golf$outlook, data_golf$play_golf) 115 | #' } 116 | #' @return Matrix of 1 row and 4 columns, where each column represent the mentioned metrics 117 | #' @export 118 | infor_magic <- function(input, target) 119 | { 120 | tbl_2v=table(input, target) 121 | 122 | # computing maximum total entropy 123 | en=round(entropy::entropy(tbl_2v, unit = "log2") ,3) 124 | 125 | # other way of computing max chaos... 126 | # log(nrow(tbl_2v)*ncol(tbl_2v)) 127 | 128 | # mutual entropy based on 'entropy' package 129 | mi=round(entropy::mi.empirical(tbl_2v, unit = "log2"),3) 130 | 131 | # mutual information or entroy based on 'infotheo' package 132 | # mi=mutinformation(data[[col1]],data[[col2]]) 133 | 134 | # Computing information gain between input and target variable 135 | ig=information_gain(input, target) 136 | 137 | # Computing information gain between input and target variable. 138 | gr=gain_ratio(input, target) 139 | 140 | res=c(en=en, mi=mi, ig=ig , gr=gr) 141 | 142 | return(res) 143 | } 144 | 145 | -------------------------------------------------------------------------------- /R/target_profiling.R: -------------------------------------------------------------------------------- 1 | #' @title Correlation plots 2 | #' @description Visual correlation analysis. Plot different graphs in order to expose the inner information of any numeric variable against the target variable 3 | #' @param data data frame source 4 | #' @param input string input variable (if empty, it runs for all numeric variable), it can take a single character value or a character vector. 5 | #' @param target string of the variable to predict, it supports binary or multinominal values. 6 | #' @param plot_type Indicates the type of plot to retrieve, available values: "boxplot" or "histdens". 7 | #' @param path_out path directory, if it has a value the plot is saved. To save in current directory path must be dot: "." 8 | #' @examples 9 | #' \dontrun{ 10 | #' ## It runs for all numeric variables automatically 11 | #' plotar(data=heart_disease, target="has_heart_disease", plot_type="histdens") 12 | #' 13 | #' plotar(heart_disease, input = 'age', target = 'chest_pain', plot_type = "boxplot") 14 | #' } 15 | #' @return Single or multiple plots specified by 'plot_type' parameter 16 | #' @export 17 | plotar <- function(data, input, target, plot_type, path_out) 18 | { 19 | data=as.data.frame(data) 20 | 21 | ## Parameters & Error handlers 22 | if(missing(plot_type)) 23 | stop("Parameter 'plot_type' cannot be missing, available values: 'histdens' or 'boxplot'.") 24 | 25 | if(!(plot_type %in% c('histdens','boxplot'))) 26 | stop("Value for 'plot_type' is not valid: available values: 'histdens' or 'boxplot'.") 27 | 28 | check_target_existence(data, target=target) 29 | 30 | data=remove_na_target(data, target=target) 31 | 32 | #check_target_2_values(data, target=target) 33 | 34 | ## Convert to factor target variable 35 | data[[target]]=as.factor(data[[target]]) 36 | 37 | if(missing(path_out)) path_out=NA 38 | 39 | if(missing(input)) 40 | { 41 | data_2=data[, !(names(data) %in% target)] 42 | input=give_me_num_vars(data_2) 43 | } 44 | 45 | ## Begin iterator logic 46 | for(i in 1:length(input)) 47 | { 48 | sprintf("Plotting '%s' (%s)", input[i], plot_type) 49 | 50 | ## Get the desiered plot 51 | target_plot=get_target_plot(data, input[i], target, plot_type) 52 | 53 | plot(target_plot) 54 | } 55 | 56 | } 57 | 58 | 59 | get_target_plot <- function(data, input, target, plot_type) 60 | { 61 | ## Retrieve the desiered plot 62 | if(plot_type=="histdens") 63 | plot_target=histdens_target(data, input, target) 64 | 65 | if(plot_type=="boxplot") 66 | plot_target=boxplot_target(data, input, target) 67 | 68 | return(plot_target) 69 | } 70 | 71 | 72 | histdens_target <- function(data, input, target) 73 | { 74 | cdf=group_by_(data, target) %>% summarise_(var.mean=interp(~mean(v, na.rm=T), v=as.name(input))) 75 | 76 | cdf$var.mean=round(cdf$var.mean, 2) 77 | 78 | plot_histdens=ggplot(data, aes_string(x=input, colour=target)) + geom_density() + geom_vline(data=cdf, aes_string(xintercept="var.mean", colour=target), linetype="dashed", size=0.5) + 79 | 80 | theme_bw() + 81 | 82 | theme( 83 | plot.background = element_blank(), 84 | panel.grid.minor = element_blank(), 85 | panel.border = element_blank(), 86 | axis.title.x=element_text(margin=margin(15,0,0,0)), 87 | axis.title.y=element_text(margin=margin(0,33,0,0)) 88 | ) 89 | 90 | return(plot_histdens) 91 | 92 | } 93 | 94 | boxplot_target <- function(data, input, target) 95 | { 96 | plot_box=ggplot(data, aes_string(x=target, y=input, fill=target)) + geom_boxplot() + 97 | guides(fill=FALSE)+stat_summary(fun.y=mean, geom="point", shape=5, size=4) + 98 | 99 | theme_bw() + 100 | 101 | theme( 102 | plot.background = element_blank(), 103 | panel.grid.minor = element_blank(), 104 | panel.border = element_blank(), 105 | axis.title.x=element_text(margin=margin(15,0,0,0)), 106 | axis.title.y=element_text(margin=margin(0,15,0,0)) 107 | 108 | ) 109 | 110 | return(plot_box) 111 | } 112 | 113 | #' @title Profiling analysis of categorical vs. target variable 114 | #' @param data input data containing the variable to describe 115 | #' @param input string input variable (if empty, it runs for all categorical variable), it can take a single character value or a character vector. 116 | #' @param target string target variable. Binary or two class is only supported by now. 117 | #' @examples 118 | #' categ_analysis(data_country, "country", "has_flu") 119 | #' @description Retrieves a complete summary of the grouped input variable against the target variable. Type of target variable must be binary for now. A positive case will be the less representative one. It returns the total positive cases (sum_target)); pecentage of total positive cases (perc_target) that fell in that category (this column sums 1); likelihood or mean of positive cases (mean_target) measured by the total positive cases over total cases in that category; quantity of rows of that category (q_rows) and in percentage (perc_rows) -this column sums 1. 120 | #' @return if input has 1 variable, it retrurns a data frame indicating all the metrics, otherwise prints in console all variable results. 121 | #' @export 122 | categ_analysis<-function(data, input, target) 123 | { 124 | data=as.data.frame(data) 125 | 126 | ## Parameters & Error handlers ##################### 127 | check_target_existence(data, target=target) 128 | 129 | data=remove_na_target(data, target=target) 130 | 131 | check_target_2_values(data, target=target) 132 | ##################################################### 133 | 134 | ## If missing it runs for all categorical variables 135 | if(missing(input)) 136 | { 137 | data_2=data[, !(names(data) %in% target)] 138 | input=give_me_character_vars(data_2) 139 | } 140 | 141 | ## Iterator 142 | q_vars=length(input) 143 | if(q_vars==1) 144 | { 145 | res=categ_analysis_logic(data = data, input=input, target=target) 146 | return(res) 147 | } else { 148 | for(i in 1:q_vars) 149 | { 150 | res=categ_analysis_logic(data = data, input=input[i], target=target) 151 | print(res) 152 | cat("", sep="\n") 153 | } 154 | 155 | cat("", sep="\n") 156 | return(sprintf("Variables processed: %s", paste(input, collapse = ", "))) 157 | } 158 | 159 | } 160 | 161 | categ_analysis_logic <- function(data, input, target) 162 | { 163 | ## Infering positive class as the less representative class. 164 | data[,target]=as.character(data[,target]) 165 | grp_class=group_by(data, data[,target]) %>% summarise(q=n()) %>% arrange(q) 166 | pred_class=as.character(grp_class[1,1]) 167 | tot_pos=sum(data[,target]==pred_class) 168 | 169 | ## profiling 170 | grp=group_by_(data, input) %>% summarise_( 171 | mean_target=interp(~round(mean(var==pred_class, na.rm = TRUE), 3), var = as.name(target)), 172 | sum_target=interp(~sum(var==pred_class, na.rm = TRUE), var = as.name(target)), 173 | perc_target=interp(~round(sum(var==pred_class, na.rm = TRUE)/tot_pos,3), var = as.name(target)), 174 | q_rows=~n(), 175 | perc_rows=~round(n()/nrow(data), 3) 176 | ) %>% arrange(-mean_target) 177 | 178 | #colnames(grp)[colnames(grp)=='sum_target']=paste("sum", target, sep="_") 179 | #colnames(grp)[colnames(grp)=='perc_target']=paste("perc", target, sep="_") 180 | #colnames(grp)[colnames(grp)=='mean_target']=paste("mean", target , sep="_") 181 | 182 | grp=data.frame(grp, stringsAsFactors = F) 183 | 184 | #print(sprintf("Variable: '%s'", input)) 185 | 186 | # colnames(grp)[1]="category" 187 | 188 | return(grp) 189 | } 190 | 191 | 192 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | ```{r setup, echo = FALSE} 2 | knitr::opts_chunk$set( 3 | fig.path = "man/figures/README-", 4 | out.width = "200px" 5 | ) 6 | ``` 7 | 8 | [![metacran downloads](https://cranlogs.r-pkg.org/badges/grand-total/funModeling)](https://cran.r-project.org/package=funModeling) 9 | 10 | [![metacran downloads](https://cranlogs.r-pkg.org/badges/funModeling)](https://cran.r-project.org/package=funModeling) 11 | 12 | # Hello! 13 | 14 | This package contains a set of functions related to exploratory data analysis, data preparation, and model performance. It is used by people coming from business, research, and teaching (professors and students). 15 | 16 | 17 | funModeling 18 | 19 | funModeling 20 | 21 | 22 | 23 | ## Books 24 | 25 | `funModeling` is intimately related to the _Data Science Live Book_ -Open Source- (2017) in the sense that most of its functionality is used to explain different topics addressed by the book. 26 | 27 | Data Science Live Book 28 | 29 | Versions: 30 | 31 | * EN: [Data Science Live Book](https://livebook.datascienceheroes.com/) 32 | * ES: [Libro Vivo de Ciencia de Datos](https://librovivodecienciadedatos.ai) 33 | 34 | In the _Download_ section, you can buy (name your price) a digital copy of the book in PDF, mobi and pub. 35 | 36 | ## Blog posts based on `funModeling`: 37 | 38 | * [Exploratory Data Analysis in R (introduction)](https://blog.datascienceheroes.com/exploratory-data-analysis-in-r-intro/) 39 | * [Automatic data types checking in predictive models](https://blog.datascienceheroes.com/automatic-data-types-checking-in-predictive-models/) 40 | * [Fast data exploration for predictive modeling](https://blog.datascienceheroes.com/fast-data-exploration-for-predictive-modeling/) 41 | * [New discretization method: Recursive information gain ratio maximization](https://blog.datascienceheroes.com/discretization-recursive-gain-ratio-maximization/) 42 | 43 | ## Official page 44 | 45 | * [funModeling official webpage](http://pablo14.github.io/funModeling/) 46 | * Check the vignette [here](http://pablo14.github.io/funModeling/articles/funModeling_quickstart.html). 47 | 48 | ## If you speak Spanish... 49 | 50 | Escuela de Datos Vivos 51 | 52 | You are invited to the [Escuela de Datos Vivos](https://escueladedatosvivos.ai/), a data school founded by the same funModeling / DSLB author. There you can find free and paid courses, blog post, youtube channel, using R and Python. 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | [![metacran downloads](https://cranlogs.r-pkg.org/badges/grand-total/funModeling)](https://cran.r-project.org/package=funModeling) 4 | 5 | [![metacran downloads](https://cranlogs.r-pkg.org/badges/funModeling)](https://cran.r-project.org/package=funModeling) 6 | 7 | # Hello! 8 | 9 | This package contains a set of functions related to exploratory data analysis, data preparation, and model performance. It is used by people coming from business, research, and teaching (professors and students). 10 | 11 | 12 | funModeling 13 | 14 | funModeling 15 | 16 | 17 | 18 | ## Books 19 | 20 | `funModeling` is intimately related to the _Data Science Live Book_ -Open Source- (2017) in the sense that most of its functionality is used to explain different topics addressed by the book. 21 | 22 | Data Science Live Book 23 | 24 | Versions: 25 | 26 | * EN: [Data Science Live Book](https://livebook.datascienceheroes.com/) 27 | * ES: [Libro Vivo de Ciencia de Datos](https://librovivodecienciadedatos.ai) 28 | 29 | In the _Download_ section, you can buy (name your price) a digital copy of the book in PDF, mobi and pub. 30 | 31 | ## Blog posts based on `funModeling`: 32 | 33 | * [Exploratory Data Analysis in R (introduction)](https://blog.datascienceheroes.com/exploratory-data-analysis-in-r-intro/) 34 | * [Automatic data types checking in predictive models](https://blog.datascienceheroes.com/automatic-data-types-checking-in-predictive-models/) 35 | * [Fast data exploration for predictive modeling](https://blog.datascienceheroes.com/fast-data-exploration-for-predictive-modeling/) 36 | * [New discretization method: Recursive information gain ratio maximization](https://blog.datascienceheroes.com/discretization-recursive-gain-ratio-maximization/) 37 | 38 | ## Official page 39 | 40 | * [funModeling official webpage](http://pablo14.github.io/funModeling/) 41 | * Check the vignette [here](http://pablo14.github.io/funModeling/articles/funModeling_quickstart.html). 42 | 43 | ## If you speak Spanish... 44 | 45 | Escuela de Datos Vivos 46 | 47 | You are invited to the [Escuela de Datos Vivos](https://escueladedatosvivos.ai/), a data school founded by the same funModeling / DSLB author. There you can find free and paid courses, blog post, youtube channel, using R and Python. 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /data/data_country.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/data/data_country.rda -------------------------------------------------------------------------------- /data/data_golf.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/data/data_golf.rda -------------------------------------------------------------------------------- /data/heart_disease.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/data/heart_disease.rda -------------------------------------------------------------------------------- /data/metadata_models.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/data/metadata_models.rda -------------------------------------------------------------------------------- /docs/404.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Page not found (404) • funModeling 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 51 | 52 | 53 | 54 | 55 | 56 | 57 |
58 |
59 | 105 | 106 | 107 | 108 |
109 | 110 |
111 |
112 | 115 | 116 | Content not found. Please use links in the navbar. 117 | 118 |
119 | 120 |
121 | 122 | 123 | 124 |
125 | 128 | 129 |
130 |

Site built with pkgdown 1.4.1.

131 |
132 | 133 |
134 |
135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | -------------------------------------------------------------------------------- /docs/LICENSE.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | NA • funModeling 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 51 | 52 | 53 | 54 | 55 | 56 | 57 |
58 |
59 | 105 | 106 | 107 | 108 |
109 | 110 |
111 |
112 | 115 | 116 | 117 |

The MIT License (MIT)

118 |

Copyright (c) 2017 Pablo Casas <pcasas.biz at gmail.com>

119 |

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

120 |

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

121 |

THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

122 | 123 | 124 |
125 | 126 |
127 | 128 | 129 | 130 |
131 | 134 | 135 |
136 |

Site built with pkgdown 1.4.1.

137 |
138 | 139 |
140 |
141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | -------------------------------------------------------------------------------- /docs/articles/discre1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/docs/articles/discre1.png -------------------------------------------------------------------------------- /docs/articles/dslb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/docs/articles/dslb.png -------------------------------------------------------------------------------- /docs/articles/funModeling_quickstart_files/figure-html/boxplot_analysis-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/docs/articles/funModeling_quickstart_files/figure-html/boxplot_analysis-1.png -------------------------------------------------------------------------------- /docs/articles/funModeling_quickstart_files/figure-html/boxplot_analysis-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/docs/articles/funModeling_quickstart_files/figure-html/boxplot_analysis-2.png -------------------------------------------------------------------------------- /docs/articles/funModeling_quickstart_files/figure-html/cluster_performance-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/docs/articles/funModeling_quickstart_files/figure-html/cluster_performance-1.png -------------------------------------------------------------------------------- /docs/articles/funModeling_quickstart_files/figure-html/density_histogram-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/docs/articles/funModeling_quickstart_files/figure-html/density_histogram-1.png -------------------------------------------------------------------------------- /docs/articles/funModeling_quickstart_files/figure-html/distribution1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/docs/articles/funModeling_quickstart_files/figure-html/distribution1-1.png -------------------------------------------------------------------------------- /docs/articles/funModeling_quickstart_files/figure-html/distribution1-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/docs/articles/funModeling_quickstart_files/figure-html/distribution1-2.png -------------------------------------------------------------------------------- /docs/articles/funModeling_quickstart_files/figure-html/performance-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/docs/articles/funModeling_quickstart_files/figure-html/performance-1.png -------------------------------------------------------------------------------- /docs/articles/funModeling_quickstart_files/figure-html/profiling1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/docs/articles/funModeling_quickstart_files/figure-html/profiling1-1.png -------------------------------------------------------------------------------- /docs/articles/funModeling_quickstart_files/figure-html/profiling1-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/docs/articles/funModeling_quickstart_files/figure-html/profiling1-2.png -------------------------------------------------------------------------------- /docs/articles/funModeling_quickstart_files/figure-html/unnamed-chunk-3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/docs/articles/funModeling_quickstart_files/figure-html/unnamed-chunk-3-1.png -------------------------------------------------------------------------------- /docs/articles/funModeling_quickstart_files/figure-html/unnamed-chunk-4-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/docs/articles/funModeling_quickstart_files/figure-html/unnamed-chunk-4-1.png -------------------------------------------------------------------------------- /docs/articles/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Articles • funModeling 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 51 | 52 | 53 | 54 | 55 | 56 | 57 |
58 |
59 | 105 | 106 | 107 | 108 |
109 | 110 |
111 |
112 | 115 | 116 |
117 |

All vignettes

118 |

119 | 120 | 123 |
124 |
125 |
126 | 127 | 128 |
129 | 132 | 133 |
134 |

Site built with pkgdown 1.4.1.

135 |
136 | 137 |
138 |
139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | -------------------------------------------------------------------------------- /docs/authors.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Authors • funModeling 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 51 | 52 | 53 | 54 | 55 | 56 | 57 |
58 |
59 | 105 | 106 | 107 | 108 |
109 | 110 |
111 |
112 | 115 | 116 |
    117 |
  • 118 |

    Pablo Casas. Author, maintainer. 119 |

    120 |
  • 121 |
122 | 123 |
124 | 125 |
126 | 127 | 128 | 129 |
130 | 133 | 134 |
135 |

Site built with pkgdown 1.4.1.

136 |
137 | 138 |
139 |
140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | -------------------------------------------------------------------------------- /docs/docsearch.js: -------------------------------------------------------------------------------- 1 | $(function() { 2 | 3 | // register a handler to move the focus to the search bar 4 | // upon pressing shift + "/" (i.e. "?") 5 | $(document).on('keydown', function(e) { 6 | if (e.shiftKey && e.keyCode == 191) { 7 | e.preventDefault(); 8 | $("#search-input").focus(); 9 | } 10 | }); 11 | 12 | $(document).ready(function() { 13 | // do keyword highlighting 14 | /* modified from https://jsfiddle.net/julmot/bL6bb5oo/ */ 15 | var mark = function() { 16 | 17 | var referrer = document.URL ; 18 | var paramKey = "q" ; 19 | 20 | if (referrer.indexOf("?") !== -1) { 21 | var qs = referrer.substr(referrer.indexOf('?') + 1); 22 | var qs_noanchor = qs.split('#')[0]; 23 | var qsa = qs_noanchor.split('&'); 24 | var keyword = ""; 25 | 26 | for (var i = 0; i < qsa.length; i++) { 27 | var currentParam = qsa[i].split('='); 28 | 29 | if (currentParam.length !== 2) { 30 | continue; 31 | } 32 | 33 | if (currentParam[0] == paramKey) { 34 | keyword = decodeURIComponent(currentParam[1].replace(/\+/g, "%20")); 35 | } 36 | } 37 | 38 | if (keyword !== "") { 39 | $(".contents").unmark({ 40 | done: function() { 41 | $(".contents").mark(keyword); 42 | } 43 | }); 44 | } 45 | } 46 | }; 47 | 48 | mark(); 49 | }); 50 | }); 51 | 52 | /* Search term highlighting ------------------------------*/ 53 | 54 | function matchedWords(hit) { 55 | var words = []; 56 | 57 | var hierarchy = hit._highlightResult.hierarchy; 58 | // loop to fetch from lvl0, lvl1, etc. 59 | for (var idx in hierarchy) { 60 | words = words.concat(hierarchy[idx].matchedWords); 61 | } 62 | 63 | var content = hit._highlightResult.content; 64 | if (content) { 65 | words = words.concat(content.matchedWords); 66 | } 67 | 68 | // return unique words 69 | var words_uniq = [...new Set(words)]; 70 | return words_uniq; 71 | } 72 | 73 | function updateHitURL(hit) { 74 | 75 | var words = matchedWords(hit); 76 | var url = ""; 77 | 78 | if (hit.anchor) { 79 | url = hit.url_without_anchor + '?q=' + escape(words.join(" ")) + '#' + hit.anchor; 80 | } else { 81 | url = hit.url + '?q=' + escape(words.join(" ")); 82 | } 83 | 84 | return url; 85 | } 86 | -------------------------------------------------------------------------------- /docs/link.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 8 | 12 | 13 | -------------------------------------------------------------------------------- /docs/pkgdown.css: -------------------------------------------------------------------------------- 1 | /* Sticky footer */ 2 | 3 | /** 4 | * Basic idea: https://philipwalton.github.io/solved-by-flexbox/demos/sticky-footer/ 5 | * Details: https://github.com/philipwalton/solved-by-flexbox/blob/master/assets/css/components/site.css 6 | * 7 | * .Site -> body > .container 8 | * .Site-content -> body > .container .row 9 | * .footer -> footer 10 | * 11 | * Key idea seems to be to ensure that .container and __all its parents__ 12 | * have height set to 100% 13 | * 14 | */ 15 | 16 | html, body { 17 | height: 100%; 18 | } 19 | 20 | body > .container { 21 | display: flex; 22 | height: 100%; 23 | flex-direction: column; 24 | } 25 | 26 | body > .container .row { 27 | flex: 1 0 auto; 28 | } 29 | 30 | footer { 31 | margin-top: 45px; 32 | padding: 35px 0 36px; 33 | border-top: 1px solid #e5e5e5; 34 | color: #666; 35 | display: flex; 36 | flex-shrink: 0; 37 | } 38 | footer p { 39 | margin-bottom: 0; 40 | } 41 | footer div { 42 | flex: 1; 43 | } 44 | footer .pkgdown { 45 | text-align: right; 46 | } 47 | footer p { 48 | margin-bottom: 0; 49 | } 50 | 51 | img.icon { 52 | float: right; 53 | } 54 | 55 | img { 56 | max-width: 100%; 57 | } 58 | 59 | /* Fix bug in bootstrap (only seen in firefox) */ 60 | summary { 61 | display: list-item; 62 | } 63 | 64 | /* Typographic tweaking ---------------------------------*/ 65 | 66 | .contents .page-header { 67 | margin-top: calc(-60px + 1em); 68 | } 69 | 70 | /* Section anchors ---------------------------------*/ 71 | 72 | a.anchor { 73 | margin-left: -30px; 74 | display:inline-block; 75 | width: 30px; 76 | height: 30px; 77 | visibility: hidden; 78 | 79 | background-image: url(./link.svg); 80 | background-repeat: no-repeat; 81 | background-size: 20px 20px; 82 | background-position: center center; 83 | } 84 | 85 | .hasAnchor:hover a.anchor { 86 | visibility: visible; 87 | } 88 | 89 | @media (max-width: 767px) { 90 | .hasAnchor:hover a.anchor { 91 | visibility: hidden; 92 | } 93 | } 94 | 95 | 96 | /* Fixes for fixed navbar --------------------------*/ 97 | 98 | .contents h1, .contents h2, .contents h3, .contents h4 { 99 | padding-top: 60px; 100 | margin-top: -40px; 101 | } 102 | 103 | /* Sidebar --------------------------*/ 104 | 105 | #sidebar { 106 | margin-top: 30px; 107 | position: -webkit-sticky; 108 | position: sticky; 109 | top: 70px; 110 | } 111 | #sidebar h2 { 112 | font-size: 1.5em; 113 | margin-top: 1em; 114 | } 115 | 116 | #sidebar h2:first-child { 117 | margin-top: 0; 118 | } 119 | 120 | #sidebar .list-unstyled li { 121 | margin-bottom: 0.5em; 122 | } 123 | 124 | .orcid { 125 | height: 16px; 126 | /* margins are required by official ORCID trademark and display guidelines */ 127 | margin-left:4px; 128 | margin-right:4px; 129 | vertical-align: middle; 130 | } 131 | 132 | /* Reference index & topics ----------------------------------------------- */ 133 | 134 | .ref-index th {font-weight: normal;} 135 | 136 | .ref-index td {vertical-align: top;} 137 | .ref-index .icon {width: 40px;} 138 | .ref-index .alias {width: 40%;} 139 | .ref-index-icons .alias {width: calc(40% - 40px);} 140 | .ref-index .title {width: 60%;} 141 | 142 | .ref-arguments th {text-align: right; padding-right: 10px;} 143 | .ref-arguments th, .ref-arguments td {vertical-align: top;} 144 | .ref-arguments .name {width: 20%;} 145 | .ref-arguments .desc {width: 80%;} 146 | 147 | /* Nice scrolling for wide elements --------------------------------------- */ 148 | 149 | table { 150 | display: block; 151 | overflow: auto; 152 | } 153 | 154 | /* Syntax highlighting ---------------------------------------------------- */ 155 | 156 | pre { 157 | word-wrap: normal; 158 | word-break: normal; 159 | border: 1px solid #eee; 160 | } 161 | 162 | pre, code { 163 | background-color: #f8f8f8; 164 | color: #333; 165 | } 166 | 167 | pre code { 168 | overflow: auto; 169 | word-wrap: normal; 170 | white-space: pre; 171 | } 172 | 173 | pre .img { 174 | margin: 5px 0; 175 | } 176 | 177 | pre .img img { 178 | background-color: #fff; 179 | display: block; 180 | height: auto; 181 | } 182 | 183 | code a, pre a { 184 | color: #375f84; 185 | } 186 | 187 | a.sourceLine:hover { 188 | text-decoration: none; 189 | } 190 | 191 | .fl {color: #1514b5;} 192 | .fu {color: #000000;} /* function */ 193 | .ch,.st {color: #036a07;} /* string */ 194 | .kw {color: #264D66;} /* keyword */ 195 | .co {color: #888888;} /* comment */ 196 | 197 | .message { color: black; font-weight: bolder;} 198 | .error { color: orange; font-weight: bolder;} 199 | .warning { color: #6A0366; font-weight: bolder;} 200 | 201 | /* Clipboard --------------------------*/ 202 | 203 | .hasCopyButton { 204 | position: relative; 205 | } 206 | 207 | .btn-copy-ex { 208 | position: absolute; 209 | right: 0; 210 | top: 0; 211 | visibility: hidden; 212 | } 213 | 214 | .hasCopyButton:hover button.btn-copy-ex { 215 | visibility: visible; 216 | } 217 | 218 | /* headroom.js ------------------------ */ 219 | 220 | .headroom { 221 | will-change: transform; 222 | transition: transform 200ms linear; 223 | } 224 | .headroom--pinned { 225 | transform: translateY(0%); 226 | } 227 | .headroom--unpinned { 228 | transform: translateY(-100%); 229 | } 230 | 231 | /* mark.js ----------------------------*/ 232 | 233 | mark { 234 | background-color: rgba(255, 255, 51, 0.5); 235 | border-bottom: 2px solid rgba(255, 153, 51, 0.3); 236 | padding: 1px; 237 | } 238 | 239 | /* vertical spacing after htmlwidgets */ 240 | .html-widget { 241 | margin-bottom: 10px; 242 | } 243 | 244 | /* fontawesome ------------------------ */ 245 | 246 | .fab { 247 | font-family: "Font Awesome 5 Brands" !important; 248 | } 249 | 250 | /* don't display links in code chunks when printing */ 251 | /* source: https://stackoverflow.com/a/10781533 */ 252 | @media print { 253 | code a:link:after, code a:visited:after { 254 | content: ""; 255 | } 256 | } 257 | -------------------------------------------------------------------------------- /docs/pkgdown.js: -------------------------------------------------------------------------------- 1 | /* http://gregfranko.com/blog/jquery-best-practices/ */ 2 | (function($) { 3 | $(function() { 4 | 5 | $('.navbar-fixed-top').headroom(); 6 | 7 | $('body').css('padding-top', $('.navbar').height() + 10); 8 | $(window).resize(function(){ 9 | $('body').css('padding-top', $('.navbar').height() + 10); 10 | }); 11 | 12 | $('body').scrollspy({ 13 | target: '#sidebar', 14 | offset: 60 15 | }); 16 | 17 | $('[data-toggle="tooltip"]').tooltip(); 18 | 19 | var cur_path = paths(location.pathname); 20 | var links = $("#navbar ul li a"); 21 | var max_length = -1; 22 | var pos = -1; 23 | for (var i = 0; i < links.length; i++) { 24 | if (links[i].getAttribute("href") === "#") 25 | continue; 26 | // Ignore external links 27 | if (links[i].host !== location.host) 28 | continue; 29 | 30 | var nav_path = paths(links[i].pathname); 31 | 32 | var length = prefix_length(nav_path, cur_path); 33 | if (length > max_length) { 34 | max_length = length; 35 | pos = i; 36 | } 37 | } 38 | 39 | // Add class to parent
  • , and enclosing
  • if in dropdown 40 | if (pos >= 0) { 41 | var menu_anchor = $(links[pos]); 42 | menu_anchor.parent().addClass("active"); 43 | menu_anchor.closest("li.dropdown").addClass("active"); 44 | } 45 | }); 46 | 47 | function paths(pathname) { 48 | var pieces = pathname.split("/"); 49 | pieces.shift(); // always starts with / 50 | 51 | var end = pieces[pieces.length - 1]; 52 | if (end === "index.html" || end === "") 53 | pieces.pop(); 54 | return(pieces); 55 | } 56 | 57 | // Returns -1 if not found 58 | function prefix_length(needle, haystack) { 59 | if (needle.length > haystack.length) 60 | return(-1); 61 | 62 | // Special case for length-0 haystack, since for loop won't run 63 | if (haystack.length === 0) { 64 | return(needle.length === 0 ? 0 : -1); 65 | } 66 | 67 | for (var i = 0; i < haystack.length; i++) { 68 | if (needle[i] != haystack[i]) 69 | return(i); 70 | } 71 | 72 | return(haystack.length); 73 | } 74 | 75 | /* Clipboard --------------------------*/ 76 | 77 | function changeTooltipMessage(element, msg) { 78 | var tooltipOriginalTitle=element.getAttribute('data-original-title'); 79 | element.setAttribute('data-original-title', msg); 80 | $(element).tooltip('show'); 81 | element.setAttribute('data-original-title', tooltipOriginalTitle); 82 | } 83 | 84 | if(ClipboardJS.isSupported()) { 85 | $(document).ready(function() { 86 | var copyButton = ""; 87 | 88 | $(".examples, div.sourceCode").addClass("hasCopyButton"); 89 | 90 | // Insert copy buttons: 91 | $(copyButton).prependTo(".hasCopyButton"); 92 | 93 | // Initialize tooltips: 94 | $('.btn-copy-ex').tooltip({container: 'body'}); 95 | 96 | // Initialize clipboard: 97 | var clipboardBtnCopies = new ClipboardJS('[data-clipboard-copy]', { 98 | text: function(trigger) { 99 | return trigger.parentNode.textContent; 100 | } 101 | }); 102 | 103 | clipboardBtnCopies.on('success', function(e) { 104 | changeTooltipMessage(e.trigger, 'Copied!'); 105 | e.clearSelection(); 106 | }); 107 | 108 | clipboardBtnCopies.on('error', function() { 109 | changeTooltipMessage(e.trigger,'Press Ctrl+C or Command+C to copy'); 110 | }); 111 | }); 112 | } 113 | })(window.jQuery || window.$) 114 | -------------------------------------------------------------------------------- /docs/pkgdown.yml: -------------------------------------------------------------------------------- 1 | pandoc: 2.3.1 2 | pkgdown: 1.4.1 3 | pkgdown_sha: ~ 4 | articles: 5 | funModeling_quickstart: funModeling_quickstart.html 6 | 7 | -------------------------------------------------------------------------------- /docs/reference/data_country.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | People with flu data — data_country • funModeling 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 57 | 58 | 59 | 60 | 61 | 62 | 63 |
    64 |
    65 | 111 | 112 | 113 | 114 |
    115 | 116 |
    117 |
    118 | 123 | 124 |
    125 |

    Each row represents a person from different countries indicating if he or she has or not flu. 126 | Colmuns 127 | person: unique id 128 | country: country of the person, 70 different countries 129 | has_flu: character variable with values "yes" or "no" indicating if the person has flu

    130 |
    131 | 132 |
    data_country
    133 | 134 | 135 |

    Format

    136 | 137 |

    A data frame with 910 rows and 3 variables

    138 | 139 |
    140 | 147 |
    148 | 149 | 150 |
    151 | 154 | 155 |
    156 |

    Site built with pkgdown 1.4.1.

    157 |
    158 | 159 |
    160 |
    161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | -------------------------------------------------------------------------------- /docs/reference/data_golf.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Play golf — data_golf • funModeling 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 53 | 54 | 55 | 56 | 57 | 58 | 59 |
    60 |
    61 | 107 | 108 | 109 | 110 |
    111 | 112 |
    113 |
    114 | 119 | 120 |
    121 |

    This well known small data frame containst 14 cases indicating wheter or not play golf based on wheather conditions. Target variable: 'play_golf.'

    122 |
    123 | 124 |
    data_golf
    125 | 126 | 127 |

    Format

    128 | 129 |

    A data frame with 14 rows and 3 variables

    130 | 131 |
    132 | 139 |
    140 | 141 | 142 |
    143 | 146 | 147 |
    148 |

    Site built with pkgdown 1.4.1.

    149 |
    150 | 151 |
    152 |
    153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | -------------------------------------------------------------------------------- /docs/reference/export_plot.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Export plot to jpeg file — export_plot • funModeling 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 53 | 54 | 55 | 56 | 57 | 58 | 59 |
    60 |
    61 | 107 | 108 | 109 | 110 |
    111 | 112 |
    113 |
    114 | 119 | 120 |
    121 |

    Export 'object_plot' to jpeg file under the name 'file_name' in the directory 'path_out'

    122 |
    123 | 124 |
    export_plot(object_plot, path_out, file_name)
    125 | 126 |

    Arguments

    127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 137 | 138 | 139 | 140 | 141 | 142 |
    object_plot

    Object plot to export (like ggplot2)

    path_out

    path directory to export the output, if it has a value the plot is saved, 136 | if the directory doesn't existis it will try to create it. To save in current directory path must be dot: "."

    file_name

    output file name

    143 | 144 |

    Value

    145 | 146 |

    none

    147 | 148 |
    149 | 157 |
    158 | 159 | 160 |
    161 | 164 | 165 |
    166 |

    Site built with pkgdown 1.4.1.

    167 |
    168 | 169 |
    170 |
    171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | -------------------------------------------------------------------------------- /docs/reference/fibonacci.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Fibonacci series — fibonacci • funModeling 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 53 | 54 | 55 | 56 | 57 | 58 | 59 |
    60 |
    61 | 107 | 108 | 109 | 110 |
    111 | 112 |
    113 |
    114 | 119 | 120 |
    121 |

    It retrieves a vector containing the first N numbers specified in 'length' parameter of the Fibonacci series.

    122 |
    123 | 124 |
    fibonacci(length, remove_first = F)
    125 | 126 |

    Arguments

    127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 |
    length

    data frame

    remove_first

    removes the first value of the series, because first 2 elements are the same (number=1). False by default.

    138 | 139 |

    Value

    140 | 141 |

    vector

    142 | 143 |

    Examples

    144 |
    # Get the first 4 elements of Fibonacci series 145 | fibonacci(4)
    #> [1] 1 1 2 3
    146 |
    147 | 156 |
    157 | 158 | 159 |
    160 | 163 | 164 |
    165 |

    Site built with pkgdown 1.4.1.

    166 |
    167 | 168 |
    169 |
    170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | -------------------------------------------------------------------------------- /docs/reference/figures/README-boxplot_analysis-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/docs/reference/figures/README-boxplot_analysis-1.png -------------------------------------------------------------------------------- /docs/reference/figures/README-boxplot_analysis-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/docs/reference/figures/README-boxplot_analysis-2.png -------------------------------------------------------------------------------- /docs/reference/figures/README-density_histogram-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/docs/reference/figures/README-density_histogram-1.png -------------------------------------------------------------------------------- /docs/reference/figures/README-distribution1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/docs/reference/figures/README-distribution1-1.png -------------------------------------------------------------------------------- /docs/reference/figures/README-distribution1-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/docs/reference/figures/README-distribution1-2.png -------------------------------------------------------------------------------- /docs/reference/figures/README-performance-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/docs/reference/figures/README-performance-1.png -------------------------------------------------------------------------------- /docs/reference/figures/README-profiling1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/docs/reference/figures/README-profiling1-1.png -------------------------------------------------------------------------------- /docs/reference/figures/README-profiling1-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/docs/reference/figures/README-profiling1-2.png -------------------------------------------------------------------------------- /docs/reference/figures/README-unnamed-chunk-3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/docs/reference/figures/README-unnamed-chunk-3-1.png -------------------------------------------------------------------------------- /docs/reference/freq-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/docs/reference/freq-1.png -------------------------------------------------------------------------------- /docs/reference/freq-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/docs/reference/freq-2.png -------------------------------------------------------------------------------- /docs/reference/freq-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/docs/reference/freq-3.png -------------------------------------------------------------------------------- /docs/reference/funModeling-package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | funModeling: Exploratory data analysis, data preparation and model performance — funModeling-package • funModeling 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 54 | 55 | 56 | 57 | 58 | 59 | 60 |
    61 |
    62 | 108 | 109 | 110 | 111 |
    112 | 113 |
    114 |
    115 | 120 | 121 |
    122 |

    funModeling is intimately related to the Data Science Live Book -Open Source- (2017) in the sense that most 123 | of its functionality is used to explain different topics addressed by the book.

    124 |
    125 | 126 | 127 | 128 |

    Details

    129 | 130 |

    To start using funModeling you can start by the vignette: 131 | `browseVignettes(package = "funModeling")`

    132 |

    Or you can read the Data Science Live Book, fully accessible at: https://livebook.datascienceheroes.com

    133 |

    See also

    134 | 135 |

    Useful links:

    139 | 140 |
    141 | 142 |
    143 | 153 |
    154 | 155 | 156 |
    157 | 160 | 161 |
    162 |

    Site built with pkgdown 1.4.1.

    163 |
    164 | 165 |
    166 |
    167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | -------------------------------------------------------------------------------- /docs/reference/gain_lift-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/docs/reference/gain_lift-1.png -------------------------------------------------------------------------------- /docs/reference/gain_ratio.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Gain ratio — gain_ratio • funModeling 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 53 | 54 | 55 | 56 | 57 | 58 | 59 |
    60 |
    61 | 107 | 108 | 109 | 110 |
    111 | 112 |
    113 |
    114 | 119 | 120 |
    121 |

    Computes the information gain between an 'input' and 'target' variable (using log2). Similar to information gain but less sensitive to high cardinality variables.

    122 |
    123 | 124 |
    gain_ratio(input, target)
    125 | 126 |

    Arguments

    127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 |
    input

    numeric/character vector

    target

    numeric/character vector

    138 | 139 |

    Value

    140 | 141 |

    gain ratio

    142 | 143 |

    Examples

    144 |
    if (FALSE) { 145 | gain_ratio(input=data_golf$outlook, target=data_golf$play_golf) 146 | }
    147 |
    148 | 157 |
    158 | 159 | 160 |
    161 | 164 | 165 |
    166 |

    Site built with pkgdown 1.4.1.

    167 |
    168 | 169 |
    170 |
    171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | -------------------------------------------------------------------------------- /docs/reference/heart_disease.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Heart Disease Data — heart_disease • funModeling 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 53 | 54 | 55 | 56 | 57 | 58 | 59 |
    60 |
    61 | 107 | 108 | 109 | 110 |
    111 | 112 |
    113 |
    114 | 119 | 120 |
    121 |

    There are variables related to patient clinic trial. The variable to predict is `has_heart_disease`.

    122 |
    123 | 124 |
    heart_disease
    125 | 126 | 127 |

    Format

    128 | 129 |

    A data frame with 303 rows and 16 variables:

    132 | 133 | 134 |
    135 | 142 |
    143 | 144 | 145 |
    146 | 149 | 150 |
    151 |

    Site built with pkgdown 1.4.1.

    152 |
    153 | 154 |
    155 |
    156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | -------------------------------------------------------------------------------- /docs/reference/metadata_models.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Metadata models data integrity — metadata_models • funModeling 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 53 | 54 | 55 | 56 | 57 | 58 | 59 |
    60 |
    61 | 107 | 108 | 109 | 110 |
    111 | 112 |
    113 |
    114 | 119 | 120 |
    121 |

    Metadata models data integrity

    122 |
    123 | 124 |
    metadata_models
    125 | 126 | 127 |

    Format

    128 | 129 |

    Tibble

    130 | 131 |
    132 | 139 |
    140 | 141 | 142 |
    143 | 146 | 147 |
    148 |

    Site built with pkgdown 1.4.1.

    149 |
    150 | 151 |
    152 |
    153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | -------------------------------------------------------------------------------- /docs/reference/range01.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Transform a variable into the [0-1] range — range01 • funModeling 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 53 | 54 | 55 | 56 | 57 | 58 | 59 |
    60 |
    61 | 107 | 108 | 109 | 110 |
    111 | 112 |
    113 |
    114 | 119 | 120 |
    121 |

    Range a variable into [0-1], assigning 0 to the min and 1 to the max of the input variable. All NA values will be removed.

    122 |
    123 | 124 |
    range01(var)
    125 | 126 |

    Arguments

    127 | 128 | 129 | 130 | 131 | 132 | 133 |
    var

    numeric input vector

    134 | 135 |

    Value

    136 | 137 |

    vector with the values scaled into the 0 to 1 range

    138 | 139 |

    Examples

    140 |
    range01(mtcars$cyl)
    #> [1] 0.5 0.5 0.0 0.5 1.0 0.5 1.0 0.0 0.0 0.5 0.5 1.0 1.0 1.0 1.0 1.0 1.0 0.0 0.0 141 | #> [20] 0.0 0.0 1.0 1.0 1.0 1.0 0.0 0.0 0.0 1.0 0.5 1.0 0.0
    142 |
    143 | 152 |
    153 | 154 | 155 | 165 |
    166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | -------------------------------------------------------------------------------- /funModeling.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: No 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageCheckArgs: --as-cran 22 | PackageRoxygenize: rd,collate,namespace,vignette 23 | -------------------------------------------------------------------------------- /man/auto_grouping.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data_preparation.R 3 | \name{auto_grouping} 4 | \alias{auto_grouping} 5 | \title{Reduce cardinality in categorical variable by automatic grouping} 6 | \usage{ 7 | auto_grouping(data, input, target, n_groups, model = "kmeans", seed = 999) 8 | } 9 | \arguments{ 10 | \item{data}{data frame source} 11 | 12 | \item{input}{categorical variable indicating} 13 | 14 | \item{target}{string of the variable to optimize the re-grouping} 15 | 16 | \item{n_groups}{number of groups for the new category based on input, normally between 3 and 10.} 17 | 18 | \item{model}{is the clustering model used to create the grouping, supported models: "kmeans" (default) or "hclust" (hierarchical clustering).} 19 | 20 | \item{seed}{optional, random number used internally for the k-means, changing this value will change the model} 21 | } 22 | \value{ 23 | A list containing 3 elements: recateg_results which contains the description of the target variable with the new groups; 24 | df_equivalence is a data frame containing the input category and the new category; fit_cluster which is the cluster model used to do the re-grouping 25 | } 26 | \description{ 27 | Reduce the cardinality of an input variable based on a target -binary by now- variable based on attribitues of accuracy and representativity, 28 | for both input and target variable. It uses a cluster model to create the new groups. 29 | Full documentation can be found at: 30 | \url{https://livebook.datascienceheroes.com/data-preparation.html#high_cardinality_predictive_modeling} 31 | } 32 | \examples{ 33 | \dontrun{ 34 | # Reducing quantity of countries based on has_flu variable 35 | auto_grouping(data=data_country, input='country', target="has_flu", n_groups=8) 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /man/categ_analysis.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/target_profiling.R 3 | \name{categ_analysis} 4 | \alias{categ_analysis} 5 | \title{Profiling analysis of categorical vs. target variable} 6 | \usage{ 7 | categ_analysis(data, input, target) 8 | } 9 | \arguments{ 10 | \item{data}{input data containing the variable to describe} 11 | 12 | \item{input}{string input variable (if empty, it runs for all categorical variable), it can take a single character value or a character vector.} 13 | 14 | \item{target}{string target variable. Binary or two class is only supported by now.} 15 | } 16 | \value{ 17 | if input has 1 variable, it retrurns a data frame indicating all the metrics, otherwise prints in console all variable results. 18 | } 19 | \description{ 20 | Retrieves a complete summary of the grouped input variable against the target variable. Type of target variable must be binary for now. A positive case will be the less representative one. It returns the total positive cases (sum_target)); pecentage of total positive cases (perc_target) that fell in that category (this column sums 1); likelihood or mean of positive cases (mean_target) measured by the total positive cases over total cases in that category; quantity of rows of that category (q_rows) and in percentage (perc_rows) -this column sums 1. 21 | } 22 | \examples{ 23 | categ_analysis(data_country, "country", "has_flu") 24 | } 25 | -------------------------------------------------------------------------------- /man/compare_df.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/exploratory_data_analysis.R 3 | \name{compare_df} 4 | \alias{compare_df} 5 | \title{Compare two data frames by keys} 6 | \usage{ 7 | compare_df(dfcomp_x, dfcomp_y, keys_x, keys_y = NA, compare_values = FALSE) 8 | } 9 | \arguments{ 10 | \item{dfcomp_x}{first data frame to compare} 11 | 12 | \item{dfcomp_y}{second data frame to compare} 13 | 14 | \item{keys_x}{keys of the first dataframe} 15 | 16 | \item{keys_y}{(optional) keys of the second dataframe, if missing both data frames will be compared with the keys_x} 17 | 18 | \item{compare_values}{(optional) if TRUE it will not only compare keys, but also will check if the values of non-key matching columns have the same values} 19 | } 20 | \value{ 21 | Differences and coincident values 22 | } 23 | \description{ 24 | Obtain differences between two data frames 25 | } 26 | \examples{ 27 | data(heart_disease) 28 | a=heart_disease 29 | b=heart_disease 30 | a=subset(a, age >45) 31 | b=subset(b, age <50) 32 | b$gender='male' 33 | b$chest_pain=ifelse(b$chest_pain ==3, 4, b$chest_pain) 34 | res=compare_df(a, b, c('age', 'gender')) 35 | # Print the keys that didn't match 36 | res 37 | # Accessing the keys not present in the first data frame 38 | res[[1]]$rows_not_in_X 39 | # Accessing the keys not present in the second data frame 40 | res[[1]]$rows_not_in_Y 41 | # Accessing the keys which coincide completely 42 | res[[1]]$coincident 43 | # Accessing the rows whose values did not coincide 44 | res[[1]]$different_values 45 | } 46 | -------------------------------------------------------------------------------- /man/concatenate_n_vars.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/discretize.R 3 | \name{concatenate_n_vars} 4 | \alias{concatenate_n_vars} 5 | \title{Concatenate 'N' variables} 6 | \usage{ 7 | concatenate_n_vars(data, vars) 8 | } 9 | \arguments{ 10 | \item{data}{data frame containing the two variables to concatenate} 11 | 12 | \item{vars}{character vector containing all variables to concatenate} 13 | } 14 | \value{ 15 | vector containing the concatenated values for the given variables 16 | } 17 | \description{ 18 | Concatenate 'N' variables using the char pipe: <|>. 19 | This function is used when there is the need of measuring the mutual information and/or the information 20 | gain between 'N' input variables an against a target variable. This function makes sense when it is used based on 21 | categorical data. 22 | } 23 | \examples{ 24 | \dontrun{ 25 | new_variable=concatenate_n_vars(mtcars, c("cyl", "disp")) 26 | # Checking new variable 27 | head(new_variable) 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /man/convert_df_to_categoric.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/discretize.R 3 | \name{convert_df_to_categoric} 4 | \alias{convert_df_to_categoric} 5 | \title{Convert every column in a data frame to character} 6 | \usage{ 7 | convert_df_to_categoric(data, n_bins) 8 | } 9 | \arguments{ 10 | \item{data}{input data frame to discretize} 11 | 12 | \item{n_bins}{number of bins/segments for each variable} 13 | } 14 | \value{ 15 | data frame containing all variables as character 16 | } 17 | \description{ 18 | It converts all the variables present in 'data' to character. Criteria conversion is based on 19 | two functions, \code{\link{discretize_get_bins}} plus \code{\link{discretize_df}}, which will discretize 20 | all the numerical variables based on equal frequency criteria, with the number of bins equal to 'n_bins'. 21 | This only applies for numerical variables which unique valuesare more than 'n_bins' parameter. 22 | After this step, it may happen that variables remain non-character, so these variables will be converting 23 | directly into character. 24 | } 25 | \examples{ 26 | \dontrun{ 27 | # before 28 | df_status(heart_disease) 29 | 30 | # after 31 | new_df=convert_df_to_categoric(data=heart_disease, n_bins=5) 32 | df_status(new_df) 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /man/coord_plot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/models_lib.R 3 | \name{coord_plot} 4 | \alias{coord_plot} 5 | \title{Coordinate plot} 6 | \usage{ 7 | coord_plot(data, group_var, group_func = mean, print_table = FALSE) 8 | } 9 | \arguments{ 10 | \item{data}{input data source} 11 | 12 | \item{group_var}{variable to make the group by} 13 | 14 | \item{group_func}{the data type of this parameter is a function, not an string, this is the function to be used in the group by, the default value is: mean} 15 | 16 | \item{print_table}{False by default, if true it retrieves the mean table used to generate the plot.} 17 | } 18 | \value{ 19 | coordinate plot, if print_table=T it also prints a table with the average per column plus the average of the whole column 20 | } 21 | \description{ 22 | Calculate the means (or other function defined in 'group_func' parameter) per group to analyze how each segment behave. It scales each variable mean inti the 0 to 1 range to easily profile the groups according to its mean. It also calculate the mean regardless the grouping. This function is also useful when you want to profile cluster results in terms of its means. 23 | } 24 | \examples{ 25 | \dontrun{ 26 | # calculating the differences based on function 'mean' 27 | coord_plot(data=mtcars, group_var="cyl") 28 | # printing the table used to generate the coord_plot 29 | coord_plot(data=mtcars, group_var="cyl", print_table=TRUE) 30 | # printing the table used to generate the coord_plot 31 | coord_plot(data=mtcars, group_var="cyl", group_func=median, print_table=TRUE) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /man/correlation_table.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/exploratory_data_analysis.R 3 | \name{correlation_table} 4 | \alias{correlation_table} 5 | \title{Get correlation against target variable} 6 | \usage{ 7 | correlation_table(data, target) 8 | } 9 | \arguments{ 10 | \item{data}{data frame} 11 | 12 | \item{target}{string variable to predict} 13 | } 14 | \value{ 15 | Correlation index for all data input variable 16 | } 17 | \description{ 18 | Obtain correlation table for all variables against target variable. Only numeric variables are analyzed (factor/character are skippted automatically). 19 | } 20 | \examples{ 21 | correlation_table(data=heart_disease, target="has_heart_disease") 22 | } 23 | -------------------------------------------------------------------------------- /man/cross_plot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cross_plot.R 3 | \name{cross_plot} 4 | \alias{cross_plot} 5 | \title{Cross-plotting input variable vs. target variable} 6 | \usage{ 7 | cross_plot(data, input, target, path_out, auto_binning, plot_type = "both") 8 | } 9 | \arguments{ 10 | \item{data}{data frame source} 11 | 12 | \item{input}{input variable name (if empty, it runs for all numeric variable), it can take a single character value or a character vector.} 13 | 14 | \item{target}{variable name to predict} 15 | 16 | \item{path_out}{path directory, if it has a value the plot is saved} 17 | 18 | \item{auto_binning}{indicates the automatic binning of input variable based on equal frequency (function 'equal_freq'), default value=TRUE} 19 | 20 | \item{plot_type}{indicates if the output is the 'percentual' plot, the 'quantity' or 'both' (default).} 21 | } 22 | \value{ 23 | cross plot 24 | } 25 | \description{ 26 | The cross_plot shows how the input variable is correlated with the target variable, getting the likelihood rates for each input's bin/bucket . 27 | } 28 | \examples{ 29 | \dontrun{ 30 | ## Example 1: 31 | cross_plot(data=heart_disease, input="chest_pain", target="has_heart_disease") 32 | 33 | ## Example 2: Disabling auto_binning: 34 | cross_plot(data=heart_disease, input="oldpeak", 35 | target="has_heart_disease", auto_binning=FALSE) 36 | 37 | ## Example 3: Saving the plot into a folder: 38 | cross_plot(data=heart_disease, input="oldpeak", 39 | target="has_heart_disease", path_out = "my_folder") 40 | 41 | ## Example 4: Running with multiple input variables at the same time: 42 | cross_plot(data=heart_disease, input=c("age", "oldpeak", "max_heart_rate"), 43 | target="has_heart_disease") 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /man/data_country.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{data_country} 5 | \alias{data_country} 6 | \title{People with flu data} 7 | \format{ 8 | A data frame with 910 rows and 3 variables 9 | } 10 | \usage{ 11 | data_country 12 | } 13 | \description{ 14 | Each row represents a person from different countries indicating if he or she has or not flu. 15 | Colmuns 16 | person: unique id 17 | country: country of the person, 70 different countries 18 | has_flu: character variable with values "yes" or "no" indicating if the person has flu 19 | } 20 | \keyword{datasets} 21 | -------------------------------------------------------------------------------- /man/data_golf.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{data_golf} 5 | \alias{data_golf} 6 | \title{Play golf} 7 | \format{ 8 | A data frame with 14 rows and 3 variables 9 | } 10 | \usage{ 11 | data_golf 12 | } 13 | \description{ 14 | This well known small data frame containst 14 cases indicating wheter or not play golf based on wheather conditions. Target variable: 'play_golf.' 15 | } 16 | \keyword{datasets} 17 | -------------------------------------------------------------------------------- /man/data_integrity.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data_integrity.R 3 | \name{data_integrity} 4 | \alias{data_integrity} 5 | \title{Data integrity} 6 | \usage{ 7 | data_integrity(data, MAX_UNIQUE = 35) 8 | } 9 | \arguments{ 10 | \item{data}{data frame or a single vector} 11 | 12 | \item{MAX_UNIQUE}{max unique threshold to flag a categorical variable as a high cardinality one. Normally above 35 values it is needed to reduce the number of different values.} 13 | } 14 | \value{ 15 | An 'integrity' object. 16 | } 17 | \description{ 18 | A handy function to return different vectors of variable names aimed to quickly filter NA, categorical (factor / character), numerical and other types (boolean, date, posix). 19 | It also returns a vector of variables which have high cardinality. 20 | It returns an 'integrity' object, which has: 'status_now' (comes from status function), and 'results' list, following elements can be found: 21 | 22 | vars_cat: Vector containing the categorical variables names (factor or character) 23 | 24 | vars_num: Vector containing the numerical variables names 25 | 26 | vars_char: Vector containing the character variables names 27 | 28 | vars_factor: Vector containing the factor variables names 29 | 30 | vars_other: Vector containing the other variables names (date time, posix and boolean) 31 | 32 | vars_num_with_NA: Summary table for numerical variables with NA 33 | 34 | vars_cat_with_NA: Summary table for categorical variables with NA 35 | 36 | vars_cat_high_card: Summary table for high cardinality variables (where thershold = MAX_UNIQUE parameter) 37 | 38 | vars_one_value: Vector containing the variables names with 1 unique different value 39 | 40 | Explore the NA and high cardinality variables by doing summary(integrity_object), or a full summary by doing print(integrity_object) 41 | } 42 | \examples{ 43 | # Example 1: 44 | data_integrity(heart_disease) 45 | # Example 2: 46 | # changing the default minimum threshold to flag a variable as high cardiniality 47 | data_integrity(data=data_country, MAX_UNIQUE=50) 48 | } 49 | -------------------------------------------------------------------------------- /man/data_integrity_model.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data_integrity.R 3 | \name{data_integrity_model} 4 | \alias{data_integrity_model} 5 | \title{Check data integrity model} 6 | \usage{ 7 | data_integrity_model(data, model_name, MAX_UNIQUE = 35) 8 | } 9 | \arguments{ 10 | \item{data}{data frame or a single vector} 11 | 12 | \item{model_name}{model name, you can check all the available models by printing `metadata_models` data frame.} 13 | 14 | \item{MAX_UNIQUE}{max unique threshold to flag a categorical variable as a high cardinality one. Normally above 35 values it is needed to reduce the number of different values. 15 | # Example 1: 16 | data_integrity_model(data=heart_disease, model_name="pca") 17 | # Example 2: 18 | # changing the default minimum threshold to flag a variable as high cardiniality 19 | data_integrity_model(data=iris, model_name="xgboost", MAX_UNIQUE=50)} 20 | } 21 | \value{ 22 | an `integritymodel` object 23 | } 24 | \description{ 25 | Given a data frame, we need to create models (xgboost, random forest, regression, etc). Each one of them has its constraints regarding data types. Many errors appear when we are creating models just because of data format. 26 | This function returns, given a certain model, which are the constraints that the data is not satisfying. This way we can anticipate and correct errors before we call for model creation. This function is quite related to \code{\link{data_integrity}}. 27 | } 28 | -------------------------------------------------------------------------------- /man/desc_groups.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/models_lib.R 3 | \name{desc_groups} 4 | \alias{desc_groups} 5 | \title{Profiling categorical variable} 6 | \usage{ 7 | desc_groups(data, group_var, group_func = mean, add_all_data_row = T) 8 | } 9 | \arguments{ 10 | \item{data}{input data source} 11 | 12 | \item{group_var}{variable to make the group by} 13 | 14 | \item{group_func}{the data type of this parameter is a function, not an string, this is the function to be used in the group by, the default value is: mean} 15 | 16 | \item{add_all_data_row}{flag indicating if final data contains the row: 'All_Data', which is the function applied regardless the grouping. Useful to compare with the rest of the values.} 17 | } 18 | \value{ 19 | grouped data frame 20 | } 21 | \description{ 22 | Calculate the means (or other function) per group to analyze how each segment behave. It scales each variable mean inti the 0 to 1 range to easily profile the groups according to its mean. It also calculate the mean regardless the grouping. This function is also useful when you want to profile cluster results in terms of its means. It automatically adds a row representing the sumarization of the column regardless the group_var categories, this is useful to compare each segement with the whole population. It will exclude all factor/character variables. 23 | } 24 | \examples{ 25 | # default grouping function: mean 26 | desc_groups(data=mtcars, group_var="cyl") 27 | 28 | # using the median as the grouping function 29 | desc_groups(data=mtcars, group_var="cyl", group_func=median) 30 | 31 | # using the max as the grouping function 32 | desc_groups(data=mtcars, group_var="gear", group_func=max) 33 | } 34 | -------------------------------------------------------------------------------- /man/desc_groups_rank.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/models_lib.R 3 | \name{desc_groups_rank} 4 | \alias{desc_groups_rank} 5 | \title{Profiling categorical variable (rank)} 6 | \usage{ 7 | desc_groups_rank(data, group_var, group_func = mean) 8 | } 9 | \arguments{ 10 | \item{data}{input data source} 11 | 12 | \item{group_var}{variable to make the group by} 13 | 14 | \item{group_func}{the data type of this parameter is a function, not an string, this is the function to be used in the group by, the default value is: mean} 15 | } 16 | \value{ 17 | grouped data frame, showing the rank instead of the absolute values/ 18 | } 19 | \description{ 20 | Similar to 'desc_groups' function, this one computes the rank of each value in order to quickly know what is the value in each segment that has the highest value (rank=1). 1 represent the highest number. It will exclude all factor/character variables. 21 | } 22 | \examples{ 23 | # default grouping function: mean 24 | desc_groups_rank(data=mtcars, group_var="gear") 25 | 26 | # using the median as the grouping function 27 | desc_groups(data=mtcars, group_var="cyl", group_func=median) 28 | 29 | # using the max as the grouping function 30 | desc_groups_rank(data=mtcars, group_var="gear", group_func=max) 31 | } 32 | -------------------------------------------------------------------------------- /man/df_status.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/exploratory_data_analysis.R 3 | \name{df_status} 4 | \alias{df_status} 5 | \title{Get a summary for the given data frame (o vector).} 6 | \usage{ 7 | df_status(data, print_results) 8 | } 9 | \arguments{ 10 | \item{data}{data frame or a single vector} 11 | 12 | \item{print_results}{if FALSE then there is not a print in the console, TRUE by default.} 13 | } 14 | \value{ 15 | Metrics data frame 16 | } 17 | \description{ 18 | For each variable it returns: Quantity and percentage of zeros (q_zeros and p_zeros respectevly). Same metrics for NA values (q_NA/p_na), and infinite values (q_inf/p_inf). Last two columns indicates data type and quantity of unique values. 19 | This function print and return the results. 20 | } 21 | \examples{ 22 | df_status(heart_disease) 23 | } 24 | -------------------------------------------------------------------------------- /man/discretize_df.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/discretize.R 3 | \name{discretize_df} 4 | \alias{discretize_df} 5 | \title{Discretize a data frame} 6 | \usage{ 7 | discretize_df(data, data_bins, stringsAsFactors = T) 8 | } 9 | \arguments{ 10 | \item{data}{Input data frame} 11 | 12 | \item{data_bins}{data frame generated by 'discretize_get_bins' function. It contains the variable name and the 13 | thresholds for each bin, or segment.} 14 | 15 | \item{stringsAsFactors}{Boolean variable which indicates if the discretization result is character or factor. 16 | When TRUE, the segments are ordered. TRUE by default.} 17 | } 18 | \value{ 19 | Data frame with the transformed variables 20 | } 21 | \description{ 22 | Converts all numerical variables into factor or character, depending on 'stringsAsFactors' parameter, 23 | based on equal frequency criteria. The thresholds for each segment in each variable are generated based on the 24 | output of \code{\link{discretize_get_bins}} function, which returns a data frame 25 | containing the threshold for each variable. This result is must be the 'data_bins' parameter input. 26 | Important to note that the returned data frame contains the non-transformed variables plus the transformed ones. 27 | More info about converting numerical into categorical variables 28 | can be found at: \url{https://livebook.datascienceheroes.com/data-preparation.html#data_types} 29 | } 30 | \examples{ 31 | \dontrun{ 32 | # Getting the bins thresholds for each. If input is missing, will run for all numerical variables. 33 | d_bins=discretize_get_bins(data=heart_disease, 34 | input=c("resting_blood_pressure", "oldpeak"), n_bins=5) 35 | 36 | # Now it can be applied on the same data frame, 37 | # or in a new one (for example in a predictive model that change data over time) 38 | heart_disease_discretized=discretize_df(data=heart_disease, data_bins=d_bins, stringsAsFactors=T) 39 | 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /man/discretize_get_bins.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/discretize.R 3 | \name{discretize_get_bins} 4 | \alias{discretize_get_bins} 5 | \title{Get the data frame thresholds for discretization} 6 | \usage{ 7 | discretize_get_bins(data, n_bins = 5, input = NULL) 8 | } 9 | \arguments{ 10 | \item{data}{Data frame source} 11 | 12 | \item{n_bins}{The number of desired bins (or segments) that each variable will have.} 13 | 14 | \item{input}{Vector of string containing all the variables that will be processed. 15 | If empty it will run for all numerical variables that match the following condition, the number of unique values 16 | must be higher than the ones defined at 'n_bins' parameter. NAs values are automatically handled by converting 17 | them into another category (more info about it at 18 | \url{https://livebook.datascienceheroes.com/data-preparation.html#treating-missing-values-in-numerical-variables}). 19 | This function must be used with \link{discretize_df}. 20 | If it is needed a different number of bins per variable, then the function must be called more than once.} 21 | } 22 | \value{ 23 | Data frame containing the thresholds or cuts to bin every variable 24 | } 25 | \description{ 26 | It takes a data frame and returns another data frame indicating the threshold for each bin (or segment) 27 | in order to discretize the variable. 28 | } 29 | \examples{ 30 | \dontrun{ 31 | # Getting the bins thresholds for each. If input is missing, will run for all numerical variables. 32 | d_bins=discretize_get_bins(data=heart_disease, 33 | input=c("resting_blood_pressure", "oldpeak"), 34 | n_bins=5) 35 | 36 | # Now it can be applied on the same data frame, 37 | # or in a new one (for example in a predictive model that change data over time) 38 | heart_disease_discretized=discretize_df(data=heart_disease, data_bins=d_bins, stringsAsFactors=T) 39 | 40 | # Checking results 41 | df_status(heart_disease_discretized) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /man/discretize_rgr.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/discretize.R 3 | \name{discretize_rgr} 4 | \alias{discretize_rgr} 5 | \title{Variable discretization by gain ratio maximization} 6 | \usage{ 7 | discretize_rgr(input, target, min_perc_bins = 0.1, max_n_bins = 5) 8 | } 9 | \arguments{ 10 | \item{input}{numeric input vector to discretize} 11 | 12 | \item{target}{character or factor multi-calss target variable} 13 | 14 | \item{min_perc_bins}{minimum percetange of rows for each split or segment (controls the sample size), 0,1 (or 10 percent) as default} 15 | 16 | \item{max_n_bins}{maximum number of bins or segments to split the input variable, 5 bins as default} 17 | } 18 | \value{ 19 | discretized variable (factor) 20 | } 21 | \description{ 22 | Discretize numeric variable by maximizing the gain ratio 23 | between each bucket and the target variable. 24 | } 25 | \examples{ 26 | \dontrun{ 27 | library(funModeling) 28 | data=heart_disease 29 | input=data$oldpeak 30 | target=as.character(data$has_heart_disease) 31 | 32 | input2=discretize_rgr(input, target) 33 | 34 | # checking: 35 | summary(input2) 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /man/entropy_2.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/information_theory.R 3 | \name{entropy_2} 4 | \alias{entropy_2} 5 | \title{Computes the entropy between two variables} 6 | \usage{ 7 | entropy_2(input, target) 8 | } 9 | \arguments{ 10 | \item{input}{numeric/character vector} 11 | 12 | \item{target}{numeric/character vector} 13 | } 14 | \value{ 15 | Entropy measured in bits 16 | } 17 | \description{ 18 | It calculates the entropy between two categorical variables using log2. 19 | This log2 is mentioned in most of the Claude Shannon bibliography. 20 | Input/target can be numeric or character. 21 | } 22 | \examples{ 23 | \dontrun{ 24 | # Measuring entropy between input and target variable 25 | entropy_2(input=data_golf$outlook, target=data_golf$play_golf) 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /man/equal_freq.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cross_plot.R 3 | \name{equal_freq} 4 | \alias{equal_freq} 5 | \title{Equal frequency binning} 6 | \usage{ 7 | equal_freq(var, n_bins) 8 | } 9 | \arguments{ 10 | \item{var}{input variable} 11 | 12 | \item{n_bins}{number of bins to split 'var' by equal frequency, if it not possible to calculate for the desired bins, it returns the closest number} 13 | } 14 | \value{ 15 | The binned variable. 16 | } 17 | \description{ 18 | Equal frequency tries to put the same quantity of cases per bin when possible. It's a wrapper of function cut2 from Hmisc package. 19 | } 20 | \examples{ 21 | ## Example 1 22 | summary(heart_disease$age) 23 | age_2=equal_freq(var=heart_disease$age, n_bins = 10) 24 | summary(age_2) 25 | 26 | ## Example 2 27 | age_3=equal_freq(var=heart_disease$age, n_bins = 5) 28 | summary(age_3) 29 | } 30 | -------------------------------------------------------------------------------- /man/export_plot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/common_lib.R 3 | \name{export_plot} 4 | \alias{export_plot} 5 | \title{Export plot to jpeg file} 6 | \usage{ 7 | export_plot(object_plot, path_out, file_name) 8 | } 9 | \arguments{ 10 | \item{object_plot}{Object plot to export (like ggplot2)} 11 | 12 | \item{path_out}{path directory to export the output, if it has a value the plot is saved, 13 | if the directory doesn't existis it will try to create it. To save in current directory path must be dot: "."} 14 | 15 | \item{file_name}{output file name} 16 | } 17 | \value{ 18 | none 19 | } 20 | \description{ 21 | Export 'object_plot' to jpeg file under the name 'file_name' in the directory 'path_out' 22 | } 23 | -------------------------------------------------------------------------------- /man/fibonacci.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/common_lib.R 3 | \name{fibonacci} 4 | \alias{fibonacci} 5 | \title{Fibonacci series} 6 | \usage{ 7 | fibonacci(length, remove_first = F) 8 | } 9 | \arguments{ 10 | \item{length}{data frame} 11 | 12 | \item{remove_first}{removes the first value of the series, because first 2 elements are the same (number=1). False by default.} 13 | } 14 | \value{ 15 | vector 16 | } 17 | \description{ 18 | It retrieves a vector containing the first N numbers specified in 'length' parameter of the Fibonacci series. 19 | } 20 | \examples{ 21 | # Get the first 4 elements of Fibonacci series 22 | fibonacci(4) 23 | } 24 | -------------------------------------------------------------------------------- /man/figures/README-boxplot_analysis-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/man/figures/README-boxplot_analysis-1.png -------------------------------------------------------------------------------- /man/figures/README-boxplot_analysis-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/man/figures/README-boxplot_analysis-2.png -------------------------------------------------------------------------------- /man/figures/README-density_histogram-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/man/figures/README-density_histogram-1.png -------------------------------------------------------------------------------- /man/figures/README-distribution1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/man/figures/README-distribution1-1.png -------------------------------------------------------------------------------- /man/figures/README-distribution1-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/man/figures/README-distribution1-2.png -------------------------------------------------------------------------------- /man/figures/README-performance-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/man/figures/README-performance-1.png -------------------------------------------------------------------------------- /man/figures/README-profiling1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/man/figures/README-profiling1-1.png -------------------------------------------------------------------------------- /man/figures/README-profiling1-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/man/figures/README-profiling1-2.png -------------------------------------------------------------------------------- /man/figures/README-unnamed-chunk-3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/man/figures/README-unnamed-chunk-3-1.png -------------------------------------------------------------------------------- /man/freq.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/exploratory_data_analysis.R 3 | \name{freq} 4 | \alias{freq} 5 | \title{Frequency table for categorical variables} 6 | \usage{ 7 | freq(data, input = NA, plot = TRUE, na.rm = FALSE, path_out) 8 | } 9 | \arguments{ 10 | \item{data}{input data containing the variable to describe} 11 | 12 | \item{input}{string input variable (if empty, it runs for all numeric variable), it can take a single character value or a character vector.} 13 | 14 | \item{plot}{flag indicating if the plot is desired, TRUE by default} 15 | 16 | \item{na.rm}{flag indicating if NA values must be included in the analysis, FALSE by default} 17 | 18 | \item{path_out}{path directory, if it has a value the plot is saved} 19 | } 20 | \value{ 21 | vector with the values scaled into the 0 to 1 range 22 | } 23 | \description{ 24 | Retrieves the frequency and percentage for input 25 | } 26 | \examples{ 27 | freq(data=heart_disease$thal) 28 | freq(data=heart_disease, input = c('thal','chest_pain')) 29 | } 30 | -------------------------------------------------------------------------------- /man/funModeling-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/funModeling.R 3 | \docType{package} 4 | \name{funModeling-package} 5 | \alias{funModeling} 6 | \alias{funModeling-package} 7 | \title{funModeling: Exploratory data analysis, data preparation and model performance} 8 | \description{ 9 | funModeling is intimately related to the Data Science Live Book -Open Source- (2017) in the sense that most 10 | of its functionality is used to explain different topics addressed by the book. 11 | } 12 | \details{ 13 | To start using funModeling you can start by the vignette: 14 | `browseVignettes(package = "funModeling")` 15 | 16 | Or you can read the Data Science Live Book, fully accessible at: \url{https://livebook.datascienceheroes.com} 17 | } 18 | \seealso{ 19 | Useful links: 20 | \itemize{ 21 | \item \url{https://livebook.datascienceheroes.com} 22 | \item Report bugs at \url{https://github.com/pablo14/funModeling/issues} 23 | } 24 | 25 | } 26 | \author{ 27 | \strong{Maintainer}: Pablo Casas \email{pcasas.biz@gmail.com} 28 | 29 | } 30 | -------------------------------------------------------------------------------- /man/gain_lift.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/models_lib.R 3 | \name{gain_lift} 4 | \alias{gain_lift} 5 | \title{Generates lift and cumulative gain performance table and plot} 6 | \usage{ 7 | gain_lift(data, score, target, q_segments = 10) 8 | } 9 | \arguments{ 10 | \item{data}{input data source} 11 | 12 | \item{score}{the variable which contains the score number, or likelihood of being positive class} 13 | 14 | \item{target}{target binary variable indicating class label} 15 | 16 | \item{q_segments}{quantity of segments to split score variable, valid values: 5, 10 or 20} 17 | } 18 | \value{ 19 | lift/gain table, column: gain implies how much positive cases are catched if the cut point to define the 20 | positive class is set to the column "Score Point" 21 | } 22 | \description{ 23 | It retrieves the cumulative positive rate -gain curve- and the lift chart & plot when score is divided 24 | in 5, 10 or 20 segments. Both metrics give a quality measure about how well the model predicts. 25 | Higher values at the beginning of the population implies a better model. More info at: 26 | \url{https://livebook.datascienceheroes.com/model-performance.html#scoring_data} 27 | } 28 | \examples{ 29 | fit_glm=glm(has_heart_disease ~ age + oldpeak, data=heart_disease, family = binomial) 30 | heart_disease$score=predict(fit_glm, newdata=heart_disease, type='response') 31 | gain_lift(data=heart_disease, score='score', target='has_heart_disease') 32 | 33 | } 34 | -------------------------------------------------------------------------------- /man/gain_ratio.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/information_theory.R 3 | \name{gain_ratio} 4 | \alias{gain_ratio} 5 | \title{Gain ratio} 6 | \usage{ 7 | gain_ratio(input, target) 8 | } 9 | \arguments{ 10 | \item{input}{numeric/character vector} 11 | 12 | \item{target}{numeric/character vector} 13 | } 14 | \value{ 15 | gain ratio 16 | } 17 | \description{ 18 | Computes the information gain between an 'input' and 'target' variable (using log2). Similar to information gain but less sensitive to high cardinality variables. 19 | } 20 | \examples{ 21 | \dontrun{ 22 | gain_ratio(input=data_golf$outlook, target=data_golf$play_golf) 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /man/get_sample.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/models_lib.R 3 | \name{get_sample} 4 | \alias{get_sample} 5 | \title{Sampling training and test data} 6 | \usage{ 7 | get_sample(data, percentage_tr_rows = 0.8, seed = 987) 8 | } 9 | \arguments{ 10 | \item{data}{input data source} 11 | 12 | \item{percentage_tr_rows}{percentage of training rows, range value from 0.1 to 0.99, default value=0.8 (80 percent of training data)} 13 | 14 | \item{seed}{to generate the sample randomly, default value=987} 15 | } 16 | \value{ 17 | TRUE/FALSE vector same length as 'data' param. TRUE represents that row position is for training data 18 | } 19 | \description{ 20 | Split input data into training and test set, retrieving always same sample by setting the seed. 21 | } 22 | \examples{ 23 | # Training and test data. Percentage of training cases default value=80\%. 24 | index_sample=get_sample(data=heart_disease, percentage_tr_rows=0.8) 25 | # Generating the samples 26 | data_tr=heart_disease[index_sample,] 27 | data_ts=heart_disease[-index_sample,] 28 | } 29 | -------------------------------------------------------------------------------- /man/hampel_outlier.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/outliers.R 3 | \name{hampel_outlier} 4 | \alias{hampel_outlier} 5 | \title{Hampel Outlier Threshold} 6 | \usage{ 7 | hampel_outlier(input, k_mad_value = 3) 8 | } 9 | \arguments{ 10 | \item{input}{Numeric variable vector} 11 | 12 | \item{k_mad_value}{'K' multiplier for the median absolute deviation. The higher the value, the more outliers will be detected. Default value=3 (it's an standad)} 13 | } 14 | \value{ 15 | A two-item vector, the first value represents the bottom threshold, while the second one is the top threshold 16 | } 17 | \description{ 18 | Retrieves the bottom and top boundaries to flag outliers or extreme values, according to the Hampel method. This technique takes into account the median and MAD value, which is a is a robust measure of the variability of a univariate sample of quantitative data (Wikipedia). Similar to standard deviation but less sensitve to outliers. 19 | This function is used in 'prep_outliers' function. All `NA`s values are automatically excluded. More information at: \url{https://livebook.datascienceheroes.com/data-preparation.html#how_to_deal_with_outliers_in_r}. 20 | } 21 | \examples{ 22 | \dontrun{ 23 | hampel_outlier(heart_disease$age) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /man/heart_disease.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{heart_disease} 5 | \alias{heart_disease} 6 | \title{Heart Disease Data} 7 | \format{ 8 | A data frame with 303 rows and 16 variables: 9 | \itemize{ 10 | 11 | \url{https://archive.ics.uci.edu/ml/datasets/Heart+Disease} 12 | } 13 | } 14 | \usage{ 15 | heart_disease 16 | } 17 | \description{ 18 | There are variables related to patient clinic trial. The variable to predict is `has_heart_disease`. 19 | } 20 | \keyword{datasets} 21 | -------------------------------------------------------------------------------- /man/infor_magic.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/information_theory.R 3 | \name{infor_magic} 4 | \alias{infor_magic} 5 | \title{Computes several information theory metrics between two vectors} 6 | \usage{ 7 | infor_magic(input, target) 8 | } 9 | \arguments{ 10 | \item{input}{vector to be evaluated against the variable defined in 'target' parameter} 11 | 12 | \item{target}{vector containing the output variable.} 13 | } 14 | \value{ 15 | Matrix of 1 row and 4 columns, where each column represent the mentioned metrics 16 | } 17 | \description{ 18 | It retrieves the same as \code{\link{var_rank_info}} but receiving two vectors. 19 | Metrics are: entropy (en), mutual information (mi), information gain (ig) and gain ratio (gr). 20 | } 21 | \examples{ 22 | \dontrun{ 23 | infor_magic(data_golf$outlook, data_golf$play_golf) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /man/information_gain.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/information_theory.R 3 | \name{information_gain} 4 | \alias{information_gain} 5 | \title{Information gain} 6 | \usage{ 7 | information_gain(input, target) 8 | } 9 | \arguments{ 10 | \item{input}{numeric/character vector} 11 | 12 | \item{target}{numeric/character vector} 13 | } 14 | \value{ 15 | information gain 16 | } 17 | \description{ 18 | Computes the information gain between an 'input' and 'target' variable (using log2). In general terms, the higher the more predictable the input is. 19 | } 20 | \examples{ 21 | \dontrun{ 22 | information_gain(input=data_golf$outlook, target=data_golf$play_golf) 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /man/metadata_models.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{metadata_models} 5 | \alias{metadata_models} 6 | \title{Metadata models data integrity} 7 | \format{ 8 | Tibble 9 | } 10 | \usage{ 11 | metadata_models 12 | } 13 | \description{ 14 | Metadata models data integrity 15 | } 16 | \keyword{datasets} 17 | -------------------------------------------------------------------------------- /man/plot_num.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/exploratory_data_analysis.R 3 | \name{plot_num} 4 | \alias{plot_num} 5 | \title{Plotting numerical data} 6 | \usage{ 7 | plot_num(data, bins = 10, path_out = NA) 8 | } 9 | \arguments{ 10 | \item{data}{data frame} 11 | 12 | \item{bins}{number of bars (bins) to plot each histogram, 10 by default} 13 | 14 | \item{path_out}{path directory to export the output, if it has a value the plot is saved, 15 | if the directory doesn't existis it will try to create it. To save in current directory path must be dot: "."} 16 | } 17 | \value{ 18 | plot containing all numerical variables 19 | } 20 | \description{ 21 | Retrieves one plot containing all the histograms for numerical variables. NA values will not be displayed. 22 | } 23 | \examples{ 24 | \dontrun{ 25 | plot_num(mtcars) 26 | # changing the bins parameter and exporting the plot 27 | plot_num(data=mtcars, bins=5, path_out="my_folder") 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /man/plotar.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/target_profiling.R 3 | \name{plotar} 4 | \alias{plotar} 5 | \title{Correlation plots} 6 | \usage{ 7 | plotar(data, input, target, plot_type, path_out) 8 | } 9 | \arguments{ 10 | \item{data}{data frame source} 11 | 12 | \item{input}{string input variable (if empty, it runs for all numeric variable), it can take a single character value or a character vector.} 13 | 14 | \item{target}{string of the variable to predict, it supports binary or multinominal values.} 15 | 16 | \item{plot_type}{Indicates the type of plot to retrieve, available values: "boxplot" or "histdens".} 17 | 18 | \item{path_out}{path directory, if it has a value the plot is saved. To save in current directory path must be dot: "."} 19 | } 20 | \value{ 21 | Single or multiple plots specified by 'plot_type' parameter 22 | } 23 | \description{ 24 | Visual correlation analysis. Plot different graphs in order to expose the inner information of any numeric variable against the target variable 25 | } 26 | \examples{ 27 | \dontrun{ 28 | ## It runs for all numeric variables automatically 29 | plotar(data=heart_disease, target="has_heart_disease", plot_type="histdens") 30 | 31 | plotar(heart_disease, input = 'age', target = 'chest_pain', plot_type = "boxplot") 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /man/prep_outliers.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/outliers.R 3 | \name{prep_outliers} 4 | \alias{prep_outliers} 5 | \title{Outliers Data Preparation} 6 | \usage{ 7 | prep_outliers( 8 | data, 9 | input = NA, 10 | type = NA, 11 | method = NA, 12 | bottom_percent = NA, 13 | top_percent = NA, 14 | k_mad_value = NA 15 | ) 16 | } 17 | \arguments{ 18 | \item{data}{a data frame or a single vector. If it's a data frame, the function returns a data frame, otherwise it returns a vector.} 19 | 20 | \item{input}{string input variable (if empty, it runs for all numeric variable).} 21 | 22 | \item{type}{can be 'stop' or 'set_na', in the first case all falling out of the threshold will be converted to the threshold, on the other case all of these values will be set as NA.} 23 | 24 | \item{method}{indicates the method used to flag the outliers, it can be: "bottom_top", "tukey" or "hampel".} 25 | 26 | \item{bottom_percent}{value from 0 to 1, represents the lowest X percentage of values to treat. Valid only when method="bottom_top".} 27 | 28 | \item{top_percent}{value from 0 to 1, represents the highest X percentage of values to treat. Valid only when method="bottom_top".} 29 | 30 | \item{k_mad_value}{only used when method='hampel', 3 by default, might seem quite restrictive. Set a higher number to spot less outliers.} 31 | } 32 | \value{ 33 | A data frame with the desired outlier transformation 34 | } 35 | \description{ 36 | Deal with outliers by setting an 'NA value' or by 'stopping' them at a certain. 37 | There are three supported methods to flag the values as outliers: "bottom_top", "tukey" and "hampel". 38 | The parameters: 'top_percent' and/or 'bottom_percent' are used only when method="bottom_top". 39 | 40 | For a full reference please check the official documentation at: \url{https://livebook.datascienceheroes.com/data-preparation.html#treatment_outliers}> 41 | Setting NA is recommended when doing statistical analysis, parameter: type='set_na'. 42 | Stopping is recommended when creating a predictive model without biasing the result due to outliers, parameter: type='stop'. 43 | 44 | The function can take a data frame, and returns the same data plus the transformations specified in the input parameter. Or it can take a single vector (in the same 'data' parameter), and it returns a vector. 45 | } 46 | \examples{ 47 | \dontrun{ 48 | # Creating data frame with outliers 49 | set.seed(10) 50 | df=data.frame(var1=rchisq(1000,df = 1), var2=rnorm(1000)) 51 | df=rbind(df, 1135, 2432) # forcing outliers 52 | df$id=as.character(seq(1:1002)) 53 | 54 | # for var1: mean is ~ 4.56, and max 2432 55 | summary(df) 56 | 57 | ######################################################## 58 | ### PREPARING OUTLIERS FOR DESCRIPTIVE STATISTICS 59 | ######################################################## 60 | 61 | #### EXAMPLE 1: Removing top 1\%\% for a single variable 62 | # checking the value for the top 1\% of highest values (percentile 0.99), which is ~ 7.05 63 | quantile(df$var1, 0.99) 64 | 65 | # Setting type='set_na' sets NA to the highest value specified by top_percent. 66 | # In this case 'data' parameter is single vector, thus it returns a single vector as well. 67 | var1_treated=prep_outliers(data = df$var1, type='set_na', top_percent = 0.01,method = "bottom_top") 68 | 69 | # now the mean (~ 1) is more accurate, and note that: 1st, median and 3rd 70 | # quartiles remaining very similar to the original variable. 71 | summary(var1_treated) 72 | 73 | #### EXAMPLE 2: Removing top and bottom 1\% for the specified input variables. 74 | vars_to_process=c('var1', 'var2') 75 | df_treated3=prep_outliers(data = df, input = vars_to_process, type='set_na', 76 | bottom_percent = 0.01, top_percent = 0.01, method = "bottom_top") 77 | summary(df_treated3) 78 | 79 | ######################################################## 80 | ### PREPARING OUTLIERS FOR PREDICTIVE MODELING 81 | ######################################################## 82 | 83 | data_prep_h=funModeling::prep_outliers(data = heart_disease, 84 | input = c('age','resting_blood_pressure'), 85 | method = "hampel", type='stop') 86 | 87 | # Using Hampel method to flag outliers: 88 | summary(heart_disease$age);summary(data_prep_h$age) 89 | # it changed from 29 to 29.31, and the max remains the same at 77 90 | hampel_outlier(heart_disease$age) # checking the thresholds 91 | 92 | data_prep_a=funModeling::prep_outliers(data = heart_disease, 93 | input = c('age','resting_blood_pressure'), 94 | method = "tukey", type='stop') 95 | 96 | max(heart_disease$age);max(data_prep_a$age) 97 | # remains the same (77) because the max thers for age is 100 98 | tukey_outlier(heart_disease$age) 99 | 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /man/profiling_num.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/exploratory_data_analysis.R 3 | \name{profiling_num} 4 | \alias{profiling_num} 5 | \title{Profiling numerical data} 6 | \usage{ 7 | profiling_num(data) 8 | } 9 | \arguments{ 10 | \item{data}{data frame} 11 | } 12 | \value{ 13 | metrics table 14 | } 15 | \description{ 16 | Get a metric table with many indicators for all numerical variables, automatically skipping the non-numerical variables. Current metrics are: 17 | mean, std_dev: standard deviation, all the p_XX: percentile at XX number, skewness, kurtosis, iqr: inter quartile range, variation_coef: the ratio of sd/mean, range_98 is the limit for which the 98% of fall, range_80 similar to range_98 but with 80%. All NA values will be skipped from calculations. 18 | } 19 | \examples{ 20 | profiling_num(mtcars) 21 | } 22 | -------------------------------------------------------------------------------- /man/range01.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data_preparation.R 3 | \name{range01} 4 | \alias{range01} 5 | \title{Transform a variable into the [0-1] range} 6 | \usage{ 7 | range01(var) 8 | } 9 | \arguments{ 10 | \item{var}{numeric input vector} 11 | } 12 | \value{ 13 | vector with the values scaled into the 0 to 1 range 14 | } 15 | \description{ 16 | Range a variable into [0-1], assigning 0 to the min and 1 to the max of the input variable. All NA values will be removed. 17 | } 18 | \examples{ 19 | range01(mtcars$cyl) 20 | } 21 | -------------------------------------------------------------------------------- /man/status.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data_integrity.R 3 | \name{status} 4 | \alias{status} 5 | \title{Get a summary for the given data frame (o vector).} 6 | \usage{ 7 | status(data) 8 | } 9 | \arguments{ 10 | \item{data}{data frame, tibble or a single vector} 11 | } 12 | \value{ 13 | Tibble with metrics 14 | } 15 | \description{ 16 | For each variable it returns: Quantity and percentage of zeros (q_zeros and p_zeros respectevly). Same metrics for NA values (q_NA/p_na), and infinite values (q_inf/p_inf). Last two columns indicates data type and quantity of unique values. 17 | 'status' function is the evolution of 'df_status'. Main change is to have the decimal points as it is, except in percentage. For example now p_na=0.04 means 4% in df_status. 18 | This time it's easier to embbed in a data process flow and to take actions based on this number. 19 | } 20 | \examples{ 21 | status(heart_disease) 22 | } 23 | -------------------------------------------------------------------------------- /man/tukey_outlier.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/outliers.R 3 | \name{tukey_outlier} 4 | \alias{tukey_outlier} 5 | \title{Tukey Outlier Threshold} 6 | \usage{ 7 | tukey_outlier(input) 8 | } 9 | \arguments{ 10 | \item{input}{Numeric variable vector} 11 | } 12 | \value{ 13 | A two-item vector, the first value represents the bottom threshold, while the second one is the top threshold 14 | } 15 | \description{ 16 | Retrieves the bottom and top boundaries to flag outliers or extreme values, according to the Tukey's test. More info at \url{https://en.wikipedia.org/wiki/Outlier#Tukey.27s_test} 17 | This function is used in 'prep_outliers' function. All `NA`s values are automatically excluded. More information at: \url{https://livebook.datascienceheroes.com/data-preparation.html#how_to_deal_with_outliers_in_r}. 18 | } 19 | \examples{ 20 | \dontrun{ 21 | tukey_outlier(heart_disease$age) 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /man/v_compare.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/exploratory_data_analysis.R 3 | \name{v_compare} 4 | \alias{v_compare} 5 | \title{Compare two vectors} 6 | \usage{ 7 | v_compare(vector_x, vector_y) 8 | } 9 | \arguments{ 10 | \item{vector_x}{1st vector to compare} 11 | 12 | \item{vector_y}{2nd vector to compare} 13 | } 14 | \value{ 15 | Correlation index for all data input variable 16 | } 17 | \description{ 18 | Obtaing coincident and not coincident elements between two vectors. 19 | } 20 | \examples{ 21 | v1=c("height","weight","age") 22 | v2=c("height","weight","location","q_visits") 23 | res=v_compare(vector_x=v1, vector_y=v2) 24 | # Print the keys that didn't match 25 | res 26 | # Accessing the keys not present in 27 | } 28 | -------------------------------------------------------------------------------- /man/var_rank_info.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/information_theory.R 3 | \name{var_rank_info} 4 | \alias{var_rank_info} 5 | \title{Importance variable ranking based on information theory} 6 | \usage{ 7 | var_rank_info(data, target) 8 | } 9 | \arguments{ 10 | \item{data}{input data frame, all the variables will be evaluated against the variable defined in 'target' parameter} 11 | 12 | \item{target}{string variable name containing the output variable.} 13 | } 14 | \value{ 15 | data frame ordered by gain ratio metric 16 | } 17 | \description{ 18 | Retrieves a data frame containing several metrics related to information theory. 19 | Metrics are: entropy (en), mutual information (mi), information gain (ig) and gain ratio (gr). 20 | } 21 | \examples{ 22 | \dontrun{ 23 | var_rank_info(data_golf, "play_golf") 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /myTESTS/test_freq.R: -------------------------------------------------------------------------------- 1 | library(funModeling) 2 | 3 | ### 1 var 4 | freq(heart_disease$gender) 5 | 6 | ### 2 var 7 | freq(heart_disease, c("gender","thal")) 8 | 9 | ### high card, see layout 10 | freq(data_country$country) 11 | 12 | ### warn message 13 | a=as.factor(1:300) 14 | b=freq(a) 15 | 16 | ### no vars -> all 17 | freq(heart_disease) 18 | 19 | ## 20 | a=c(NA,NA,NA) 21 | b=factor(c("aa","vv","vv")) 22 | tt=data.frame(a,b) 23 | 24 | # factor var 25 | funModeling::freq(tt$b) 26 | 27 | # factor var 28 | funModeling::freq(b) 29 | 30 | # all NA 31 | funModeling::freq(a) 32 | 33 | # all numerical 34 | funModeling::freq(mtcars) 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /myTESTS/test_prep_outliers.R: -------------------------------------------------------------------------------- 1 | library(funModeling);library(dplyr);library(Hmisc) 2 | 3 | ############################################################################################ 4 | ## method="bottom_top" 5 | ############################################################################################ 6 | ## 1 var, stop ############################################## 7 | data_prep_a=prep_outliers(data = heart_disease, input = c('age','resting_blood_pressure'), 8 | top_percent = 0.05, method = "bottom_top", type='stop') 9 | 10 | 11 | max(heart_disease$age);max(data_prep_a$age) # have modfied, from 77 to 68 12 | max(heart_disease$resting_blood_pressure);max(data_prep_a$resting_blood_pressure) # have been modfied, from 200 to 160 13 | max(heart_disease$max_heart_rate);max(data_prep_a$max_heart_rate) # remains the same,202, not in the input 14 | 15 | ## passing a vector returning a vector 16 | tail(heart_disease$age[order(heart_disease$age)]) 17 | b=prep_outliers(data=heart_disease$age, top_percent = 0.01, type='stop', method = "bottom_top") 18 | tail(b[order(b)], 10) # now the max is 71 (not 77), and the result is a vector 19 | 20 | 21 | 22 | ## 2 var, set na ############################################## 23 | df_res=prep_outliers(data = heart_disease, input = c('age', 'max_heart_rate'), top_percent = 0.01, type='set_na', method = "bottom_top") 24 | 25 | describe(select(heart_disease, age, max_heart_rate));describe(select(df_res, age, max_heart_rate)) ## returns a data frame, 26 | status(select(heart_disease, age, max_heart_rate));status(select(df_res, age, max_heart_rate)) ## and it has NAs 27 | 28 | ## testing with an skewed variable ############################ 29 | set.seed(10) 30 | df=data.frame(var=rchisq(1000,df = 1)) 31 | df=rbind(df, c(-1000,1135, 2432)) # forcing outliers 32 | 33 | options(scipen = 999) 34 | df_2=prep_outliers(data = df, input = c('var'), bottom_percent = 0.01, top_percent = 0.01, type='set_na', method = "bottom_top") 35 | profiling_num(df) 36 | profiling_num(df_2) # the mean and variation coef changed a lot 37 | 38 | ## testing warning message ############################ 39 | df_3=data.frame(var=c(1,1,1,1,1,1,1,1,1,1,1,1,1,4)) 40 | 41 | df_3_b=prep_outliers(data = df_3, input = c('var'), bottom_percent = 0.01, top_percent = 0.01, type='set_na', method = "bottom_top") 42 | profiling_num(df_3) 43 | profiling_num(df_3_b) # remains the same 44 | 45 | ### only bottom 46 | df_3_c=prep_outliers(data = df, input = c('var'), bottom_percent = 0.01, type='set_na', method = "bottom_top") 47 | tail(df$var);tail(df_3_c$var) # the value "-1000" is NA 48 | 49 | # fail because it doesn have bot/top and method is bottom_top 50 | df_3_c=prep_outliers(data = df, input = c('var'),type='set_na', method = "bottom_top") 51 | 52 | 53 | ############################################################################################ 54 | ## method="tukey" 55 | ############################################################################################ 56 | data_prep_a=funModeling::prep_outliers(data = heart_disease, input = c('age','resting_blood_pressure'), method = "tukey", type='stop') 57 | 58 | max(heart_disease$age);max(data_prep_a$age) # remains the same (77) because the max thers for age is 100 59 | tukey_outlier(heart_disease$age) 60 | # forcing two outliers 61 | data_prep_a$age[1]=1 62 | data_prep_a$age[2]=110 63 | 64 | v_age=funModeling::prep_outliers(data = data_prep_a$age, method = "tukey", type='stop') 65 | summary(v_age) # now the min is 7 and max is 101.5, also i tested the single vector treatment 66 | tukey_outlier(data_prep_a$age) # it matches with the threshold, gr8! 67 | 68 | ############################################################################################ 69 | ## method="hampel" 70 | ############################################################################################ 71 | data_prep_h=funModeling::prep_outliers(data = heart_disease, input = c('age','resting_blood_pressure'), method = "hampel", type='stop') 72 | summary(heart_disease$age);summary(data_prep_h$age) # remains the same 73 | ## forcing outliers 74 | data_prep_h$age[1]=1 75 | data_prep_h$age[2]=110 76 | 77 | data_prep_h2=funModeling::prep_outliers(data = data_prep_h, input = c('age','resting_blood_pressure'), method = "hampel", type='stop') 78 | summary(data_prep_h$age);summary(data_prep_h2$age) # it changed from 1 to 24, and the max from 110 to 86 79 | 80 | hampel_outlier(data_prep_h$age) # min=24 and max=86 81 | 82 | # bottom remains the same at 94, and the top is adjusted at 174 (before it was 200) 83 | summary(heart_disease$resting_blood_pressure);summary(data_prep_h$resting_blood_pressure) 84 | hampel_outlier(heart_disease$resting_blood_pressure) 85 | 86 | ## testing set_na 87 | v_age=funModeling::prep_outliers(data = heart_disease$age, method = "hampel", type='set_na') 88 | head(heart_disease$age[order(heart_disease$age)]) 89 | head(v_age[order(v_age)]) 90 | 91 | 92 | summary(heart_disease$age);summary(v_age) # 1 NA 93 | hampel_outlier(heart_disease$age) ## the 29 is now NA 94 | 95 | 96 | 97 | -------------------------------------------------------------------------------- /myTESTS/tests_cross_plot.R: -------------------------------------------------------------------------------- 1 | library(funModeling) 2 | 3 | ## normal ########################################## 4 | cross_plot(data=heart_disease, input="chest_pain", target="has_heart_disease") 5 | 6 | ## normal + auto_binning= T ########################################## 7 | # forcing hp to have less than 20 uniq values 8 | mtcars=prep_outliers(mtcars, input = "hp",prob = 0.80) 9 | # binning 1 10 | cross_plot(data=mtcars, input="hp", target="vs", auto_binning = T) 11 | mtcars$hp_2=equal_freq(mtcars$hp, 5) 12 | cross_plot(data=mtcars, input="hp_2", target="vs", auto_binning = T) 13 | # not binning 14 | cross_plot(data=mtcars, input="hp", target="vs", auto_binning = F) 15 | 16 | ## uniq>20 - automatic auto binning 17 | cross_plot(data=mtcars, input="drat", target="vs") 18 | ## uniq>20 - forcing not binning 19 | cross_plot(data=mtcars, input="drat", target="vs", auto_binning = F) 20 | 21 | 22 | ## forcing NA in target ########################################## 23 | heart_disease$has_heart_disease[1]=NA 24 | cross_plot(data=heart_disease, input="chest_pain", target="has_heart_disease") 25 | 26 | ## forcing 3 values in target ########################################## 27 | heart_disease$has_heart_disease=as.character(heart_disease$has_heart_disease) 28 | heart_disease$has_heart_disease2=heart_disease$has_heart_disease 29 | heart_disease$has_heart_disease2[1]="hello_world" 30 | cross_plot(data=heart_disease, input="chest_pain", target="has_heart_disease2") 31 | 32 | ## target as numeric ########################################## 33 | heart_disease$has_heart_disease_num=ifelse(heart_disease$has_heart_disease=="yes", 1, 0) 34 | library(Hmisc) 35 | describe(heart_disease$has_heart_disease_num) 36 | cross_plot(data=heart_disease, input="chest_pain", target="has_heart_disease_num") 37 | 38 | 39 | ## input missing, run for every variable ########################################## 40 | cross_plot(data=heart_disease, target="has_heart_disease") 41 | -------------------------------------------------------------------------------- /myTESTS/tests_plotar.R: -------------------------------------------------------------------------------- 1 | library(funModeling) 2 | 3 | ## 1 var, sin output, hisdens #################################### 4 | plotar(data=heart_disease, input="age", target="has_heart_disease", plot_type = "histdens") 5 | 6 | ## 1 var, sin output, boxplot #################################### 7 | plotar(data=heart_disease, input="age", target="has_heart_disease", plot_type = "boxplot") 8 | 9 | 10 | ## 1 var, boxplot, no output #################################### 11 | # boxplot is not a good plot for this var due to the presence of lots of zeros 12 | plotar(data=heart_disease, input="num_vessels_flour", target="has_heart_disease", plot_type = "boxplot") 13 | 14 | ## Filtering zeros #################################### 15 | sub=subset(heart_disease, num_vessels_flour!=0) 16 | plotar(data=sub, input="num_vessels_flour", target="has_heart_disease", plot_type = "boxplot") 17 | 18 | 19 | #################################### 20 | # ALL vars, sin output, boxplot 21 | plotar(data=heart_disease, target="has_heart_disease", plot_type = "boxplot") 22 | 23 | # ALL vars, sin output, histdens 24 | plotar(data=heart_disease, target="has_heart_disease", plot_type = "histdens") 25 | 26 | ## target as numeric 27 | heart_disease$has_heart_disease_num=ifelse(heart_disease$has_heart_disease=="yes", 1, 0) 28 | library(Hmisc) 29 | describe(heart_disease$has_heart_disease_num) 30 | plotar(data=heart_disease, target="has_heart_disease_num", input="age", plot_type = "histdens") 31 | 32 | 33 | ############################################# 34 | -------------------------------------------------------------------------------- /vignettes/discre1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/vignettes/discre1.png -------------------------------------------------------------------------------- /vignettes/dslb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/vignettes/dslb.png -------------------------------------------------------------------------------- /vignettes/funModeling_quickstart.R: -------------------------------------------------------------------------------- 1 | ## ---- message=FALSE, warning=FALSE-------------------------------------------- 2 | library(funModeling) 3 | 4 | status(heart_disease) 5 | 6 | ## ---- message=FALSE, warning=FALSE-------------------------------------------- 7 | library(funModeling) 8 | 9 | di=data_integrity(heart_disease) 10 | 11 | # returns a summary 12 | summary(di) 13 | 14 | # print all the metadata information 15 | print(di) 16 | 17 | ## ---- fig.height=3, fig.width=5----------------------------------------------- 18 | plot_num(heart_disease) 19 | 20 | ## ----------------------------------------------------------------------------- 21 | profiling_num(heart_disease) 22 | 23 | ## ----distribution1, message=FALSE, fig.height=3, fig.width=5, warning=FALSE---- 24 | library(dplyr) 25 | 26 | # Select only two variables for this example 27 | heart_disease_2=heart_disease %>% select(chest_pain, thal) 28 | 29 | # Frequency distribution 30 | freq(heart_disease_2) 31 | 32 | ## ----------------------------------------------------------------------------- 33 | correlation_table(heart_disease, "has_heart_disease") 34 | 35 | ## ----------------------------------------------------------------------------- 36 | var_rank_info(heart_disease, "has_heart_disease") 37 | 38 | ## ----profiling1, fig.height=4, fig.width=8------------------------------------ 39 | cross_plot(data=heart_disease, input=c("age", "oldpeak"), target="has_heart_disease") 40 | 41 | ## ----boxplot_analysis, fig.height=2, fig.width=4------------------------------ 42 | plotar(data=heart_disease, input = c("age", "oldpeak"), target="has_heart_disease", plot_type="boxplot") 43 | 44 | ## ----density_histogram, fig.height=2, fig.width=4----------------------------- 45 | plotar(data=mtcars, input = "gear", target="cyl", plot_type="histdens") 46 | 47 | ## ----------------------------------------------------------------------------- 48 | df_ca=categ_analysis(data = data_country, input = "country", target = "has_flu") 49 | 50 | head(df_ca) 51 | 52 | ## ----------------------------------------------------------------------------- 53 | # Step 1: Getting the thresholds for the desired variables: "max_heart_rate" and "oldpeak" 54 | d_bins=discretize_get_bins(data=heart_disease, input=c("max_heart_rate", "oldpeak"), n_bins=5) 55 | 56 | # Step 2: Applying the threshold to get the final processed data frame 57 | heart_disease_discretized=discretize_df(data=heart_disease, data_bins=d_bins, stringsAsFactors=T) 58 | 59 | ## ----------------------------------------------------------------------------- 60 | new_age=equal_freq(heart_disease$age, n_bins = 5) 61 | 62 | # checking results 63 | Hmisc::describe(new_age) 64 | 65 | ## ----------------------------------------------------------------------------- 66 | 67 | input=heart_disease$oldpeak 68 | target=heart_disease$has_heart_disease 69 | 70 | input2=discretize_rgr(input, target) 71 | 72 | # checking: 73 | summary(input2) 74 | 75 | ## ----------------------------------------------------------------------------- 76 | age_scaled=range01(heart_disease$oldpeak) 77 | 78 | # checking results 79 | summary(age_scaled) 80 | 81 | ## ----------------------------------------------------------------------------- 82 | tukey_outlier(heart_disease$resting_blood_pressure) 83 | 84 | ## ----------------------------------------------------------------------------- 85 | hampel_outlier(heart_disease$resting_blood_pressure) 86 | 87 | ## ----------------------------------------------------------------------------- 88 | # Get threshold according to Hampel's method 89 | hampel_outlier(heart_disease$max_heart_rate) 90 | 91 | # Apply function to stop outliers at the threshold values 92 | data_prep=prep_outliers(data = heart_disease, input = c('max_heart_rate','resting_blood_pressure'), method = "hampel", type='stop') 93 | 94 | 95 | ## ---- echo=FALSE-------------------------------------------------------------- 96 | # Checking max and min value for 'max_heart_rate' before the transformation 97 | sprintf("Before transformation -> Min: %s; Max: %s", min(heart_disease$max_heart_rate), max(heart_disease$max_heart_rate)) 98 | 99 | # Apply function to stop outliers at the threshold values 100 | data_prep=prep_outliers(data = heart_disease, input = c('max_heart_rate','resting_blood_pressure'), method = "hampel", type='stop') 101 | 102 | # Checking the results, the maximum value is now 174.5 (the minimum remains the same) 103 | # Checking max and min value for 'max_heart_rate' before the transformation 104 | sprintf("After transformation -> Min: %s; Max: %s", min(data_prep$max_heart_rate), max(data_prep$max_heart_rate)) 105 | 106 | 107 | ## ----performance, fig.height=3, fig.width=7----------------------------------- 108 | # Create machine learning model and get its scores for positive case 109 | fit_glm=glm(has_heart_disease ~ age + oldpeak, data=heart_disease, family = binomial) 110 | heart_disease$score=predict(fit_glm, newdata=heart_disease, type='response') 111 | 112 | # Calculate performance metrics 113 | gain_lift(data=heart_disease, score='score', target='has_heart_disease') 114 | 115 | 116 | -------------------------------------------------------------------------------- /vignettes/img/data-science-live-book.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/vignettes/img/data-science-live-book.png -------------------------------------------------------------------------------- /vignettes/img/funmodeling_logo_wh.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/vignettes/img/funmodeling_logo_wh.png -------------------------------------------------------------------------------- /vignettes/man/figures/README-boxplot_analysis-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/vignettes/man/figures/README-boxplot_analysis-1.png -------------------------------------------------------------------------------- /vignettes/man/figures/README-boxplot_analysis-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/vignettes/man/figures/README-boxplot_analysis-2.png -------------------------------------------------------------------------------- /vignettes/man/figures/README-density_histogram-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/vignettes/man/figures/README-density_histogram-1.png -------------------------------------------------------------------------------- /vignettes/man/figures/README-distribution1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/vignettes/man/figures/README-distribution1-1.png -------------------------------------------------------------------------------- /vignettes/man/figures/README-distribution1-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/vignettes/man/figures/README-distribution1-2.png -------------------------------------------------------------------------------- /vignettes/man/figures/README-performance-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/vignettes/man/figures/README-performance-1.png -------------------------------------------------------------------------------- /vignettes/man/figures/README-profiling1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/vignettes/man/figures/README-profiling1-1.png -------------------------------------------------------------------------------- /vignettes/man/figures/README-profiling1-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/vignettes/man/figures/README-profiling1-2.png -------------------------------------------------------------------------------- /vignettes/man/figures/README-unnamed-chunk-3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funModeling/c04e72293d92c0864e6cdeb4cdc985bbea580187/vignettes/man/figures/README-unnamed-chunk-3-1.png --------------------------------------------------------------------------------