├── .Rbuildignore ├── R ├── sysdata.rda ├── zzz.R ├── coordinates_lister.R ├── test_for_geom_bar.R ├── labels_finder.R ├── geoms_lister.R ├── too_few_data.R ├── is_horizontal_barplot.R ├── is_pie_chart.R ├── aes_puller.R ├── test_for_histogram.R ├── labels_reader.R ├── too_many_dimensions.R ├── filled_barplot.R ├── heavy_background.R ├── mappings_lister.R ├── cozy_plot.R ├── histogram_bins_tester.R ├── does_it_need_geom_smooth.R ├── outlier_labels.R ├── scorer.R ├── scorer_bot.R └── metadata_reader.R ├── vizscorer.Rproj ├── man ├── coordinates_lister.Rd ├── geoms_lister.Rd ├── labels_reader.Rd ├── mappings_lister.Rd ├── labels_finder.Rd ├── test_for_geom_bar.Rd ├── is_pie_chart.Rd ├── aes_puller.Rd ├── is_horizontal_barplot.Rd ├── too_few_data.Rd ├── scorer.Rd ├── test_for_histogram.Rd ├── cozy_plot.Rd ├── heavy_background.Rd ├── too_many_dimensions.Rd ├── metadata_reader.Rd ├── histogram_bins_tester.Rd ├── outlier_labels.Rd ├── filled_barplot.Rd ├── does_it_need_geom_smooth.Rd └── scorer_bot.Rd ├── DESCRIPTION ├── NAMESPACE ├── .gitignore ├── LICENSE ├── inst └── extdata │ ├── comparison_db.csv │ ├── report.Rmd │ └── plot_advices.csv ├── README.md └── plot_report.Rmd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | -------------------------------------------------------------------------------- /R/sysdata.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreaCirilloAC/vizscorer/HEAD/R/sysdata.rda -------------------------------------------------------------------------------- /R/zzz.R: -------------------------------------------------------------------------------- 1 | .onLoad <- function(libname, pkgname) { 2 | packageStartupMessage("vizscorer is currently under development and it may not be able to analyse your specific plot. 3 | Please look into package documentation for more details: https://github.com/AndreaCirilloAC/vizscorer") 4 | } -------------------------------------------------------------------------------- /R/coordinates_lister.R: -------------------------------------------------------------------------------- 1 | #' list coordinates layers in a ggplot object 2 | #' @param plot_object the ggplot object being analysed 3 | #' @return a vector storing all coordinates found within layers of a ggplot object 4 | #' @export 5 | coordinates_lister <- function(plot_object){ 6 | plot_object$coordinates %>% class() %>% as.vector() 7 | } 8 | -------------------------------------------------------------------------------- /R/test_for_geom_bar.R: -------------------------------------------------------------------------------- 1 | #' look for geom_bar layer on a plot 2 | #' @param plot_object the ggplot object being analysed 3 | #' @param n_of_layers number of layers within the ggplot object 4 | #' @return TRUE if a geom_bar is found within ggplot object layers, FALSE in the opposite case. 5 | #' @export 6 | test_for_geom_bar <- function(plot_object,n_of_layers){ 7 | !is.na(match("GeomBar",geoms_lister(plot_object,n_of_layers))) 8 | } 9 | -------------------------------------------------------------------------------- /vizscorer.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | StripTrailingWhitespace: Yes 16 | 17 | BuildType: Package 18 | PackageUseDevtools: Yes 19 | PackageInstallArgs: --no-multiarch --with-keep.source 20 | -------------------------------------------------------------------------------- /R/labels_finder.R: -------------------------------------------------------------------------------- 1 | #' look for a specific label within a ggplot object 2 | #' @param plot_object the ggplot object being analysed 3 | #' @param searched_label, among title, subtitle and caption 4 | #' @return TRUE if the searched label was found, FALSE in the opposite case 5 | #' @export 6 | labels_finder <- function(plot_object,searched_label){ 7 | labels_vector <- plot_object$labels %>% names() %>% as.vector() 8 | label_found <- if(is.na(match(searched_label,labels_vector))){FALSE}else{TRUE} 9 | 10 | } 11 | -------------------------------------------------------------------------------- /man/coordinates_lister.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/coordinates_lister.R 3 | \name{coordinates_lister} 4 | \alias{coordinates_lister} 5 | \title{list coordinates layers in a ggplot object} 6 | \usage{ 7 | coordinates_lister(plot_object) 8 | } 9 | \arguments{ 10 | \item{plot_object}{the ggplot object being analysed} 11 | } 12 | \value{ 13 | a vector storing all coordinates found within layers of a ggplot object 14 | } 15 | \description{ 16 | list coordinates layers in a ggplot object 17 | } 18 | -------------------------------------------------------------------------------- /man/geoms_lister.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/geoms_lister.R 3 | \name{geoms_lister} 4 | \alias{geoms_lister} 5 | \title{list geoms in a ggplot object} 6 | \usage{ 7 | geoms_lister(plot_object, n_of_layers) 8 | } 9 | \arguments{ 10 | \item{plot_object}{the ggplot object being analysed} 11 | 12 | \item{n_of_layers}{number of layers within the ggplot object} 13 | } 14 | \value{ 15 | a vector listing all geoms found within the given ggplot object 16 | } 17 | \description{ 18 | list geoms in a ggplot object 19 | } 20 | -------------------------------------------------------------------------------- /R/geoms_lister.R: -------------------------------------------------------------------------------- 1 | #' list geoms in a ggplot object 2 | #' @param plot_object the ggplot object being analysed 3 | #' @param n_of_layers number of layers within the ggplot object 4 | #' @return a vector listing all geoms found within the given ggplot object 5 | #' @export 6 | geoms_lister <- function(plot_object, n_of_layers){ 7 | geoms_in_plot <- c() 8 | for (i in 1:n_of_layers) { 9 | 10 | class_list <- plot_object$layers[[i]]$geom %>% class() %>% as.vector() 11 | geom <- class_list[1] 12 | geoms_in_plot <- c(geoms_in_plot,geom) 13 | } 14 | return(geoms_in_plot) 15 | } 16 | -------------------------------------------------------------------------------- /man/labels_reader.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/labels_reader.R 3 | \name{labels_reader} 4 | \alias{labels_reader} 5 | \title{look for special charaters in a plot labels} 6 | \usage{ 7 | labels_reader(plot_object) 8 | } 9 | \arguments{ 10 | \item{plot_object}{the ggplot object being analysed} 11 | } 12 | \value{ 13 | a list storing test result and the list of special characters found within labels. 14 | } 15 | \description{ 16 | this function match the labels of the given ggplot object against a set of predefined special characters 17 | } 18 | -------------------------------------------------------------------------------- /man/mappings_lister.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/mappings_lister.R 3 | \name{mappings_lister} 4 | \alias{mappings_lister} 5 | \title{storing aestetich variables in a dataframe} 6 | \usage{ 7 | mappings_lister(plot_object, n_of_layers) 8 | } 9 | \arguments{ 10 | \item{plot_object}{the ggplot object being analysed} 11 | 12 | \item{n_of_layers}{number of layers within the ggplot object} 13 | } 14 | \value{ 15 | a data frane storing for each aestetich the mapped variable 16 | } 17 | \description{ 18 | storing aestetich variables in a dataframe 19 | } 20 | -------------------------------------------------------------------------------- /man/labels_finder.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/labels_finder.R 3 | \name{labels_finder} 4 | \alias{labels_finder} 5 | \title{look for a specific label within a ggplot object} 6 | \usage{ 7 | labels_finder(plot_object, searched_label) 8 | } 9 | \arguments{ 10 | \item{plot_object}{the ggplot object being analysed} 11 | 12 | \item{searched_label, }{among title, subtitle and caption} 13 | } 14 | \value{ 15 | TRUE if the searched label was found, FALSE in the opposite case 16 | } 17 | \description{ 18 | look for a specific label within a ggplot object 19 | } 20 | -------------------------------------------------------------------------------- /man/test_for_geom_bar.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/test_for_geom_bar.R 3 | \name{test_for_geom_bar} 4 | \alias{test_for_geom_bar} 5 | \title{look for geom_bar layer on a plot} 6 | \usage{ 7 | test_for_geom_bar(plot_object, n_of_layers) 8 | } 9 | \arguments{ 10 | \item{plot_object}{the ggplot object being analysed} 11 | 12 | \item{n_of_layers}{number of layers within the ggplot object} 13 | } 14 | \value{ 15 | TRUE if a geom_bar is found within ggplot object layers, FALSE in the opposite case. 16 | } 17 | \description{ 18 | look for geom_bar layer on a plot 19 | } 20 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: vizscorer 2 | Type: Package 3 | Title: An intelligent advisor for your dataviz 4 | Version: 0.1.0 5 | Author: Andrea Cirillo 6 | Maintainer: Andrea Cirillo 7 | Description: vizscorer analyses yout ggplot plot object and provides you customised advices on how 8 | how to improve it. The analysis is performed applying machine learning to your plot metadata and results 9 | in a customised report. 10 | License: What license is it under? 11 | Encoding: UTF-8 12 | LazyData: true 13 | Depends: ggplot2 , cluster, dplyr , energy , caret , tidyr , knitr , revealjs 14 | RoxygenNote: 6.0.1 15 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(aes_puller) 4 | export(coordinates_lister) 5 | export(cozy_plot) 6 | export(does_it_need_geom_smooth) 7 | export(filled_barplot) 8 | export(geoms_lister) 9 | export(heavy_background) 10 | export(histogram_bins_tester) 11 | export(is_horizontal_barplot) 12 | export(is_pie_chart) 13 | export(labels_finder) 14 | export(labels_reader) 15 | export(mappings_lister) 16 | export(metadata_reader) 17 | export(outlier_labels) 18 | export(scorer) 19 | export(scorer_bot) 20 | export(test_for_geom_bar) 21 | export(test_for_histogram) 22 | export(too_few_data) 23 | export(too_many_dimensions) 24 | -------------------------------------------------------------------------------- /R/too_few_data.R: -------------------------------------------------------------------------------- 1 | #' look for overshooting in case of a plot for with too few data 2 | #' @param plot_object the ggplot object being analysed 3 | #' @param data_threshold a custom threshold to define too many data 4 | #' @description this function checks for the presence of too few data to deserve a plot. Based on data visualization literature it is assumed that twenty data points can be considered a significant threshold 5 | #' @return TRUE in case too few data points are observed, FALSE in the opposite site. 6 | #' @export 7 | too_few_data <- function(plot_object,data_threshold) { 8 | if (plot_object$data %>% nrow() < data_threshold) { FALSE}else{TRUE} 9 | } 10 | -------------------------------------------------------------------------------- /R/is_horizontal_barplot.R: -------------------------------------------------------------------------------- 1 | #' a simple function to test for the presence of too many dimension for a 2D plotting space 2 | #' @param plot_object the ggplot object being analysed 3 | #' @param n_of_layers number of layers within the ggplot object 4 | #' @return TRUE in case the plot object is a bar plot and it is horizontally layed, FALSE in the opposite case. 5 | #' @export 6 | is_horizontal_barplot <- function(plot_object,n_of_layers){ 7 | is_bar_plot <- test_for_geom_bar(plot_object,n_of_layers) 8 | is_flipped <- !is.na(match("CoordFlip" ,coordinates_lister(plot_object))) 9 | if(!is_bar_plot){return(FALSE)}else{ 10 | return(is_bar_plot & is_flipped)} 11 | } 12 | -------------------------------------------------------------------------------- /man/is_pie_chart.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/is_pie_chart.R 3 | \name{is_pie_chart} 4 | \alias{is_pie_chart} 5 | \title{test a ggplot object to understand if it is a pie chart} 6 | \usage{ 7 | is_pie_chart(plot_object, n_of_layers) 8 | } 9 | \arguments{ 10 | \item{plot_object}{the ggplot object being analysed} 11 | 12 | \item{n_of_layers}{number of layers within the ggplot object} 13 | } 14 | \value{ 15 | TRUE if the plot is a pie chart and FALSE in the opposite case 16 | } 17 | \description{ 18 | to understand if aplot is a pie chart this functions look for the presence of a \code{geom_bar} layer and a \code{coord_polar} layer. 19 | } 20 | -------------------------------------------------------------------------------- /man/aes_puller.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/aes_puller.R 3 | \name{aes_puller} 4 | \alias{aes_puller} 5 | \title{retrieve a vector storing values of a specified aestetich} 6 | \usage{ 7 | aes_puller(plot_object, n_of_layers, required_aes) 8 | } 9 | \arguments{ 10 | \item{plot_object}{the ggplot object being analysed} 11 | 12 | \item{n_of_layers}{number of layers within the ggplot object} 13 | 14 | \item{required_aes}{the ggplot object aestetich required} 15 | } 16 | \value{ 17 | a vector containing values of the variable the required specified aestetich is mapped to 18 | } 19 | \description{ 20 | retrieve a vector storing values of a specified aestetich 21 | } 22 | -------------------------------------------------------------------------------- /man/is_horizontal_barplot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/is_horizontal_barplot.R 3 | \name{is_horizontal_barplot} 4 | \alias{is_horizontal_barplot} 5 | \title{a simple function to test for the presence of too many dimension for a 2D plotting space} 6 | \usage{ 7 | is_horizontal_barplot(plot_object, n_of_layers) 8 | } 9 | \arguments{ 10 | \item{plot_object}{the ggplot object being analysed} 11 | 12 | \item{n_of_layers}{number of layers within the ggplot object} 13 | } 14 | \value{ 15 | TRUE in case the plot object is a bar plot and it is horizontally layed, FALSE in the opposite case. 16 | } 17 | \description{ 18 | a simple function to test for the presence of too many dimension for a 2D plotting space 19 | } 20 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | .Rapp.history 4 | 5 | # Session Data files 6 | .RData 7 | 8 | # Example code in package build process 9 | *-Ex.R 10 | 11 | # Output files from R CMD build 12 | /*.tar.gz 13 | 14 | # Output files from R CMD check 15 | /*.Rcheck/ 16 | 17 | # RStudio files 18 | .Rproj.user/ 19 | 20 | # produced vignettes 21 | vignettes/*.html 22 | vignettes/*.pdf 23 | 24 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 25 | .httr-oauth 26 | 27 | # knitr and R markdown default cache directories 28 | /*_cache/ 29 | /cache/ 30 | 31 | # Temporary files created by R markdown 32 | *.utf8.md 33 | *.knit.md 34 | 35 | # Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html 36 | rsconnect/ 37 | .Rproj.user 38 | -------------------------------------------------------------------------------- /R/is_pie_chart.R: -------------------------------------------------------------------------------- 1 | #' test a ggplot object to understand if it is a pie chart 2 | #' @param plot_object the ggplot object being analysed 3 | #' @param n_of_layers number of layers within the ggplot object 4 | #' @description to understand if aplot is a pie chart this functions look for the presence of a \code{geom_bar} layer and a \code{coord_polar} layer. 5 | #' @return TRUE if the plot is a pie chart and FALSE in the opposite case 6 | #' @export 7 | is_pie_chart <- function(plot_object, n_of_layers){ 8 | 9 | coordinates_class <- coordinates_lister(plot_object) 10 | 11 | # if we do not found any geom_bar we can be sure it is not a pie chart 12 | 13 | test_for_geom_bar(plot_object,n_of_layers)& !is.na(match("CoordPolar",coordinates_class)) 14 | 15 | } 16 | 17 | -------------------------------------------------------------------------------- /man/too_few_data.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/too_few_data.R 3 | \name{too_few_data} 4 | \alias{too_few_data} 5 | \title{look for overshooting in case of a plot for with too few data} 6 | \usage{ 7 | too_few_data(plot_object, data_threshold) 8 | } 9 | \arguments{ 10 | \item{plot_object}{the ggplot object being analysed} 11 | 12 | \item{data_threshold}{a custom threshold to define too many data} 13 | } 14 | \value{ 15 | TRUE in case too few data points are observed, FALSE in the opposite site. 16 | } 17 | \description{ 18 | this function checks for the presence of too few data to deserve a plot. Based on data visualization literature it is assumed that twenty data points can be considered a significant threshold 19 | } 20 | -------------------------------------------------------------------------------- /man/scorer.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/scorer.R 3 | \name{scorer} 4 | \alias{scorer} 5 | \title{scoring the effectivenes of ggplot object applying a gradient boosting model} 6 | \usage{ 7 | scorer(plot_metadata = NULL) 8 | } 9 | \arguments{ 10 | \item{plot_metadata}{a data frame storing ggplot object metadata, resulting from a previous call of \code{metadata_reader} function.} 11 | } 12 | \value{ 13 | a numeric value 14 | } 15 | \description{ 16 | \code{scorer} function handles the running of a gradient boosting pre-trained model on the metadata of a ggplot object. 17 | As a result a 0 to 1 score is produced, measuring the effectiveness of the related plot based on a set of rules drawn from data visualization theory. 18 | } 19 | -------------------------------------------------------------------------------- /man/test_for_histogram.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/test_for_histogram.R 3 | \name{test_for_histogram} 4 | \alias{test_for_histogram} 5 | \title{analyse a ggplot object to understand if it is an histogram} 6 | \usage{ 7 | test_for_histogram(plot_object, n_of_layers) 8 | } 9 | \arguments{ 10 | \item{plot_object}{the ggplot object being analysed} 11 | 12 | \item{n_of_layers}{number of layers within the ggplot object} 13 | } 14 | \value{ 15 | TRUE in case the plot is an histogram, FALSE in the opposite case 16 | } 17 | \description{ 18 | this function checks two conditions to understand if the provided ggplot object represent an histogram: 19 | - presence of a geom_bar layer 20 | - presence of bindwidth parameter among parameters of that layer 21 | } 22 | -------------------------------------------------------------------------------- /R/aes_puller.R: -------------------------------------------------------------------------------- 1 | #' retrieve a vector storing values of a specified aestetich 2 | #' @param plot_object the ggplot object being analysed 3 | #' @param n_of_layers number of layers within the ggplot object 4 | #' @param required_aes the ggplot object aestetich required 5 | #' @return a vector containing values of the variable the required specified aestetich is mapped to 6 | #' @export 7 | aes_puller <- function(plot_object,n_of_layers, required_aes){ 8 | 9 | aes_db <- mappings_lister(plot_object , n_of_layers ) 10 | variable_name <- aes_db %>% filter(aes == required_aes) %>% select("variable") %>% pull() 11 | col_index <- match(variable_name,colnames(plot_object$data)) 12 | if(length(variable_name) == 0 ){return(NA)}else{ 13 | variable_vector <- plot_object$data[,col_index] 14 | 15 | return(variable_vector)} 16 | } 17 | -------------------------------------------------------------------------------- /R/test_for_histogram.R: -------------------------------------------------------------------------------- 1 | #' analyse a ggplot object to understand if it is an histogram 2 | #' @param plot_object the ggplot object being analysed 3 | #' @param n_of_layers number of layers within the ggplot object 4 | #' @description this function checks two conditions to understand if the provided ggplot object represent an histogram: 5 | #' - presence of a geom_bar layer 6 | #' - presence of bindwidth parameter among parameters of that layer 7 | #' @return TRUE in case the plot is an histogram, FALSE in the opposite case 8 | #' @export 9 | test_for_histogram <- function(plot_object,n_of_layers){ 10 | bar_index <- match("GeomBar", geoms_lister(plot_object,n_of_layers)) 11 | bar_stat_params <- if(!is.na(bar_index )){plot_object$layers[[bar_index]]$stat_params %>% names()} 12 | test_histogram <- !is.na(bar_index) & !is.na(match("binwidth",bar_stat_params)) 13 | return(test_histogram) 14 | } 15 | -------------------------------------------------------------------------------- /man/cozy_plot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cozy_plot.R 3 | \name{cozy_plot} 4 | \alias{cozy_plot} 5 | \title{checking for the presence of overplotting within a ggplot object.} 6 | \usage{ 7 | cozy_plot(plot_object, n_of_layers, overplotting_floor, aes_db) 8 | } 9 | \arguments{ 10 | \item{plot_object}{the ggplot object being analysed} 11 | 12 | \item{n_of_layers}{number of layers within the ggplot object} 13 | 14 | \item{overplotting_floor}{a custom threshold of distance among points to detect overplotting} 15 | 16 | \item{aes_db}{a data frame storing plot object aestetichs and variable mapped against them} 17 | } 18 | \value{ 19 | a list containing an overall test result with TRUE in presence of overplotting and the median distance observed between points 20 | } 21 | \description{ 22 | checking for the presence of overplotting within a ggplot object. 23 | } 24 | -------------------------------------------------------------------------------- /R/labels_reader.R: -------------------------------------------------------------------------------- 1 | #' look for special charaters in a plot labels 2 | #' @param plot_object the ggplot object being analysed 3 | #' @description this function match the labels of the given ggplot object against a set of predefined special characters 4 | #' @return a list storing test result and the list of special characters found within labels. 5 | #' @export 6 | labels_reader <- function(plot_object){ 7 | labels_text_vector <- plot_object$labels %>% unlist() %>% as.vector() 8 | special_character_vector <- c("&","/","!","?","^","@","#","$","¶","§", ":)", ".") 9 | 10 | match_vector <- pmatch(special_character_vector,labels_text_vector) 11 | 12 | special_characters_indexes <- match_vector[!is.na(match_vector)] 13 | 14 | there_are_special_characters <- if(length(special_characters_indexes) == 0){FALSE}else{TRUE} 15 | 16 | labels_with_special_characters <- labels_text_vector[special_characters_indexes] 17 | return(list(there_are_special_characters, labels_with_special_characters)) 18 | } 19 | -------------------------------------------------------------------------------- /man/heavy_background.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/heavy_background.R 3 | \name{heavy_background} 4 | \alias{heavy_background} 5 | \title{look for heavy background on a ggplot plot} 6 | \usage{ 7 | heavy_background(plot_object) 8 | } 9 | \arguments{ 10 | \item{plot_object}{the ggplot object being analysed} 11 | } 12 | \value{ 13 | returns TRUE if the plot is depicted on an heavy background, FALSE if the background is white or transparent 14 | } 15 | \description{ 16 | this function tests a ggplot object to understand if it represents a plot with a background neither transparent nor white. 17 | Based on data visualization literature this kind of plots tend to reduce the **data to ink ratio** meaning the amount of data showed related to amount of ink employed. 18 | Lower values of this ratio are related to non-effective plots. 19 | } 20 | \references{ 21 | The Visual Display of Quantitative Information, E.R.Tufte, GRaphics Press . Cheshire, Connecticut 22 | } 23 | -------------------------------------------------------------------------------- /man/too_many_dimensions.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/too_many_dimensions.R 3 | \name{too_many_dimensions} 4 | \alias{too_many_dimensions} 5 | \title{a simple function to test for the presence of too many dimension for a 2D plotting space} 6 | \usage{ 7 | too_many_dimensions(plot_object, n_of_layers) 8 | } 9 | \arguments{ 10 | \item{plot_object}{the ggplot object being analysed} 11 | 12 | \item{n_of_layers}{number of layers within the ggplot object} 13 | } 14 | \value{ 15 | TRUE if the plot contains too many dimension, FALSE in the opposite case. 16 | } 17 | \description{ 18 | this function checks for the presence of both point or line and the mapping of some variable on the \code{size} aestetich. 19 | This check is based on data visualization principle requesting that the number of dimension depicted should not be higher thant the number of dimension of the data. 20 | } 21 | \references{ 22 | The Visual Display of Quantitative Information, E.R.Tufte, GRaphics Press . Cheshire, Connecticut 23 | } 24 | -------------------------------------------------------------------------------- /man/metadata_reader.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/metadata_reader.R 3 | \name{metadata_reader} 4 | \alias{metadata_reader} 5 | \title{draw a data frame containing relevant metadata of a ggplot object} 6 | \usage{ 7 | metadata_reader(plot_object = NULL) 8 | } 9 | \arguments{ 10 | \item{plot_object}{the ggplot object being analysed} 11 | } 12 | \value{ 13 | a list storing results and additional data related to each test performed. 14 | } 15 | \description{ 16 | \code{metadata_reader} function performs a number of test on the provided ggplot object to draw some intelligence about it. 17 | It tries to derive attributes ofthe plot relate to four data visualization theory areas: 18 | - the adequateness of labelling 19 | - the data density 20 | - the data to ink ratio 21 | - the readability of the plot 22 | The results of performed tests are stored in a structued list object, so to be provided as an input to a gradient boosting model. 23 | See \code{scorer} function documentation for more info about this subsequent step. 24 | } 25 | -------------------------------------------------------------------------------- /man/histogram_bins_tester.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/histogram_bins_tester.R 3 | \name{histogram_bins_tester} 4 | \alias{histogram_bins_tester} 5 | \title{compare the actual bins width against the Freedman Diaconis rule} 6 | \usage{ 7 | histogram_bins_tester(plot_object, n_of_layers, default_n_of_bins) 8 | } 9 | \arguments{ 10 | \item{plot_object}{the ggplot object being analysed} 11 | 12 | \item{n_of_layers}{number of layers within the ggplot object} 13 | 14 | \item{default_n_of_bins}{the default number of bins set by ggplot2. employed if no custom bin was set from the user within the ggplot object} 15 | } 16 | \value{ 17 | an object storing the general test result, the optimal binwidth based on Freedman Diaconis and the distance of the actual binwidth from this measure. 18 | TRUE is produced as a result if the actual width is not optimal. 19 | } 20 | \description{ 21 | this function computes the optimal bins width based on the Freedman Diaconis rule and compares it against the actual bin width set within the ggplot object. 22 | } 23 | -------------------------------------------------------------------------------- /man/outlier_labels.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/outlier_labels.R 3 | \name{outlier_labels} 4 | \alias{outlier_labels} 5 | \title{check for the presence of outliers not labelled within a ggplot object} 6 | \usage{ 7 | outlier_labels(plot_object, n_of_layers, p_build, aes_db) 8 | } 9 | \arguments{ 10 | \item{plot_object}{the ggplot object being analysed} 11 | 12 | \item{n_of_layers}{number of layers within the ggplot object} 13 | 14 | \item{p_build}{an object obtained running \code{ggplot_build} on the ggplot object} 15 | 16 | \item{aes_db}{a data frame storing plot object aestetichs and variable mapped against them} 17 | } 18 | \value{ 19 | a list storing a TRUE in case there are no outliers or the detected outliers are labelled and FALSE in the other cases. 20 | } 21 | \description{ 22 | for x and y variable the Tukey outlier rule is applied to evaluate the presence of outliers. 23 | If any outlier is identified one more check is performed to retrieve among plot metadata the presence of some annotation related to that data. 24 | } 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Andrea Cirillo 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /man/filled_barplot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/filled_barplot.R 3 | \name{filled_barplot} 4 | \alias{filled_barplot} 5 | \title{test a ggplot object to understand if it represents a barplot with filled bars} 6 | \usage{ 7 | filled_barplot(plot_object, n_of_layers) 8 | } 9 | \arguments{ 10 | \item{plot_object}{the ggplot object being analysed} 11 | 12 | \item{n_of_layers}{number of layers within the ggplot object} 13 | } 14 | \value{ 15 | returns TRUE when the barplot is composed of filled bars and FALSE in the opposite case. 16 | } 17 | \description{ 18 | this function tests a ggplot object to understand if it represents a barplot with filled bars. 19 | Based on data visualization literature this kind of plots tend to reduce the **data to ink ratio** meaning the amount of data showed related to amount of ink employed. 20 | Lower values of this ratio are related to non-effective plots. 21 | A control is performed to exclude we are simply looking at a pie chart based on an intermediate \code{geom_bar} layer 22 | } 23 | \references{ 24 | The Visual Display of Quantitative Information, E.R.Tufte, GRaphics Press . Cheshire, Connecticut 25 | } 26 | -------------------------------------------------------------------------------- /R/too_many_dimensions.R: -------------------------------------------------------------------------------- 1 | #' a simple function to test for the presence of too many dimension for a 2D plotting space 2 | #' @param plot_object the ggplot object being analysed 3 | #' @param n_of_layers number of layers within the ggplot object 4 | #' @description this function checks for the presence of both point or line and the mapping of some variable on the \code{size} aestetich. 5 | #' This check is based on data visualization principle requesting that the number of dimension depicted should not be higher thant the number of dimension of the data. 6 | #' @references The Visual Display of Quantitative Information, E.R.Tufte, GRaphics Press . Cheshire, Connecticut 7 | #' @return TRUE if the plot contains too many dimension, FALSE in the opposite case. 8 | #' @export 9 | too_many_dimensions <- function(plot_object,n_of_layers){ 10 | # we check here if there is a combination of geom_point/ geom_line and mapping = size 11 | 12 | not_point_or_line <- match(c("GeomLine", "GeomPoint"),geoms_lister(plot_object,n_of_layers)) %>% 13 | sum(na.rm = TRUE) ==0 14 | not_mapped_on_size <- match("size", 15 | mappings_lister(plot_object,n_of_layers)$aes ) %>% is.na() 16 | return(!not_point_or_line & !not_mapped_on_size) 17 | 18 | } 19 | -------------------------------------------------------------------------------- /man/does_it_need_geom_smooth.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/does_it_need_geom_smooth.R 3 | \name{does_it_need_geom_smooth} 4 | \alias{does_it_need_geom_smooth} 5 | \title{study the relationship among variables to find out if a geom_smooth is needed within the plot} 6 | \usage{ 7 | does_it_need_geom_smooth(plot_object, n_of_layers, correlation_threshold, 8 | aes_db) 9 | } 10 | \arguments{ 11 | \item{plot_object}{the ggplot object being analysed} 12 | 13 | \item{n_of_layers}{number of layers within the ggplot object} 14 | 15 | \item{correlation_threshold}{a judgmentally specified threshold to test for the need of a trend line on the plot} 16 | 17 | \item{aes_db}{a data frame storing plot object aestetichs and variable mapped against them} 18 | } 19 | \value{ 20 | a list storing in two separate vectors the final result of the test and the observed distance correlation. 21 | the test outcome is TRUE in case a trend line is deemed necessary. 22 | } 23 | \description{ 24 | \code{does_it_need_geom_smooth} computes distance correlation among x and y aestetich to evaluate it against a judgmental threshold. 25 | if this threshold is breached and no \code{geom_smooth} is observed among ggplot objects layers a postive outcome is produced for the test. 26 | } 27 | -------------------------------------------------------------------------------- /man/scorer_bot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/scorer_bot.R 3 | \name{scorer_bot} 4 | \alias{scorer_bot} 5 | \title{produce a customised report analysing the submitted ggplot object} 6 | \usage{ 7 | scorer_bot(plot_object = NULL) 8 | } 9 | \arguments{ 10 | \item{plot_object}{the ggplot object being analysed} 11 | } 12 | \value{ 13 | a reveljs deck of slide containing an evaluation of the effectiveness of ggplot objects and suggestion about how to improve it. 14 | } 15 | \description{ 16 | call to \code{scorer_bot} function produces the sequential calling of all function included within 17 | the \code{vizscorer} package, with the purpose to produce a customised report about the submitted ggplot object. The function handles call to \code{metadata_reader} function 18 | to draw metadata from the plot object in a format ready to be submitted to the scorer algorithm. within the next step those metadata are submitted to the \code{scorer} function which apply a gradient boosting pre-trained model to them. 19 | a final score from 0 to 1 is computed as a probability to be a good plot. 20 | To provide more relevant information to the user a set of advices are retrieved from a built-in knowledge base, selecting only those relevant to suboptimal areas of the plot, observed from metadata. 21 | Finally a revealjs deck of slides is programmatically created leveraging a quasi-real natural language generation algorithm. 22 | } 23 | -------------------------------------------------------------------------------- /R/filled_barplot.R: -------------------------------------------------------------------------------- 1 | #' test a ggplot object to understand if it represents a barplot with filled bars 2 | #' @param plot_object the ggplot object being analysed 3 | #' @param n_of_layers number of layers within the ggplot object 4 | #' @description this function tests a ggplot object to understand if it represents a barplot with filled bars. 5 | #' Based on data visualization literature this kind of plots tend to reduce the **data to ink ratio** meaning the amount of data showed related to amount of ink employed. 6 | #' Lower values of this ratio are related to non-effective plots. 7 | #' A control is performed to exclude we are simply looking at a pie chart based on an intermediate \code{geom_bar} layer 8 | #' @references The Visual Display of Quantitative Information, E.R.Tufte, GRaphics Press . Cheshire, Connecticut 9 | #' @return returns TRUE when the barplot is composed of filled bars and FALSE in the opposite case. 10 | #' @export 11 | filled_barplot <- function(plot_object,n_of_layers){ 12 | 13 | is_geom_bar <- test_for_geom_bar(plot_object,n_of_layers) 14 | 15 | mappings_lister(plot_object,n_of_layers) %>% 16 | select(aes) %>% 17 | pull() %>% 18 | as.vector()-> aes_vector 19 | 20 | fill_aes_is_mapped <- !is.na(match("fill",aes_vector)) 21 | 22 | a_pie_chart <- is_pie_chart(plot_object, n_of_layers) 23 | 24 | if(is_geom_bar & 25 | fill_aes_is_mapped == FALSE & 26 | a_pie_chart==FALSE){ # we control for being a pie chart 27 | TRUE 28 | }else{if(is_geom_bar == FALSE){FALSE}#NA} 29 | else{FALSE}} 30 | } 31 | -------------------------------------------------------------------------------- /R/heavy_background.R: -------------------------------------------------------------------------------- 1 | #'look for heavy background on a ggplot plot 2 | #' @param plot_object the ggplot object being analysed 3 | #' @description this function tests a ggplot object to understand if it represents a plot with a background neither transparent nor white. 4 | #' Based on data visualization literature this kind of plots tend to reduce the **data to ink ratio** meaning the amount of data showed related to amount of ink employed. 5 | #' Lower values of this ratio are related to non-effective plots. 6 | #' @references The Visual Display of Quantitative Information, E.R.Tufte, GRaphics Press . Cheshire, Connecticut 7 | #' @return returns TRUE if the plot is depicted on an heavy background, FALSE if the background is white or transparent 8 | #' @export 9 | heavy_background <- function(plot_object){ 10 | # we check here if the default background is being used AND 11 | # if the default background is still the grey one 12 | 13 | default_theme <- theme_get() 14 | default_is_grey <- pmatch("grey",default_theme$panel.background$fill) 15 | background_fill <- plot_object$theme$panel.background$fill 16 | if ((length(as.character(plot_object$theme)) == 0 | is.null(background_fill)) & !is.na(default_is_grey)) { 17 | TRUE # the default theme is being used and this produces to a grey background 18 | }else{ 19 | if (background_fill %in% c(rgb(1,1,1),"#FFFFFF", "white", "transparent")) { # if the background is not white or transparent we say it is heavy, tertium non datur 20 | FALSE}else{ 21 | TRUE} 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /R/mappings_lister.R: -------------------------------------------------------------------------------- 1 | #' storing aestetich variables in a dataframe 2 | #' @param plot_object the ggplot object being analysed 3 | #' @param n_of_layers number of layers within the ggplot object 4 | #' @return a data frane storing for each aestetich the mapped variable 5 | #' @export 6 | mappings_lister <- function(plot_object,n_of_layers){ 7 | 8 | #aes_db_raw just contains aes from within the ggplot() call, i.e. from the "zero" layer 9 | aes_db_raw <- data.frame("aes" = plot_object$mapping %>% unlist() %>% names(), 10 | "variable" = plot_object$mapping %>% unlist() %>% as.character(),stringsAsFactors = FALSE ) 11 | 12 | #we then add aes from other layers 13 | for(i in 1:n_of_layers){ 14 | aes_names_partial <- plot_object$layers[[i]]$mapping %>% unlist() %>% names() 15 | aes_values_partial_raw <- plot_object$layers[[i]]$mapping %>% unlist() %>% as.character() 16 | #removing ~ character to ensure proper matching with variable names 17 | aes_values_partial <- gsub("~","",x = aes_values_partial_raw) 18 | rbind(aes_db_raw, 19 | data.frame("aes" = aes_names_partial, 20 | "variable" = aes_values_partial, 21 | stringsAsFactors = FALSE)) %>% 22 | filter(variable != "x", variable != "y") -> aes_db_intermediate # we check for aestetichs inheritance 23 | } 24 | #in the end I add the mapping set into ggplot() 25 | aes_db_intermediate %>% 26 | mutate(variable = gsub("~","",variable)) -> aes_db 27 | return(aes_db) 28 | 29 | } 30 | -------------------------------------------------------------------------------- /R/cozy_plot.R: -------------------------------------------------------------------------------- 1 | #' checking for the presence of overplotting within a ggplot object. 2 | #' @param plot_object the ggplot object being analysed 3 | #' @param n_of_layers number of layers within the ggplot object 4 | #' @param overplotting_floor a custom threshold of distance among points to detect overplotting 5 | #' @param aes_db a data frame storing plot object aestetichs and variable mapped against them 6 | #' @return a list containing an overall test result with TRUE in presence of overplotting and the median distance observed between points 7 | #' @export 8 | cozy_plot <- function(plot_object, n_of_layers, overplotting_floor,aes_db){ 9 | 10 | if (is.na(match("y",aes_db$aes))){ #if there is no y we are probably "looking" at an histogram, for which no overplotting check is provided 11 | return(list(NA,100))}else{ 12 | # we look here for overplotting. To do this we compute the median euclidean distance as a measure 13 | # of plot density 14 | raw_x <- aes_puller(plot_object,n_of_layers, "x") 15 | if(mode(raw_x) == "list"){x_vector <- raw_x[,1] %>% pull}else{x_vector <- raw_x} 16 | if(!is.na(match("..density..",aes_db$variable) )){y_vector <- c()}else{ 17 | raw_y <- aes_puller(plot_object,n_of_layers, "y") 18 | if(mode(raw_y) == "list"){y_vector <- raw_y[,1] %>% pull}else{y_vector <- raw_y} 19 | } 20 | not_handled <- c("factor","character","Date") 21 | #check on variabe type 22 | if(class(x_vector) %in% not_handled | class(y_vector) %in% not_handled|!is.na(match("..density..",aes_db$variable)) ){return(list(NA,100))}else{ 23 | 24 | median_distance <- median(daisy(data.frame(x_vector,y_vector))) 25 | 26 | test_result <- median_distance < overplotting_floor 27 | 28 | return(list("test_result" = test_result, 29 | "median_distance" = median_distance)) 30 | } 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /inst/extdata/comparison_db.csv: -------------------------------------------------------------------------------- 1 | "","good","model" 2 | "1",0.144211620391848,"gbm" 3 | "2",0.218814729082962,"gbm" 4 | "3",0.139131253522651,"gbm" 5 | "4",0.208631105844797,"gbm" 6 | "5",0.179385610765719,"gbm" 7 | "6",0.120071844534823,"gbm" 8 | "7",0.164181484028644,"gbm" 9 | "8",0.205241426326928,"gbm" 10 | "9",0.156413287634618,"gbm" 11 | "10",0.638012252879337,"gbm" 12 | "11",0.22387265004053,"gbm" 13 | "12",0.638012252879337,"gbm" 14 | "13",0.186305201671998,"gbm" 15 | "14",0.638012252879337,"gbm" 16 | "15",0.638012252879337,"gbm" 17 | "16",0.194471806079098,"gbm" 18 | "17",0.129353075678487,"gbm" 19 | "18",0.638012252879337,"gbm" 20 | "19",0.638012252879337,"gbm" 21 | "20",0.239115140832344,"gbm" 22 | "21",0.144211620391848,"gbm" 23 | "22",0.638012252879337,"gbm" 24 | "23",0.638012252879337,"gbm" 25 | "24",0.200495699012896,"gbm" 26 | "25",0.638012252879337,"gbm" 27 | "26",0.171822731153773,"gbm" 28 | "27",0.638012252879337,"gbm" 29 | "28",0.189779089244373,"gbm" 30 | "29",0.184835736843535,"gbm" 31 | "30",0.638012252879337,"gbm" 32 | "31",0.638012252879337,"gbm" 33 | "32",0.236262189471113,"gbm" 34 | "33",0.120071844534823,"gbm" 35 | "34",0.200495699012896,"gbm" 36 | "35",0.146054675383698,"gbm" 37 | "36",0.166872480718325,"gbm" 38 | "37",0.156413287634618,"gbm" 39 | "38",0.22387265004053,"gbm" 40 | "39",0.638012252879337,"gbm" 41 | "40",0.182534545204342,"gbm" 42 | "41",0.147772112561917,"gbm" 43 | "42",0.208631105844797,"gbm" 44 | "43",0.144211620391848,"gbm" 45 | "44",0.144211620391848,"gbm" 46 | "45",0.179385610765719,"gbm" 47 | "46",0.155990810792557,"gbm" 48 | "47",0.239587909998247,"gbm" 49 | "48",0.163675931518928,"gbm" 50 | "49",0.226157482865321,"gbm" 51 | "50",0.171822731153773,"gbm" 52 | "51",0.120071844534823,"gbm" 53 | "52",0.208631105844797,"gbm" 54 | "53",0.22031846014868,"gbm" 55 | "54",0.183746501021698,"gbm" 56 | "55",0.171822731153773,"gbm" 57 | "56",0.183746501021698,"gbm" 58 | "57",0.638012252879337,"gbm" 59 | "58",0.208631105844797,"gbm" 60 | "59",0.200495699012896,"gbm" 61 | "60",0.211673659995752,"gbm" 62 | "61",0.171822731153773,"gbm" 63 | "62",0.22387265004053,"gbm" 64 | "63",0.194618727276444,"gbm" 65 | "64",0.22387265004053,"gbm" 66 | "65",0.184835736843535,"gbm" 67 | -------------------------------------------------------------------------------- /R/histogram_bins_tester.R: -------------------------------------------------------------------------------- 1 | #' compare the actual bins width against the Freedman Diaconis rule 2 | #' @param plot_object the ggplot object being analysed 3 | #' @param n_of_layers number of layers within the ggplot object 4 | #' @param default_n_of_bins the default number of bins set by ggplot2. employed if no custom bin was set from the user within the ggplot object 5 | #' @description this function computes the optimal bins width based on the Freedman Diaconis rule and compares it against the actual bin width set within the ggplot object. 6 | #' @return an object storing the general test result, the optimal binwidth based on Freedman Diaconis and the distance of the actual binwidth from this measure. 7 | #' TRUE is produced as a result if the actual width is not optimal. 8 | #' @export 9 | histogram_bins_tester <- function(plot_object, n_of_layers, default_n_of_bins){ 10 | # first of all I look for a GeomBar layer and a binwidth parameter. the occurence of 11 | # both means we are "looking" at an histogram ( there is no separate geom) 12 | test_histogram <- test_for_histogram(plot_object,n_of_layers) 13 | # if the plot actually seems to be an histogram I retrieve di x variable and compute on it 14 | ## the optimal number of bins based on the Freedman Diaconis rule 15 | 16 | if(test_histogram){ 17 | variable_vector <- aes_puller(plot_object,n_of_layers,required_aes = "x") 18 | optimal_bw <- 2 * (IQR(variable_vector, na.rm = TRUE) / length(variable_vector)^(1/3)) 19 | bar_index <- match("GeomBar",geoms_lister(plot_object,n_of_layers)) 20 | bar_stat_params <- plot_object[[bar_index]]$stat_params 21 | actual_bw_index <- match("binwidth",bar_stat_params) 22 | actual_bw <- plot_object$layers[[bar_index]]$stat_params[[actual_bw_index]] 23 | 24 | if (is.null(actual_bw)){ 25 | actual_bw <- diff(range(variable_vector, na.rm = TRUE))/default_n_of_bins 26 | } # we handle here the common case of the user not changing the default setting for bins size 27 | 28 | # after retrieving or computing the actual binwidth I compute the distance from the optimum 29 | 30 | distance_from_optimum <- optimal_bw - actual_bw 31 | if(distance_from_optimum !=0){ 32 | optimization_data <- list("test" = TRUE,"optimal_bw" = optimal_bw, "distance_from_optimum" = distance_from_optimum) 33 | return(optimization_data) 34 | }else{return(list("test"=FALSE,"optimal_bw" =optimal_bw,"distance_from_optimum"=distance_from_optimum))} 35 | }else{list(FALSE,NA,100)} 36 | 37 | } 38 | -------------------------------------------------------------------------------- /R/does_it_need_geom_smooth.R: -------------------------------------------------------------------------------- 1 | #' study the relationship among variables to find out if a geom_smooth is needed within the plot 2 | #' @param plot_object the ggplot object being analysed 3 | #' @param n_of_layers number of layers within the ggplot object 4 | #' @param correlation_threshold a judgmentally specified threshold to test for the need of a trend line on the plot 5 | #' @param aes_db a data frame storing plot object aestetichs and variable mapped against them 6 | #' @description \code{does_it_need_geom_smooth} computes distance correlation among x and y aestetich to evaluate it against a judgmental threshold. 7 | #' if this threshold is breached and no \code{geom_smooth} is observed among ggplot objects layers a postive outcome is produced for the test. 8 | #' @return a list storing in two separate vectors the final result of the test and the observed distance correlation. 9 | #' the test outcome is TRUE in case a trend line is deemed necessary. 10 | #' @export 11 | does_it_need_geom_smooth <- function(plot_object, n_of_layers,correlation_threshold,aes_db){ 12 | 13 | # first we check for correlations ( of any type) among points 14 | raw_x <- aes_puller(plot_object,n_of_layers, "x") 15 | if(mode(raw_x) == "list"){x_vector <- raw_x[,1] %>% pull}else{x_vector <- raw_x} 16 | 17 | #look for density plot currently not handled 18 | if(!is.na(match("..density..",aes_db$variable))){y_vector <- c()}else{ 19 | raw_y <- aes_puller(plot_object,n_of_layers, "y") 20 | if(mode(raw_y) == "list"){y_vector <- raw_y[,1] %>% pull}else{y_vector <- raw_y} 21 | } 22 | not_handled <- c("factor","character") 23 | #check on variabe type 24 | if(class(x_vector) %in% not_handled | class(y_vector) %in% not_handled | !is.na(match("..density..",aes_db$variable))){return(list(NA,0,NA)) 25 | } else if(!is.na(x_vector) & !is.na(y_vector)){ 26 | distance_correlation <- dcor(x_vector,y_vector) 27 | }else{distance_correlation <- 2} 28 | 29 | # if the level of correlation is relevant we look for the presence of geom_smooth 30 | if(abs(distance_correlation)>correlation_threshold & distance_correlation != 2){ 31 | geoms_vector <- geoms_lister(plot_object, n_of_layers) 32 | if(!is.na(match("GeomSmooth",geoms_vector))){return(list(FALSE,distance_correlation))}else{ 33 | return(list(TRUE,distance_correlation)) # if we have a relevant correlation and no geom smooth we suggest to add one 34 | } 35 | 36 | }else if(distance_correlation == 2){return(list(NA,0))} 37 | else{ 38 | return(list(FALSE,distance_correlation)) 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # vizscorer 2 | a clever bot to rate and improve your ggplot plot 3 | 4 | Vizscorer is an R package which tries to help users discover weaknesses in their ggplot plots. To do so it applies a mix of machine learning and natural language generation. 5 | 6 |
7 | infoviz 8 |
9 | 10 | Vizscorer basically: 11 | 12 | - looks at ggplot internals 13 | - analyzes them 14 | - produces a deck of slides showing an overall evaluation and detailed suggestions about how to improve the plot. 15 | 16 | See it in action below: 17 | 18 |
19 | infoviz 20 |
21 | 22 | Vizscorer continues the effort started with [paletteR](https://github.com/AndreaCirilloAC/paletter) to increase the level of quality of the average plot produced in companies, where there is no time to study data visualization theory. 23 | 24 | ## how to use it 25 | 26 | install vizscorer package: 27 | 28 | 29 | devtools::install_github("AndreaCirilloAC/vizscorer") 30 | 31 | 32 | call the scorer_bot on your ggplot plot: 33 | 34 | 35 | scorer_bot(plot_object = your_ggplot_plot) 36 | 37 | 38 | after some strange code populating your console a web browser will show up to display you a deck of slide containing: 39 | 40 | - an assessment of your plot 41 | - a set of customized suggestions about how to improve your plot, based on data visualization principles 42 | 43 | ### Disclaimer 44 | 45 | **vizscorer is currently in beta mode and every feedback and support from early adopters is more than welcome :)** 46 | 47 | The package currently does not support complex ggplot plots, like plots with data not provided within the *ggplot()* call but rather within the other layers. 48 | If any issue arises, please consider submit an issue within the *Issues* section or forking the repo and fix it by yourself! 49 | 50 | ### further area of development 51 | 52 | - increasing the number of plots employed for training the gradient boosting model (more about this in a forthcoming blog post) 53 | - introducing a “console version” of the report, more usable and less annoying in the long run 54 | - increasing the level of cleaverness of the bot to make it able to analyse more complex plots and provide even more useful suggestions 55 | -------------------------------------------------------------------------------- /inst/extdata/report.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Your Plot Meta Analysis" 3 | output: 4 | revealjs::revealjs_presentation: 5 | center: true 6 | transition: fade 7 | theme: serif 8 | --- 9 | 10 | 68 | 69 | Hi! I am you dataviz bot advisor_ 70 | 71 | --- 72 | 73 | I have looked into your plot and evaluated it against dataviz best practices and theoretical frameworks_ 74 | 75 | --- 76 | 77 | your plot looks __`r judgment`___ 78 | 79 | to be more precise your plot scored __`r round(score,2)`__ 80 | 81 | 82 | --- 83 | 84 | here it is a comparison between your score and the score obtained from plots included within the score estimation database. 85 | 86 | ```{r eval=TRUE, include=TRUE,echo=FALSE, fig.height=5,fig.width=7,5} 87 | 88 | score_plot 89 | 90 | ``` 91 | 92 | you can find more info about how my scoring algorithm works on my official webpage 93 | 94 | --- 95 | 96 | let me tell you a bit more about this_ 97 | 98 | --- 99 | 100 | as far as I can see the data viz area in which your plot scored worst was the __`r worst_area`___ 101 | 102 | --- 103 | 104 | here it is a summary of your plot performance for each area. 105 | 106 | `r kable(positive_ratio_show)` 107 | 108 | you can look into my documentation to learn which test were performed and how the scores were computed. 109 | you can also find more info about all on my official webpage 110 | 111 | --- 112 | 113 | is now time to learn how to make your plot more effective applying dataviz principles_ 114 | 115 | --- 116 | 117 | I will give you an advice on how to fix your plot for each test failed_ 118 | 119 | -------------------------------------------------------------------------------- /R/outlier_labels.R: -------------------------------------------------------------------------------- 1 | #' check for the presence of outliers not labelled within a ggplot object 2 | #' @param plot_object the ggplot object being analysed 3 | #' @param n_of_layers number of layers within the ggplot object 4 | #' @param p_build an object obtained running \code{ggplot_build} on the ggplot object 5 | #' @param aes_db a data frame storing plot object aestetichs and variable mapped against them 6 | #' @description for x and y variable the Tukey outlier rule is applied to evaluate the presence of outliers. 7 | #' If any outlier is identified one more check is performed to retrieve among plot metadata the presence of some annotation related to that data. 8 | #' @return a list storing a TRUE in case there are no outliers or the detected outliers are labelled and FALSE in the other cases. 9 | #' @export 10 | outlier_labels <- function(plot_object,n_of_layers,p_build,aes_db){ 11 | #TRUE as final result means either you don't have outliers or the ones you have are labeled. the relevant result here is 12 | # FALSE since it discriminates between a good and a bad plot. 13 | 14 | #draw_data 15 | raw_x <- aes_puller(plot_object,n_of_layers, "x") 16 | if(mode(raw_x) == "list"){x_vector <- raw_x[,1] %>% pull}else{x_vector <- as.numeric(raw_x)} 17 | if(!is.na(match("..density..",aes_db$variable) )){y_vector <- c(NA)}else{ 18 | raw_y <- aes_puller(plot_object,n_of_layers, "y") 19 | if(mode(raw_y) == "list"){y_vector <- raw_y[,1] %>% pull}else{y_vector <- as.numeric(raw_y)} 20 | } 21 | ############ x 22 | if(is.na(x_vector)|is.character(x_vector)){x_outliers_are_labelled <- NA}else{ 23 | # look for outliers 24 | x_outliers <- boxplot.stats(x_vector )$out 25 | 26 | 27 | if (length((x_outliers))>0){ #we preliminary check for outliers presence, to avoid unnecessary computations 28 | # look for geom_text 29 | text_index <- match("GeomText",geoms_lister(plot_object, n_of_layers )) 30 | # if text is found I look at data to see if we are looking the outliers are labeled 31 | if(is.na(text_index)){x_outliers_are_labelled <- FALSE}else if(text_index > 0){ 32 | 33 | text_x <- p_build$data[[text_index]]$x 34 | 35 | 36 | x_outliers_are_labelled <- if(length(x_outliers)==0){TRUE}else if(x_outliers%in% text_x){TRUE}else{FALSE} 37 | 38 | }}else{x_outliers_are_labelled <- TRUE}} 39 | 40 | ########### y 41 | 42 | if(is.na(y_vector)|is.character(x_vector)){ y_outliers_are_labelled <- NA}else{ 43 | 44 | # look for outliers 45 | 46 | y_outliers <- boxplot.stats(y_vector)$out 47 | 48 | if (length(y_outliers)>0){ #we preliminary check for outliers presence, to avoid unnecessary computations 49 | # look for geom_text 50 | text_index <- match("GeomText",geoms_lister(plot_object, n_of_layers )) 51 | # if text is found we look at data to see if in the plot we are looking at the outliers are labeled 52 | if(is.na(text_index)){y_outliers_are_labelled <- FALSE}else if(text_index > 0){ 53 | 54 | text_y <- p_build$data[[text_index]]$y 55 | 56 | y_outliers_are_labelled <- if(length(y_outliers)==0){TRUE}else if(y_outliers%in% text_y){TRUE}else{FALSE} 57 | 58 | }}else{y_outliers_are_labelled <- TRUE}} 59 | ############### 60 | general_result <- if(is.na(x_outliers_are_labelled) & is.na(y_outliers_are_labelled)){ 61 | TRUE} else{if(prod(x_outliers_are_labelled,y_outliers_are_labelled, na.rm = TRUE)== TRUE){TRUE}else{FALSE}} 62 | 63 | return(general_result) 64 | } 65 | -------------------------------------------------------------------------------- /R/scorer.R: -------------------------------------------------------------------------------- 1 | #'scoring the effectivenes of ggplot object applying a gradient boosting model 2 | #'@description \code{scorer} function handles the running of a gradient boosting pre-trained model on the metadata of a ggplot object. 3 | #'As a result a 0 to 1 score is produced, measuring the effectiveness of the related plot based on a set of rules drawn from data visualization theory. 4 | #'@param plot_metadata a data frame storing ggplot object metadata, resulting from a previous call of \code{metadata_reader} function. 5 | #'@return a numeric value 6 | #' @export 7 | scorer <- function(plot_metadata = NULL){ 8 | 9 | if(is.null(plot_metadata)){stop("you must provide a valid set of plot metadata to obtain a score")}else{ 10 | 11 | # take as input a data frame with one row, coming from metadata_reader 12 | 13 | # predict plot score employing the gradient boosting fitted on the classification db 14 | 15 | scoring_db_raw <- data.frame() 16 | colnames_vector <- c( 17 | 'pie_chart', 18 | 'number_of_layers', 19 | 'number_of_dimensions', 20 | 'width_of_bins', 21 | 'flipped_barplot', 22 | 'need_for_a_smooth', 23 | 'sufficient_number_of_data', 24 | 'overplotting', 25 | 'use_of_heavy_background', 26 | 'filled_barplot', 27 | 'presence_of_title', 28 | 'presence_of_subtitle', 29 | 'presence_of_caption', 30 | 'special_characters_in_label', 31 | 'outliers_not_labelled') 32 | 33 | plot_metadata <- plot_metadata %>% as.data.frame() 34 | 35 | result <- plot_metadata$test %>% as.character() 36 | row <- data.frame(t(c(result))) 37 | colnames(row) <- colnames_vector 38 | scoring_db_raw <- rbind(scoring_db_raw,row) 39 | colnames(scoring_db_raw) <- colnames_vector 40 | 41 | scoring_db_raw %>% 42 | mutate(pie_chart = as.logical(pie_chart), 43 | number_of_dimensions = as.logical(number_of_dimensions), 44 | number_of_layers = as.numeric(as.character(number_of_layers)), 45 | width_of_bins = as.numeric(as.character(width_of_bins)), 46 | flipped_barplot = as.logical(flipped_barplot), 47 | need_for_a_smooth = as.numeric(as.character(need_for_a_smooth)), 48 | sufficient_number_of_data = as.logical(sufficient_number_of_data), 49 | overplotting =as.numeric(as.character(overplotting)), 50 | use_of_heavy_background = as.logical(use_of_heavy_background), 51 | filled_barplot= as.logical(filled_barplot), 52 | presence_of_title = as.logical(presence_of_title), 53 | presence_of_subtitle = as.logical(presence_of_subtitle), 54 | presence_of_caption = as.logical(presence_of_caption), 55 | special_characters_in_label = as.logical(special_characters_in_label), 56 | outliers_not_labelled = as.logical(outliers_not_labelled))-> scoring_db 57 | 58 | predict(gbm_fit,newdata = scoring_db, probability = TRUE) 59 | probabilities <- extractProb(list(gbm = gbm_fit),unkX = scoring_db ) 60 | #return as output the probability of being a good plot, i.e. the final score 61 | 62 | return(probabilities$good) 63 | 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /inst/extdata/plot_advices.csv: -------------------------------------------------------------------------------- 1 | topic_label;advice;problem 2 | pie_chart;pie charts are rarely the better way to display data, since makes difficult to make precise comparison between categories. Why do not simply look for a well defined horizontal bar chart?;you are using a pie chart 3 | number_of_layers;you are trying to show too many elements at once, why do not split your plot into more plots or introduce some facet? Look at facet_wrap for further info;you are trying to show too many things at once 4 | number_of_dimensions;you are showing three dimensions on a two dimension support. This can give a quick comparison of values of your data ,but is not as well suited for accurate or precise lookup. it would be better to split it into two different plots, for instance employing facet_wrap;showing more than 2D on a 2D space is difficult 5 | number_of_bins;the histogram you specified is not the best possibile visualisation of your data. I have found that setting bins to a different width produces a better visualisation of your data structure. Look for the Freedman-Diaconis rule to understand how to set the optimal width of your bins;try to pick a better width for your bins 6 | flipped_barplot;humans are better able at comparing lengths when showed horizontally. Unless you are showing time-related information (i.e. time series over the x axis) try to flip your bars adding coord_flip() to your plot specification;let's flip your bars down! 7 | need_for_a_smooth;it seems a strong correlation is going on between your data. Try to help the reader of your plot spotting it adding geom_smooth() to your plot;you should highlight trends going on 8 | sufficient_number_of_data;your plot shows less then 20 data points. You would obtain a better result employing a table. You can find more info about this within one of masterpieces about data visualization: _The Visual Display of Quantitative Information_ by E. R. Tufte;too few points for a plot 9 | overplotting;you are incurring in __overplotting__: there are too many points to be displayed at once in an accurate way. you should consider introducing some faceting. If you rather want to maintain the current structure of the plot you could consider employing a different geom like geom_bin2d() able to better avoid overplotting;your plot is crowded due to overplotting 10 | use_of_heavy_background;you are employing the default ggplot theme. while it is generally fit for its purpose you should consider removing that grey background, which lowers your data to ink ratio. Please find more info about data ink ratio on the official info vis wiki page;try to remove the grey background 11 | filled_barplot;you are using monochromatic filled bars. that great amount of black neither show any additional data nor convey significant message. consider change it into a geom_point() or a geom_point_range();the filling of your bars is not adding information 12 | presence_of_title;your plot is missing a title. this makes harder for your reader to understand its content. try to add a title employing labs(title = 'relevant title');what is a book without a title? 13 | presence_of_subtitle; your plot is missing a subtitle. Subtitle are effective in providing additional relevant info about metrics and data showed in your plot.Try to add a plot employing labs(subtitle = 'your subtitle');let us understand your plot better with a subtitle 14 | presence_of_caption;your plot is missing a caption. While not mandatories , captions are used to provide info about sources and reference period of your plot. This is crucial to increase the level of graphical integrity of your plot;use caption to add integrity to your plot 15 | special_characters_in_label;your labels shows special characters. this makes them hard to read and reduce their effectiveness. try to solve this explicitly setting your labels trough labs(x = 'meaningful x axis' = ,y = 'meaningful y axis');special characters in labels lower readability 16 | outliers_not_labelled;I have found some outliers in your data ,and they do not seem to be explicitly labeled in your plot.To increase the level of meaningfulness you should consider labelling them;mark that outliers 17 | -------------------------------------------------------------------------------- /plot_report.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Your Plot Meta Analysis" 3 | output: 4 | revealjs::revealjs_presentation: 5 | center: true 6 | transition: fade 7 | theme: serif 8 | --- 9 | 10 | 68 | 69 | Hi! I am you dataviz bot advisor_ 70 | 71 | --- 72 | 73 | I have looked into your plot and evaluated it against dataviz best practices and theoretical frameworks_ 74 | 75 | --- 76 | 77 | your plot looks __`r judgment`___ 78 | 79 | to be more precise your plot scored __`r round(score,2)`__ 80 | 81 | 82 | --- 83 | 84 | here it is a comparison between your score and the score obtained from plots included within the score estimation database. 85 | 86 | ```{r eval=TRUE, include=TRUE,echo=FALSE, fig.height=5,fig.width=7,5} 87 | 88 | score_plot 89 | 90 | ``` 91 | 92 | you can find more info about how my scoring algorithm works on my official webpage 93 | 94 | --- 95 | 96 | let me tell you a bit more about this_ 97 | 98 | --- 99 | 100 | as far as I can see the data viz area in which your plot scored worst was the __`r worst_area`___ 101 | 102 | --- 103 | 104 | here it is a summary of your plot performance for each area. 105 | 106 | `r kable(positive_ratio_show)` 107 | 108 | you can look into my documentation to learn which test were performed and how the scores were computed. 109 | you can also find more info about all on my official webpage 110 | 111 | --- 112 | 113 | is now time to learn how to make your plot more effective applying dataviz principles_ 114 | 115 | --- 116 | 117 | I will give you an advice on how to fix your plot for each test failed_ 118 | 119 | 120 | --- 121 | 122 | __you should highlight trends going on__ 123 | 124 | it seems a strong correlation is going on between your data. Try to help the reader of your plot spotting it adding geom_smooth() to your plot_ 125 | 126 | --- 127 | 128 | __try to remove the grey background__ 129 | 130 | you are employing the default ggplot theme. while it is generally fit for its purpose you should consider removing that grey background, which lowers your data to ink ratio. Please find more info about data ink ratio on the official info vis wiki page_ 131 | 132 | --- 133 | 134 | __let us understand your plot better with a subtitle__ 135 | 136 | your plot is missing a subtitle. Subtitle are effective in providing additional relevant info about metrics and data showed in your plot.Try to add a plot employing labs(subtitle = 'your subtitle')_ 137 | 138 | --- 139 | 140 | __use caption to add integrity to your plot__ 141 | 142 | your plot is missing a caption. While not mandatories , captions are used to provide info about sources and reference period of your plot. This is crucial to increase the level of graphical integrity of your plot_ 143 | 144 | --- 145 | 146 | __mark that outliers__ 147 | 148 | I have found some outliers in your data ,and they do not seem to be explicitly labeled in your plot.To increase the level of meaningfulness you should consider labelling them_ 149 | 150 | --- 151 | Hope you will find this useful and will help you improve your plot 152 | 153 | Cheers, 154 | 155 | _your dataviz bot advisor_ 156 | 157 | -------------------------------------------------------------------------------- /R/scorer_bot.R: -------------------------------------------------------------------------------- 1 | #' produce a customised report analysing the submitted ggplot object 2 | #' @param plot_object the ggplot object being analysed 3 | #' @description call to \code{scorer_bot} function produces the sequential calling of all function included within 4 | #' the \code{vizscorer} package, with the purpose to produce a customised report about the submitted ggplot object. The function handles call to \code{metadata_reader} function 5 | #' to draw metadata from the plot object in a format ready to be submitted to the scorer algorithm. within the next step those metadata are submitted to the \code{scorer} function which apply a gradient boosting pre-trained model to them. 6 | #' a final score from 0 to 1 is computed as a probability to be a good plot. 7 | #' To provide more relevant information to the user a set of advices are retrieved from a built-in knowledge base, selecting only those relevant to suboptimal areas of the plot, observed from metadata. 8 | #' Finally a revealjs deck of slides is programmatically created leveraging a quasi-real natural language generation algorithm. 9 | #' @return a reveljs deck of slide containing an evaluation of the effectiveness of ggplot objects and suggestion about how to improve it. 10 | #' @export 11 | scorer_bot <- function(plot_object = NULL){ 12 | plot_metadata_raw <- metadata_reader(plot_object) %>% as.data.frame() 13 | plot_metadata <- data.frame(area_label = unlist(plot_metadata_raw$area_label), 14 | topic_label = unlist(plot_metadata_raw$topic_label), 15 | test = unlist(plot_metadata_raw$test), 16 | ispositive = unlist(plot_metadata_raw$ispositive)) 17 | 18 | score <- scorer(plot_metadata) 19 | score_db <- data.frame("good"=score,"count"=1) 20 | comparison_path <- system.file("extdata", "comparison_db.csv", package = "vizscorer") 21 | comparison_db <- read.csv(comparison_path, sep = ",", stringsAsFactors = FALSE) %>% 22 | select(good,model) 23 | 24 | comparison_db %>% 25 | ggplot(aes(x = good))+ 26 | geom_histogram(binwidth = 0.01,colour = "grey80",fill ="grey80")+ 27 | geom_point(data =score_db, 28 | aes(x=good,y=count), 29 | inherit.aes = FALSE,colour = "red",size=3)+ 30 | theme_minimal()+ 31 | xlab("score")+ 32 | labs(title="comparison among training plots and your plot")+ 33 | annotate("segment", x = score, xend = 0.4, y = 1, yend = 5,colour ="grey50")+ 34 | annotate(geom ="text", 35 | label = "you are here", 36 | x = 0.4,y=5.1,colour="grey20",size= 7) -> score_plot 37 | 38 | score_map <- data.frame(lower = c(0,0.11,0.41,0.71), 39 | upper = c(0.10,0.40,0.70,1), 40 | judgment = c("poorly", "improvable","good!", "great!")) 41 | 42 | score_map$score_match <- mapply(between, 43 | x = rep(score,nrow(score_map)), 44 | left = score_map$lower, 45 | right = score_map$upper, 46 | SIMPLIFY = TRUE) 47 | score_map %>% 48 | filter(score_match == TRUE) %>% 49 | select(judgment) %>% pull() %>% 50 | as.character() -> judgment 51 | 52 | # from metadata, leveraging area_label and ispositive attribute, we highlight which area of dataviz 53 | # shows problem for the plot 54 | # we will highlight those area to our user 55 | # brief test for histograms to filter out test note relevant for non-histogram plots 56 | is_histogram <- test_for_histogram(plot_object,plot_object$layers %>% length()) 57 | 58 | plot_metadata %>% 59 | {if (is_histogram == FALSE) filter(., topic_label != "number_of_bins" & topic_label != "flipped_barplot" ) else filter(.,topic_label !="stop")} %>% 60 | select(area_label,ispositive) %>% 61 | group_by(area_label,ispositive) %>% 62 | count() %>% 63 | spread(key = ispositive, value = n) %>% 64 | mutate(positive_ratio = `TRUE`/sum(`TRUE`,`FALSE`, na.rm = TRUE)) %>% 65 | mutate(area_ratio =round(positive_ratio,2)) %>% 66 | rename(passed = `TRUE`,failed = `FALSE`) %>% 67 | mutate(total = sum(passed , failed,na.rm = TRUE)) -> positive_ratio_db 68 | 69 | positive_ratio_db %>% 70 | select(area_label,failed,passed,area_ratio) -> positive_ratio_show 71 | 72 | positive_ratio_show[is.na(positive_ratio_show)] <- 0 73 | 74 | positive_ratio_db %>% 75 | filter(positive_ratio == min(.$positive_ratio)) %>% 76 | select(area_label) %>% 77 | pull() %>% 78 | as.character() -> worst_area # the worst area was the one in which the plot obtained ther worst positive rate, 79 | #i.e. number of positive results in test given the overall number of test for that area. 80 | 81 | # we then select each specific test showing problems, so to give specific advice for each of this elements. 82 | plot_metadata %>% 83 | {if (is_histogram == FALSE) filter(., topic_label != "number_of_bins" & topic_label != "flipped_barplot" ) else filter(.,topic_label !="stop")} %>% 84 | select(topic_label,ispositive) %>% 85 | filter(ispositive == FALSE) -> errors_db 86 | 87 | # merge errors_db with a db storing a suggestion for each possible error 88 | advices_path <- system.file("extdata", "plot_advices.csv", package = "vizscorer") 89 | advices_db <- read.csv(advices_path, sep = ";", stringsAsFactors = FALSE) 90 | 91 | errors_db %>% 92 | left_join(.,advices_db,by = "topic_label") -> teaching_db 93 | 94 | n_of_errors <- nrow(errors_db) 95 | report_path <- system.file("extdata", "report.Rmd", package = "vizscorer") 96 | file.copy(report_path, to = "plot_report.Rmd",overwrite = TRUE) 97 | write(" ", file = "plot_report.Rmd", append = TRUE) 98 | for (i in 1:n_of_errors){ 99 | 100 | 101 | write("---", file = "plot_report.Rmd", append = TRUE) 102 | write(" ", file = "plot_report.Rmd", append = TRUE) 103 | write(paste0("__",teaching_db[i,4],"__"), file = "plot_report.Rmd", append = TRUE) 104 | write(" " , file = "plot_report.Rmd", append = TRUE) 105 | write(paste0(teaching_db[i,3],"_",sep = ""),file = "plot_report.Rmd", append = TRUE) #<- il problema è in questa aggiunta 106 | write(" " , file = "plot_report.Rmd", append = TRUE) 107 | 108 | } 109 | 110 | write("---", file = "plot_report.Rmd", append = TRUE) 111 | write("Hope you will find this useful and will help you improve your plot", file = "plot_report.Rmd", append = TRUE) 112 | write(" " , file = "plot_report.Rmd", append = TRUE) 113 | write("Cheers,", file = "plot_report.Rmd", append = TRUE) 114 | write(" " , file = "plot_report.Rmd", append = TRUE) 115 | write("_your dataviz bot advisor_", file = "plot_report.Rmd", append = TRUE) 116 | write(" " , file = "plot_report.Rmd", append = TRUE) 117 | 118 | rmarkdown::render("plot_report.Rmd", revealjs_presentation()) 119 | system("open plot_report.html") 120 | } 121 | 122 | 123 | -------------------------------------------------------------------------------- /R/metadata_reader.R: -------------------------------------------------------------------------------- 1 | #' draw a data frame containing relevant metadata of a ggplot object 2 | #' @param plot_object the ggplot object being analysed 3 | #' @description \code{metadata_reader} function performs a number of test on the provided ggplot object to draw some intelligence about it. 4 | #' It tries to derive attributes ofthe plot relate to four data visualization theory areas: 5 | #' - the adequateness of labelling 6 | #' - the data density 7 | #' - the data to ink ratio 8 | #' - the readability of the plot 9 | #' The results of performed tests are stored in a structued list object, so to be provided as an input to a gradient boosting model. 10 | #' See \code{scorer} function documentation for more info about this subsequent step. 11 | #' @return a list storing results and additional data related to each test performed. 12 | #' @export 13 | metadata_reader <- function(plot_object = NULL){ 14 | 15 | # preliminary chech on arguments of the function and others 16 | 17 | if(is.null(plot_object)){ 18 | stop("you must provide a valid ggplot2 object", call = FALSE) 19 | } 20 | #blocking conditions: 21 | # data must e provided within ggplot() call 22 | if(is.null(plot_object$data[1])){ 23 | stop("current version only checks on data provided within ggplot call") 24 | } 25 | # we have to be sure x aestetich is not named "x" and the same for y. Otherwise the introduced check for 26 | # inherited aestetichs (cfr. mapplings_lister) would produce the removal of the right occurence of 27 | if(plot_object$mapping %>% 28 | unlist() %>% 29 | as.character() %>% 30 | match("y") %>% 31 | sum(na.rm = TRUE) != 0 | 32 | plot_object$mapping %>% unlist() %>% as.character() %>% match("y") %>% sum(na.rm = TRUE)!=0){ 33 | stop("it seems you have columns named 'x' or 'y' within the db you provided. This is not safe for some of the check performed. Go and change this, then come back and try again.") 34 | } 35 | n_of_layers <- plot_object$layers %>% length() 36 | # no overplotting check for histograms ( and generally where no y is provided) 37 | if(test_for_histogram(plot_object,n_of_layers) == TRUE){ 38 | message("no check for overplotting on histogram is currently provided") 39 | } 40 | #limitations 41 | aes_db <- mappings_lister(plot_object,n_of_layers) 42 | 43 | raw_x <- aes_puller(plot_object,n_of_layers, "x") 44 | 45 | if(mode(raw_x) == "list"){x_vector <- raw_x[,1] %>% pull}else{x_vector <- as.numeric(raw_x)} 46 | 47 | 48 | if(!is.na(match("..density..",aes_db$variable))){y_vector <- c()}else{ 49 | raw_y <- aes_puller(plot_object,n_of_layers, "y") 50 | if(mode(raw_y) == "list"){y_vector <- raw_y[,1] %>% pull}else{y_vector <- as.numeric(raw_y)} 51 | } 52 | not_handled <- c("factor","character") 53 | available_and_not_handled <- intersect( x = c(unique(c(class(x_vector),class(y_vector)))),y = not_handled) 54 | #check on variabe type 55 | if(class(x_vector) %in% not_handled | class(y_vector) %in% not_handled){ 56 | message(paste("you have variables of class ", 57 | paste(unique(c(class(x_vector),class(y_vector))),collapse = " and "), 58 | ". Not all checks are implemented for ", paste(available_and_not_handled, collapse = " and ")))} 59 | 60 | 61 | 62 | 63 | area_categories <- c("readability of the plot", 64 | "data density", 65 | "data to ink ratio", 66 | "adequateness of labeling") 67 | 68 | default_n_of_bins <- 30 # the default ggplot setting for the number of bins 69 | data_threshold <- 20 # following tufte we set 20 as threshold to suggest the useR to avoid graphs 70 | overplotting_floor <- if(!is.na(match("GeomLine",geoms_lister(plot_object ,n_of_layers)))){ 71 | 16 72 | } else{1} # judgmental, based on experience 73 | layers_tresholds <- 5 # judgmental, based on experience 74 | correlation_threshold <- .4 # judgmental, based on experience 75 | bins_distance <- 6 # judgmental, based on experience 76 | p_build <- ggplot_build(plot_object) 77 | 78 | # perform checks for each of the area and principle and assign a score 79 | 80 | 81 | 82 | # READABILITY OF THE PLOT 83 | 84 | ## is it a pie chart? 85 | 86 | pie_results <- list( area_label = area_categories[1], 87 | topic_label = "pie_chart", 88 | test = as.logical(is_pie_chart(plot_object, n_of_layers)), 89 | ispositive = as.logical(!is_pie_chart(plot_object, n_of_layers)), 90 | additional_data = list()) 91 | 92 | ## are there too many layers? 93 | 94 | layers_results <- list( 95 | area_label = area_categories[1], 96 | topic_label = "number_of_layers", 97 | test = n_of_layers,# here the number of layers 98 | ispositive = !(n_of_layers > layers_tresholds ), 99 | additional_data = n_of_layers) 100 | 101 | ## is the user showing more dimensions than the plot would allow to? 102 | 103 | dimension_results <- list( 104 | area_label = area_categories[1], 105 | topic_label = "number_of_dimensions", 106 | test = too_many_dimensions(plot_object,n_of_layers), 107 | ispositive = !too_many_dimensions(plot_object,n_of_layers), 108 | additional_data = list()) 109 | 110 | ## in case of geom_histogram study the optimal number of bins, employing the Freedman Diaconis rule 111 | 112 | bins_results <- list( 113 | area_label = area_categories[1], 114 | topic_label = "number_of_bins", 115 | test = histogram_bins_tester(plot_object,n_of_layers,default_n_of_bins)[[3]], #here the distance from the optimal number of bins 116 | ispositive = !(histogram_bins_tester(plot_object,n_of_layers,default_n_of_bins)[[3]]>bins_distance), 117 | additional_data = histogram_bins_tester(plot_object,n_of_layers,default_n_of_bins)[2:3] ) 118 | ## let's check if we are looking at a bar plot and if yes we check if it is flipped or not 119 | 120 | flipped_bar_results <- list( 121 | are_label = area_categories[1], 122 | topic_label = "flipped_barplot", 123 | test = is_horizontal_barplot(plot_object,n_of_layers),# TRUE here means we are looking at an horizontal barplot, which is good 124 | ispositive = is_horizontal_barplot(plot_object,n_of_layers), 125 | additional_data = list() 126 | ) 127 | 128 | # we look here for correlation between x and y and for the presence of geom_smooth lines. 129 | 130 | geom_smooth_results <- list( 131 | area_label = area_categories[1], 132 | topic_label = "need_for_a_smooth", 133 | test = does_it_need_geom_smooth(plot_object,n_of_layers,correlation_threshold,aes_db)[[2]],#we place here the distance correlation to better apply machine learning technique 134 | ispositive = !(does_it_need_geom_smooth(plot_object,n_of_layers,correlation_threshold,aes_db)[[2]]>correlation_threshold), 135 | additional_data = does_it_need_geom_smooth(plot_object,n_of_layers,correlation_threshold,aes_db)[[2]] 136 | 137 | ) 138 | 139 | # DATA DENSITY 140 | 141 | ## we check here for the user wasting his time developing a graph for less than 20 points to show 142 | 143 | n_data_results <- list( 144 | area_label = area_categories[2], 145 | topic_label = "sufficient_number_of_data", 146 | test = too_few_data(plot_object, data_threshold), #TRUE here means you have enough data 147 | ispositive = too_few_data(plot_object, data_threshold), 148 | additional_data = list ()) 149 | 150 | ## check here for overplotting 151 | 152 | overplotting_results <- list( 153 | area_label = area_categories[2], 154 | topic_label = "overplotting", 155 | test = cozy_plot(plot_object, n_of_layers,overplotting_floor,aes_db)[[2]], # the median distance here as a measure of overplotting 156 | ispositive = !(cozy_plot(plot_object, n_of_layers,overplotting_floor,aes_db)[[2]]