├── .Rbuildignore
├── R
    ├── sysdata.rda
    ├── zzz.R
    ├── coordinates_lister.R
    ├── test_for_geom_bar.R
    ├── labels_finder.R
    ├── geoms_lister.R
    ├── too_few_data.R
    ├── is_horizontal_barplot.R
    ├── is_pie_chart.R
    ├── aes_puller.R
    ├── test_for_histogram.R
    ├── labels_reader.R
    ├── too_many_dimensions.R
    ├── filled_barplot.R
    ├── heavy_background.R
    ├── mappings_lister.R
    ├── cozy_plot.R
    ├── histogram_bins_tester.R
    ├── does_it_need_geom_smooth.R
    ├── outlier_labels.R
    ├── scorer.R
    ├── scorer_bot.R
    └── metadata_reader.R
├── vizscorer.Rproj
├── man
    ├── coordinates_lister.Rd
    ├── geoms_lister.Rd
    ├── labels_reader.Rd
    ├── mappings_lister.Rd
    ├── labels_finder.Rd
    ├── test_for_geom_bar.Rd
    ├── is_pie_chart.Rd
    ├── aes_puller.Rd
    ├── is_horizontal_barplot.Rd
    ├── too_few_data.Rd
    ├── scorer.Rd
    ├── test_for_histogram.Rd
    ├── cozy_plot.Rd
    ├── heavy_background.Rd
    ├── too_many_dimensions.Rd
    ├── metadata_reader.Rd
    ├── histogram_bins_tester.Rd
    ├── outlier_labels.Rd
    ├── filled_barplot.Rd
    ├── does_it_need_geom_smooth.Rd
    └── scorer_bot.Rd
├── DESCRIPTION
├── NAMESPACE
├── .gitignore
├── LICENSE
├── inst
    └── extdata
    │   ├── comparison_db.csv
    │   ├── report.Rmd
    │   └── plot_advices.csv
├── README.md
└── plot_report.Rmd


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | 


--------------------------------------------------------------------------------
/R/sysdata.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreaCirilloAC/vizscorer/HEAD/R/sysdata.rda


--------------------------------------------------------------------------------
/R/zzz.R:
--------------------------------------------------------------------------------
1 | .onLoad <- function(libname, pkgname) {
2 |   packageStartupMessage("vizscorer is currently under development and it may not be able to analyse your specific plot.
3 | Please look into package documentation for more details: https://github.com/AndreaCirilloAC/vizscorer")
4 | }


--------------------------------------------------------------------------------
/R/coordinates_lister.R:
--------------------------------------------------------------------------------
1 | #' list coordinates layers in a ggplot object
2 | #' @param plot_object the ggplot object being analysed
3 | #' @return a vector storing all coordinates found within layers of a ggplot object
4 | #' @export
5 | coordinates_lister <- function(plot_object){
6 |   plot_object$coordinates %>% class() %>% as.vector()
7 | }
8 | 


--------------------------------------------------------------------------------
/R/test_for_geom_bar.R:
--------------------------------------------------------------------------------
1 | #' look for geom_bar layer on a plot
2 | #' @param plot_object the ggplot object being analysed
3 | #' @param n_of_layers number of layers within the ggplot object
4 | #' @return TRUE if a geom_bar is found within ggplot object layers, FALSE in the opposite case.
5 | #' @export
6 | test_for_geom_bar <- function(plot_object,n_of_layers){
7 |   !is.na(match("GeomBar",geoms_lister(plot_object,n_of_layers)))
8 | }
9 | 


--------------------------------------------------------------------------------
/vizscorer.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | StripTrailingWhitespace: Yes
16 | 
17 | BuildType: Package
18 | PackageUseDevtools: Yes
19 | PackageInstallArgs: --no-multiarch --with-keep.source
20 | 


--------------------------------------------------------------------------------
/R/labels_finder.R:
--------------------------------------------------------------------------------
 1 | #' look for a specific label within a ggplot object
 2 | #' @param plot_object the ggplot object being analysed
 3 | #' @param searched_label, among title, subtitle and caption
 4 | #' @return TRUE if the searched label was found, FALSE in the opposite case
 5 | #' @export
 6 | labels_finder <- function(plot_object,searched_label){
 7 |   labels_vector <- plot_object$labels %>% names() %>% as.vector()
 8 |   label_found <- if(is.na(match(searched_label,labels_vector))){FALSE}else{TRUE}
 9 | 
10 |   }
11 | 


--------------------------------------------------------------------------------
/man/coordinates_lister.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/coordinates_lister.R
 3 | \name{coordinates_lister}
 4 | \alias{coordinates_lister}
 5 | \title{list coordinates layers in a ggplot object}
 6 | \usage{
 7 | coordinates_lister(plot_object)
 8 | }
 9 | \arguments{
10 | \item{plot_object}{the ggplot object being analysed}
11 | }
12 | \value{
13 | a vector storing all coordinates found within layers of a ggplot object
14 | }
15 | \description{
16 | list coordinates layers in a ggplot object
17 | }
18 | 


--------------------------------------------------------------------------------
/man/geoms_lister.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/geoms_lister.R
 3 | \name{geoms_lister}
 4 | \alias{geoms_lister}
 5 | \title{list geoms in a ggplot object}
 6 | \usage{
 7 | geoms_lister(plot_object, n_of_layers)
 8 | }
 9 | \arguments{
10 | \item{plot_object}{the ggplot object being analysed}
11 | 
12 | \item{n_of_layers}{number of layers within the ggplot object}
13 | }
14 | \value{
15 | a vector listing all geoms found within the given ggplot object
16 | }
17 | \description{
18 | list geoms in a ggplot object
19 | }
20 | 


--------------------------------------------------------------------------------
/R/geoms_lister.R:
--------------------------------------------------------------------------------
 1 | #' list geoms in a ggplot object
 2 | #' @param plot_object the ggplot object being analysed
 3 | #' @param n_of_layers number of layers within the ggplot object
 4 | #' @return a vector listing all geoms found within the given ggplot object
 5 | #' @export
 6 | geoms_lister <- function(plot_object, n_of_layers){
 7 |   geoms_in_plot <- c()
 8 |   for (i in 1:n_of_layers) {
 9 | 
10 |     class_list <- plot_object$layers[[i]]$geom %>% class() %>% as.vector()
11 |     geom <- class_list[1]
12 |     geoms_in_plot <- c(geoms_in_plot,geom)
13 |   }
14 |   return(geoms_in_plot)
15 | }
16 | 


--------------------------------------------------------------------------------
/man/labels_reader.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/labels_reader.R
 3 | \name{labels_reader}
 4 | \alias{labels_reader}
 5 | \title{look for special charaters in a plot labels}
 6 | \usage{
 7 | labels_reader(plot_object)
 8 | }
 9 | \arguments{
10 | \item{plot_object}{the ggplot object being analysed}
11 | }
12 | \value{
13 | a list storing test result and the list of special characters found within labels.
14 | }
15 | \description{
16 | this function match the labels of the given ggplot object against a set of predefined special characters
17 | }
18 | 


--------------------------------------------------------------------------------
/man/mappings_lister.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/mappings_lister.R
 3 | \name{mappings_lister}
 4 | \alias{mappings_lister}
 5 | \title{storing aestetich variables in a dataframe}
 6 | \usage{
 7 | mappings_lister(plot_object, n_of_layers)
 8 | }
 9 | \arguments{
10 | \item{plot_object}{the ggplot object being analysed}
11 | 
12 | \item{n_of_layers}{number of layers within the ggplot object}
13 | }
14 | \value{
15 | a data frane storing for each aestetich the mapped variable
16 | }
17 | \description{
18 | storing aestetich variables in a dataframe
19 | }
20 | 


--------------------------------------------------------------------------------
/man/labels_finder.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/labels_finder.R
 3 | \name{labels_finder}
 4 | \alias{labels_finder}
 5 | \title{look for a specific label within a ggplot object}
 6 | \usage{
 7 | labels_finder(plot_object, searched_label)
 8 | }
 9 | \arguments{
10 | \item{plot_object}{the ggplot object being analysed}
11 | 
12 | \item{searched_label, }{among title, subtitle and caption}
13 | }
14 | \value{
15 | TRUE if the searched label was found, FALSE in the opposite case
16 | }
17 | \description{
18 | look for a specific label within a ggplot object
19 | }
20 | 


--------------------------------------------------------------------------------
/man/test_for_geom_bar.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/test_for_geom_bar.R
 3 | \name{test_for_geom_bar}
 4 | \alias{test_for_geom_bar}
 5 | \title{look for geom_bar layer on a plot}
 6 | \usage{
 7 | test_for_geom_bar(plot_object, n_of_layers)
 8 | }
 9 | \arguments{
10 | \item{plot_object}{the ggplot object being analysed}
11 | 
12 | \item{n_of_layers}{number of layers within the ggplot object}
13 | }
14 | \value{
15 | TRUE if a geom_bar is found within ggplot object layers, FALSE in the opposite case.
16 | }
17 | \description{
18 | look for geom_bar layer on a plot
19 | }
20 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: vizscorer
 2 | Type: Package
 3 | Title: An intelligent advisor for your dataviz
 4 | Version: 0.1.0
 5 | Author: Andrea Cirillo
 6 | Maintainer: Andrea Cirillo <me@andreacirillo.com>
 7 | Description: vizscorer analyses yout ggplot plot object and provides you customised advices on how
 8 |         how to improve it. The analysis is performed applying machine learning to your plot metadata and results
 9 |         in a customised report.
10 | License: What license is it under?
11 | Encoding: UTF-8
12 | LazyData: true
13 | Depends: ggplot2 , cluster, dplyr , energy , caret , tidyr , knitr , revealjs
14 | RoxygenNote: 6.0.1
15 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(aes_puller)
 4 | export(coordinates_lister)
 5 | export(cozy_plot)
 6 | export(does_it_need_geom_smooth)
 7 | export(filled_barplot)
 8 | export(geoms_lister)
 9 | export(heavy_background)
10 | export(histogram_bins_tester)
11 | export(is_horizontal_barplot)
12 | export(is_pie_chart)
13 | export(labels_finder)
14 | export(labels_reader)
15 | export(mappings_lister)
16 | export(metadata_reader)
17 | export(outlier_labels)
18 | export(scorer)
19 | export(scorer_bot)
20 | export(test_for_geom_bar)
21 | export(test_for_histogram)
22 | export(too_few_data)
23 | export(too_many_dimensions)
24 | 


--------------------------------------------------------------------------------
/R/too_few_data.R:
--------------------------------------------------------------------------------
 1 | #' look for overshooting in case of a plot for with too few data
 2 | #' @param plot_object the ggplot object being analysed
 3 | #' @param data_threshold a custom threshold to define too many data
 4 | #' @description this function checks for the presence of too few data to deserve a plot. Based on data visualization literature it is assumed that twenty data points can be considered a significant threshold
 5 | #' @return TRUE in case too few data points are observed, FALSE in the opposite site.
 6 | #' @export
 7 | too_few_data <- function(plot_object,data_threshold) {
 8 |   if (plot_object$data %>% nrow() < data_threshold) { FALSE}else{TRUE}
 9 | }
10 | 


--------------------------------------------------------------------------------
/R/is_horizontal_barplot.R:
--------------------------------------------------------------------------------
 1 | #' a simple function to test for the presence of too many dimension for a 2D plotting space
 2 | #' @param plot_object the ggplot object being analysed
 3 | #' @param n_of_layers number of layers within the ggplot object
 4 | #' @return TRUE in case the plot object is a bar plot and it is horizontally layed, FALSE in the opposite case.
 5 | #' @export
 6 | is_horizontal_barplot <- function(plot_object,n_of_layers){
 7 |   is_bar_plot <- test_for_geom_bar(plot_object,n_of_layers)
 8 |   is_flipped <- !is.na(match("CoordFlip" ,coordinates_lister(plot_object)))
 9 |   if(!is_bar_plot){return(FALSE)}else{
10 |   return(is_bar_plot & is_flipped)}
11 | }
12 | 


--------------------------------------------------------------------------------
/man/is_pie_chart.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/is_pie_chart.R
 3 | \name{is_pie_chart}
 4 | \alias{is_pie_chart}
 5 | \title{test a ggplot object to understand if it is a pie chart}
 6 | \usage{
 7 | is_pie_chart(plot_object, n_of_layers)
 8 | }
 9 | \arguments{
10 | \item{plot_object}{the ggplot object being analysed}
11 | 
12 | \item{n_of_layers}{number of layers within the ggplot object}
13 | }
14 | \value{
15 | TRUE if the plot is a pie chart and FALSE in the opposite case
16 | }
17 | \description{
18 | to understand if aplot is a pie chart this functions look for the presence of a \code{geom_bar} layer and a \code{coord_polar} layer.
19 | }
20 | 


--------------------------------------------------------------------------------
/man/aes_puller.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/aes_puller.R
 3 | \name{aes_puller}
 4 | \alias{aes_puller}
 5 | \title{retrieve a vector storing values of a specified aestetich}
 6 | \usage{
 7 | aes_puller(plot_object, n_of_layers, required_aes)
 8 | }
 9 | \arguments{
10 | \item{plot_object}{the ggplot object being analysed}
11 | 
12 | \item{n_of_layers}{number of layers within the ggplot object}
13 | 
14 | \item{required_aes}{the ggplot object aestetich required}
15 | }
16 | \value{
17 | a vector containing values of the variable the required specified aestetich is mapped to
18 | }
19 | \description{
20 | retrieve a vector storing values of a specified aestetich
21 | }
22 | 


--------------------------------------------------------------------------------
/man/is_horizontal_barplot.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/is_horizontal_barplot.R
 3 | \name{is_horizontal_barplot}
 4 | \alias{is_horizontal_barplot}
 5 | \title{a simple function to test for the presence of too many dimension for a 2D plotting space}
 6 | \usage{
 7 | is_horizontal_barplot(plot_object, n_of_layers)
 8 | }
 9 | \arguments{
10 | \item{plot_object}{the ggplot object being analysed}
11 | 
12 | \item{n_of_layers}{number of layers within the ggplot object}
13 | }
14 | \value{
15 | TRUE in case the plot object is a bar plot and it is horizontally layed, FALSE in the opposite case.
16 | }
17 | \description{
18 | a simple function to test for the presence of too many dimension for a 2D plotting space
19 | }
20 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # History files
 2 | .Rhistory
 3 | .Rapp.history
 4 | 
 5 | # Session Data files
 6 | .RData
 7 | 
 8 | # Example code in package build process
 9 | *-Ex.R
10 | 
11 | # Output files from R CMD build
12 | /*.tar.gz
13 | 
14 | # Output files from R CMD check
15 | /*.Rcheck/
16 | 
17 | # RStudio files
18 | .Rproj.user/
19 | 
20 | # produced vignettes
21 | vignettes/*.html
22 | vignettes/*.pdf
23 | 
24 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
25 | .httr-oauth
26 | 
27 | # knitr and R markdown default cache directories
28 | /*_cache/
29 | /cache/
30 | 
31 | # Temporary files created by R markdown
32 | *.utf8.md
33 | *.knit.md
34 | 
35 | # Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html
36 | rsconnect/
37 | .Rproj.user
38 | 


--------------------------------------------------------------------------------
/R/is_pie_chart.R:
--------------------------------------------------------------------------------
 1 | #' test a ggplot object to understand if it is a pie chart
 2 | #' @param plot_object the ggplot object being analysed
 3 | #' @param n_of_layers number of layers within the ggplot object
 4 | #' @description to understand if aplot is a pie chart this functions look for the presence of a \code{geom_bar} layer and a \code{coord_polar} layer.
 5 | #' @return TRUE if the plot is a pie chart and FALSE in the opposite case
 6 | #' @export
 7 | is_pie_chart <- function(plot_object, n_of_layers){
 8 | 
 9 |   coordinates_class <-   coordinates_lister(plot_object)
10 | 
11 |   # if we do not found any geom_bar we can be sure it is not a pie chart
12 | 
13 | test_for_geom_bar(plot_object,n_of_layers)& !is.na(match("CoordPolar",coordinates_class))
14 | 
15 |   }
16 | 
17 | 


--------------------------------------------------------------------------------
/man/too_few_data.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/too_few_data.R
 3 | \name{too_few_data}
 4 | \alias{too_few_data}
 5 | \title{look for overshooting in case of a plot for with too few data}
 6 | \usage{
 7 | too_few_data(plot_object, data_threshold)
 8 | }
 9 | \arguments{
10 | \item{plot_object}{the ggplot object being analysed}
11 | 
12 | \item{data_threshold}{a custom threshold to define too many data}
13 | }
14 | \value{
15 | TRUE in case too few data points are observed, FALSE in the opposite site.
16 | }
17 | \description{
18 | this function checks for the presence of too few data to deserve a plot. Based on data visualization literature it is assumed that twenty data points can be considered a significant threshold
19 | }
20 | 


--------------------------------------------------------------------------------
/man/scorer.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/scorer.R
 3 | \name{scorer}
 4 | \alias{scorer}
 5 | \title{scoring the effectivenes of ggplot object applying a gradient boosting model}
 6 | \usage{
 7 | scorer(plot_metadata = NULL)
 8 | }
 9 | \arguments{
10 | \item{plot_metadata}{a data frame storing ggplot object metadata, resulting from a previous call of \code{metadata_reader} function.}
11 | }
12 | \value{
13 | a numeric value
14 | }
15 | \description{
16 | \code{scorer} function handles the running of a gradient boosting pre-trained model on the metadata of a ggplot object.
17 | As a result a 0 to 1 score is produced, measuring the effectiveness of the related plot based on a set of rules drawn from data visualization theory.
18 | }
19 | 


--------------------------------------------------------------------------------
/man/test_for_histogram.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/test_for_histogram.R
 3 | \name{test_for_histogram}
 4 | \alias{test_for_histogram}
 5 | \title{analyse a ggplot object to understand if it is an histogram}
 6 | \usage{
 7 | test_for_histogram(plot_object, n_of_layers)
 8 | }
 9 | \arguments{
10 | \item{plot_object}{the ggplot object being analysed}
11 | 
12 | \item{n_of_layers}{number of layers within the ggplot object}
13 | }
14 | \value{
15 | TRUE in case the plot is an histogram, FALSE in the opposite case
16 | }
17 | \description{
18 | this function checks two conditions to understand if the provided ggplot object represent an histogram:
19 | - presence of a geom_bar layer
20 | - presence of bindwidth parameter among parameters of that layer
21 | }
22 | 


--------------------------------------------------------------------------------
/R/aes_puller.R:
--------------------------------------------------------------------------------
 1 | #' retrieve a vector storing values of a specified aestetich
 2 | #' @param plot_object the ggplot object being analysed
 3 | #' @param n_of_layers number of layers within the ggplot object
 4 | #' @param required_aes the ggplot object aestetich required
 5 | #' @return a vector containing values of the variable the required specified aestetich is mapped to
 6 | #' @export
 7 | aes_puller <- function(plot_object,n_of_layers, required_aes){
 8 | 
 9 |   aes_db     <- mappings_lister(plot_object , n_of_layers )
10 |   variable_name   <- aes_db %>% filter(aes == required_aes) %>% select("variable") %>% pull()
11 |   col_index <- match(variable_name,colnames(plot_object$data))
12 |   if(length(variable_name) == 0 ){return(NA)}else{
13 |   variable_vector <- plot_object$data[,col_index]
14 | 
15 |   return(variable_vector)}
16 | }
17 | 


--------------------------------------------------------------------------------
/R/test_for_histogram.R:
--------------------------------------------------------------------------------
 1 | #' analyse a ggplot object to understand if it is an histogram
 2 | #' @param plot_object the ggplot object being analysed
 3 | #' @param n_of_layers number of layers within the ggplot object
 4 | #' @description this function checks two conditions to understand if the provided ggplot object represent an histogram:
 5 | #' - presence of a geom_bar layer
 6 | #' - presence of bindwidth parameter among parameters of that layer
 7 | #' @return TRUE in case the plot is an histogram, FALSE in the opposite case
 8 | #' @export
 9 | test_for_histogram <- function(plot_object,n_of_layers){
10 |   bar_index <- match("GeomBar", geoms_lister(plot_object,n_of_layers))
11 |   bar_stat_params <- if(!is.na(bar_index )){plot_object$layers[[bar_index]]$stat_params %>% names()}
12 |   test_histogram <- !is.na(bar_index) & !is.na(match("binwidth",bar_stat_params))
13 |   return(test_histogram)
14 | }
15 | 


--------------------------------------------------------------------------------
/man/cozy_plot.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cozy_plot.R
 3 | \name{cozy_plot}
 4 | \alias{cozy_plot}
 5 | \title{checking for the presence of overplotting within a ggplot object.}
 6 | \usage{
 7 | cozy_plot(plot_object, n_of_layers, overplotting_floor, aes_db)
 8 | }
 9 | \arguments{
10 | \item{plot_object}{the ggplot object being analysed}
11 | 
12 | \item{n_of_layers}{number of layers within the ggplot object}
13 | 
14 | \item{overplotting_floor}{a custom threshold of distance among points to detect overplotting}
15 | 
16 | \item{aes_db}{a data frame storing plot object aestetichs and variable mapped against them}
17 | }
18 | \value{
19 | a list containing an overall test result with TRUE in presence of overplotting and the median distance observed between points
20 | }
21 | \description{
22 | checking for the presence of overplotting within a ggplot object.
23 | }
24 | 


--------------------------------------------------------------------------------
/R/labels_reader.R:
--------------------------------------------------------------------------------
 1 | #' look for special charaters in a plot labels
 2 | #' @param plot_object the ggplot object being analysed
 3 | #' @description this function match the labels of the given ggplot object against a set of predefined special characters
 4 | #' @return a list storing test result and the list of special characters found within labels.
 5 | #' @export
 6 | labels_reader <- function(plot_object){
 7 |   labels_text_vector <- plot_object$labels %>% unlist() %>% as.vector()
 8 |   special_character_vector <- c("&","/","!","?","^","@","#","$","¶","§", ":)", ".")
 9 | 
10 |   match_vector <- pmatch(special_character_vector,labels_text_vector)
11 | 
12 |   special_characters_indexes <- match_vector[!is.na(match_vector)]
13 | 
14 |   there_are_special_characters <- if(length(special_characters_indexes) == 0){FALSE}else{TRUE}
15 | 
16 |   labels_with_special_characters <- labels_text_vector[special_characters_indexes]
17 |   return(list(there_are_special_characters, labels_with_special_characters))
18 | }
19 | 


--------------------------------------------------------------------------------
/man/heavy_background.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/heavy_background.R
 3 | \name{heavy_background}
 4 | \alias{heavy_background}
 5 | \title{look for heavy background on a ggplot plot}
 6 | \usage{
 7 | heavy_background(plot_object)
 8 | }
 9 | \arguments{
10 | \item{plot_object}{the ggplot object being analysed}
11 | }
12 | \value{
13 | returns TRUE if the plot is depicted on an heavy background, FALSE if the background is white or transparent
14 | }
15 | \description{
16 | this function tests a ggplot object to understand if it represents a plot with a background neither transparent nor white.
17 | Based on data visualization literature this kind of plots tend to reduce the **data to ink ratio** meaning the amount of data showed related to amount of ink employed.
18 | Lower values of this ratio are related to non-effective plots.
19 | }
20 | \references{
21 | The Visual Display of Quantitative Information, E.R.Tufte, GRaphics Press . Cheshire, Connecticut
22 | }
23 | 


--------------------------------------------------------------------------------
/man/too_many_dimensions.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/too_many_dimensions.R
 3 | \name{too_many_dimensions}
 4 | \alias{too_many_dimensions}
 5 | \title{a simple function to test for the presence of too many dimension for a 2D plotting space}
 6 | \usage{
 7 | too_many_dimensions(plot_object, n_of_layers)
 8 | }
 9 | \arguments{
10 | \item{plot_object}{the ggplot object being analysed}
11 | 
12 | \item{n_of_layers}{number of layers within the ggplot object}
13 | }
14 | \value{
15 | TRUE if the plot contains too many dimension, FALSE in the opposite case.
16 | }
17 | \description{
18 | this function checks for the presence of both point or line and the mapping of some variable on the \code{size} aestetich.
19 | This check is based on data visualization principle requesting that the number of dimension depicted should not be higher thant the number of dimension of the data.
20 | }
21 | \references{
22 | The Visual Display of Quantitative Information, E.R.Tufte, GRaphics Press . Cheshire, Connecticut
23 | }
24 | 


--------------------------------------------------------------------------------
/man/metadata_reader.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/metadata_reader.R
 3 | \name{metadata_reader}
 4 | \alias{metadata_reader}
 5 | \title{draw a data frame containing relevant metadata of a ggplot object}
 6 | \usage{
 7 | metadata_reader(plot_object = NULL)
 8 | }
 9 | \arguments{
10 | \item{plot_object}{the ggplot object being analysed}
11 | }
12 | \value{
13 | a list storing results and additional data related to each test performed.
14 | }
15 | \description{
16 | \code{metadata_reader} function performs a number of test on the provided ggplot object to draw some intelligence about it.
17 | It tries to derive attributes ofthe plot relate to four data visualization theory areas:
18 | - the adequateness of labelling
19 | - the data density
20 | - the data to ink ratio
21 | - the readability of the plot
22 | The results of performed tests are stored in a structued list object, so to be provided as an input to a gradient boosting model.
23 | See \code{scorer} function documentation for more info about this subsequent step.
24 | }
25 | 


--------------------------------------------------------------------------------
/man/histogram_bins_tester.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/histogram_bins_tester.R
 3 | \name{histogram_bins_tester}
 4 | \alias{histogram_bins_tester}
 5 | \title{compare the actual bins width against the Freedman Diaconis rule}
 6 | \usage{
 7 | histogram_bins_tester(plot_object, n_of_layers, default_n_of_bins)
 8 | }
 9 | \arguments{
10 | \item{plot_object}{the ggplot object being analysed}
11 | 
12 | \item{n_of_layers}{number of layers within the ggplot object}
13 | 
14 | \item{default_n_of_bins}{the default number of bins set by ggplot2. employed if no custom bin was set from the user within the ggplot object}
15 | }
16 | \value{
17 | an object storing the general test result, the optimal binwidth based on Freedman Diaconis and the distance of the actual binwidth from this measure.
18 | TRUE is produced as a result if the actual width is not optimal.
19 | }
20 | \description{
21 | this function computes the optimal bins width based on the Freedman Diaconis rule and compares it against the actual bin width set within the ggplot object.
22 | }
23 | 


--------------------------------------------------------------------------------
/man/outlier_labels.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/outlier_labels.R
 3 | \name{outlier_labels}
 4 | \alias{outlier_labels}
 5 | \title{check for the presence of outliers not labelled within a ggplot object}
 6 | \usage{
 7 | outlier_labels(plot_object, n_of_layers, p_build, aes_db)
 8 | }
 9 | \arguments{
10 | \item{plot_object}{the ggplot object being analysed}
11 | 
12 | \item{n_of_layers}{number of layers within the ggplot object}
13 | 
14 | \item{p_build}{an object obtained running \code{ggplot_build} on the ggplot object}
15 | 
16 | \item{aes_db}{a data frame storing plot object aestetichs and variable mapped against them}
17 | }
18 | \value{
19 | a list storing a TRUE in case there are no outliers or the detected outliers are labelled and FALSE in the other cases.
20 | }
21 | \description{
22 | for x and y variable the Tukey outlier rule is applied to evaluate the presence of outliers.
23 | If any outlier is identified one more check is performed to retrieve among plot metadata the presence of some annotation related to that data.
24 | }
25 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Andrea Cirillo
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/man/filled_barplot.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/filled_barplot.R
 3 | \name{filled_barplot}
 4 | \alias{filled_barplot}
 5 | \title{test a ggplot object to understand if it represents a barplot with filled bars}
 6 | \usage{
 7 | filled_barplot(plot_object, n_of_layers)
 8 | }
 9 | \arguments{
10 | \item{plot_object}{the ggplot object being analysed}
11 | 
12 | \item{n_of_layers}{number of layers within the ggplot object}
13 | }
14 | \value{
15 | returns TRUE when the barplot is composed of filled bars and FALSE in the opposite case.
16 | }
17 | \description{
18 | this function tests a ggplot object to understand if it represents a barplot with filled bars.
19 | Based on data visualization literature this kind of plots tend to reduce the **data to ink ratio** meaning the amount of data showed related to amount of ink employed.
20 | Lower values of this ratio are related to non-effective plots.
21 | A control is performed to exclude we are simply looking at a pie chart based on an intermediate \code{geom_bar} layer
22 | }
23 | \references{
24 | The Visual Display of Quantitative Information, E.R.Tufte, GRaphics Press . Cheshire, Connecticut
25 | }
26 | 


--------------------------------------------------------------------------------
/R/too_many_dimensions.R:
--------------------------------------------------------------------------------
 1 | #' a simple function to test for the presence of too many dimension for a 2D plotting space
 2 | #' @param plot_object the ggplot object being analysed
 3 | #' @param n_of_layers number of layers within the ggplot object
 4 | #' @description this function checks for the presence of both point or line and the mapping of some variable on the \code{size} aestetich.
 5 | #' This check is based on data visualization principle requesting that the number of dimension depicted should not be higher thant the number of dimension of the data.
 6 | #' @references The Visual Display of Quantitative Information, E.R.Tufte, GRaphics Press . Cheshire, Connecticut
 7 | #' @return TRUE if the plot contains too many dimension, FALSE in the opposite case.
 8 | #' @export
 9 | too_many_dimensions <- function(plot_object,n_of_layers){
10 | # we check here if there is a combination of geom_point/ geom_line and mapping = size
11 | 
12 | not_point_or_line <- match(c("GeomLine", "GeomPoint"),geoms_lister(plot_object,n_of_layers)) %>%
13 |   sum(na.rm = TRUE) ==0
14 | not_mapped_on_size <- match("size",
15 |                         mappings_lister(plot_object,n_of_layers)$aes ) %>% is.na()
16 | return(!not_point_or_line & !not_mapped_on_size)
17 | 
18 | }
19 | 


--------------------------------------------------------------------------------
/man/does_it_need_geom_smooth.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/does_it_need_geom_smooth.R
 3 | \name{does_it_need_geom_smooth}
 4 | \alias{does_it_need_geom_smooth}
 5 | \title{study the relationship among variables to find out if a geom_smooth is needed within the plot}
 6 | \usage{
 7 | does_it_need_geom_smooth(plot_object, n_of_layers, correlation_threshold,
 8 |   aes_db)
 9 | }
10 | \arguments{
11 | \item{plot_object}{the ggplot object being analysed}
12 | 
13 | \item{n_of_layers}{number of layers within the ggplot object}
14 | 
15 | \item{correlation_threshold}{a judgmentally specified threshold to test for the need of a trend line on the plot}
16 | 
17 | \item{aes_db}{a data frame storing plot object aestetichs and variable mapped against them}
18 | }
19 | \value{
20 | a list storing in two separate vectors the final result of the test and the observed distance correlation.
21 | the test outcome is TRUE in case a trend line is deemed necessary.
22 | }
23 | \description{
24 | \code{does_it_need_geom_smooth} computes distance correlation among x and y aestetich to evaluate it against a judgmental threshold.
25 | if this threshold is breached and no \code{geom_smooth} is observed among ggplot objects layers a postive outcome is produced for the test.
26 | }
27 | 


--------------------------------------------------------------------------------
/man/scorer_bot.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/scorer_bot.R
 3 | \name{scorer_bot}
 4 | \alias{scorer_bot}
 5 | \title{produce a customised report analysing the submitted ggplot object}
 6 | \usage{
 7 | scorer_bot(plot_object = NULL)
 8 | }
 9 | \arguments{
10 | \item{plot_object}{the ggplot object being analysed}
11 | }
12 | \value{
13 | a reveljs deck of slide containing an evaluation of the effectiveness of ggplot objects and suggestion about how to improve it.
14 | }
15 | \description{
16 | call to \code{scorer_bot} function produces the sequential calling of all function included within
17 | the \code{vizscorer} package, with the purpose to produce a customised report about the submitted ggplot object. The function handles call to \code{metadata_reader} function
18 | to draw metadata from the plot object in a format ready to be submitted to the scorer algorithm. within the next step those metadata are submitted to the \code{scorer} function which apply a gradient boosting pre-trained model to them.
19 | a final score from 0 to 1 is computed as a probability to be a good plot.
20 | To provide more relevant information to the user a set of advices are retrieved from a built-in knowledge base, selecting only those relevant to suboptimal areas of the plot, observed from metadata.
21 | Finally a revealjs deck of slides is programmatically created leveraging a quasi-real natural language generation algorithm.
22 | }
23 | 


--------------------------------------------------------------------------------
/R/filled_barplot.R:
--------------------------------------------------------------------------------
 1 | #' test a ggplot object to understand if it represents a barplot with filled bars
 2 | #' @param plot_object the ggplot object being analysed
 3 | #' @param n_of_layers number of layers within the ggplot object
 4 | #' @description this function tests a ggplot object to understand if it represents a barplot with filled bars.
 5 | #' Based on data visualization literature this kind of plots tend to reduce the **data to ink ratio** meaning the amount of data showed related to amount of ink employed.
 6 | #' Lower values of this ratio are related to non-effective plots.
 7 | #' A control is performed to exclude we are simply looking at a pie chart based on an intermediate \code{geom_bar} layer
 8 | #' @references The Visual Display of Quantitative Information, E.R.Tufte, GRaphics Press . Cheshire, Connecticut
 9 | #' @return returns TRUE when the barplot is composed of filled bars and FALSE in the opposite case.
10 | #' @export
11 | filled_barplot <- function(plot_object,n_of_layers){
12 | 
13 | is_geom_bar <- test_for_geom_bar(plot_object,n_of_layers)
14 | 
15 | mappings_lister(plot_object,n_of_layers) %>%
16 |     select(aes) %>%
17 |     pull() %>%
18 |     as.vector()-> aes_vector
19 | 
20 | fill_aes_is_mapped <- !is.na(match("fill",aes_vector))
21 | 
22 | a_pie_chart <- is_pie_chart(plot_object, n_of_layers)
23 | 
24 | if(is_geom_bar &
25 |    fill_aes_is_mapped == FALSE &
26 |    a_pie_chart==FALSE){ # we control for being a pie chart
27 |   TRUE
28 | }else{if(is_geom_bar == FALSE){FALSE}#NA}
29 |   else{FALSE}}
30 | }
31 | 


--------------------------------------------------------------------------------
/R/heavy_background.R:
--------------------------------------------------------------------------------
 1 | #'look for heavy background on a ggplot plot
 2 | #' @param plot_object the ggplot object being analysed
 3 | #' @description this function tests a ggplot object to understand if it represents a plot with a background neither transparent nor white.
 4 | #' Based on data visualization literature this kind of plots tend to reduce the **data to ink ratio** meaning the amount of data showed related to amount of ink employed.
 5 | #' Lower values of this ratio are related to non-effective plots.
 6 | #' @references The Visual Display of Quantitative Information, E.R.Tufte, GRaphics Press . Cheshire, Connecticut
 7 | #' @return returns TRUE if the plot is depicted on an heavy background, FALSE if the background is white or transparent
 8 | #' @export
 9 | heavy_background <- function(plot_object){
10 |   # we check here if the default background is being used AND
11 |   # if the default background is still the grey one
12 | 
13 |   default_theme <- theme_get()
14 |   default_is_grey <- pmatch("grey",default_theme$panel.background$fill)
15 |   background_fill <- plot_object$theme$panel.background$fill
16 |   if ((length(as.character(plot_object$theme)) == 0 | is.null(background_fill))  & !is.na(default_is_grey)) {
17 |     TRUE # the default theme is being used and this produces to a grey background
18 |   }else{
19 |     if (background_fill %in% c(rgb(1,1,1),"#FFFFFF", "white", "transparent")) { # if the background is not white or transparent we say it is heavy, tertium non datur
20 |       FALSE}else{
21 |         TRUE}
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/R/mappings_lister.R:
--------------------------------------------------------------------------------
 1 | #' storing aestetich variables in a dataframe
 2 | #' @param plot_object the ggplot object being analysed
 3 | #' @param n_of_layers number of layers within the ggplot object
 4 | #' @return a data frane storing for each aestetich the mapped variable
 5 | #' @export
 6 | mappings_lister <- function(plot_object,n_of_layers){
 7 | 
 8 |   #aes_db_raw just contains aes from within the ggplot() call, i.e. from the "zero" layer
 9 |   aes_db_raw <- data.frame("aes" = plot_object$mapping %>% unlist() %>% names(),
10 |                                      "variable" = plot_object$mapping %>% unlist() %>% as.character(),stringsAsFactors = FALSE )
11 | 
12 |   #we then add aes from other layers
13 |   for(i in 1:n_of_layers){
14 |     aes_names_partial <- plot_object$layers[[i]]$mapping %>% unlist() %>% names()
15 |     aes_values_partial_raw <- plot_object$layers[[i]]$mapping %>% unlist() %>% as.character()
16 |     #removing ~ character to ensure proper matching with variable names
17 |     aes_values_partial <- gsub("~","",x = aes_values_partial_raw)
18 |     rbind(aes_db_raw,
19 |           data.frame("aes" = aes_names_partial,
20 |                      "variable" = aes_values_partial,
21 |                      stringsAsFactors = FALSE)) %>%
22 |       filter(variable != "x", variable != "y") -> aes_db_intermediate # we check for aestetichs inheritance
23 |     }
24 |   #in the end I add the mapping set into ggplot()
25 |   aes_db_intermediate %>%
26 |     mutate(variable = gsub("~","",variable)) -> aes_db
27 |  return(aes_db)
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/R/cozy_plot.R:
--------------------------------------------------------------------------------
 1 | #' checking for the presence of overplotting within a ggplot object.
 2 | #' @param plot_object the ggplot object being analysed
 3 | #' @param n_of_layers number of layers within the ggplot object
 4 | #' @param overplotting_floor a custom threshold of distance among points to detect overplotting
 5 | #' @param aes_db a data frame storing plot object aestetichs and variable mapped against them
 6 | #' @return a list containing an overall test result with TRUE in presence of overplotting and the median distance observed between points
 7 | #' @export
 8 | cozy_plot <- function(plot_object, n_of_layers, overplotting_floor,aes_db){
 9 | 
10 | if (is.na(match("y",aes_db$aes))){ #if there is no y we are probably "looking" at an histogram, for which no overplotting check is provided
11 |   return(list(NA,100))}else{
12 |   # we look here for overplotting. To do this we compute the median euclidean distance as a measure
13 |     # of plot density
14 |     raw_x <- aes_puller(plot_object,n_of_layers, "x")
15 |     if(mode(raw_x) == "list"){x_vector <- raw_x[,1] %>% pull}else{x_vector <- raw_x}
16 |     if(!is.na(match("..density..",aes_db$variable) )){y_vector <- c()}else{
17 |     raw_y <- aes_puller(plot_object,n_of_layers, "y")
18 |     if(mode(raw_y) == "list"){y_vector <- raw_y[,1] %>% pull}else{y_vector <- raw_y}
19 |     }
20 |     not_handled <- c("factor","character","Date")
21 |     #check on variabe type
22 |     if(class(x_vector) %in% not_handled | class(y_vector) %in% not_handled|!is.na(match("..density..",aes_db$variable)) ){return(list(NA,100))}else{
23 | 
24 |   median_distance <- median(daisy(data.frame(x_vector,y_vector)))
25 | 
26 |   test_result <- median_distance < overplotting_floor
27 | 
28 |   return(list("test_result" = test_result,
29 |               "median_distance" = median_distance))
30 |     }
31 |   }
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/inst/extdata/comparison_db.csv:
--------------------------------------------------------------------------------
 1 | "","good","model"
 2 | "1",0.144211620391848,"gbm"
 3 | "2",0.218814729082962,"gbm"
 4 | "3",0.139131253522651,"gbm"
 5 | "4",0.208631105844797,"gbm"
 6 | "5",0.179385610765719,"gbm"
 7 | "6",0.120071844534823,"gbm"
 8 | "7",0.164181484028644,"gbm"
 9 | "8",0.205241426326928,"gbm"
10 | "9",0.156413287634618,"gbm"
11 | "10",0.638012252879337,"gbm"
12 | "11",0.22387265004053,"gbm"
13 | "12",0.638012252879337,"gbm"
14 | "13",0.186305201671998,"gbm"
15 | "14",0.638012252879337,"gbm"
16 | "15",0.638012252879337,"gbm"
17 | "16",0.194471806079098,"gbm"
18 | "17",0.129353075678487,"gbm"
19 | "18",0.638012252879337,"gbm"
20 | "19",0.638012252879337,"gbm"
21 | "20",0.239115140832344,"gbm"
22 | "21",0.144211620391848,"gbm"
23 | "22",0.638012252879337,"gbm"
24 | "23",0.638012252879337,"gbm"
25 | "24",0.200495699012896,"gbm"
26 | "25",0.638012252879337,"gbm"
27 | "26",0.171822731153773,"gbm"
28 | "27",0.638012252879337,"gbm"
29 | "28",0.189779089244373,"gbm"
30 | "29",0.184835736843535,"gbm"
31 | "30",0.638012252879337,"gbm"
32 | "31",0.638012252879337,"gbm"
33 | "32",0.236262189471113,"gbm"
34 | "33",0.120071844534823,"gbm"
35 | "34",0.200495699012896,"gbm"
36 | "35",0.146054675383698,"gbm"
37 | "36",0.166872480718325,"gbm"
38 | "37",0.156413287634618,"gbm"
39 | "38",0.22387265004053,"gbm"
40 | "39",0.638012252879337,"gbm"
41 | "40",0.182534545204342,"gbm"
42 | "41",0.147772112561917,"gbm"
43 | "42",0.208631105844797,"gbm"
44 | "43",0.144211620391848,"gbm"
45 | "44",0.144211620391848,"gbm"
46 | "45",0.179385610765719,"gbm"
47 | "46",0.155990810792557,"gbm"
48 | "47",0.239587909998247,"gbm"
49 | "48",0.163675931518928,"gbm"
50 | "49",0.226157482865321,"gbm"
51 | "50",0.171822731153773,"gbm"
52 | "51",0.120071844534823,"gbm"
53 | "52",0.208631105844797,"gbm"
54 | "53",0.22031846014868,"gbm"
55 | "54",0.183746501021698,"gbm"
56 | "55",0.171822731153773,"gbm"
57 | "56",0.183746501021698,"gbm"
58 | "57",0.638012252879337,"gbm"
59 | "58",0.208631105844797,"gbm"
60 | "59",0.200495699012896,"gbm"
61 | "60",0.211673659995752,"gbm"
62 | "61",0.171822731153773,"gbm"
63 | "62",0.22387265004053,"gbm"
64 | "63",0.194618727276444,"gbm"
65 | "64",0.22387265004053,"gbm"
66 | "65",0.184835736843535,"gbm"
67 | 


--------------------------------------------------------------------------------
/R/histogram_bins_tester.R:
--------------------------------------------------------------------------------
 1 | #' compare the actual bins width against the Freedman Diaconis rule
 2 | #' @param plot_object the ggplot object being analysed
 3 | #' @param n_of_layers number of layers within the ggplot object
 4 | #' @param default_n_of_bins the default number of bins set by ggplot2. employed if no custom bin was set from the user within the ggplot object
 5 | #' @description this function computes the optimal bins width based on the Freedman Diaconis rule and compares it against the actual bin width set within the ggplot object.
 6 | #' @return an object storing the general test result, the optimal binwidth based on Freedman Diaconis and the distance of the actual binwidth from this measure.
 7 | #' TRUE is produced as a result if the actual width is not optimal.
 8 | #' @export
 9 | histogram_bins_tester <- function(plot_object, n_of_layers, default_n_of_bins){
10 | # first of all I look for a GeomBar layer and a binwidth parameter. the occurence of
11 |   # both means we are "looking" at an histogram ( there is no separate geom)
12 |  test_histogram <- test_for_histogram(plot_object,n_of_layers)
13 | # if the plot actually seems to be an histogram I retrieve di x variable and compute on it
14 |   ## the optimal number of bins based on the Freedman Diaconis rule
15 | 
16 | if(test_histogram){
17 | variable_vector <- aes_puller(plot_object,n_of_layers,required_aes = "x")
18 | optimal_bw       <-  2 * (IQR(variable_vector, na.rm = TRUE) / length(variable_vector)^(1/3))
19 | bar_index <- match("GeomBar",geoms_lister(plot_object,n_of_layers))
20 | bar_stat_params <- plot_object[[bar_index]]$stat_params
21 | actual_bw_index  <- match("binwidth",bar_stat_params)
22 | actual_bw          <- plot_object$layers[[bar_index]]$stat_params[[actual_bw_index]]
23 | 
24 | if (is.null(actual_bw)){
25 |   actual_bw <- diff(range(variable_vector, na.rm = TRUE))/default_n_of_bins
26 |   } # we handle here the common case of the user not changing the default setting for bins size
27 | 
28 | # after retrieving or computing the actual binwidth I compute the distance from the optimum
29 | 
30 | distance_from_optimum <- optimal_bw - actual_bw
31 | if(distance_from_optimum !=0){
32 | optimization_data <- list("test" = TRUE,"optimal_bw" = optimal_bw, "distance_from_optimum" = distance_from_optimum)
33 | return(optimization_data)
34 | }else{return(list("test"=FALSE,"optimal_bw" =optimal_bw,"distance_from_optimum"=distance_from_optimum))}
35 |   }else{list(FALSE,NA,100)}
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/R/does_it_need_geom_smooth.R:
--------------------------------------------------------------------------------
 1 | #' study the relationship among variables to find out if a geom_smooth is needed within the plot
 2 | #' @param plot_object the ggplot object being analysed
 3 | #' @param n_of_layers number of layers within the ggplot object
 4 | #' @param correlation_threshold a judgmentally specified threshold to test for the need of a trend line on the plot
 5 | #' @param aes_db a data frame storing plot object aestetichs and variable mapped against them
 6 | #' @description \code{does_it_need_geom_smooth} computes distance correlation among x and y aestetich to evaluate it against a judgmental threshold.
 7 | #' if this threshold is breached and no \code{geom_smooth} is observed among ggplot objects layers a postive outcome is produced for the test.
 8 | #' @return a list storing in two separate vectors the final result of the test and the observed distance correlation.
 9 | #' the test outcome is TRUE in case a trend line is deemed necessary.
10 | #' @export
11 | does_it_need_geom_smooth <- function(plot_object, n_of_layers,correlation_threshold,aes_db){
12 | 
13 |   # first we check for correlations ( of any type) among points
14 |   raw_x <- aes_puller(plot_object,n_of_layers, "x")
15 |   if(mode(raw_x) == "list"){x_vector <- raw_x[,1] %>% pull}else{x_vector <- raw_x}
16 | 
17 |   #look for density plot currently not handled
18 |   if(!is.na(match("..density..",aes_db$variable))){y_vector <- c()}else{
19 |   raw_y <- aes_puller(plot_object,n_of_layers, "y")
20 |   if(mode(raw_y) == "list"){y_vector <- raw_y[,1] %>% pull}else{y_vector <- raw_y}
21 |   }
22 |   not_handled <- c("factor","character")
23 |   #check on variabe type
24 |   if(class(x_vector) %in% not_handled | class(y_vector) %in% not_handled | !is.na(match("..density..",aes_db$variable))){return(list(NA,0,NA))
25 |     } else if(!is.na(x_vector) & !is.na(y_vector)){
26 |     distance_correlation <- dcor(x_vector,y_vector)
27 |   }else{distance_correlation <- 2}
28 | 
29 |   # if the level of correlation is relevant we look for the presence of geom_smooth
30 |   if(abs(distance_correlation)>correlation_threshold & distance_correlation != 2){
31 |     geoms_vector <- geoms_lister(plot_object, n_of_layers)
32 |     if(!is.na(match("GeomSmooth",geoms_vector))){return(list(FALSE,distance_correlation))}else{
33 |       return(list(TRUE,distance_correlation)) # if we have a relevant correlation and no geom smooth we suggest to add one
34 |     }
35 | 
36 |   }else if(distance_correlation == 2){return(list(NA,0))}
37 |   else{
38 |     return(list(FALSE,distance_correlation))
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # vizscorer
 2 | a clever bot to rate and improve your ggplot plot
 3 | 
 4 | Vizscorer is an R package which tries to help users discover weaknesses in their ggplot plots. To do so it applies a mix of machine learning and natural language generation.
 5 | 
 6 | <a href="http://www.andreacirillo.com/images/vizscorer_viz.001.png" ><figure class= "post_image" >
 7 | <img src="http://www.andreacirillo.com/images/vizscorer_viz.001.png"  alt="infoviz" style="width: 300px;"/>
 8 | </figure></a>
 9 | 
10 | Vizscorer basically:
11 | 
12 | - looks at ggplot internals 
13 | - analyzes them
14 | - produces a deck of slides showing an overall evaluation and detailed suggestions about how to improve the plot.
15 | 
16 | See it in action below:
17 | 
18 | <a href="http://www.andreacirillo.com/images/vizscorer_demo.gif" ><figure class= "post_image" >
19 | <img src="http://www.andreacirillo.com/images/vizscorer_demo.gif"  alt="infoviz" style="width: 300px;"/>
20 | </figure></a>
21 | 
22 | Vizscorer continues the effort started with [paletteR](https://github.com/AndreaCirilloAC/paletter) to increase the level of quality of the average plot produced in companies, where there is no time to study data visualization theory.
23 | 
24 | ## how to use it
25 | 
26 | install vizscorer package:
27 | 
28 | <code>
29 | devtools::install_github("AndreaCirilloAC/vizscorer")
30 | </code>
31 | 
32 | call the scorer_bot on your ggplot plot:
33 | 
34 | <code>
35 | scorer_bot(plot_object = your_ggplot_plot)
36 | </code>
37 | 
38 | after some strange code populating your console a web browser will show up to display you a deck of slide containing:
39 | 
40 | - an assessment of your plot
41 | - a set of customized suggestions about how to improve your plot, based on data visualization principles
42 | 
43 | ### Disclaimer
44 | 
45 | **vizscorer is currently in beta mode and every feedback and support from early adopters is more than welcome :)**
46 | 
47 | The package currently does not support complex ggplot plots, like plots with data not provided within the *ggplot()* call but rather within the other layers.
48 | If any issue arises, please consider submit an issue within the *Issues* section or forking the repo and fix it by yourself!
49 | 
50 | ### further area of development
51 | 
52 | - increasing the number of plots employed for training the gradient boosting model (more about this in a forthcoming blog post)
53 | - introducing a “console version” of the report, more usable and less annoying in the long run
54 | - increasing the level of cleaverness of the bot to make it able to analyse more complex plots and provide even more useful suggestions
55 | 


--------------------------------------------------------------------------------
/inst/extdata/report.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Your Plot Meta Analysis"
  3 | output:
  4 |   revealjs::revealjs_presentation:
  5 |     center: true
  6 |     transition: fade
  7 |     theme: serif
  8 | ---
  9 | <!-- custom CSS code for blinking cursor -->
 10 | <style type="text/css">
 11 |     .blinking-cursor {
 12 |   font-weight: 100;
 13 |   color: #2E3D48;
 14 |   -webkit-animation: 1s blink step-end infinite;
 15 |   -moz-animation: 1s blink step-end infinite;
 16 |   -ms-animation: 1s blink step-end infinite;
 17 |   -o-animation: 1s blink step-end infinite;
 18 |   animation: 1s blink step-end infinite;
 19 |   font-size: 100%
 20 | }
 21 | 
 22 | @keyframes "blink" {
 23 |   from, to {
 24 |     color: transparent;
 25 |   }
 26 |   50% {
 27 |     color: black;
 28 |   }
 29 | }
 30 | 
 31 | @-moz-keyframes blink {
 32 |   from, to {
 33 |     color: transparent;
 34 |   }
 35 |   50% {
 36 |     color: black;
 37 |   }
 38 | }
 39 | 
 40 | @-webkit-keyframes "blink" {
 41 |   from, to {
 42 |     color: transparent;
 43 |   }
 44 |   50% {
 45 |     color: black;
 46 |   }
 47 | }
 48 | 
 49 | @-ms-keyframes "blink" {
 50 |   from, to {
 51 |     color: transparent;
 52 |   }
 53 |   50% {
 54 |     color: black;
 55 |   }
 56 | }
 57 | 
 58 | @-o-keyframes "blink" {
 59 |   from, to {
 60 |     color: transparent;
 61 |   }
 62 |   50% {
 63 |     color: black;
 64 |   }
 65 | }
 66 | 
 67 | </style>
 68 | 
 69 | Hi! I am you dataviz bot advisor<span class=blinking-cursor>_</span>
 70 | 
 71 | ---
 72 | 
 73 | I have looked into your plot and evaluated it against dataviz best practices and theoretical frameworks<span class=blinking-cursor>_</span>
 74 | 
 75 | ---
 76 | 
 77 | your plot looks  __`r judgment`__<span class=blinking-cursor>_</span>
 78 | 
 79 | to be more precise your plot scored __`r round(score,2)`__
 80 | 
 81 | 
 82 | ---
 83 | 
 84 | here it is a comparison between your score and the score obtained from plots included within the score estimation database.
 85 | 
 86 | ```{r eval=TRUE, include=TRUE,echo=FALSE, fig.height=5,fig.width=7,5}
 87 | 
 88 | score_plot
 89 | 
 90 | ```
 91 | 
 92 | <small>you can find more info about how my scoring algorithm works on my <a hrefandreacirillo.com/viz_bot>official webpage</a></small>
 93 | 
 94 | ---
 95 | 
 96 | let me tell you a bit more about this<span class=blinking-cursor>_</span>
 97 | 
 98 | ---
 99 | 
100 | as far as I can see the data viz area in which your plot scored worst was the __`r worst_area`__<span class=blinking-cursor>_</span>
101 | 
102 | ---
103 | 
104 | here it is a summary of your plot performance for each area. 
105 | 
106 | `r kable(positive_ratio_show)`
107 | 
108 | you can look into my documentation to learn which test were performed and how the scores were computed.
109 | <small>you can also find more info about all on my <a hrefandreacirillo.com/viz_bot>official webpage</a></small>
110 | 
111 | ---
112 | 
113 | is now time to learn how to make your plot more effective applying dataviz principles<span class=blinking-cursor>_</span>
114 | 
115 | ---
116 | 
117 | I will give you an advice on how to fix your plot for each test failed<span class=blinking-cursor>_</span>
118 |  
119 | 


--------------------------------------------------------------------------------
/R/outlier_labels.R:
--------------------------------------------------------------------------------
 1 | #' check for the presence of outliers not labelled within a ggplot object
 2 | #' @param plot_object the ggplot object being analysed
 3 | #' @param n_of_layers number of layers within the ggplot object
 4 | #' @param p_build an object obtained running \code{ggplot_build} on the ggplot object
 5 | #' @param aes_db a data frame storing plot object aestetichs and variable mapped against them
 6 | #' @description for x and y variable the Tukey outlier rule is applied to evaluate the presence of outliers.
 7 | #' If any outlier is identified one more check is performed to retrieve among plot metadata the presence of some annotation related to that data.
 8 | #' @return a list storing a TRUE in case there are no outliers or the detected outliers are labelled and FALSE in the other cases.
 9 | #' @export
10 | outlier_labels <- function(plot_object,n_of_layers,p_build,aes_db){
11 |   #TRUE as final result means either you don't have outliers or the ones you have are labeled. the relevant result here is
12 |   # FALSE since it discriminates between a good and a bad plot.
13 | 
14 |   #draw_data
15 |   raw_x <- aes_puller(plot_object,n_of_layers, "x")
16 |   if(mode(raw_x) == "list"){x_vector <- raw_x[,1] %>% pull}else{x_vector <- as.numeric(raw_x)}
17 |   if(!is.na(match("..density..",aes_db$variable) )){y_vector <- c(NA)}else{
18 |   raw_y <- aes_puller(plot_object,n_of_layers, "y")
19 |   if(mode(raw_y) == "list"){y_vector <- raw_y[,1] %>% pull}else{y_vector <- as.numeric(raw_y)}
20 |   }
21 | ############ x
22 | if(is.na(x_vector)|is.character(x_vector)){x_outliers_are_labelled <- NA}else{
23 |   # look for outliers
24 |   x_outliers <- boxplot.stats(x_vector )$out
25 | 
26 | 
27 |   if (length((x_outliers))>0){ #we preliminary check for outliers presence, to avoid unnecessary computations
28 |     # look for geom_text
29 |     text_index <- match("GeomText",geoms_lister(plot_object, n_of_layers ))
30 |     # if text is found I look at data to see if we are looking the outliers are labeled
31 |     if(is.na(text_index)){x_outliers_are_labelled <- FALSE}else if(text_index > 0){
32 | 
33 |       text_x <- p_build$data[[text_index]]$x
34 | 
35 | 
36 |       x_outliers_are_labelled <- if(length(x_outliers)==0){TRUE}else if(x_outliers%in% text_x){TRUE}else{FALSE}
37 | 
38 |       }}else{x_outliers_are_labelled <-  TRUE}}
39 | 
40 | ########### y
41 | 
42 |   if(is.na(y_vector)|is.character(x_vector)){ y_outliers_are_labelled <- NA}else{
43 | 
44 |   # look for outliers
45 | 
46 |   y_outliers <- boxplot.stats(y_vector)$out
47 | 
48 |   if (length(y_outliers)>0){ #we preliminary check for outliers presence, to avoid unnecessary computations
49 |   # look for geom_text
50 |     text_index <- match("GeomText",geoms_lister(plot_object, n_of_layers ))
51 |     # if text is found we look at data to see if in the plot we are looking at the outliers are labeled
52 |    if(is.na(text_index)){y_outliers_are_labelled <- FALSE}else if(text_index > 0){
53 | 
54 |      text_y <- p_build$data[[text_index]]$y
55 | 
56 |      y_outliers_are_labelled <- if(length(y_outliers)==0){TRUE}else if(y_outliers%in% text_y){TRUE}else{FALSE}
57 | 
58 |    }}else{y_outliers_are_labelled <-  TRUE}}
59 | ###############
60 |      general_result <- if(is.na(x_outliers_are_labelled) & is.na(y_outliers_are_labelled)){
61 |        TRUE} else{if(prod(x_outliers_are_labelled,y_outliers_are_labelled, na.rm = TRUE)== TRUE){TRUE}else{FALSE}}
62 | 
63 |      return(general_result)
64 | }
65 | 


--------------------------------------------------------------------------------
/R/scorer.R:
--------------------------------------------------------------------------------
 1 | #'scoring the effectivenes of ggplot object applying a gradient boosting model
 2 | #'@description \code{scorer} function handles the running of a gradient boosting pre-trained model on the metadata of a ggplot object.
 3 | #'As a result a 0 to 1 score is produced, measuring the effectiveness of the related plot based on a set of rules drawn from data visualization theory.
 4 | #'@param plot_metadata a data frame storing ggplot object metadata, resulting from a previous call of \code{metadata_reader} function.
 5 | #'@return a numeric value
 6 | #' @export
 7 | scorer <- function(plot_metadata = NULL){
 8 | 
 9 |   if(is.null(plot_metadata)){stop("you must provide a valid set of plot metadata to obtain a score")}else{
10 | 
11 |     # take as input a data frame with one row, coming from metadata_reader
12 | 
13 |     # predict plot score employing the gradient boosting fitted on the classification db
14 | 
15 |     scoring_db_raw <- data.frame()
16 |     colnames_vector <- c(
17 |                           'pie_chart',
18 |                           'number_of_layers',
19 |                           'number_of_dimensions',
20 |                           'width_of_bins',
21 |                           'flipped_barplot',
22 |                           'need_for_a_smooth',
23 |                           'sufficient_number_of_data',
24 |                           'overplotting',
25 |                           'use_of_heavy_background',
26 |                           'filled_barplot',
27 |                           'presence_of_title',
28 |                           'presence_of_subtitle',
29 |                           'presence_of_caption',
30 |                           'special_characters_in_label',
31 |                           'outliers_not_labelled')
32 | 
33 |     plot_metadata <- plot_metadata %>% as.data.frame()
34 | 
35 |     result <- plot_metadata$test %>% as.character()
36 |     row <- data.frame(t(c(result)))
37 |     colnames(row) <- colnames_vector
38 |     scoring_db_raw <- rbind(scoring_db_raw,row)
39 |     colnames(scoring_db_raw) <- colnames_vector
40 | 
41 |     scoring_db_raw %>%
42 |       mutate(pie_chart = as.logical(pie_chart),
43 |              number_of_dimensions = as.logical(number_of_dimensions),
44 |              number_of_layers = as.numeric(as.character(number_of_layers)),
45 |              width_of_bins = as.numeric(as.character(width_of_bins)),
46 |              flipped_barplot = as.logical(flipped_barplot),
47 |              need_for_a_smooth = as.numeric(as.character(need_for_a_smooth)),
48 |              sufficient_number_of_data = as.logical(sufficient_number_of_data),
49 |              overplotting =as.numeric(as.character(overplotting)),
50 |              use_of_heavy_background = as.logical(use_of_heavy_background),
51 |              filled_barplot= as.logical(filled_barplot),
52 |              presence_of_title = as.logical(presence_of_title),
53 |              presence_of_subtitle = as.logical(presence_of_subtitle),
54 |              presence_of_caption = as.logical(presence_of_caption),
55 |              special_characters_in_label = as.logical(special_characters_in_label),
56 |              outliers_not_labelled = as.logical(outliers_not_labelled))-> scoring_db
57 | 
58 |     predict(gbm_fit,newdata = scoring_db, probability = TRUE)
59 |     probabilities <- extractProb(list(gbm = gbm_fit),unkX = scoring_db )
60 |     #return as output the probability of being a good plot, i.e. the final score
61 | 
62 |     return(probabilities$good)
63 | 
64 |   }
65 | }
66 | 


--------------------------------------------------------------------------------
/inst/extdata/plot_advices.csv:
--------------------------------------------------------------------------------
 1 | topic_label;advice;problem
 2 | pie_chart;pie charts are rarely the better way to display data, since makes difficult to make precise comparison between categories. Why do not simply look for a well defined horizontal bar chart?;you are using a pie chart
 3 | number_of_layers;you are trying to show too many elements at once, why do not split your plot into more plots or introduce some facet? Look at <code>facet_wrap</code> for further info;you are trying to show too many things at once
 4 | number_of_dimensions;you are showing three dimensions on a two dimension support. This can give a quick comparison of values of your data  ,but is not as well suited for accurate or precise lookup. it would be better to split it into two different plots, for instance employing <code>facet_wrap</code>;showing more than 2D on a 2D space is difficult
 5 | number_of_bins;the histogram you specified is not the best possibile visualisation of your data. I have found that setting bins to a different width produces a better visualisation of your data structure. Look for the  <a href='https://en.wikipedia.org/wiki/Freedman-Diaconis_rule'>Freedman-Diaconis rule</a>  to understand how to set the optimal width of your bins;try to pick a better width for your bins
 6 | flipped_barplot;humans are better able at comparing lengths when showed horizontally. Unless you are showing time-related information (i.e. time series over the x axis) try to flip your bars adding <code>coord_flip()</code> to your plot specification;let's flip your bars down!
 7 | need_for_a_smooth;it seems a strong correlation is going on between your data. Try to help the reader of your plot spotting it adding <code>geom_smooth()</code> to your plot;you should highlight trends going on
 8 | sufficient_number_of_data;your plot shows less then 20 data points. You would obtain a better result employing a table. You can find more info about this within one of masterpieces about data visualization: _The Visual Display of Quantitative Information_ by E. R. Tufte;too few points for a plot
 9 | overplotting;you are incurring in __overplotting__: there are too many points to be displayed at once in an accurate way. you should consider introducing some faceting. If you rather want to maintain the current structure of the plot you could consider employing a different geom like <code>geom_bin2d()</code> able to better avoid overplotting;your plot is crowded due to overplotting
10 | use_of_heavy_background;you are employing the default ggplot theme. while it is generally fit for its purpose you should consider removing that grey background, which lowers your data to ink ratio. Please find more info about data ink ratio on the <a href='https://infovis-wiki.net/wiki/Data-Ink_Ratio'>official info vis wiki page</a>;try to remove the grey background
11 | filled_barplot;you are using monochromatic filled bars. that great amount of black neither show any additional data nor convey significant message. consider change it into a <code>geom_point()</code> or a <code>geom_point_range()</code>;the filling of your bars is not adding information
12 | presence_of_title;your plot is missing a title. this makes harder for your reader to understand its content. try to add a title employing <code>labs(title = 'relevant title')</code>;what is a book without a title?
13 | presence_of_subtitle; your plot is missing a subtitle. Subtitle are effective in providing additional relevant info about metrics and data showed in your plot.Try to add a plot employing labs(subtitle  = 'your subtitle');let us understand your plot better with a subtitle
14 | presence_of_caption;your plot is missing a caption. While not mandatories , captions are used to provide info about sources and reference period of your plot. This is crucial to increase the level of graphical integrity of your plot;use caption to add integrity to your plot
15 | special_characters_in_label;your labels shows special characters. this makes them hard to read and reduce their effectiveness. try to solve this explicitly setting your labels trough labs(x  = 'meaningful x axis' = ,y  = 'meaningful y axis');special characters in labels lower readability
16 | outliers_not_labelled;I have found some outliers in your data  ,and they do not seem to be explicitly labeled in your plot.To increase the level of meaningfulness you should consider labelling them;mark that outliers
17 | 


--------------------------------------------------------------------------------
/plot_report.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Your Plot Meta Analysis"
  3 | output:
  4 |   revealjs::revealjs_presentation:
  5 |     center: true
  6 |     transition: fade
  7 |     theme: serif
  8 | ---
  9 | <!-- custom CSS code for blinking cursor -->
 10 | <style type="text/css">
 11 |     .blinking-cursor {
 12 |   font-weight: 100;
 13 |   color: #2E3D48;
 14 |   -webkit-animation: 1s blink step-end infinite;
 15 |   -moz-animation: 1s blink step-end infinite;
 16 |   -ms-animation: 1s blink step-end infinite;
 17 |   -o-animation: 1s blink step-end infinite;
 18 |   animation: 1s blink step-end infinite;
 19 |   font-size: 100%
 20 | }
 21 | 
 22 | @keyframes "blink" {
 23 |   from, to {
 24 |     color: transparent;
 25 |   }
 26 |   50% {
 27 |     color: black;
 28 |   }
 29 | }
 30 | 
 31 | @-moz-keyframes blink {
 32 |   from, to {
 33 |     color: transparent;
 34 |   }
 35 |   50% {
 36 |     color: black;
 37 |   }
 38 | }
 39 | 
 40 | @-webkit-keyframes "blink" {
 41 |   from, to {
 42 |     color: transparent;
 43 |   }
 44 |   50% {
 45 |     color: black;
 46 |   }
 47 | }
 48 | 
 49 | @-ms-keyframes "blink" {
 50 |   from, to {
 51 |     color: transparent;
 52 |   }
 53 |   50% {
 54 |     color: black;
 55 |   }
 56 | }
 57 | 
 58 | @-o-keyframes "blink" {
 59 |   from, to {
 60 |     color: transparent;
 61 |   }
 62 |   50% {
 63 |     color: black;
 64 |   }
 65 | }
 66 | 
 67 | </style>
 68 | 
 69 | Hi! I am you dataviz bot advisor<span class=blinking-cursor>_</span>
 70 | 
 71 | ---
 72 | 
 73 | I have looked into your plot and evaluated it against dataviz best practices and theoretical frameworks<span class=blinking-cursor>_</span>
 74 | 
 75 | ---
 76 | 
 77 | your plot looks  __`r judgment`__<span class=blinking-cursor>_</span>
 78 | 
 79 | to be more precise your plot scored __`r round(score,2)`__
 80 | 
 81 | 
 82 | ---
 83 | 
 84 | here it is a comparison between your score and the score obtained from plots included within the score estimation database.
 85 | 
 86 | ```{r eval=TRUE, include=TRUE,echo=FALSE, fig.height=5,fig.width=7,5}
 87 | 
 88 | score_plot
 89 | 
 90 | ```
 91 | 
 92 | <small>you can find more info about how my scoring algorithm works on my <a hrefandreacirillo.com/viz_bot>official webpage</a></small>
 93 | 
 94 | ---
 95 | 
 96 | let me tell you a bit more about this<span class=blinking-cursor>_</span>
 97 | 
 98 | ---
 99 | 
100 | as far as I can see the data viz area in which your plot scored worst was the __`r worst_area`__<span class=blinking-cursor>_</span>
101 | 
102 | ---
103 | 
104 | here it is a summary of your plot performance for each area. 
105 | 
106 | `r kable(positive_ratio_show)`
107 | 
108 | you can look into my documentation to learn which test were performed and how the scores were computed.
109 | <small>you can also find more info about all on my <a hrefandreacirillo.com/viz_bot>official webpage</a></small>
110 | 
111 | ---
112 | 
113 | is now time to learn how to make your plot more effective applying dataviz principles<span class=blinking-cursor>_</span>
114 | 
115 | ---
116 | 
117 | I will give you an advice on how to fix your plot for each test failed<span class=blinking-cursor>_</span>
118 |  
119 |  
120 | ---
121 |  
122 | __you should highlight trends going on__
123 |   
124 | it seems a strong correlation is going on between your data. Try to help the reader of your plot spotting it adding <code>geom_smooth()</code> to your plot<span class=blinking-cursor>_</span>
125 |   
126 | ---
127 |  
128 | __try to remove the grey background__
129 |   
130 | you are employing the default ggplot theme. while it is generally fit for its purpose you should consider removing that grey background, which lowers your data to ink ratio. Please find more info about data ink ratio on the <a href='https://infovis-wiki.net/wiki/Data-Ink_Ratio'>official info vis wiki page</a><span class=blinking-cursor>_</span>
131 |   
132 | ---
133 |  
134 | __let us understand your plot better with a subtitle__
135 |   
136 |  your plot is missing a subtitle. Subtitle are effective in providing additional relevant info about metrics and data showed in your plot.Try to add a plot employing labs(subtitle  = 'your subtitle')<span class=blinking-cursor>_</span>
137 |   
138 | ---
139 |  
140 | __use caption to add integrity to your plot__
141 |   
142 | your plot is missing a caption. While not mandatories , captions are used to provide info about sources and reference period of your plot. This is crucial to increase the level of graphical integrity of your plot<span class=blinking-cursor>_</span>
143 |   
144 | ---
145 |  
146 | __mark that outliers__
147 |   
148 | I have found some outliers in your data  ,and they do not seem to be explicitly labeled in your plot.To increase the level of meaningfulness you should consider labelling them<span class=blinking-cursor>_</span>
149 |   
150 | ---
151 | Hope you will find this useful and will help you improve your plot
152 |   
153 | Cheers,
154 |   
155 | _your dataviz bot advisor_
156 |   
157 | 


--------------------------------------------------------------------------------
/R/scorer_bot.R:
--------------------------------------------------------------------------------
  1 | #' produce a customised report analysing the submitted ggplot object
  2 | #' @param plot_object the ggplot object being analysed
  3 | #' @description call to \code{scorer_bot} function produces the sequential calling of all function included within
  4 | #' the \code{vizscorer} package, with the purpose to produce a customised report about the submitted ggplot object. The function handles call to \code{metadata_reader} function
  5 | #' to draw metadata from the plot object in a format ready to be submitted to the scorer algorithm. within the next step those metadata are submitted to the \code{scorer} function which apply a gradient boosting pre-trained model to them.
  6 | #' a final score from 0 to 1 is computed as a probability to be a good plot.
  7 | #' To provide more relevant information to the user a set of advices are retrieved from a built-in knowledge base, selecting only those relevant to suboptimal areas of the plot, observed from metadata.
  8 | #' Finally a revealjs deck of slides is programmatically created leveraging a quasi-real natural language generation algorithm.
  9 | #' @return a reveljs deck of slide containing an evaluation of the effectiveness of ggplot objects and suggestion about how to improve it.
 10 | #' @export
 11 | scorer_bot <- function(plot_object = NULL){
 12 |  plot_metadata_raw <- metadata_reader(plot_object) %>% as.data.frame()
 13 |  plot_metadata <- data.frame(area_label = unlist(plot_metadata_raw$area_label),
 14 |                              topic_label = unlist(plot_metadata_raw$topic_label),
 15 |                              test        = unlist(plot_metadata_raw$test),
 16 |                              ispositive = unlist(plot_metadata_raw$ispositive))
 17 | 
 18 |   score <- scorer(plot_metadata)
 19 |   score_db <- data.frame("good"=score,"count"=1)
 20 |   comparison_path <- system.file("extdata", "comparison_db.csv", package = "vizscorer")
 21 |   comparison_db <- read.csv(comparison_path, sep = ",", stringsAsFactors = FALSE) %>%
 22 |     select(good,model)
 23 | 
 24 |   comparison_db %>%
 25 |     ggplot(aes(x = good))+
 26 |     geom_histogram(binwidth = 0.01,colour = "grey80",fill ="grey80")+
 27 |     geom_point(data =score_db,
 28 |                aes(x=good,y=count),
 29 |                inherit.aes = FALSE,colour = "red",size=3)+
 30 |     theme_minimal()+
 31 |     xlab("score")+
 32 |     labs(title="comparison among training plots and your plot")+
 33 |     annotate("segment", x = score, xend = 0.4, y = 1, yend = 5,colour ="grey50")+
 34 |   annotate(geom ="text",
 35 |            label = "you are here",
 36 |            x = 0.4,y=5.1,colour="grey20",size= 7) -> score_plot
 37 | 
 38 |   score_map <- data.frame(lower = c(0,0.11,0.41,0.71),
 39 |                           upper = c(0.10,0.40,0.70,1),
 40 |                           judgment = c("poorly", "improvable","good!", "great!"))
 41 | 
 42 |   score_map$score_match <- mapply(between,
 43 |                                   x = rep(score,nrow(score_map)),
 44 |                                   left = score_map$lower,
 45 |                                   right = score_map$upper,
 46 |                                   SIMPLIFY = TRUE)
 47 |   score_map %>%
 48 |     filter(score_match == TRUE) %>%
 49 |     select(judgment) %>% pull() %>%
 50 |     as.character() -> judgment
 51 | 
 52 |   # from metadata, leveraging area_label and ispositive attribute, we highlight which area of dataviz
 53 |   # shows problem for the plot
 54 |   # we will highlight those area to our user
 55 |   # brief test for histograms to filter out test note relevant for non-histogram plots
 56 |   is_histogram <- test_for_histogram(plot_object,plot_object$layers %>% length())
 57 | 
 58 |   plot_metadata %>%
 59 |   {if (is_histogram == FALSE) filter(., topic_label != "number_of_bins" & topic_label != "flipped_barplot" ) else filter(.,topic_label !="stop")}  %>%
 60 |     select(area_label,ispositive) %>%
 61 |     group_by(area_label,ispositive) %>%
 62 |     count() %>%
 63 |     spread(key = ispositive, value = n) %>%
 64 |     mutate(positive_ratio = `TRUE`/sum(`TRUE`,`FALSE`, na.rm = TRUE)) %>%
 65 |     mutate(area_ratio     =round(positive_ratio,2)) %>%
 66 |     rename(passed = `TRUE`,failed = `FALSE`) %>%
 67 |     mutate(total = sum(passed , failed,na.rm = TRUE)) -> positive_ratio_db
 68 | 
 69 |   positive_ratio_db %>%
 70 |     select(area_label,failed,passed,area_ratio) -> positive_ratio_show
 71 | 
 72 |   positive_ratio_show[is.na(positive_ratio_show)] <- 0
 73 | 
 74 |   positive_ratio_db %>%
 75 |     filter(positive_ratio == min(.$positive_ratio)) %>%
 76 |     select(area_label) %>%
 77 |     pull() %>%
 78 |     as.character() -> worst_area # the worst area was the one in which the plot obtained ther worst positive rate,
 79 |   #i.e. number of positive results in test given the overall number of test for that area.
 80 | 
 81 |   # we then select each specific test showing problems, so to give specific advice for each of this elements.
 82 |   plot_metadata %>%
 83 |   {if (is_histogram == FALSE) filter(., topic_label != "number_of_bins" & topic_label != "flipped_barplot" ) else filter(.,topic_label !="stop")}  %>%
 84 |     select(topic_label,ispositive) %>%
 85 |     filter(ispositive == FALSE) -> errors_db
 86 | 
 87 |   # merge errors_db with a db storing a suggestion for each possible error
 88 |   advices_path <- system.file("extdata", "plot_advices.csv", package = "vizscorer")
 89 |   advices_db <- read.csv(advices_path, sep = ";", stringsAsFactors = FALSE)
 90 | 
 91 |   errors_db %>%
 92 |     left_join(.,advices_db,by = "topic_label") -> teaching_db
 93 | 
 94 | n_of_errors <- nrow(errors_db)
 95 | report_path <- system.file("extdata", "report.Rmd", package = "vizscorer")
 96 | file.copy(report_path, to = "plot_report.Rmd",overwrite = TRUE)
 97 | write(" ", file = "plot_report.Rmd", append = TRUE)
 98 | for (i in 1:n_of_errors){
 99 | 
100 | 
101 |   write("---", file = "plot_report.Rmd", append = TRUE)
102 |   write(" ", file = "plot_report.Rmd", append = TRUE)
103 |   write(paste0("__",teaching_db[i,4],"__"), file = "plot_report.Rmd", append = TRUE)
104 |   write("  " , file = "plot_report.Rmd", append = TRUE)
105 |   write(paste0(teaching_db[i,3],"<span class=blinking-cursor>_</span>",sep = ""),file = "plot_report.Rmd", append = TRUE) #<- il problema è in questa aggiunta
106 |   write("  " , file = "plot_report.Rmd", append = TRUE)
107 | 
108 | }
109 | 
110 |  write("---", file = "plot_report.Rmd", append = TRUE)
111 |  write("Hope you will find this useful and will help you improve your plot", file = "plot_report.Rmd", append = TRUE)
112 |  write("  " , file = "plot_report.Rmd", append = TRUE)
113 |  write("Cheers,", file = "plot_report.Rmd", append = TRUE)
114 |  write("  " , file = "plot_report.Rmd", append = TRUE)
115 |  write("_your dataviz bot advisor_", file = "plot_report.Rmd", append = TRUE)
116 |  write("  " , file = "plot_report.Rmd", append = TRUE)
117 | 
118 |   rmarkdown::render("plot_report.Rmd",  revealjs_presentation())
119 |   system("open plot_report.html")
120 | }
121 | 
122 | 
123 | 


--------------------------------------------------------------------------------
/R/metadata_reader.R:
--------------------------------------------------------------------------------
  1 | #' draw a data frame containing relevant metadata of a ggplot object
  2 | #' @param plot_object the ggplot object being analysed
  3 | #' @description \code{metadata_reader} function performs a number of test on the provided ggplot object to draw some intelligence about it.
  4 | #' It tries to derive attributes ofthe plot relate to four data visualization theory areas:
  5 | #' - the adequateness of labelling
  6 | #' - the data density
  7 | #' - the data to ink ratio
  8 | #' - the readability of the plot
  9 | #' The results of performed tests are stored in a structued list object, so to be provided as an input to a gradient boosting model.
 10 | #' See \code{scorer} function documentation for more info about this subsequent step.
 11 | #' @return a list storing results and additional data related to each test performed.
 12 | #' @export
 13 | metadata_reader <- function(plot_object = NULL){
 14 | 
 15 |   # preliminary chech on arguments of the function and others
 16 | 
 17 |   if(is.null(plot_object)){
 18 |     stop("you must provide a valid ggplot2 object", call = FALSE)
 19 |   }
 20 |   #blocking conditions:
 21 |   # data must e provided within ggplot() call
 22 |   if(is.null(plot_object$data[1])){
 23 |     stop("current version only checks on data provided within ggplot call")
 24 |   }
 25 |   # we have to be sure x aestetich is not named "x" and the same for y. Otherwise the introduced check for
 26 |   # inherited aestetichs (cfr. mapplings_lister) would produce the removal of the right occurence of
 27 |   if(plot_object$mapping %>%
 28 |      unlist() %>%
 29 |      as.character() %>%
 30 |      match("y") %>%
 31 |      sum(na.rm = TRUE) != 0 |
 32 |      plot_object$mapping %>% unlist() %>% as.character() %>% match("y") %>% sum(na.rm = TRUE)!=0){
 33 |     stop("it seems you have columns named 'x' or 'y' within the db you provided. This is not safe for some of the check performed. Go and change this, then come back and try again.")
 34 |   }
 35 |   n_of_layers <-  plot_object$layers %>% length()
 36 |   # no overplotting check for histograms ( and generally where no y is provided)
 37 |     if(test_for_histogram(plot_object,n_of_layers) == TRUE){
 38 |       message("no check for overplotting on histogram is currently provided")
 39 |     }
 40 |   #limitations
 41 |   aes_db <- mappings_lister(plot_object,n_of_layers)
 42 | 
 43 |   raw_x <- aes_puller(plot_object,n_of_layers, "x")
 44 | 
 45 |   if(mode(raw_x) == "list"){x_vector <- raw_x[,1] %>% pull}else{x_vector <- as.numeric(raw_x)}
 46 | 
 47 | 
 48 |   if(!is.na(match("..density..",aes_db$variable))){y_vector <- c()}else{
 49 |   raw_y <- aes_puller(plot_object,n_of_layers, "y")
 50 |   if(mode(raw_y) == "list"){y_vector <- raw_y[,1] %>% pull}else{y_vector <- as.numeric(raw_y)}
 51 |   }
 52 |   not_handled <- c("factor","character")
 53 |   available_and_not_handled <- intersect( x = c(unique(c(class(x_vector),class(y_vector)))),y = not_handled)
 54 |   #check on variabe type
 55 |   if(class(x_vector) %in% not_handled | class(y_vector) %in% not_handled){
 56 |     message(paste("you have variables of class ",
 57 |                   paste(unique(c(class(x_vector),class(y_vector))),collapse = " and "),
 58 |                   ". Not all checks are implemented for ", paste(available_and_not_handled, collapse = " and ")))}
 59 | 
 60 | 
 61 | 
 62 | 
 63 | area_categories <- c("readability of the plot",
 64 |                      "data density",
 65 |                      "data to ink ratio",
 66 |                      "adequateness of labeling")
 67 | 
 68 | default_n_of_bins     <- 30 # the default ggplot setting for the number of bins
 69 | data_threshold        <- 20 # following tufte we set 20 as threshold to suggest the useR to avoid graphs
 70 | overplotting_floor    <- if(!is.na(match("GeomLine",geoms_lister(plot_object ,n_of_layers)))){
 71 |   16
 72 | } else{1} # judgmental, based on experience
 73 | layers_tresholds     <- 5   # judgmental, based on experience
 74 | correlation_threshold <- .4  # judgmental, based on experience
 75 | bins_distance         <- 6   # judgmental, based on experience
 76 | p_build               <- ggplot_build(plot_object)
 77 | 
 78 | # perform checks for each of the area and principle and assign a score
 79 | 
 80 | 
 81 | 
 82 | # READABILITY OF THE PLOT
 83 | 
 84 | ## is it a pie chart?
 85 | 
 86 | pie_results <- list(           area_label = area_categories[1],
 87 |                                topic_label = "pie_chart",
 88 |                                test = as.logical(is_pie_chart(plot_object, n_of_layers)),
 89 |                                ispositive = as.logical(!is_pie_chart(plot_object, n_of_layers)),
 90 |                                additional_data = list())
 91 | 
 92 | ## are there too many layers?
 93 | 
 94 | layers_results <- list(
 95 |                                area_label = area_categories[1],
 96 |                                topic_label = "number_of_layers",
 97 |                                test = n_of_layers,# here the number of layers
 98 |                                ispositive = !(n_of_layers > layers_tresholds ),
 99 |                                additional_data = n_of_layers)
100 | 
101 | ## is the user showing more dimensions than the plot would allow to?
102 | 
103 | dimension_results <- list(
104 |                                area_label = area_categories[1],
105 |                                topic_label = "number_of_dimensions",
106 |                                test = too_many_dimensions(plot_object,n_of_layers),
107 |                                ispositive = !too_many_dimensions(plot_object,n_of_layers),
108 |                                additional_data = list())
109 | 
110 | ## in case of geom_histogram study the optimal number of bins, employing the Freedman Diaconis rule
111 | 
112 | bins_results <- list(
113 |                                   area_label = area_categories[1],
114 |                                   topic_label = "number_of_bins",
115 |                                   test = histogram_bins_tester(plot_object,n_of_layers,default_n_of_bins)[[3]], #here the distance from the optimal number of bins
116 |                                   ispositive = !(histogram_bins_tester(plot_object,n_of_layers,default_n_of_bins)[[3]]>bins_distance),
117 |                                   additional_data = histogram_bins_tester(plot_object,n_of_layers,default_n_of_bins)[2:3] )
118 | ## let's check if we are looking at a bar plot and if yes we check if it is flipped or not
119 | 
120 | flipped_bar_results <- list(
121 |   are_label = area_categories[1],
122 |   topic_label = "flipped_barplot",
123 |   test = is_horizontal_barplot(plot_object,n_of_layers),# TRUE here means we are looking at an horizontal barplot, which is good
124 |   ispositive = is_horizontal_barplot(plot_object,n_of_layers),
125 |   additional_data = list()
126 | )
127 | 
128 | # we look here for correlation between x and y and for the presence of geom_smooth lines.
129 | 
130 | geom_smooth_results <- list(
131 |                               area_label = area_categories[1],
132 |                               topic_label = "need_for_a_smooth",
133 |                               test = does_it_need_geom_smooth(plot_object,n_of_layers,correlation_threshold,aes_db)[[2]],#we place here the distance correlation to better apply machine learning technique
134 |                               ispositive = !(does_it_need_geom_smooth(plot_object,n_of_layers,correlation_threshold,aes_db)[[2]]>correlation_threshold),
135 |                               additional_data = does_it_need_geom_smooth(plot_object,n_of_layers,correlation_threshold,aes_db)[[2]]
136 | 
137 | )
138 | 
139 | # DATA DENSITY
140 | 
141 | ## we check here for the user wasting his time developing a graph for less than 20 points to show
142 | 
143 | n_data_results <-  list(
144 |                                 area_label      = area_categories[2],
145 |                                 topic_label     = "sufficient_number_of_data",
146 |                                 test            = too_few_data(plot_object, data_threshold), #TRUE here means you have enough data
147 |                                 ispositive      = too_few_data(plot_object, data_threshold),
148 |                                 additional_data = list ())
149 | 
150 | ## check here for overplotting
151 | 
152 | overplotting_results <- list(
153 |                                area_label = area_categories[2],
154 |                                topic_label = "overplotting",
155 |                                test       = cozy_plot(plot_object, n_of_layers,overplotting_floor,aes_db)[[2]], # the median distance here as a measure of overplotting
156 |                                ispositive = !(cozy_plot(plot_object, n_of_layers,overplotting_floor,aes_db)[[2]]<overplotting_floor),# if distance is lower than overplotting floor we have a cozy plot
157 |                                additional_data = cozy_plot(plot_object, n_of_layers,overplotting_floor,aes_db)[[2]])
158 | 
159 | # DATA TO INK RATIO
160 | 
161 | ## check for the use of default theme and by that way the grey background
162 | 
163 | background_results <- list(
164 |                                area_label = area_categories[3],
165 |                                topic_label = "use_of_heavy_background",
166 |                                test = heavy_background(plot_object),
167 |                                ispositive = !heavy_background(plot_object),
168 |                                additional_data = list()) #TRUE here means we are looking at an heavy background, either being one set from the user or the default one ( if still grey)
169 | ## check for bar map with full, non white filling not mapped to any aes
170 | 
171 | filled_barplot_results <- list(
172 |                                area_label = area_categories[3],
173 |                                topic_label = "filled_barplot",
174 |                                test = filled_barplot(plot_object,n_of_layers),
175 |                                ispositive = !filled_barplot(plot_object,n_of_layers),
176 |                                additional_data = list()) #TRUE here means we are looking at an ink-wasting barplot filled with one meaningless colour
177 | 
178 | 
179 | # ADEQUATENESS OF LABELLING
180 | 
181 | ##check for presence of descriptive labels: title, subtitle and caption
182 | 
183 | title_results <- list(
184 |                                area_label = area_categories[4],
185 |                                topic_label = "presence_of_title",
186 |                                test = labels_finder(plot_object,"title"), # TRUE here means we found the label
187 |                                ispositive = labels_finder(plot_object,"title"),
188 |                                additional_data = list())
189 | 
190 | subtitle_results <- list(
191 |                                area_label = area_categories[4],
192 |                                topic_label = "presence_of_subtitle",
193 |                                test = labels_finder(plot_object,"subtitle"), # TRUE here means we found the label
194 |                                ispositive = labels_finder(plot_object,"subtitle"),
195 |                                additional_data = list())
196 | 
197 | caption_results <- list(
198 |                                area_label = area_categories[4],
199 |                                topic_label = "presence_of_caption",
200 |                                test = labels_finder(plot_object,"caption"), # TRUE here means we found the label
201 |                                ispositive = labels_finder(plot_object,"caption"),
202 |                                additional_data = list())
203 | 
204 | ## loook for special characters within labels
205 | 
206 | special_characters_results <- list(
207 |                                area_label = area_categories[4],
208 |                                topic_label = "special_characters_in_label",
209 |                                test = labels_reader(plot_object)[[1]],#TRUE here means we found special character
210 |                                ispositive = !(labels_reader(plot_object)[[1]]),
211 |                                additional_data = labels_reader(plot_object)[2])
212 | ## a polished check : we try to understand if there are outliers in data and if they are in some way labeled within the plot
213 | 
214 | outliers_results <- list(
215 |                                area_label = area_categories[4],
216 |                                topic_label = "outliers_not_labelled",
217 |                                test = outlier_labels(plot_object,n_of_layers,p_build,aes_db),#TRUE here means we have outliers not labelled
218 |                                ispositive = !(outlier_labels(plot_object,n_of_layers,p_build,aes_db)),
219 |                                additional_data = list())
220 | 
221 | check_results <- rbind(pie_results,
222 |                        layers_results,
223 |                        dimension_results,
224 |                        bins_results,
225 |                        flipped_bar_results,
226 |                        geom_smooth_results,
227 |                        n_data_results,
228 |                        overplotting_results,
229 |                        background_results,
230 |                        filled_barplot_results,
231 |                        title_results,
232 |                        subtitle_results,
233 |                        caption_results,
234 |                        special_characters_results,
235 |                        outliers_results)
236 | 
237 | return(check_results)}
238 | 


--------------------------------------------------------------------------------