├── .Rbuildignore ├── .gitattributes ├── .gitignore ├── DESCRIPTION ├── LICENSE ├── NAMESPACE ├── R ├── analysisPipelines_package.R ├── core-functions-batch.R ├── core-functions-meta-pipelines.R ├── core-functions.R ├── core-streaming-functions.R ├── r-batch-eda-utilities.R ├── r-helper-utilites-python.R ├── spark-structured-streaming-utilities.R ├── sysdata.rda └── zzz.R ├── README.md ├── analysisPipelines.Rproj ├── data-raw └── predefFunctions.R ├── inst ├── data-icon.png ├── logging.png ├── output-icon.png ├── param-icon.png ├── pipelineViz1.png ├── pipelineViz2.png ├── python-logo.png ├── python │ └── sampleFunctions.py ├── r-logo.png ├── report.Rmd ├── report1.png ├── report2.png ├── report3.png ├── spark-logo.png ├── spark-structured-streaming-logo.png └── styles.css ├── knit-vignettes ├── 2.Analysis_pipelines_for_working_with_Spark_DataFrames_for_one-time_batch_analyses.html ├── 3.Analysis_pipelines_for_working_with_Python_functions.html ├── 4.Interoperable_analysis_pipelines.html └── 5.Streaming_Analysis_Pipelines_for_working_with_Apache_Spark_Structured_Streaming.html ├── man ├── AnalysisPipeline-class.Rd ├── BaseAnalysisPipeline-class.Rd ├── CheckColumnType.Rd ├── MetaAnalysisPipeline-class.Rd ├── StreamingAnalysisPipeline-class.Rd ├── analysisPipelines.Rd ├── assessEngineSetUp.Rd ├── bivarPlots.Rd ├── castKafkaStreamAsString.Rd ├── checkSchema.Rd ├── checkSchemaMatch.Rd ├── computeEdges.Rd ├── convertKafkaValueFromJson.Rd ├── correlationMatPlot.Rd ├── createPipelineInstance.Rd ├── dot-analysisPipelinesEnvir.Rd ├── dot-getCache.Rd ├── dot-saveMetaPipeline.Rd ├── dot-setRegistry.Rd ├── dot-updateRegistry.Rd ├── dot-visualizeMetaPipeline.Rd ├── exportAsMetaPipeline.Rd ├── generateOutput.Rd ├── generateReport.Rd ├── genericPipelineException.Rd ├── getDatatype.Rd ├── getEndPoints.Rd ├── getFeaturesForPyClassification.Rd ├── getInput.Rd ├── getLoggerDetails.Rd ├── getOutputById.Rd ├── getPipeline.Rd ├── getPipelinePrototype.Rd ├── getRegistry.Rd ├── getResponse.Rd ├── getStartingPoints.Rd ├── getTargetForPyClassification.Rd ├── getTerm.Rd ├── getUpstreamDependencies.Rd ├── identifyTopLevelRecursively.Rd ├── identifyTopologicalLevels.Rd ├── ignoreCols.Rd ├── initDfBasedOnType.Rd ├── initialize-methods.Rd ├── initializeLoggers.Rd ├── isDependencyParam.Rd ├── loadMetaPipeline.Rd ├── loadPipeline.Rd ├── loadPredefinedFunctionRegistry.Rd ├── loadRegistry.Rd ├── multiVarOutlierPlot.Rd ├── outlierPlot.Rd ├── prepExecution.Rd ├── registerFunction.Rd ├── savePipeline.Rd ├── saveRegistry.Rd ├── setInput.Rd ├── setLoggerDetails.Rd ├── setPythonEnvir.Rd ├── setUpstreamDependencies.Rd ├── sparkRSessionCreateIfNotPresent.Rd ├── univarCatDistPlots.Rd ├── updateObject.Rd └── visualizePipeline.Rd └── vignettes ├── Analysis_pipelines_for_working_with_Python_functions.Rmd ├── Analysis_pipelines_for_working_with_R_dataframes.Rmd ├── Analysis_pipelines_for_working_with_sparkR.Rmd ├── Interoperable_Pipelines.Rmd ├── Meta_Pipelines.Rmd ├── Streaming_pipelines_for_working_Apache_Spark_Structured_Streaming.Rmd └── Using_pipelines_inside_shiny_widgets.Rmd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | data-raw/ 4 | inst/python/.ipynb_checkpoints 5 | metastore_db/ 6 | vignettes/*.R 7 | vignettes/*.html 8 | vignettes/*.RDS 9 | vignettes/metastore_db/ 10 | vignettes/*.out 11 | knit-vignettes/ 12 | 13 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.* linguist-language=R -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | metastore_db/ 6 | inst/python/.ipynb_checkpoints/ 7 | .DS_Store 8 | vignettes/metastore_db/ 9 | vignettes/*.RDS 10 | vignettes/*.out 11 | vignettes/*.R 12 | vignettes/*.html 13 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: analysisPipelines 2 | Type: Package 3 | Date: 2020-06-12 4 | Title: Compose Interoperable Analysis Pipelines & Put Them in Production 5 | Version: 1.0.2 6 | Authors@R: c( 7 | person("Naren","Srinivasan", email = "naren1991@gmail.com", role = c("aut")), 8 | person("Zubin Dowlaty","", email = "Zubin.Dowlaty@mu-sigma.com", role = c("aut")), 9 | person("Sanjay","", email = "Sanjay@mu-sigma.com", role = c("ctb")), 10 | person("Neeratyoy","Mallik", email = "Neeratyoy.Mallik@mu-sigma.com", role = c("ctb")), 11 | person("Anoop S","", email = "Anoop.S@mu-sigma.com", role = c("ctb")), 12 | person("Mu Sigma, Inc.", email = "ird.experiencelab@mu-sigma.com", role = c("cre")) 13 | ) 14 | Description: Enables data scientists to compose pipelines of analysis which consist of data manipulation, exploratory analysis & reporting, as well as modeling steps. Data scientists can use tools of their choice through an R interface, and compose interoperable pipelines between R, Spark, and Python. 15 | Credits to Mu Sigma for supporting the development of the package. 16 | Note - To enable pipelines involving Spark tasks, the package uses the 'SparkR' package. 17 | The SparkR package needs to be installed to use Spark as an engine within a pipeline. SparkR is distributed natively with Apache Spark and is not distributed on CRAN. The SparkR version needs to directly map to the Spark version (hence the native distribution), and care needs to be taken to ensure that this is configured properly. 18 | To install SparkR from Github, run the following command if you know the Spark version: 'devtools::install_github('apache/spark@v2.x.x', subdir='R/pkg')'. 19 | The other option is to install SparkR by running the following terminal commands if Spark has already been installed: '$ export SPARK_HOME=/path/to/spark/directory && cd $SPARK_HOME/R/lib/SparkR/ && R -e "devtools::install('.')"'. 20 | Depends: R (>= 3.4.0), magrittr, pipeR, methods 21 | Imports: ggplot2, dplyr, futile.logger, RCurl, rlang (>= 0.3.0), proto, purrr 22 | Suggests: plotly, knitr, rmarkdown, parallel, visNetwork, rjson, DT, shiny, R.devices, corrplot, car, foreign 23 | Enhances: SparkR, reticulate 24 | BugReports: https://github.com/Mu-Sigma/analysis-pipelines/issues 25 | URL: https://github.com/Mu-Sigma/analysis-pipelines 26 | Encoding: UTF-8 27 | License: MIT 28 | LazyLoad: yes 29 | LazyData: yes 30 | RoxygenNote: 6.1.1 31 | VignetteBuilder: knitr 32 | Collate: 33 | 'analysisPipelines_package.R' 34 | 'core-functions.R' 35 | 'core-functions-batch.R' 36 | 'core-functions-meta-pipelines.R' 37 | 'core-streaming-functions.R' 38 | 'r-batch-eda-utilities.R' 39 | 'r-helper-utilites-python.R' 40 | 'spark-structured-streaming-utilities.R' 41 | 'zzz.R' 42 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (C) 2019 Mu Sigma Labs 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge and/or publish of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE MU-SIGMA LABS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 10 | 11 | Except as contained in this notice, the name of the Mu-Sigma Labs shall not be used in advertising or otherwise to promote the sale, use or other dealings in this Software without prior written authorization from the Mu-Sigma Labs. 12 | 13 | Mu Sigma Labs is a trademark of Mu Sigma Business Solutions Private Limited. -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(AnalysisPipeline) 4 | export(BaseAnalysisPipeline) 5 | export(CheckColumnType) 6 | export(MetaAnalysisPipeline) 7 | export(StreamingAnalysisPipeline) 8 | export(assessEngineSetUp) 9 | export(bivarPlots) 10 | export(castKafkaStreamAsString) 11 | export(convertKafkaValueFromJson) 12 | export(correlationMatPlot) 13 | export(createPipelineInstance) 14 | export(exportAsMetaPipeline) 15 | export(generateReport) 16 | export(genericPipelineException) 17 | export(getDatatype) 18 | export(getFeaturesForPyClassification) 19 | export(getInput) 20 | export(getLoggerDetails) 21 | export(getOutputById) 22 | export(getPipeline) 23 | export(getPipelinePrototype) 24 | export(getRegistry) 25 | export(getResponse) 26 | export(getTargetForPyClassification) 27 | export(getTerm) 28 | export(ignoreCols) 29 | export(isDependencyParam) 30 | export(loadMetaPipeline) 31 | export(loadPipeline) 32 | export(loadPredefinedFunctionRegistry) 33 | export(loadRegistry) 34 | export(multiVarOutlierPlot) 35 | export(outlierPlot) 36 | export(prepExecution) 37 | export(registerFunction) 38 | export(savePipeline) 39 | export(saveRegistry) 40 | export(setInput) 41 | export(setLoggerDetails) 42 | export(setPythonEnvir) 43 | export(sparkRSessionCreateIfNotPresent) 44 | export(univarCatDistPlots) 45 | export(updateObject) 46 | export(visualizePipeline) 47 | exportClasses(AnalysisPipeline) 48 | exportClasses(BaseAnalysisPipeline) 49 | exportClasses(MetaAnalysisPipeline) 50 | exportClasses(StreamingAnalysisPipeline) 51 | exportMethods(checkSchemaMatch) 52 | exportMethods(generateOutput) 53 | importFrom(graphics,image) 54 | importFrom(magrittr,"%>%") 55 | importFrom(methods,getClass) 56 | importFrom(methods,new) 57 | importFrom(methods,removeMethod) 58 | importFrom(methods,setClassUnion) 59 | importFrom(methods,setGeneric) 60 | importFrom(methods,setOldClass) 61 | importFrom(pipeR,"%>>%") 62 | importFrom(rlang,"!!") 63 | importFrom(rlang,.data) 64 | importFrom(stats,as.formula) 65 | importFrom(stats,lm) 66 | importFrom(stats,reorder) 67 | importFrom(stats,terms) 68 | importFrom(utils,installed.packages) 69 | importFrom(utils,read.csv) 70 | -------------------------------------------------------------------------------- /R/analysisPipelines_package.R: -------------------------------------------------------------------------------- 1 | #' analysisPipelines 2 | #' 3 | #' The package aims at enabling data scientists to compose pipelines of analysis which consist of data manipulation, 4 | #' exploratory analysis & reporting, as well as modeling steps. It also aims to enable data scientists to use tools 5 | #' of their choice through an R interface, and compose interoperable pipelines between R, Spark, and Python. 6 | #' Credits to Mu Sigma for supporting the development of the package. 7 | #' 8 | #' @note To enable pipelines involving Spark tasks, the package uses the 'SparkR' package. Using Spark as an engine requires the SparkR package to be installed. 9 | #' SparkR is distributed natively with Apache Spark and is not distributed on CRAN. The SparkR version needs to directly map to the Spark version (hence the native distribution), and care needs to be taken to ensure that this is configured properly. 10 | #' To install from Github, run the following command, if you know the Spark version: 11 | #' \itemize{ 12 | #' \item devtools::install_github('apache/spark@v2.x.x', subdir='R/pkg') 13 | #' } 14 | #' The other option is to install SparkR by running the following terminal commands if Spark has already been installed: 15 | #' \itemize{ 16 | #' \item $ export SPARK_HOME=/path/to/spark/directory 17 | #' \item $ cd $SPARK_HOME/R/lib/SparkR/ 18 | #' \item $ R -e "devtools::install('.')" 19 | #' } 20 | #' @docType package 21 | #' @name analysisPipelines 22 | NULL 23 | -------------------------------------------------------------------------------- /R/core-functions-meta-pipelines.R: -------------------------------------------------------------------------------- 1 | ################################################################################################## 2 | # Title: Meta pipelines 3 | # Author: Naren Srinivasan 4 | # Created on: Nov 20, 2018 5 | # Description: Functions/ Methods to define and use meta-pipelines 6 | ################################################################################################## 7 | 8 | # proto' is an S3 class whic is used as a slot, and hence it is defined in the environment 9 | setOldClass("proto") 10 | 11 | #' @name MetaAnalysisPipeline-class 12 | #' @rdname MetaAnalysisPipeline-class 13 | #' @title Class for creating and working with meta-pipelines 14 | #' @details This class works with the \code{AnalysisPipeline} and \code{StreamingAnalysisPipeline} classes, and allows the 15 | #' pipeline to be exported as meta-pipeline. A meta-pipeline is a construct, where the input dataset as well as the arguments 16 | #' to functions in the pipeline are not defined. Only the analysis flow and dependencies are stored. 17 | #' @slot pipeline A tibble which holds functions to be called in the pipeline 18 | #' @slot pipelinePrototype An object of class \code{proto} from the 'proto' package which maintains the prototype of the 19 | #' functions in the pipeline and their respective arguments 20 | #' @slot type A string defining whether it is a batch or streaming pipeline. Acceptable values are 'batch' & 'streaming' 21 | #' @family Package core functions 22 | #' @exportClass MetaAnalysisPipeline 23 | #' @export MetaAnalysisPipeline 24 | 25 | MetaAnalysisPipeline <- setClass("MetaAnalysisPipeline", 26 | slots = c( 27 | pipeline = "tbl", 28 | pipelinePrototype = "proto", 29 | type = "character" 30 | )) 31 | 32 | #' MetaAnalysisPipeline constructor 33 | #' @docType methods 34 | #' @rdname initialize-methods 35 | #' @title This is the constructor for the \link{MetaAnalysisPipeline} class 36 | #' @family Package core functions 37 | #' @keywords internal 38 | 39 | setMethod( 40 | f = "initialize", 41 | signature = "MetaAnalysisPipeline", 42 | definition = function(.Object, type = "batch") 43 | { 44 | tryCatch({ 45 | .Object@pipeline <- dplyr::tibble( 46 | id = character(), 47 | operation = character(), 48 | heading = character(), 49 | parameters = list(), 50 | outAsIn = logical(), 51 | storeOutput = F 52 | ) 53 | 54 | .Object@type <- "batch" 55 | 56 | return(.Object) 57 | }, error = function(e){ 58 | futile.logger::flog.error(e, name = "logger.base") 59 | stop() 60 | }) 61 | } 62 | ) 63 | 64 | #' @name exportAsMetaPipeline 65 | #' @rdname exportAsMetaPipeline 66 | #' @title Method to export a meta-pipeline 67 | #' @details This method exports a Pipeline object i.e. of the classes \code{AnalysisPipeline} or 68 | #' \code{StreamingAnalysisPipeline} as a meta-pipeline 69 | #' @param object A Pipeline object 70 | #' @return an object of class "\code{MetaAnalysisPipeline}" 71 | #' @family Package core functions 72 | #' @examples 73 | #' \dontrun{ 74 | #' #' pipelineObj <- AnalysisPipeline(input = iris) 75 | #' pipelineObj %>>% univarCatDistPlots(uniCol = "Species") %>>% 76 | #' exportAsMetaPipeline -> exportedMetaPipeline 77 | #' } 78 | #' @export 79 | setGeneric( 80 | name = "exportAsMetaPipeline", 81 | def = function(object){ 82 | standardGeneric("exportAsMetaPipeline") 83 | } 84 | ) 85 | 86 | .exportAsMetaPipeline <- function(object){ 87 | object %>>% setLoggerDetails(target = "none") -> object 88 | metaPipeline <- MetaAnalysisPipeline() 89 | pipelineProto <- proto::proto() 90 | if(class(object) == "AnalysisPipeline"){ 91 | metaPipeline@type <- "batch" 92 | }else if(class(object) == "StreamingAnalysisPipeline"){ 93 | metaPipeline@type <- "streaming" 94 | } 95 | 96 | if(nrow(object@pipelineExecutor$topologicalOrdering) == 0){ 97 | object %>>% prepExecution -> object 98 | } 99 | 100 | object@pipeline -> pipeline 101 | pipeline %>>% purrr::pmap(function(id, operation, heading, 102 | parameters, outAsIn, storeOutput, dependencies){ 103 | # fnName <- paste0("fn_", operation) 104 | fnName <- operation 105 | assign(x = fnName, value = proto::proto(), envir = pipelineProto) 106 | 107 | purrr::imap(parameters, function(p, np){ 108 | # n <- names(p) 109 | if(class(p) == "formula"){ 110 | if(analysisPipelines::isDependencyParam(p)){ 111 | n <- analysisPipelines::getResponse(p) 112 | p <- paste0("~", analysisPipelines::getTerm(p)) %>>% as.formula() 113 | } 114 | } 115 | assign(x = paste0(np), 116 | value = p, 117 | envir = pipelineProto[[fnName]]) 118 | return(NULL) 119 | }) 120 | return(NULL) 121 | }) 122 | metaPipeline@pipeline <- pipeline 123 | metaPipeline@pipelinePrototype <- pipelineProto 124 | return(metaPipeline) 125 | } 126 | 127 | #' @rdname exportAsMetaPipeline 128 | setMethod( 129 | f = "exportAsMetaPipeline", 130 | signature = "BaseAnalysisPipeline", 131 | definition = .exportAsMetaPipeline 132 | ) 133 | 134 | 135 | #' @name getPipelinePrototype 136 | #' @rdname getPipelinePrototype 137 | #' @title Obtain the prototype of the functions in the pipeline 138 | #' @param metaPipelineObj A \code{MetaAnalysisPipeline} object 139 | #' @details This method returns the prototype of functions in the pipeline and their respective arguments as \code{proto} object. 140 | #' Functions in the pipeline can be accessed easily by using the '$' operator, and within the functions the arguments can 141 | #' be accessed the same way. These can be accessed and set to new values. This pipeline prototype can then be passed to the 142 | #' \code{createPipelineInstance} method which will instantiate an executable pipeline with the inputs set in the prototype 143 | #' @return An object og class \code{proto} from the 'proto' package 144 | #' @family Package core functions 145 | #' @examples 146 | #' \dontrun{ 147 | #' pipelineObj <- AnalysisPipeline(input = iris) 148 | #' pipelineObj %>>% univarCatDistPlots(uniCol = "Species") %>>% 149 | #' exportAsMetaPipeline %>>% getPipelinePrototype 150 | #' } 151 | #' @export 152 | setGeneric( 153 | name = "getPipelinePrototype", 154 | def = function(metaPipelineObj){ 155 | standardGeneric("getPipelinePrototype") 156 | } 157 | ) 158 | 159 | .getPipelinePrototype <- function(metaPipelineObj){ 160 | return(metaPipelineObj@pipelinePrototype) 161 | } 162 | 163 | #' @rdname getPipelinePrototype 164 | setMethod( 165 | f = "getPipelinePrototype", 166 | signature = "MetaAnalysisPipeline", 167 | definition = .getPipelinePrototype 168 | ) 169 | 170 | 171 | #' @name createPipelineInstance 172 | #' @rdname createPipelineInstance 173 | #' @title Create a Pipeline object from a meta-pipeline 174 | #' @param metaPipelineObj A \code{MetaAnalysisPipeline} object 175 | #' @param newParams Either a nested named list containing all the functions in the pipeline, their arguments and 176 | #' corresponding values (OR) an object of class \code{proto} which is a pipeline prototype, with the new values of the arguments 177 | #' set. Refer the \code{getPipelinePrototype} method. 178 | #' @details This method instantiates a Pipeline object (both \code{AnalysisPipeline} and \code{StreamingAnalysisPipeline}) from 179 | #' a meta-pipeline as well as an object containing the new set of values for the arguments of all the functions in the pipeline. 180 | #' @return A Pipeline object 181 | #' @family Package core functions 182 | #' @examples 183 | #' \dontrun{ 184 | #' pipelineObj <- AnalysisPipeline(input = iris) 185 | #' pipelineObj %>>% univarCatDistPlots(uniCol = "Species") -> pipelineObj 186 | #' pipelineObj %>>% exportAsMetaPipeline -> exportedMetaPipeline 187 | #' exportedMetaPipeline %>>% 188 | #' createPipelineInstance(newParams = exportedMetaPipeline %>>% 189 | #' getPipelinePrototype) 190 | #' } 191 | #' @export 192 | setGeneric( 193 | name = "createPipelineInstance", 194 | def = function(metaPipelineObj, newParams){ 195 | standardGeneric("createPipelineInstance") 196 | } 197 | ) 198 | 199 | .createPipelineInstance <- function(metaPipelineObj, newParams){ 200 | 201 | if(metaPipelineObj@type == "batch"){ 202 | pipelineObj <- AnalysisPipeline() 203 | }else if(metaPipelineObj@type == "streaming"){ 204 | pipelineObj <- StreamingAnalysisPipeline() 205 | } 206 | 207 | pipelineObj@pipeline <- metaPipelineObj@pipeline 208 | 209 | newParamList <- newParams 210 | if(any(class(newParams) == "proto")){ 211 | names(newParams) %>>% grep(pattern = "^[.]", value = T, invert = T ) -> fnNames 212 | 213 | newParamList <- purrr::imap(fnNames, function(fn, nfn){ 214 | fnEnvir <- get(fn, envir = newParams) 215 | fnEnvir %>>% names %>>% grep(pattern = "^[.]", invert = T, value = T ) -> argNames 216 | params <- mget(x = argNames, envir = newParams[[fn]]) 217 | params <- purrr::imap(params, function(p, np){ 218 | if(class(p) == "formula"){ 219 | if(analysisPipelines::isDependencyParam(p)){ 220 | p <- paste(np, "~", analysisPipelines::getTerm(p)) %>>% as.formula 221 | } 222 | #TODO: Deal with normal formula parameters 223 | } 224 | return(p) 225 | }) 226 | return(params) 227 | }) 228 | names(newParamList) <- fnNames 229 | } 230 | 231 | # Match pipeline table order 232 | tblOrder <- match(pipelineObj@pipeline$operation, names(newParamList)) 233 | newParamList <- newParamList[tblOrder] 234 | 235 | #Match argument list orders 236 | newParamList <- purrr::imap(newParamList, function(params, fnName){ 237 | pipelineParams <- pipelineObj@pipeline %>>% dplyr::filter(.data$operation == fnName) 238 | pipelineParams <- unlist(pipelineParams$parameters, recursive = F) 239 | argOrder <- match(names(pipelineParams), names(params)) 240 | params <- params[argOrder] 241 | return(params) 242 | }) 243 | 244 | names(newParamList) <- NULL 245 | pipelineObj@pipeline %>>% dplyr::mutate(parameters = newParamList) -> pipelineObj@pipeline 246 | 247 | return(pipelineObj) 248 | } 249 | 250 | #' @rdname createPipelineInstance 251 | setMethod( 252 | f = "createPipelineInstance", 253 | signature = "MetaAnalysisPipeline", 254 | definition = .createPipelineInstance 255 | ) 256 | 257 | #' A method definition for visualizing meta-pipelines, called when the 'visualizePipeline' method is called against the 258 | #' \code{MetaAnalysisPipeline} signature 259 | #' @name .visualizeMetaPipeline 260 | #' @keywords internal 261 | .visualizeMetaPipeline <- function(object){ 262 | object %>>% createPipelineInstance(object@pipelinePrototype) -> sampleObj 263 | vis <- NULL 264 | sampleObj %>>% setLoggerDetails(target = "none") -> sampleObj 265 | sampleObj %>>% prepExecution -> sampleObj 266 | sampleObj %>>% visualizePipeline -> vis 267 | return(vis) 268 | } 269 | 270 | #' @rdname visualizePipeline 271 | setMethod( 272 | f = "visualizePipeline", 273 | signature = "MetaAnalysisPipeline", 274 | definition = .visualizeMetaPipeline 275 | ) 276 | 277 | 278 | #' A method definition for saving meta-pipelines, called when the 'savePipeline' method is called against the 279 | #' \code{MetaAnalysisPipeline} signature 280 | #' @name .saveMetaPipeline 281 | #' @keywords internal 282 | .saveMetaPipeline <- function(object, path){ 283 | tryCatch({ 284 | .registry <- getRegistry() 285 | listToBeSaved <- c("object", ".registry", getRegistry()$functionName, getRegistry()$exceptionHandlingFunction) 286 | save(list = listToBeSaved,file = path) 287 | futile.logger::flog.info("|| Registry saved successfully at path '%s' ||", path, 288 | name = "logger.base") 289 | },error = function(e){ 290 | futile.logger::flog.error(e, name = "logger.base") 291 | stop() 292 | }, warning = function(w){ 293 | futile.logger::flog.warn(w, name = "logger.base") 294 | }) 295 | } 296 | 297 | #' @rdname savePipeline 298 | setMethod( 299 | f = "savePipeline", 300 | signature = "MetaAnalysisPipeline", 301 | definition = .saveMetaPipeline 302 | ) 303 | 304 | #' @name loadMetaPipeline 305 | #' @title Load a meta-pipeline 306 | #' @param path the path at which the .Rds file containing the pipeline is located 307 | #' @details This function loads a meta-pipeline from a file system, and returns the meta-pipeline object, which can be assigned 308 | #' to an object in the environment. 309 | #' @details Note - When a meta-pipeline is loaded, the existing registry is overwritten with the registry saved with the 310 | #' meta-pipeline 311 | #' @return An \code{MetaAnalysisPipeline} object 312 | #' @family Package core functions 313 | #' @examples 314 | #' \dontrun{ 315 | #' loadMetaPipeline(path = "./metaPipeline.RDS") 316 | #' } 317 | #' @export 318 | loadMetaPipeline <- function(path){ 319 | tryCatch({ 320 | object <- NULL 321 | futile.logger::flog.warn("|| The existing registry will be overwritten with the registry from the RDS file ||", 322 | name = "logger.base") 323 | load(path, envir = environment()) 324 | functionNames = setdiff(ls(envir = environment()), c("path", "object", ".registry")) 325 | eval(parse(text = paste0(".setRegistry(.registry)"))) 326 | lapply(functionNames, function(x){ 327 | assign(x, get(x, environment()), globEnv) 328 | }) 329 | 330 | return(object) 331 | },error = function(e){ 332 | futile.logger::flog.error(e, name = "logger.base") 333 | stop() 334 | }) 335 | } 336 | 337 | 338 | -------------------------------------------------------------------------------- /R/core-streaming-functions.R: -------------------------------------------------------------------------------- 1 | ################################################################################################## 2 | # Title: Reusable pipelines for streaming analyses 3 | # Author: Naren Srinivasan 4 | # Created on: July 12, 2018 5 | # Description: An R package version - Currently supports Apache Spark Structured Streaming 6 | ################################################################################################## 7 | 8 | # TO DO 9 | # - Add schema checks 10 | # - Add ability to initialized without input and check for generate output if there is not input initialized 11 | # - Remove workingInput - DONE 12 | # - Test loadPipeline function 13 | 14 | #' @include core-functions.R 15 | NULL 16 | 17 | #' @name StreamingAnalysisPipeline-class 18 | #' @rdname StreamingAnalysisPipeline-class 19 | #' @title Class for constructing Analysis Pipelines for streaming analyeses 20 | #' @details Inherits the base class \link{BaseAnalysisPipeline} class which holds the metadata including the registry of available functions, 21 | #' the data on which the pipeline is to be applied, as well as the pipeline itself 22 | #' @details This class currently only supports Apache Spark Structured Streaming, implemented through the SparkR interface 23 | #' @slot input The input Spark DataFrame on which analysis is to be performed 24 | #' @slot originalSchemaDf Empty Spark DataFrame representing the schema of the input 25 | #' @family Package core functions for Streaming Analyses 26 | #' @include core-functions.R 27 | #' @exportClass StreamingAnalysisPipeline 28 | #' @export StreamingAnalysisPipeline 29 | 30 | 31 | StreamingAnalysisPipeline <- setClass("StreamingAnalysisPipeline", 32 | slots = c( 33 | input = "ANY", 34 | #Should be a SparkDataFrame, but unable to specify as SparkR is not distributed on CRAN 35 | originalSchemaDf = "ANY" 36 | ), contains = "BaseAnalysisPipeline") 37 | 38 | #' StreamingAnalysisPipeline constructor 39 | #' @docType methods 40 | #' @rdname initialize-methods 41 | #' @title Constructor for the \code{StreamingAnalysisPipeline} object 42 | #' @include core-functions.R 43 | #' @keywords internal 44 | 45 | setMethod( 46 | f = "initialize", 47 | signature = "StreamingAnalysisPipeline", 48 | definition = function(.Object,input) 49 | { 50 | .Object@input <- input 51 | 52 | ## Calling the parent constructor 53 | .Object <- methods::callNextMethod(.Object) 54 | return(.Object) 55 | } 56 | ) 57 | 58 | .checkSparkDataFrame <- function(obj){ 59 | if(class(obj) != "SparkDataFrame"){ 60 | futile.logger::flog.error("|| The input should be of class 'SparkDataFrame' from the 'SparkR' package ||", 61 | name = "logger.base") 62 | stop() 63 | } 64 | } 65 | 66 | .executeStream<- function(object){ 67 | 68 | tryCatch({ 69 | 70 | futile.logger::flog.info("|| Pipeline Execution STARTED ||" , name='logger.execution') 71 | 72 | outputCache <- .getCache() 73 | 74 | topOrder <- object@pipelineExecutor$topologicalOrdering 75 | dplyr::left_join(object@pipeline, getRegistry(), by = c("operation" = "functionName")) %>>% 76 | dplyr::left_join(object@pipelineExecutor$topologicalOrdering, by = c("id" = "id")) -> pipelineRegistryOrderingJoin 77 | 78 | batches <- unique(pipelineRegistryOrderingJoin$level) 79 | numBatches <- max(as.numeric(batches)) 80 | 81 | 82 | # Iterate across batches i.e. sets of independent functions 83 | lapply(batches, function(x, object, pipelineRegistryOrderingJoin, outputCache){ 84 | 85 | pipelineRegistryOrderingJoin %>>% dplyr::filter(.data$level == x) -> functionsInBatch 86 | 87 | ## Function execution in a stream 88 | lapply(functionsInBatch$id, function(y, object, functionsInBatch, outputCache){ 89 | 90 | functionsInBatch %>>% dplyr::filter(.data$id == y) %>>% as.list -> funcDetails 91 | 92 | futile.logger::flog.info("|| Function ID '%s' named '%s' STARTED on the '%s' engine ||", 93 | funcDetails$id, funcDetails$operation, funcDetails$engine, 94 | name='logger.func') 95 | 96 | 97 | # Set parameters 98 | 99 | params <- unlist(funcDetails$parameters, recursive = F) 100 | dep <- unique(unlist(funcDetails$dependencies, recursive = F)) 101 | depTerms <- paste0("f", dep) 102 | 103 | # Datasets passed as a formula are updated here 104 | 105 | params <- lapply(params, function(p, depTerms, outputCache){ 106 | if(class(p) == "formula"){ 107 | isDepParam <- analysisPipelines::isDependencyParam(p) 108 | if(isDepParam){ 109 | formulaTerm <- analysisPipelines::getTerm(p) 110 | argName <- analysisPipelines::getResponse(p) 111 | if(formulaTerm %in% depTerms){ 112 | 113 | ## Formula of previous function in pipeline 114 | actualParamObjectName <- paste0(formulaTerm, ".out") 115 | p <- get(actualParamObjectName, envir = outputCache) 116 | } 117 | } 118 | } 119 | 120 | return(p) 121 | }, depTerms, outputCache) 122 | 123 | # No type conversion for Streaming pipelines 124 | 125 | if(funcDetails$isDataFunction){ 126 | # Not passed as a formula 127 | if(any(class(params[[1]]) == "rlang_fake_data_pronoun")){ 128 | # Checking for outAsIn 129 | if(funcDetails$outAsIn && funcDetails$id != "1"){ 130 | dataOpFn <- paste0("f", as.numeric(funcDetails$id) - 1) 131 | actualDataObjectName <- paste0(dataOpFn, ".out") 132 | params[[1]] <- get(actualDataObjectName, envir = outputCache) 133 | }else{ 134 | # On original input 135 | params[[1]]<- object@input 136 | } 137 | } 138 | } 139 | 140 | #Call 141 | startFunc <- Sys.time() 142 | args <- params 143 | output <- tryCatch({do.call(what = funcDetails$operation, 144 | args = args)}, 145 | error = function(e){ 146 | futile.logger::flog.error("|| ERROR Occurred in Function ID '%s' named '%s'. EXITING PIPELINE EXECUTION. Calling Exception Function - '%s' ||", 147 | funcDetails$id, funcDetails$operation, funcDetails$exceptionHandlingFunction, 148 | name='logger.func') 149 | do.call(funcDetails$exceptionHandlingFunction, 150 | list(error = e)) 151 | 152 | }) 153 | 154 | endFunc <- Sys.time() 155 | funcExecTime <- endFunc - startFunc 156 | 157 | opName <- paste0("f", funcDetails$id, ".out") #eg: f1.out 158 | if(funcDetails$storeOutput){ 159 | assign(opName, value = output, envir = outputCache) 160 | }else{ 161 | #Check if there are dependent children 162 | fromList <- object@pipelineExecutor$dependencyLinks$from 163 | if(funcDetails$id %in% fromList){ 164 | assign(opName, value = output, envir = outputCache) 165 | } 166 | } 167 | 168 | 169 | futile.logger::flog.info("|| NEW MICRO_BATCH PROCESSED for Function ID '%s' named '%s' in %s seconds ||", 170 | funcDetails$id, funcDetails$operation, funcExecTime, 171 | name='logger.func') 172 | 173 | }, object, functionsInBatch, outputCache) 174 | 175 | }, object, pipelineRegistryOrderingJoin, outputCache) 176 | 177 | object@output <- mget(ls(outputCache), envir = outputCache) 178 | rm(list = ls(outputCache), envir = outputCache) 179 | 180 | return(object) 181 | },error = function(e){ 182 | futile.logger::flog.error(e, name = "logger.base") 183 | stop() 184 | }) 185 | } 186 | 187 | .generateStreamingOutput <- function(object){ 188 | tryCatch({ 189 | 190 | object %>>% initializeLoggers 191 | 192 | inputToExecute <- object@input 193 | 194 | if(class(inputToExecute) != "SparkDataFrame"){ 195 | m <- "This streaming pipeline has not been initialized with a SparkDataFrame. Please use the setInput() function to do so." 196 | futile.logger::flog.error(m, name = 'logger.pipeline') 197 | stop(m) 198 | } 199 | 200 | ## Check engine setup 201 | object %>>% assessEngineSetUp -> engineAssessment 202 | engineAssessment %>>% dplyr::filter(.data$requiredForPipeline == T) -> requiredEngines 203 | 204 | if(!all(requiredEngines$isSetup)){ 205 | m <- paste0("All engines required for the pipelines have not been configured. ", 206 | "Please use the analysisPipelines::assessEngine() function to check") 207 | futile.logger::flog.error(m, name = 'logger.engine.assessment') 208 | stop(m) 209 | } 210 | 211 | if(nrow(object@pipelineExecutor$topologicalOrdering) == 0){ 212 | object %>>% prepExecution -> object 213 | } 214 | 215 | object %>>% .executeStream -> object 216 | 217 | return(object) 218 | 219 | },error = function(e){ 220 | futile.logger::flog.error(e, name = "logger.base") 221 | stop() 222 | }) 223 | } 224 | 225 | #' @rdname generateOutput 226 | setMethod( 227 | f = "generateOutput", 228 | signature = "StreamingAnalysisPipeline", 229 | definition = .generateStreamingOutput 230 | ) 231 | -------------------------------------------------------------------------------- /R/r-helper-utilites-python.R: -------------------------------------------------------------------------------- 1 | #' @name setPythonEnvir 2 | #' @title Sets the python environment to be used 3 | #' @details Wrapper function over reticulate functions to set a python environment to be used 4 | #' @param type Type of python environment. Takes three possible vales - 'conda' for Anaconda environments, 5 | #' 'virtualenv' for Virtual environments, and 'python' to manually set the python path to use 6 | #' @param pathOrEnvirName Name of the environment for Anaconda and Virtual environments, 7 | #' or the Python path when type is 'python' 8 | #' @family R helper utilities for Python 9 | #' @examples 10 | #' \dontrun{ 11 | #' setPythonEnvir() 12 | #' } 13 | #' @export 14 | setPythonEnvir <- function(type = 'conda', pathOrEnvirName = 'base'){ 15 | tryCatch({ 16 | if(type == 'conda'){ 17 | reticulate::use_condaenv(pathOrEnvirName, required = T) 18 | futile.logger::flog.info("|| Using conda environment of name '%s' ||", pathOrEnvirName, 19 | name = "logger.base") 20 | }else if(type == 'virtualenv'){ 21 | reticulate::use_virtualenv(pathOrEnvirName, required = T) 22 | futile.logger::flog.info("|| Using virtual environment of name '%s' ||", pathOrEnvirName, 23 | name = "logger.base") 24 | }else if (type == 'python'){ 25 | reticulate::use_python(pathOrEnvirName, required = T) 26 | futile.logger::flog.info("|| Using python at path: '%s' ||", pathOrEnvirName, 27 | name = "logger.base") 28 | }else{ 29 | futile.logger::flog.error("|| Invalid type - Should be one of 'conda', 'virtualenv', or 'python' ||") 30 | } 31 | }, error = function(e){ 32 | futile.logger::flog.error("|| %s ||", e, name = 'logger.base') 33 | }) 34 | } 35 | 36 | 37 | #' @name getFeaturesForPyClassification 38 | #' @title Extracts selected columns from a data frame as a Python array 39 | #' @details Helper function, which when provided an R data frame and a set of column/ feature names, 40 | #' extracts them from the R data frame as a matrix and converts them to the equivalent Python array. 41 | #' @details Typically this function can be used when providing a feature matrix to a Python machine learning function 42 | #' @param dataset an R data frame 43 | #' @param featureNames Column names to be extracted from the R data frames. A character vector. 44 | #' @family R helper utilities for Python 45 | #' @examples 46 | #' \dontrun{ 47 | #' getFeaturesForPyClassification(dataset = iris, 48 | #' featureNames = c("Sepal.Length", "Sepal.Width")) 49 | #' } 50 | #' @export 51 | getFeaturesForPyClassification <- function(dataset, featureNames){ 52 | dataset %>% dplyr::select(!!featureNames) %>% as.matrix %>% reticulate::r_to_py() -> featureMatrix 53 | return(featureMatrix) 54 | } 55 | 56 | #' @name getTargetForPyClassification 57 | #' @title Extracts selected column from a data frame a binary class Python array 58 | #' @param dataset an R data frame 59 | #' @param targetVarName Name of the target variable for classification. Should be a categorical variable. 60 | #' @param positiveClass Name of the class of the target variable which should be coded as '1' 61 | #' @details Helper function, which when provided an R dataframe and a binary categorical column, 62 | #' extracts it from the R data frame, converts it to 1/0 class coding, and converts it to a Python array 63 | #' @details Typically this function can be used to extract a target variable for a classifier to be provided to a 64 | #' Python machine learning function 65 | #' @family R helper utilities for Python 66 | #' @examples 67 | #' \dontrun{ 68 | #' getTargetForPyClassification(dataset = iris, 69 | #' targetVarName = "Species", positiveClass = "setosa") 70 | #' } 71 | #' @export 72 | getTargetForPyClassification <- function(dataset, targetVarName, positiveClass){ 73 | dataset %>% dplyr::mutate(target = ifelse(!!rlang::sym(targetVarName) == !!(positiveClass) , 1, 0)) %>% dplyr::select(target) %>% 74 | as.list() %>% unlist -> targetList 75 | names(targetList) <- NULL 76 | targetList %>% as.factor %>% reticulate::r_to_py() -> target 77 | return(target) 78 | } 79 | 80 | -------------------------------------------------------------------------------- /R/spark-structured-streaming-utilities.R: -------------------------------------------------------------------------------- 1 | ###################################################################################################### 2 | # Title: Utility functions for working with Spark through R 3 | # Author: Naren Srinivasan, Anoop S 4 | # Created on: August 24, 2018 5 | # Description: Functions to work with Spark, incuding Structured Streaming 6 | ###################################################################################################### 7 | 8 | 9 | #' @name sparkRSessionCreateIfNotPresent 10 | #' @title Connect to a Spark session 11 | #' @details Loads the SparkR package and intializes a Spark session from R 12 | #' @param ... Arguments to sparkR.session 13 | #' @family Spark utilities 14 | #' @examples 15 | #' \dontrun{ 16 | #' sparkHome <- "/Users/naren/softwares/spark-2.3.1-bin-hadoop2.7/" 17 | #' sparkMaster <- "local[1]" 18 | #' sparkPackages <- c("org.apache.spark:spark-sql-kafka-0-10_2.11:2.3.1") 19 | #' sparkRSessionCreateIfNotPresent(master = sparkMaster, 20 | #' sparkPackages = sparkPackages) 21 | #' } 22 | #' @export 23 | 24 | sparkRSessionCreateIfNotPresent <- function(...){ 25 | 26 | if(Sys.getenv("SPARK_HOME") == "" && sparkHome == ""){ 27 | stop("SPARK_HOME environment variable is not set on the system, and sparkHome argument is empty") 28 | } 29 | 30 | if(!("SparkR" %in% installed.packages())){ 31 | stop("SparkR package not installed. Please install from the $SPARK_HOME folder") 32 | } 33 | 34 | if(sparkHome == ""){ 35 | .libPaths(c(file.path(Sys.getenv("SPARK_HOME"), "R", "lib"), .libPaths())) 36 | sparkHome <- Sys.getenv("SPARK_HOME") 37 | }else{ 38 | .libPaths(c(file.path(sparkHome, "R", "lib"), .libPaths())) 39 | } 40 | 41 | SparkR::sparkR.session(...) 42 | } 43 | 44 | #' @name castKafkaStreamAsString 45 | #' @title Connect to a Spark session 46 | #' @details Takes in a Structured Stream from Kafka created from \code{read.stream(source = 'kafka', ...)} and returns 47 | #' a Structured Streaming DataFrame where the \code{key} and \code{value} from the Kafka stream are cast to string 48 | #' @param streamObj Spark Structured Streaming DataFrame returned by \code{read.stream} function with \code{source = 'kafka'} 49 | #' @return Updated Spark Structured Streaming DataFrame with key, value, topic and timestamp from the Kafka stream 50 | #' @family Spark utilities 51 | #' @export 52 | 53 | castKafkaStreamAsString <- function(streamObj){ 54 | streamObj <- SparkR::selectExpr(streamObj, "CAST(key AS STRING)", "CAST(value AS STRING)","topic","timestamp") 55 | return(streamObj) 56 | } 57 | 58 | #' @name convertKafkaValueFromJson 59 | #' @title Connect to a Spark session 60 | #' @details Takes in a Structured Stream from Kafka created from \code{read.stream(source = 'kafka', ...)} and returns 61 | #' a Structured Streaming DataFrame where the \code{key} and \code{value} from the Kafka stream are cast to string 62 | #' @param streamObj Spark Structured Streaming DataFrame which is returned by the \code{castKafkaStreamAsString} function 63 | #' @param schema A structType object created from SparkR specifying the schema of the json data present in the \code{value} 64 | #' attribute of the incoming Kafka stream 65 | #' @return Spark Structured Streaming DataFrame with the json data in the \code{value} attribute of the Kafka stream parsed 66 | #' into a DataFrame format 67 | #' @family Spark utilities 68 | #' @export 69 | 70 | convertKafkaValueFromJson <- function(streamObj, schema){ 71 | streamObj <- SparkR::select(streamObj, SparkR::from_json(streamObj$value, 72 | schema = schema)) 73 | return(streamObj) 74 | } 75 | 76 | -------------------------------------------------------------------------------- /R/sysdata.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Mu-Sigma/analysis-pipelines/a7bfb1a0d5d251a42309b2430c11535be817dea9/R/sysdata.rda -------------------------------------------------------------------------------- /R/zzz.R: -------------------------------------------------------------------------------- 1 | 2 | .onAttach <- function(libName, pkgName){ 3 | loadPredefinedFunctionRegistry() 4 | } 5 | 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Table of contents 2 | 1. [An overview of the package](#overview) 3 | 2. [Usage](#Usage) 4 | 3. [Features](#Features) 5 | 6 | # An overview of the package 7 | 8 | In a typical data science workflow there are multiple steps involved from data aggregation, cleaning, exploratory analysis, modeling and so on. As the data science community matures, we are seeing that there are a variety of languages which provide better capabilities for specific steps in the data science workflow. *R* is typically used for data transformations, statistical models, and visualizations, while *Python* provides more robust functions for machine learning. In addition to this, *Spark* provides an environment to process high volume data - both as one-time/ batch or as streams. 9 | 10 | The job of today's data scientist is changing from one where they are married to a specific tool or language, to one where they are using all these tools for their specialized purposes. The key problem then becomes one of translation between these tools for seamless analysis. Additionally, in the work of a data scientist, there is a need to perform the same task repeatedly, as well as put certain analysis flows (or) pipelines into production to work on new data periodically, or work on streaming data. 11 | 12 | Recently, interfaces for using these various tools have been published. In terms of R packages, the *reticulate* package provides an interface to Python, and the *SparkR* and *sparklyr* packages provide an interface to Spark. 13 | 14 | The *analysisPipelines* package uses these interfaces to enable *Interoperable Pipelines* i.e. the ability compose and execute a reusable data science pipeline which can contain functions to be executed in an *R* environment, in a *Python* environment or in a *Spark* environment. These pipelines can saved and loaded, to enable batch operation as datasets get updated with new data. 15 | 16 | The goal of the *analysisPipelines* package is to make the job of the data scientist easier and help them compose pipelines of analysis which consist of data manipulation, exploratory analysis & reporting, as well as modeling steps. The idea is for data scientists to use tools of their choice through an *R* interface, using this package 17 | Essentially, it allows data scientists to: 18 | 19 | * Compose **reusable, interoperable** pipelines in a flexible manner 20 | * Leverage available utility functions for performing different analytical operations 21 | * Put these pipelines into production in order to execute repeatedly 22 | * Generated analysis reports by executing these pipelines 23 | 24 | ## Types of pipelines 25 | 26 | This package supports for both *batch/ repeated* pipelines, as well as *streaming pipelines.* 27 | 28 | For *batch* pipelines, the vision is to enable interoperable pipelines which execute efficiently with functions in *R*, *Spark* and *Python* 29 | 30 | For *streaming* pipelines, the package allows for streaming analyses through *Apache Spark Structured Streaming.* 31 | 32 | ## Classes and implementation 33 | 34 | The *analysisPipelines* package uses S4 classes and methods to implement all the core functionality. The fundamental class exposed in this package is the *BaseAnalysisPipeline* class on which most of the core functions are implemented. The user, however, interacts with the *AnalysisPipeline* and *StreamingAnalysisPipeline* classes for batch and streaming analysis respectively. 35 | 36 | ## Pipelining semantics 37 | 38 | The package stays true to the *tidyverse* pipelining style which also fits nicely into the idea of creating pipelines. The core mechanism in the package is to instantiate a pipeline with data and then pipeline required functions to the object itself. 39 | 40 | The package allows both the use of *magrittr* pipe **(%>%)** or the *pipeR* pipe **(%>>%)**. 41 | 42 | ## Supported engines 43 | 44 | As of this version, the package supports functions executed on *R*, or *Spark* through the SparkR interface, as well as Python functions run through *reticulate* for batch pipelines. It also supports *Apache Spark Structured Streaming* pipelines for streaming analyses. 45 | 46 | ## Bug reports and feature requests 47 | 48 | * Bug reports/ Feature requests and other thoughts can be raised [here](https://github.com/Mu-Sigma/analysis-pipelines/issues) 49 | 50 | ## Available vignettes 51 | 52 | This package contains 7 vignettes: 53 | 54 | 1. **Analysis pipelines - Core functionality and working with R data frames and functions** - This is the main vignette describing the package's core functionality, and explaining this through **batch** pipelines in just **R** 55 | 2. **Analysis pipelines for working with Spark DataFrames for one-time/ batch analyses** - This vignette describes creating **batch** pipelines to execute solely in a *Spark* environment 56 | 3. **Analysis pipelines for working with Python functions** - This vignette describes creating **batch** pipelines to execute solely in a *Python* environment 57 | 4. **Interoperable analysis pipelines** - This vignette describes creating and executing **batch** pipelines which are composed of functions executing across *supported engines* 58 | 5. **Streaming Analysis Pipelines for working with Apache Spark Structured Streaming** - This vignette describes setting up streaming pipelines on *Apache Spark Structured Streaming* 59 | 6. **Using pipelines inside Shiny widgets or apps** - A brief vignette which illustrates an example of using a pipeline inside a shiny widget with reactive elements and changing data 60 | 7. **An introduction to meta-pipelines** - This vignette illustrates the use of meta-pipelines 61 | 62 | When the package is installed and loaded, vignettes 1 & 7 have all the chunks evaluated. Other vignettes require specific Python and Spark configurations and hence all chunks are not evaluated as part of the package. However, an evaluated version of vignettes 2-5 can be found in the `knit-vignettes` folder in the Github project. Vignette 6 is a shiny document which can be run. 63 | 64 | # Usage 65 | 66 | ## Loading the package 67 | 68 | ```r 69 | library(analysisPipelines) 70 | ``` 71 | 72 | ## Creating an analysisPipeline object 73 | 74 | An object of class *AnalysisPipeline* can be created like so: 75 | 76 | ```{r creating object, warning=F} 77 | obj <- AnalysisPipeline(input = iris) 78 | class(obj) 79 | ``` 80 | 81 | While initializing the object, an input dataframe can be provided on which the pipeline should work, either by providing the filePath to a *.csv* file through the *filePath* argument, or by providing R dataframe available in the session, through the *input* argument 82 | 83 | The *AnalysisPipeline* object has a set of getters, for retrieving various slots containing data and metadata required for pipeline execution. The most basic of them is the *getInput* method which retrieves the input dataframe with which the object has been initialized. If not initialized with a dataframe, the *setInput* method can be used to do so. 84 | 85 | ```{r printing object contents, warning=F} 86 | obj %>>% getInput %>>% str 87 | getRegistry() 88 | ``` 89 | 90 | The *getRegistry* function retrieves the set of functions and their metadata available for pipelining. Any *AnalysisPipeline* object comes with a set of pre-registered functions which can be used **out-of-the-box**. Of course, the user can register her own functions, to be used in the pipeline. We will explore this later on. 91 | 92 | There are two types of functions which can be pipelined: 93 | 94 | * **Data functions** - These functions necessarily take their **first** argument as a dataframe. These are functions focused on performing operations on data. Specifically, the nomenclature *data functions* is used for those functions which work on the input dataframe set to the pipeline object, and perform some transformation or analysis on them. They help form the main *path* in a pipeline, constituting a linear flow from the input. 95 | * **Non-data functions** - These are auxiliary helper functions which are required in a pipeline, which may or may not operate on data. However, the *key* difference is that these functions do not operate on the **input (or some direct transformation of it)**. In essence, they help form auxiliary paths in the pipeline, which eventually merge into the main path. 96 | 97 | Both pre-registered and user-defined functions work with the *AnalysisPipeline* object in the same way i.e. regardless of who writes the function, they follow the same semantics. 98 | 99 | ## Creating a simple pipeline 100 | 101 | We'll now take a look at creating a simple pipeline, with some of the pre-registered functions available in the registry. We pipeline the *univarCatDistPlots* function (available as a pre-registered utility function,which generates a chart showing distribution of a categorical variable in a dataset), by simply using the *pipe* or *double pipe* operator, and providing the required additional parameters apart from the *data* on which it needs to operate, as we have already initialized the *AnalysisPipeline* object with the data. 102 | 103 | Note that unless assigned to the same or another object, the pipeline does not get stored. 104 | 105 | We can access the details of the pipeline as a tibble through the `getPipeline` method. 106 | 107 | ```{r pipe demo 1, warning=F} 108 | # Running univariate categorical distribution plot on the constructed object 109 | # ?analysisPipelines::univarCatDistPlots 110 | obj1 <- obj %>>% univarCatDistPlots(uniCol = "Species", priColor = "blue", optionalPlots = 0, storeOutput = T) 111 | obj1 %>>% getPipeline 112 | ``` 113 | 114 | # Features 115 | 116 | ## User-defined functions 117 | 118 | ### Registering your own function 119 | 120 | You can register your own *data* or *non-data* functions by calling `registerFunction.` This adds the user-defined function to the registry. The registry is maintained by the package and once registered, functions can be used across pipeline objects. The registry can be view by calling the `getRegistry` function. 121 | 122 | ```r 123 | # Currently registered functions 124 | getRegistry() 125 | ``` 126 | 127 | 128 | In order to register a function, first the function must be defined in the Global environment, before calling `registerFunction`. 129 | 130 | ```r 131 | bivariatePlots <- function(dataset, select_var_name_1, select_var_name_2, 132 | priColor = "blue", secColor='black') { 133 | x=dataset[, select_var_name_1] 134 | y=dataset[, select_var_name_2] 135 | bivarPlot <- ggplot2::ggplot(dataset, ggplot2::aes(x,y)) + 136 | ggplot2::geom_point(color=priColor,alpha=0.7) + 137 | ggplot2::geom_smooth(method = lm,color=secColor) + 138 | ggplot2::xlab(select_var_name_1) + 139 | ggplot2::ylab(select_var_name_2) + 140 | ggplot2::theme_bw() + 141 | ggplot2::ggtitle(paste('Bivariate plot for', select_var_name_1, 142 | 'and', select_var_name_2, sep=' ')) + 143 | ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 10), 144 | axis.text = ggplot2::element_text(size=10), 145 | axis.title=ggplot2::element_text(size=10)) 146 | return(bivarPlot) 147 | } 148 | 149 | registerFunction(functionName = "bivariatePlots", heading = "Bivariate Analysis") 150 | ``` 151 | 152 | ### Adding the newly registered function to a pipeline 153 | 154 | Now the newly registered user-defined function can be used as part of the pipeline, exactly as described before. For example, we add it to a pipeline which already contains some functions. The function then gets added to the end of the pipeline 155 | 156 | ```{r register function 2, warning=F} 157 | # Chaining the user-defined function to the object's pipeline where it was registered 158 | obj2 <- obj2 %>>% 159 | bivariatePlots(select_var_name_1 = 'Sepal.Length', select_var_name_2 = 'Sepal.Width', 160 | priColor = "blue", secColor = "black") 161 | 162 | # Printing the updated pipeline 163 | obj2 %>>% getPipeline 164 | ``` 165 | 166 | ## Complex pipelines and formula semantics 167 | 168 | In addition to simple linear pipelines, more complex pipelines can also be defined. There are cases when the outputs of previous functions in the pipeline, as inputs to arbitrary parameters of subsequent functions. 169 | 170 | The package defines certain *formula* semantics to accomplish this. We take the example of two simple user-defined functions, both which simply return the color of the graph, as well as the column on which the graph should be plotted, in order to illustrate how this works. 171 | 172 | Preceding outputs can be passed to subsequent functions simply by specifying a **formula** of the form 'f*id*' against the argument to which the output is to be passed . The ID represents the ID of the function in the pipeline. For example, to pass the output of function with ID '1' as an argument to a parameter of a subsequent function, the formula '~f1' is passed to that corresponding argument. 173 | 174 | ```r 175 | obj %>>% getColor(color = "blue") %>>% getColumnName(columnName = "Sepal.Length") %>>% 176 | univarCatDistPlots(uniCol = "Species", priColor = ~f1, optionalPlots = 0, storeOutput = T) %>>% 177 | outlierPlot(method = "iqr", columnName = ~f2, cutoffValue = 0.01, priColor = ~f1 , optionalPlots = 0) -> complexPipeline 178 | 179 | complexPipeline %>>% getPipeline 180 | complexPipeline %>>% generateOutput -> op 181 | op %>>% getOutputById("4") 182 | ``` 183 | 184 | ## Interoperable pipelines 185 | 186 | **Interoperable pipelines** containing functions operating on different engines such as R, Spark and Python can be configured and executed through the **analysisPipelines** package. Currently, the package supports interoperable pipelines containing *R* and *Spark* batch functions. 187 | 188 | Pipeline Visualization 1 189 | 190 | ## Pipeline visualization 191 | 192 | Pipelines can be visualized as directed graphs, providing information about the engines being used, function dependencies and so on. 193 | 194 | Pipeline Visualization 2 195 | 196 | 197 | ## Report generation 198 | 199 | Outputs generated from pipelines can easily be exported to formatted reports, showcasing the results, generating pipeline as well as a peek at the data 200 | 201 | Report 1 202 |
203 | 204 | Report 2 205 |
206 | Report 3 207 | 208 | ## Meta-pipelines 209 | 210 | The meta-pipeline construct is one which allows users to export pipelines they have created for a particular use case to a general analysis flow which can be used for a different dataset and different set of parameters. A pipeline is one where the data can change, though retaining the same schema, and the same set of parameters for the functions. A meta-pipeline is one where only the analysis flow, function dependencies and so on are retained. The specific parameters for each of the functions can be set differently for a new use case. 211 | 212 | The objective of a meta-pipeline is to define and execute reusable analysis flows. They can be used to: 213 | * Document best practices for a particular problem 214 | * Templatize analyses for particular situations 215 | 216 | Meta-pipelines can be created by exporting from pipelines, and new pipelines instantiated from a meta-pipeline, with an easy-to-use method to set the new values of parameters. 217 | 218 | ## Execution 219 | 220 | The 'analysisPipelines' package internally converts the pipeline defined by the user into a **directed graph** which captures dependencies of each function in the pipeline on data, other arguments as well as outputs as other functions. 221 | 222 | ### Topological sort and ordering 223 | 224 | When it is required to generate the output, the pipeline is first *prepped* by performing a **topological sort** of the directed graph, and identifying *sets (or) batches* of independent functions and a sequence of *batches* for execution. A later release of the package will allow for parallel execution of these independent functions 225 | 226 | ### Memory management & garbage cleaning 227 | 228 | Memory is managed efficiently, by only storing outputs which the user has explicitly specified, or temporarily storing intermediate outputs required for subsequent functions **only until** they are required for processing. Garbage cleaning is performed after the execution of each *batch* in order to manage memory effectively. 229 | 230 | ### Type conversions 231 | 232 | In the case of *Interoperable pipelines* executing across multiple engines such as *R, Spark and Python*, type conversions between data types in the different engines is **minimized** by identifying the optimal number of type conversions, before execution starts 233 | 234 | ## Logging & Execution times 235 | 236 | The package provides logging capabilities for execution of pipelines, as you might have noted when the output was generated in sections above. By default, logs are written to the *console*, but alternatively the user can specify an output file to which the logs need to be written through the `setLoggerDetails` function. 237 | 238 | Logs capture errors, as well as provide information on the steps being performed, execution times and so on. 239 | 240 | Logging 241 | 242 | ## Custom exception-handling 243 | 244 | By default, when a function is registered, a generic exception handling function which captures the R error message, in case of error is registered against each function in the registry. The user can define a custom exception handling function, by defining it and providing it during the time of registration. The function should take 1 argument, which is the error object. 245 | 246 | 247 | 248 | -------------------------------------------------------------------------------- /analysisPipelines.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: No 4 | SaveWorkspace: No 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageBuildArgs: --resave-data 22 | PackageCheckArgs: --as-cran 23 | PackageRoxygenize: rd,collate,namespace,vignette 24 | -------------------------------------------------------------------------------- /data-raw/predefFunctions.R: -------------------------------------------------------------------------------- 1 | ################################################################################################## 2 | # Title: Predefined functions as part of package 3 | # Version: 18.08.01 4 | # Created on: August 23, 2018 5 | # Description: Reproducible code to generate list of predefined functions 6 | ################################################################################################## 7 | 8 | ################################################################################################## 9 | # Working with batch pipelines - data frames in R, Spark or Python 10 | ################################################################################################## 11 | 12 | ################################################################################################## 13 | # EDA 14 | ################################################################################################## 15 | .batchPredefFunctions <- data.frame(functionName = c("univarCatDistPlots"), 16 | heading = c("Univariate Distribution Categorical"), 17 | engine = c("r"), 18 | exceptionHandlingFunction = c(as.character(substitute(genericPipelineException))), 19 | isDataFunction = TRUE, 20 | firstArgClass = "", 21 | stringsAsFactors = F) 22 | 23 | .batchPredefFunctions %>>% dplyr::add_row(functionName = "outlierPlot", 24 | heading = "Univariate Outlier", 25 | # outAsIn = FALSE, 26 | engine = "r", 27 | exceptionHandlingFunction = c(as.character(substitute(genericPipelineException))), 28 | isDataFunction = TRUE, 29 | firstArgClass = "") -> .batchPredefFunctions 30 | .batchPredefFunctions %>>% dplyr::add_row(functionName = "multiVarOutlierPlot", 31 | heading = "Multivariate Outlier", 32 | engine = "r", 33 | exceptionHandlingFunction = c(as.character(substitute(genericPipelineException))), 34 | isDataFunction = T, 35 | firstArgClass = "") -> .batchPredefFunctions 36 | .batchPredefFunctions %>>% dplyr::add_row(functionName = "ignoreCols", 37 | heading = "Ignore Columns", 38 | engine = "r", 39 | exceptionHandlingFunction = c(as.character(substitute(genericPipelineException))), 40 | isDataFunction = TRUE, 41 | firstArgClass = "") -> .batchPredefFunctions 42 | .batchPredefFunctions %>>% dplyr::add_row(functionName = "getFeaturesForPyClassification", 43 | heading = "", 44 | engine = "r", 45 | exceptionHandlingFunction = c(as.character(substitute(genericPipelineException))), 46 | isDataFunction = T, 47 | firstArgClass = "") -> .batchPredefFunctions 48 | .batchPredefFunctions %>>% dplyr::add_row(functionName = "getTargetForPyClassification", 49 | heading = "", 50 | engine = "r", 51 | exceptionHandlingFunction = c(as.character(substitute(genericPipelineException))), 52 | isDataFunction = TRUE, 53 | firstArgClass = "") -> .batchPredefFunctions 54 | 55 | ################################################################################################## 56 | 57 | ################################################################################################## 58 | # Working with Streaming pipelines - Currently supports Apache Spark Structured Streaming 59 | ################################################################################################## 60 | 61 | ################################################################################################## 62 | # Kafka Streams as input 63 | ################################################################################################## 64 | 65 | .streamingPredefFunctions <- data.frame(functionName = c("castKafkaStreamAsString"), 66 | heading = c("Cast Kafka stream to a string"), 67 | engine = c("spark-structured-streaming"), 68 | exceptionHandlingFunction = c(as.character(substitute(genericPipelineException))), 69 | isDataFunction = TRUE, 70 | firstArgClass = "", 71 | stringsAsFactors = F) 72 | 73 | .streamingPredefFunctions %>>% dplyr::add_row(functionName = "convertKafkaValueFromJson", 74 | heading = "Convert Kafka Value from JSON", 75 | engine = c("spark-structured-streaming"), 76 | exceptionHandlingFunction = c(as.character(substitute(genericPipelineException))), 77 | isDataFunction = TRUE, 78 | firstArgClass = "" 79 | ) -> .streamingPredefFunctions 80 | 81 | 82 | devtools::use_data(.batchPredefFunctions, .streamingPredefFunctions, internal = TRUE, overwrite = T) 83 | -------------------------------------------------------------------------------- /inst/data-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Mu-Sigma/analysis-pipelines/a7bfb1a0d5d251a42309b2430c11535be817dea9/inst/data-icon.png -------------------------------------------------------------------------------- /inst/logging.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Mu-Sigma/analysis-pipelines/a7bfb1a0d5d251a42309b2430c11535be817dea9/inst/logging.png -------------------------------------------------------------------------------- /inst/output-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Mu-Sigma/analysis-pipelines/a7bfb1a0d5d251a42309b2430c11535be817dea9/inst/output-icon.png -------------------------------------------------------------------------------- /inst/param-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Mu-Sigma/analysis-pipelines/a7bfb1a0d5d251a42309b2430c11535be817dea9/inst/param-icon.png -------------------------------------------------------------------------------- /inst/pipelineViz1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Mu-Sigma/analysis-pipelines/a7bfb1a0d5d251a42309b2430c11535be817dea9/inst/pipelineViz1.png -------------------------------------------------------------------------------- /inst/pipelineViz2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Mu-Sigma/analysis-pipelines/a7bfb1a0d5d251a42309b2430c11535be817dea9/inst/pipelineViz2.png -------------------------------------------------------------------------------- /inst/python-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Mu-Sigma/analysis-pipelines/a7bfb1a0d5d251a42309b2430c11535be817dea9/inst/python-logo.png -------------------------------------------------------------------------------- /inst/python/sampleFunctions.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn import datasets 3 | from sklearn import metrics 4 | from sklearn.tree import DecisionTreeClassifier 5 | 6 | def getColMeans(df): 7 | meanList = [] 8 | for x in df.columns: 9 | meanList.append(df[x].mean()) 10 | return meanList 11 | 12 | def decisionTreeTrainAndTest(data, target, newData): 13 | model = DecisionTreeClassifier() 14 | model.fit(data, target) 15 | testPred = model.predict(newData) 16 | return testPred 17 | 18 | -------------------------------------------------------------------------------- /inst/r-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Mu-Sigma/analysis-pipelines/a7bfb1a0d5d251a42309b2430c11535be817dea9/inst/r-logo.png -------------------------------------------------------------------------------- /inst/report.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Analysis Pipeline Results" 3 | 4 | subtitle: '`r format(Sys.Date(), "%B %d, %Y")`' 5 | 6 | output: html_document 7 | 8 | params: 9 | obj: r! analysisPipelines::AnalysisPipeline() 10 | --- 11 | 12 | ## Pipeline Visualization 13 | 14 | ```{r echo=FALSE, warning=FALSE, comment=FALSE, message=FALSE, results='asis', fig.width = 12, out.width = '100%'} 15 | obj <- params$obj 16 | input <- obj@input 17 | pipelineDetails <-obj@pipeline 18 | output <- obj@output 19 | 20 | 21 | analysisPipelines::visualizePipeline(obj) 22 | ``` 23 | 24 | 25 | ## Quick Peek 26 | ```{r quick peek,echo=FALSE,warning=FALSE,results='asis', fig.width = 12, out.width = '100%'} 27 | DT::datatable(head(input),options = list(scrollX = T, scrollY = T)) 28 | ``` 29 | 30 | 31 | ```{r, echo =FALSE,warnings=FALSE,results='asis'} 32 | 33 | knitString <- "" 34 | 35 | storedOps <- pipelineDetails %>>% dplyr::filter(storeOutput == T) 36 | 37 | for(i in storedOps$id){ 38 | opTable <- storedOps %>>% dplyr::filter(id == i) 39 | obj%>>% getOutputById(i) -> op 40 | eval(parse(text = paste0("op_", i, " <- op"))) 41 | knit_expanded <- paste0( 42 | "\n```{r chunk",i,",results='asis', fig.width = 12, out.width = '100%', echo=FALSE} 43 | \n\n 44 | cat('## ",opTable$heading," \n') 45 | \n\n 46 | op_", i, " 47 | \n``` 48 | \n\n" 49 | ) 50 | knitString <- paste0(knitString, knit_expanded) 51 | } 52 | 53 | ``` 54 | 55 | 56 | `r paste(knitr::knit(text = knitString), collapse = '\n')` 57 | 58 | -------------------------------------------------------------------------------- /inst/report1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Mu-Sigma/analysis-pipelines/a7bfb1a0d5d251a42309b2430c11535be817dea9/inst/report1.png -------------------------------------------------------------------------------- /inst/report2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Mu-Sigma/analysis-pipelines/a7bfb1a0d5d251a42309b2430c11535be817dea9/inst/report2.png -------------------------------------------------------------------------------- /inst/report3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Mu-Sigma/analysis-pipelines/a7bfb1a0d5d251a42309b2430c11535be817dea9/inst/report3.png -------------------------------------------------------------------------------- /inst/spark-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Mu-Sigma/analysis-pipelines/a7bfb1a0d5d251a42309b2430c11535be817dea9/inst/spark-logo.png -------------------------------------------------------------------------------- /inst/spark-structured-streaming-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Mu-Sigma/analysis-pipelines/a7bfb1a0d5d251a42309b2430c11535be817dea9/inst/spark-structured-streaming-logo.png -------------------------------------------------------------------------------- /inst/styles.css: -------------------------------------------------------------------------------- 1 | body { 2 | font-family: "Helvetica Neue", Helvetica, Arial, sans-serif; 3 | font-size: 14px; 4 | } 5 | 6 | td { 7 | padding-left: 10px; 8 | width: 50px; 9 | align:left; 10 | } 11 | 12 | table{ 13 | width: 100% 14 | } 15 | 16 | /*h1{ 17 | text-align:center; 18 | }*/ 19 | .level2{ 20 | width:77vw; 21 | margin-left: 20vw; 22 | } 23 | .fluid-row{ 24 | position: relative; 25 | background-color: #162f47; 26 | font-weight: bold; 27 | color: white; 28 | width:98.8vw; 29 | left: -15px; 30 | padding: 0.3cm 0.5cm 0.5cm 0.3cm; 31 | z-index :100; 32 | box-shadow: 0px 2px 4px 0px rgba(0, 0, 0, 0.80); 33 | } 34 | h3{ 35 | font-family: Arial, Helvetica, sans-serif; 36 | 37 | } 38 | #hideGridMsg{ 39 | display:none; 40 | } 41 | .main-container{ 42 | max-width:100vw !important; 43 | } 44 | h1{ 45 | font-family: Arial, Helvetica, sans-serif; 46 | margin-left:2%; 47 | margin-top: 0px !important; 48 | margin-bottom: -15px !important; 49 | font-size: 34px !important; 50 | } 51 | .author{ 52 | font-family: Arial, Helvetica, sans-serif; 53 | margin-left:2%; 54 | font-size: 16px; 55 | } 56 | h2{ 57 | position: relative; 58 | left: -1%; 59 | bottom: 3px; 60 | border-bottom: 2px solid #444444; 61 | font-size: 24px; 62 | font-family: Arial, Helvetica, sans-serif; 63 | font-weight: 600; 64 | color:#444444; 65 | } 66 | .subtitle{ 67 | margin-left: 2%; 68 | font-size: 20px; 69 | } 70 | body{ 71 | font-family: Arial, Helvetica, sans-serif; 72 | overflow-x: hidden; 73 | } 74 | head{ 75 | font-family: Arial, Helvetica, sans-serif; 76 | } 77 | .mulogo{ 78 | position: relative; 79 | float: right; 80 | right: -20vw; 81 | margin-top: -115px; 82 | z-index:100; 83 | } 84 | .client{ 85 | position: relative; 86 | float: right; 87 | right:-27vw; 88 | height: 90px; 89 | width: 90px; 90 | margin-top: -115px; 91 | z-index:100; 92 | } 93 | 94 | 95 | 96 | h3{ 97 | 98 | } 99 | 100 | /* 101 | h4{ 102 | text-align:center; 103 | font-style:normal; 104 | } 105 | .date, .author { 106 | font-style: italic; 107 | } 108 | */ 109 | 110 | tr:nth-child(even) {background-color: #f2f2f2} 111 | tr:hover {background-color: #f5f5f5} 112 | 113 | th { 114 | background-color: #ffffff; 115 | color: black; 116 | padding-left: 10px; 117 | width: 100px; 118 | } 119 | 120 | #var_dist_table{ 121 | width: 100px; 122 | table-layout:fixed; 123 | } 124 | 125 | #post_trellis, #post_time_series, #post_pkg_details,#post_peek,#post_summ,#post_miss,#post_num_var,#post_cat_var,#post_bi_var,#post_prob_outlier,#post_corr_net,#post_corr_mat, #post_univar, #post_cluster, #post_factAnalysis,#post_cat_summ,#post_univar, #post_trellis,#post_factAnalysis,#post_cluster{ 126 | font-weight: bold; 127 | box-shadow: 5px 5px 5px #888888; 128 | } 129 | 130 | /* 131 | #post_num_var, #post_cat_var, #post_bi_var, #post_miss{ 132 | position: absolute; right: 10%; 133 | position: relative; right: 10%; 134 | } 135 | 136 | #package_details, #post_pkg_details, #post_peek, #post_summ{ 137 | position: absolute; left: 300%; 138 | position: relative; top: 10%; 139 | } 140 | */ 141 | 142 | #quickFilters{ 143 | margin-bottom:10px; 144 | } 145 | .tocify{ 146 | 147 | width:21vw !important; 148 | margin-left: -20px; 149 | background-color: white; 150 | border-color: white; 151 | margin: 150px 0px 20px 0px !important; 152 | border-radius: 1px; 153 | z-index: 1000; 154 | top:40px; 155 | max-width: 19vw !important; 156 | } 157 | .tocify-item{ 158 | 159 | padding: 10px 10px 10px 20px !important; 160 | background-color: #424C55 !important; 161 | border-bottom: 1px solid #454545 !important; 162 | font-size: 14px; 163 | color:#d8d8d8 !important; 164 | border-radius: 1px !important; 165 | transition-property: all; 166 | -moz-transition-property: all; 167 | -webkit-transition-property: all; 168 | -o-transition-property: all; 169 | 170 | transition-duration: 250ms; 171 | -webkit-transition-duration: 250ms; 172 | 173 | } 174 | .tocify-item-hover { 175 | background: #F00; 176 | color: #FFF; 177 | } 178 | 179 | 180 | .tocify-subheader .tocify-item { 181 | padding-left: 45px !important; 182 | } 183 | .tocify-item:hover{ 184 | background-color: #d8d8d8 !important; 185 | 186 | 187 | border-left: 5px solid #2d6396 !important; 188 | text-decoration: none !important; 189 | color:#2d6396 !important; 190 | 191 | } 192 | 193 | 194 | .toc-content{ 195 | position: absolute; 196 | } 197 | 198 | .tocify-extend-page{ 199 | height: 0px !important; 200 | } 201 | 202 | .list-group-item.active, .list-group-item.active:focus, .list-group-item.active:hover .list-group-item:hover{ 203 | background-color: #ffffff !important; 204 | font-weight: bold !important; 205 | 206 | border-left: 5px solid #2d6396 !important; 207 | text-decoration: none !important; 208 | color:#2d6396 !important; 209 | 210 | } 211 | 212 | .row-fluid{ 213 | margin-left:-26px; 214 | } 215 | 216 | html,body{ 217 | height: 100%; 218 | } 219 | .fixedToc{ 220 | top:-100px; 221 | } 222 | -------------------------------------------------------------------------------- /man/AnalysisPipeline-class.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions-batch.R 3 | \docType{class} 4 | \name{AnalysisPipeline-class} 5 | \alias{AnalysisPipeline-class} 6 | \alias{AnalysisPipeline} 7 | \title{Class for constructing Analysis Pipelines for batch/ one-time analyeses} 8 | \description{ 9 | Class for constructing Analysis Pipelines for batch/ one-time analyeses 10 | } 11 | \details{ 12 | Inherits the base class \link{BaseAnalysisPipeline} class which holds the metadata including the registry of available functions, 13 | the data on which the pipeline is to be applied, as well as the pipeline itself 14 | 15 | Additionally, this class is meant to be used for batch/ one-time processing. Contains additional slots to 16 | hold the data frame to be used for the pipeline and associated schema 17 | } 18 | \section{Slots}{ 19 | 20 | \describe{ 21 | \item{\code{input}}{The input dataset on which analysis is to be performed} 22 | 23 | \item{\code{originalSchemaDf}}{Empty data frame representing the schema of the input} 24 | }} 25 | 26 | \seealso{ 27 | Other Package core functions for batch/one-time analyses: \code{\link{checkSchema}}, 28 | \code{\link{generateReport}}, 29 | \code{\link{initialize,BaseAnalysisPipeline-method}} 30 | } 31 | \concept{Package core functions for batch/one-time analyses} 32 | -------------------------------------------------------------------------------- /man/BaseAnalysisPipeline-class.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions.R 3 | \docType{class} 4 | \name{BaseAnalysisPipeline-class} 5 | \alias{BaseAnalysisPipeline-class} 6 | \alias{BaseAnalysisPipeline} 7 | \title{Base class for \code{AnalysisPipeline} and \code{StreamingAnalysisPipeline} objects} 8 | \description{ 9 | Base class for \code{AnalysisPipeline} and \code{StreamingAnalysisPipeline} objects 10 | } 11 | \details{ 12 | The class which holds the metadata including the registry of available functions, 13 | the data on which the pipeline is to be applied, as well as the pipeline itself, and serves 14 | as the base class for various types of Pipeline objects such as Batch and Streaming. 15 | 16 | This base class which contains the slots related to the registry, pipeline and output can be extended 17 | to create custom class for specific scenarios if required. 18 | 19 | In the documentation, objects of classes which are subclasses of this class are referred to as 'Pipeline' objects 20 | } 21 | \section{Slots}{ 22 | 23 | \describe{ 24 | \item{\code{pipeline}}{A tibble which holds functions to be called} 25 | 26 | \item{\code{pipelineExecutor}}{A list containing details of the execution, such as topological ordering of functions to be executed, 27 | dependency map of functions, as well as logger configuration} 28 | 29 | \item{\code{output}}{A list which holds all the functions output} 30 | }} 31 | 32 | \seealso{ 33 | Other Package core functions: \code{\link{MetaAnalysisPipeline-class}}, 34 | \code{\link{assessEngineSetUp}}, 35 | \code{\link{checkSchemaMatch}}, 36 | \code{\link{createPipelineInstance}}, 37 | \code{\link{exportAsMetaPipeline}}, 38 | \code{\link{generateOutput}}, 39 | \code{\link{genericPipelineException}}, 40 | \code{\link{getInput}}, \code{\link{getLoggerDetails}}, 41 | \code{\link{getOutputById}}, 42 | \code{\link{getPipelinePrototype}}, 43 | \code{\link{getPipeline}}, \code{\link{getRegistry}}, 44 | \code{\link{initDfBasedOnType}}, 45 | \code{\link{initialize,BaseAnalysisPipeline-method}}, 46 | \code{\link{loadMetaPipeline}}, 47 | \code{\link{loadPipeline}}, 48 | \code{\link{loadPredefinedFunctionRegistry}}, 49 | \code{\link{loadRegistry}}, \code{\link{prepExecution}}, 50 | \code{\link{registerFunction}}, 51 | \code{\link{savePipeline}}, \code{\link{saveRegistry}}, 52 | \code{\link{setInput}}, \code{\link{setLoggerDetails}}, 53 | \code{\link{updateObject}}, 54 | \code{\link{visualizePipeline}} 55 | } 56 | \concept{Package core functions} 57 | -------------------------------------------------------------------------------- /man/CheckColumnType.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/r-batch-eda-utilities.R 3 | \name{CheckColumnType} 4 | \alias{CheckColumnType} 5 | \title{Check for type of column} 6 | \usage{ 7 | CheckColumnType(dataVector) 8 | } 9 | \arguments{ 10 | \item{dataVector}{a data vector of a column} 11 | } 12 | \value{ 13 | column Type 14 | } 15 | \description{ 16 | Check for type of column 17 | } 18 | \details{ 19 | Checking for type of columns in the datavector 20 | } 21 | \examples{ 22 | CheckColumnType(iris$Sepal.Length) 23 | } 24 | \seealso{ 25 | Other Package EDA Utilites functions: \code{\link{bivarPlots}}, 26 | \code{\link{correlationMatPlot}}, 27 | \code{\link{getDatatype}}, \code{\link{ignoreCols}}, 28 | \code{\link{multiVarOutlierPlot}}, 29 | \code{\link{outlierPlot}}, 30 | \code{\link{univarCatDistPlots}} 31 | } 32 | \concept{Package EDA Utilites functions} 33 | -------------------------------------------------------------------------------- /man/MetaAnalysisPipeline-class.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions-meta-pipelines.R 3 | \docType{class} 4 | \name{MetaAnalysisPipeline-class} 5 | \alias{MetaAnalysisPipeline-class} 6 | \alias{MetaAnalysisPipeline} 7 | \title{Class for creating and working with meta-pipelines} 8 | \description{ 9 | Class for creating and working with meta-pipelines 10 | } 11 | \details{ 12 | This class works with the \code{AnalysisPipeline} and \code{StreamingAnalysisPipeline} classes, and allows the 13 | pipeline to be exported as meta-pipeline. A meta-pipeline is a construct, where the input dataset as well as the arguments 14 | to functions in the pipeline are not defined. Only the analysis flow and dependencies are stored. 15 | } 16 | \section{Slots}{ 17 | 18 | \describe{ 19 | \item{\code{pipeline}}{A tibble which holds functions to be called in the pipeline} 20 | 21 | \item{\code{pipelinePrototype}}{An object of class \code{proto} from the 'proto' package which maintains the prototype of the 22 | functions in the pipeline and their respective arguments} 23 | 24 | \item{\code{type}}{A string defining whether it is a batch or streaming pipeline. Acceptable values are 'batch' & 'streaming'} 25 | }} 26 | 27 | \seealso{ 28 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, 29 | \code{\link{assessEngineSetUp}}, 30 | \code{\link{checkSchemaMatch}}, 31 | \code{\link{createPipelineInstance}}, 32 | \code{\link{exportAsMetaPipeline}}, 33 | \code{\link{generateOutput}}, 34 | \code{\link{genericPipelineException}}, 35 | \code{\link{getInput}}, \code{\link{getLoggerDetails}}, 36 | \code{\link{getOutputById}}, 37 | \code{\link{getPipelinePrototype}}, 38 | \code{\link{getPipeline}}, \code{\link{getRegistry}}, 39 | \code{\link{initDfBasedOnType}}, 40 | \code{\link{initialize,BaseAnalysisPipeline-method}}, 41 | \code{\link{loadMetaPipeline}}, 42 | \code{\link{loadPipeline}}, 43 | \code{\link{loadPredefinedFunctionRegistry}}, 44 | \code{\link{loadRegistry}}, \code{\link{prepExecution}}, 45 | \code{\link{registerFunction}}, 46 | \code{\link{savePipeline}}, \code{\link{saveRegistry}}, 47 | \code{\link{setInput}}, \code{\link{setLoggerDetails}}, 48 | \code{\link{updateObject}}, 49 | \code{\link{visualizePipeline}} 50 | } 51 | \concept{Package core functions} 52 | -------------------------------------------------------------------------------- /man/StreamingAnalysisPipeline-class.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-streaming-functions.R 3 | \docType{class} 4 | \name{StreamingAnalysisPipeline-class} 5 | \alias{StreamingAnalysisPipeline-class} 6 | \alias{StreamingAnalysisPipeline} 7 | \title{Class for constructing Analysis Pipelines for streaming analyeses} 8 | \description{ 9 | Class for constructing Analysis Pipelines for streaming analyeses 10 | } 11 | \details{ 12 | Inherits the base class \link{BaseAnalysisPipeline} class which holds the metadata including the registry of available functions, 13 | the data on which the pipeline is to be applied, as well as the pipeline itself 14 | 15 | This class currently only supports Apache Spark Structured Streaming, implemented through the SparkR interface 16 | } 17 | \section{Slots}{ 18 | 19 | \describe{ 20 | \item{\code{input}}{The input Spark DataFrame on which analysis is to be performed} 21 | 22 | \item{\code{originalSchemaDf}}{Empty Spark DataFrame representing the schema of the input} 23 | }} 24 | 25 | \concept{Package core functions for Streaming Analyses} 26 | -------------------------------------------------------------------------------- /man/analysisPipelines.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/analysisPipelines_package.R 3 | \docType{package} 4 | \name{analysisPipelines} 5 | \alias{analysisPipelines} 6 | \alias{analysisPipelines-package} 7 | \title{analysisPipelines} 8 | \description{ 9 | The package aims at enabling data scientists to compose pipelines of analysis which consist of data manipulation, 10 | exploratory analysis & reporting, as well as modeling steps. It also aims to enable data scientists to use tools 11 | of their choice through an R interface, and compose interoperable pipelines between R, Spark, and Python. 12 | Credits to Mu Sigma for supporting the development of the package. 13 | } 14 | \note{ 15 | To enable pipelines involving Spark tasks, the package uses the 'SparkR' package. Using Spark as an engine requires the SparkR package to be installed. 16 | SparkR is distributed natively with Apache Spark and is not distributed on CRAN. The SparkR version needs to directly map to the Spark version (hence the native distribution), and care needs to be taken to ensure that this is configured properly. 17 | To install from Github, run the following command, if you know the Spark version: 18 | \itemize{ 19 | \item devtools::install_github('apache/spark@v2.x.x', subdir='R/pkg') 20 | } 21 | The other option is to install SparkR by running the following terminal commands if Spark has already been installed: 22 | \itemize{ 23 | \item $ export SPARK_HOME=/path/to/spark/directory 24 | \item $ cd $SPARK_HOME/R/lib/SparkR/ 25 | \item $ R -e "devtools::install('.')" 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /man/assessEngineSetUp.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions.R 3 | \docType{methods} 4 | \name{assessEngineSetUp} 5 | \alias{assessEngineSetUp} 6 | \alias{assessEngineSetUp,BaseAnalysisPipeline-method} 7 | \title{Assesses engine (R, Spark, Python, Spark Structured Streaming) set up} 8 | \usage{ 9 | assessEngineSetUp(object) 10 | 11 | \S4method{assessEngineSetUp}{BaseAnalysisPipeline}(object) 12 | } 13 | \arguments{ 14 | \item{object}{A Pipeline object} 15 | } 16 | \value{ 17 | Tibble containing the details of available engines, whether they are required for a pipeline, a logical value 18 | reporting whether the engine has been set up, and comments. 19 | } 20 | \description{ 21 | Assesses engine (R, Spark, Python, Spark Structured Streaming) set up 22 | } 23 | \details{ 24 | Assesses whether engines required for executing functions in an \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} 25 | object have been set up 26 | 27 | This method is implemented on the base class as it is a shared functionality across Pipeline objects 28 | } 29 | \examples{ 30 | \dontrun{ 31 | library(analysisPipelines) 32 | pipelineObj <- AnalysisPipeline(input = iris) 33 | pipelineObj \%>>\% univarCatDistPlots(uniCol = "Species", priColor = "blue", 34 | optionalPlots = 0) \%>>\% assessEngineSetUp 35 | } 36 | } 37 | \seealso{ 38 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, 39 | \code{\link{MetaAnalysisPipeline-class}}, 40 | \code{\link{checkSchemaMatch}}, 41 | \code{\link{createPipelineInstance}}, 42 | \code{\link{exportAsMetaPipeline}}, 43 | \code{\link{generateOutput}}, 44 | \code{\link{genericPipelineException}}, 45 | \code{\link{getInput}}, \code{\link{getLoggerDetails}}, 46 | \code{\link{getOutputById}}, 47 | \code{\link{getPipelinePrototype}}, 48 | \code{\link{getPipeline}}, \code{\link{getRegistry}}, 49 | \code{\link{initDfBasedOnType}}, 50 | \code{\link{initialize,BaseAnalysisPipeline-method}}, 51 | \code{\link{loadMetaPipeline}}, 52 | \code{\link{loadPipeline}}, 53 | \code{\link{loadPredefinedFunctionRegistry}}, 54 | \code{\link{loadRegistry}}, \code{\link{prepExecution}}, 55 | \code{\link{registerFunction}}, 56 | \code{\link{savePipeline}}, \code{\link{saveRegistry}}, 57 | \code{\link{setInput}}, \code{\link{setLoggerDetails}}, 58 | \code{\link{updateObject}}, 59 | \code{\link{visualizePipeline}} 60 | } 61 | \concept{Package core functions} 62 | -------------------------------------------------------------------------------- /man/bivarPlots.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/r-batch-eda-utilities.R 3 | \name{bivarPlots} 4 | \alias{bivarPlots} 5 | \title{Bi-Variate Plot} 6 | \usage{ 7 | bivarPlots(dataset, select_var_name_1, select_var_name_2, 8 | priColor = "blue", secColor = "black") 9 | } 10 | \arguments{ 11 | \item{dataset}{the dataframe that needs to be loaded} 12 | 13 | \item{select_var_name_1}{the name of first column on which the plot needs to be generated} 14 | 15 | \item{select_var_name_2}{the name of second column on which the plot needs to be generated} 16 | 17 | \item{priColor}{the primary color for the plots} 18 | 19 | \item{secColor}{A secondary color for the plots} 20 | } 21 | \value{ 22 | Bivariate plot 23 | } 24 | \description{ 25 | Bi-Variate Plot 26 | } 27 | \details{ 28 | A bivariate distribution graph on the selected columns from the dataframe.Selected two columns are on two axis' and a plot is generated 29 | } 30 | \examples{ 31 | bivarPlots(dataset = iris, select_var_name_1 = "Sepal.Length", 32 | select_var_name_2 = "Sepal.Width") 33 | } 34 | \seealso{ 35 | Other Package EDA Utilites functions: \code{\link{CheckColumnType}}, 36 | \code{\link{correlationMatPlot}}, 37 | \code{\link{getDatatype}}, \code{\link{ignoreCols}}, 38 | \code{\link{multiVarOutlierPlot}}, 39 | \code{\link{outlierPlot}}, 40 | \code{\link{univarCatDistPlots}} 41 | } 42 | \concept{Package EDA Utilites functions} 43 | -------------------------------------------------------------------------------- /man/castKafkaStreamAsString.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/spark-structured-streaming-utilities.R 3 | \name{castKafkaStreamAsString} 4 | \alias{castKafkaStreamAsString} 5 | \title{Connect to a Spark session} 6 | \usage{ 7 | castKafkaStreamAsString(streamObj) 8 | } 9 | \arguments{ 10 | \item{streamObj}{Spark Structured Streaming DataFrame returned by \code{read.stream} function with \code{source = 'kafka'}} 11 | } 12 | \value{ 13 | Updated Spark Structured Streaming DataFrame with key, value, topic and timestamp from the Kafka stream 14 | } 15 | \description{ 16 | Connect to a Spark session 17 | } 18 | \details{ 19 | Takes in a Structured Stream from Kafka created from \code{read.stream(source = 'kafka', ...)} and returns 20 | a Structured Streaming DataFrame where the \code{key} and \code{value} from the Kafka stream are cast to string 21 | } 22 | \seealso{ 23 | Other Spark utilities: \code{\link{convertKafkaValueFromJson}}, 24 | \code{\link{sparkRSessionCreateIfNotPresent}} 25 | } 26 | \concept{Spark utilities} 27 | -------------------------------------------------------------------------------- /man/checkSchema.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions-batch.R 3 | \name{checkSchema} 4 | \alias{checkSchema} 5 | \title{Compare the schemas of two dataframes} 6 | \usage{ 7 | checkSchema(dfOld, dfNew) 8 | } 9 | \arguments{ 10 | \item{dfOld}{Old dataframe} 11 | 12 | \item{dfNew}{New dataframe} 13 | } 14 | \value{ 15 | Returns a list with details on added columns, removed columns, comparison between column classes, and a logical 16 | whether the schema has remained the same from the old dataframe to the new one 17 | } 18 | \description{ 19 | Compare the schemas of two dataframes 20 | } 21 | \details{ 22 | Compares the schemas of two dataframes, providing information on added and removed columns in the new dataframe 23 | as compared to the old 24 | } 25 | \seealso{ 26 | Other Package core functions for batch/one-time analyses: \code{\link{AnalysisPipeline-class}}, 27 | \code{\link{generateReport}}, 28 | \code{\link{initialize,BaseAnalysisPipeline-method}} 29 | } 30 | \concept{Package core functions for batch/one-time analyses} 31 | \keyword{internal} 32 | -------------------------------------------------------------------------------- /man/checkSchemaMatch.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions.R, R/core-functions-batch.R 3 | \docType{methods} 4 | \name{checkSchemaMatch} 5 | \alias{checkSchemaMatch} 6 | \alias{checkSchemaMatch,AnalysisPipeline-method} 7 | \title{Checks the schema of the input to a Pipeline object against the original} 8 | \usage{ 9 | checkSchemaMatch(object, newData) 10 | 11 | \S4method{checkSchemaMatch}{AnalysisPipeline}(object, newData) 12 | } 13 | \arguments{ 14 | \item{object}{A Pipeline object} 15 | 16 | \item{newData}{The newData that the pipeline is to be initialized with} 17 | } 18 | \value{ 19 | Returns a list with details on added columns, removed columns, comparison between column classes, and a logical 20 | whether the schema has remained the same from the old dataframe to the new one 21 | } 22 | \description{ 23 | Checks the schema of the input to a Pipeline object against the original 24 | } 25 | \details{ 26 | Checks the schema of the new data frame that the pipeline is to be initialized with against 27 | the original schema that the pipeline was saved with. Provides a detailed comparison 28 | } 29 | \seealso{ 30 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, 31 | \code{\link{MetaAnalysisPipeline-class}}, 32 | \code{\link{assessEngineSetUp}}, 33 | \code{\link{createPipelineInstance}}, 34 | \code{\link{exportAsMetaPipeline}}, 35 | \code{\link{generateOutput}}, 36 | \code{\link{genericPipelineException}}, 37 | \code{\link{getInput}}, \code{\link{getLoggerDetails}}, 38 | \code{\link{getOutputById}}, 39 | \code{\link{getPipelinePrototype}}, 40 | \code{\link{getPipeline}}, \code{\link{getRegistry}}, 41 | \code{\link{initDfBasedOnType}}, 42 | \code{\link{initialize,BaseAnalysisPipeline-method}}, 43 | \code{\link{loadMetaPipeline}}, 44 | \code{\link{loadPipeline}}, 45 | \code{\link{loadPredefinedFunctionRegistry}}, 46 | \code{\link{loadRegistry}}, \code{\link{prepExecution}}, 47 | \code{\link{registerFunction}}, 48 | \code{\link{savePipeline}}, \code{\link{saveRegistry}}, 49 | \code{\link{setInput}}, \code{\link{setLoggerDetails}}, 50 | \code{\link{updateObject}}, 51 | \code{\link{visualizePipeline}} 52 | } 53 | \concept{Package core functions} 54 | -------------------------------------------------------------------------------- /man/computeEdges.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions.R 3 | \name{computeEdges} 4 | \alias{computeEdges} 5 | \title{Computes edges (dependencies) in a pipeline given the joined tibble of the pipeline and registry} 6 | \usage{ 7 | computeEdges(pipelineRegistryJoin) 8 | } 9 | \description{ 10 | Computes edges (dependencies) in a pipeline given the joined tibble of the pipeline and registry 11 | } 12 | \keyword{internal} 13 | -------------------------------------------------------------------------------- /man/convertKafkaValueFromJson.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/spark-structured-streaming-utilities.R 3 | \name{convertKafkaValueFromJson} 4 | \alias{convertKafkaValueFromJson} 5 | \title{Connect to a Spark session} 6 | \usage{ 7 | convertKafkaValueFromJson(streamObj, schema) 8 | } 9 | \arguments{ 10 | \item{streamObj}{Spark Structured Streaming DataFrame which is returned by the \code{castKafkaStreamAsString} function} 11 | 12 | \item{schema}{A structType object created from SparkR specifying the schema of the json data present in the \code{value} 13 | attribute of the incoming Kafka stream} 14 | } 15 | \value{ 16 | Spark Structured Streaming DataFrame with the json data in the \code{value} attribute of the Kafka stream parsed 17 | into a DataFrame format 18 | } 19 | \description{ 20 | Connect to a Spark session 21 | } 22 | \details{ 23 | Takes in a Structured Stream from Kafka created from \code{read.stream(source = 'kafka', ...)} and returns 24 | a Structured Streaming DataFrame where the \code{key} and \code{value} from the Kafka stream are cast to string 25 | } 26 | \seealso{ 27 | Other Spark utilities: \code{\link{castKafkaStreamAsString}}, 28 | \code{\link{sparkRSessionCreateIfNotPresent}} 29 | } 30 | \concept{Spark utilities} 31 | -------------------------------------------------------------------------------- /man/correlationMatPlot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/r-batch-eda-utilities.R 3 | \name{correlationMatPlot} 4 | \alias{correlationMatPlot} 5 | \title{Correlation Matrix Plot} 6 | \usage{ 7 | correlationMatPlot(dataset, methodused = "everything") 8 | } 9 | \arguments{ 10 | \item{dataset}{the dataset that needs to be loaded} 11 | 12 | \item{methodused}{methods to be used for computing correlation} 13 | } 14 | \value{ 15 | Correlation Matrix graph 16 | } 17 | \description{ 18 | A correlation matrix is created and plotted across all the columns in the dataset 19 | } 20 | \examples{ 21 | correlationMatPlot(dataset = iris) 22 | } 23 | \seealso{ 24 | Other Package EDA Utilites functions: \code{\link{CheckColumnType}}, 25 | \code{\link{bivarPlots}}, \code{\link{getDatatype}}, 26 | \code{\link{ignoreCols}}, 27 | \code{\link{multiVarOutlierPlot}}, 28 | \code{\link{outlierPlot}}, 29 | \code{\link{univarCatDistPlots}} 30 | } 31 | \concept{Package EDA Utilites functions} 32 | -------------------------------------------------------------------------------- /man/createPipelineInstance.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions-meta-pipelines.R 3 | \docType{methods} 4 | \name{createPipelineInstance} 5 | \alias{createPipelineInstance} 6 | \alias{createPipelineInstance,MetaAnalysisPipeline-method} 7 | \title{Create a Pipeline object from a meta-pipeline} 8 | \usage{ 9 | createPipelineInstance(metaPipelineObj, newParams) 10 | 11 | \S4method{createPipelineInstance}{MetaAnalysisPipeline}(metaPipelineObj, 12 | newParams) 13 | } 14 | \arguments{ 15 | \item{metaPipelineObj}{A \code{MetaAnalysisPipeline} object} 16 | 17 | \item{newParams}{Either a nested named list containing all the functions in the pipeline, their arguments and 18 | corresponding values (OR) an object of class \code{proto} which is a pipeline prototype, with the new values of the arguments 19 | set. Refer the \code{getPipelinePrototype} method.} 20 | } 21 | \value{ 22 | A Pipeline object 23 | } 24 | \description{ 25 | Create a Pipeline object from a meta-pipeline 26 | } 27 | \details{ 28 | This method instantiates a Pipeline object (both \code{AnalysisPipeline} and \code{StreamingAnalysisPipeline}) from 29 | a meta-pipeline as well as an object containing the new set of values for the arguments of all the functions in the pipeline. 30 | } 31 | \examples{ 32 | \dontrun{ 33 | pipelineObj <- AnalysisPipeline(input = iris) 34 | pipelineObj \%>>\% univarCatDistPlots(uniCol = "Species") -> pipelineObj 35 | pipelineObj \%>>\% exportAsMetaPipeline -> exportedMetaPipeline 36 | exportedMetaPipeline \%>>\% 37 | createPipelineInstance(newParams = exportedMetaPipeline \%>>\% 38 | getPipelinePrototype) 39 | } 40 | } 41 | \seealso{ 42 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, 43 | \code{\link{MetaAnalysisPipeline-class}}, 44 | \code{\link{assessEngineSetUp}}, 45 | \code{\link{checkSchemaMatch}}, 46 | \code{\link{exportAsMetaPipeline}}, 47 | \code{\link{generateOutput}}, 48 | \code{\link{genericPipelineException}}, 49 | \code{\link{getInput}}, \code{\link{getLoggerDetails}}, 50 | \code{\link{getOutputById}}, 51 | \code{\link{getPipelinePrototype}}, 52 | \code{\link{getPipeline}}, \code{\link{getRegistry}}, 53 | \code{\link{initDfBasedOnType}}, 54 | \code{\link{initialize,BaseAnalysisPipeline-method}}, 55 | \code{\link{loadMetaPipeline}}, 56 | \code{\link{loadPipeline}}, 57 | \code{\link{loadPredefinedFunctionRegistry}}, 58 | \code{\link{loadRegistry}}, \code{\link{prepExecution}}, 59 | \code{\link{registerFunction}}, 60 | \code{\link{savePipeline}}, \code{\link{saveRegistry}}, 61 | \code{\link{setInput}}, \code{\link{setLoggerDetails}}, 62 | \code{\link{updateObject}}, 63 | \code{\link{visualizePipeline}} 64 | } 65 | \concept{Package core functions} 66 | -------------------------------------------------------------------------------- /man/dot-analysisPipelinesEnvir.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions.R 3 | \docType{data} 4 | \name{.analysisPipelinesEnvir} 5 | \alias{.analysisPipelinesEnvir} 6 | \title{This section defines the environment which the package uses for maintaining the registry and an outputCache} 7 | \format{An object of class \code{environment} of length 2.} 8 | \usage{ 9 | .analysisPipelinesEnvir 10 | } 11 | \description{ 12 | This section defines the environment which the package uses for maintaining the registry and an outputCache 13 | } 14 | \keyword{internal} 15 | -------------------------------------------------------------------------------- /man/dot-getCache.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions.R 3 | \name{.getCache} 4 | \alias{.getCache} 5 | \title{This is an internal function which returns the cache from the package namespace} 6 | \usage{ 7 | .getCache() 8 | } 9 | \description{ 10 | This is an internal function which returns the cache from the package namespace 11 | } 12 | \keyword{internal} 13 | -------------------------------------------------------------------------------- /man/dot-saveMetaPipeline.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions-meta-pipelines.R 3 | \name{.saveMetaPipeline} 4 | \alias{.saveMetaPipeline} 5 | \title{A method definition for saving meta-pipelines, called when the 'savePipeline' method is called against the 6 | \code{MetaAnalysisPipeline} signature} 7 | \usage{ 8 | .saveMetaPipeline(object, path) 9 | } 10 | \description{ 11 | A method definition for saving meta-pipelines, called when the 'savePipeline' method is called against the 12 | \code{MetaAnalysisPipeline} signature 13 | } 14 | \keyword{internal} 15 | -------------------------------------------------------------------------------- /man/dot-setRegistry.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions.R 3 | \name{.setRegistry} 4 | \alias{.setRegistry} 5 | \title{Internal function used to set the registry object in case of loading pipelines or meta-pipelines} 6 | \usage{ 7 | .setRegistry(.registry) 8 | } 9 | \description{ 10 | Internal function used to set the registry object in case of loading pipelines or meta-pipelines 11 | } 12 | \keyword{internal} 13 | -------------------------------------------------------------------------------- /man/dot-updateRegistry.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions.R 3 | \name{.updateRegistry} 4 | \alias{.updateRegistry} 5 | \title{This is an internal function used to update the registry, in order to override existing function registrations} 6 | \usage{ 7 | .updateRegistry(functionName, heading = "", engine = "r", 8 | exceptionHandlingFunction = as.character(substitute(genericPipelineException)), 9 | userDefined = F, isDataFunction = T, firstArgClass = "") 10 | } 11 | \description{ 12 | This is an internal function used to update the registry, in order to override existing function registrations 13 | } 14 | \keyword{internal} 15 | -------------------------------------------------------------------------------- /man/dot-visualizeMetaPipeline.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions-meta-pipelines.R 3 | \name{.visualizeMetaPipeline} 4 | \alias{.visualizeMetaPipeline} 5 | \title{A method definition for visualizing meta-pipelines, called when the 'visualizePipeline' method is called against the 6 | \code{MetaAnalysisPipeline} signature} 7 | \usage{ 8 | .visualizeMetaPipeline(object) 9 | } 10 | \description{ 11 | A method definition for visualizing meta-pipelines, called when the 'visualizePipeline' method is called against the 12 | \code{MetaAnalysisPipeline} signature 13 | } 14 | \keyword{internal} 15 | -------------------------------------------------------------------------------- /man/exportAsMetaPipeline.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions-meta-pipelines.R 3 | \docType{methods} 4 | \name{exportAsMetaPipeline} 5 | \alias{exportAsMetaPipeline} 6 | \alias{exportAsMetaPipeline,BaseAnalysisPipeline-method} 7 | \title{Method to export a meta-pipeline} 8 | \usage{ 9 | exportAsMetaPipeline(object) 10 | 11 | \S4method{exportAsMetaPipeline}{BaseAnalysisPipeline}(object) 12 | } 13 | \arguments{ 14 | \item{object}{A Pipeline object} 15 | } 16 | \value{ 17 | an object of class "\code{MetaAnalysisPipeline}" 18 | } 19 | \description{ 20 | Method to export a meta-pipeline 21 | } 22 | \details{ 23 | This method exports a Pipeline object i.e. of the classes \code{AnalysisPipeline} or 24 | \code{StreamingAnalysisPipeline} as a meta-pipeline 25 | } 26 | \examples{ 27 | \dontrun{ 28 | #' pipelineObj <- AnalysisPipeline(input = iris) 29 | pipelineObj \%>>\% univarCatDistPlots(uniCol = "Species") \%>>\% 30 | exportAsMetaPipeline -> exportedMetaPipeline 31 | } 32 | } 33 | \seealso{ 34 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, 35 | \code{\link{MetaAnalysisPipeline-class}}, 36 | \code{\link{assessEngineSetUp}}, 37 | \code{\link{checkSchemaMatch}}, 38 | \code{\link{createPipelineInstance}}, 39 | \code{\link{generateOutput}}, 40 | \code{\link{genericPipelineException}}, 41 | \code{\link{getInput}}, \code{\link{getLoggerDetails}}, 42 | \code{\link{getOutputById}}, 43 | \code{\link{getPipelinePrototype}}, 44 | \code{\link{getPipeline}}, \code{\link{getRegistry}}, 45 | \code{\link{initDfBasedOnType}}, 46 | \code{\link{initialize,BaseAnalysisPipeline-method}}, 47 | \code{\link{loadMetaPipeline}}, 48 | \code{\link{loadPipeline}}, 49 | \code{\link{loadPredefinedFunctionRegistry}}, 50 | \code{\link{loadRegistry}}, \code{\link{prepExecution}}, 51 | \code{\link{registerFunction}}, 52 | \code{\link{savePipeline}}, \code{\link{saveRegistry}}, 53 | \code{\link{setInput}}, \code{\link{setLoggerDetails}}, 54 | \code{\link{updateObject}}, 55 | \code{\link{visualizePipeline}} 56 | } 57 | \concept{Package core functions} 58 | -------------------------------------------------------------------------------- /man/generateOutput.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions.R, R/core-functions-batch.R, 3 | % R/core-streaming-functions.R 4 | \docType{methods} 5 | \name{generateOutput} 6 | \alias{generateOutput} 7 | \alias{generateOutput,AnalysisPipeline-method} 8 | \alias{generateOutput,StreamingAnalysisPipeline-method} 9 | \title{Generate a list of outputs from Pipeline objects} 10 | \usage{ 11 | generateOutput(object) 12 | 13 | \S4method{generateOutput}{AnalysisPipeline}(object) 14 | 15 | \S4method{generateOutput}{StreamingAnalysisPipeline}(object) 16 | } 17 | \arguments{ 18 | \item{object}{object that contains input, pipeline, registry and output} 19 | } 20 | \value{ 21 | Updated Pipeline object with the outputs at each step stored in the \code{output} slot. 22 | 23 | Specific outputs can be obtained by using the \link{getOutputById} function 24 | } 25 | \description{ 26 | Generate a list of outputs from Pipeline objects 27 | } 28 | \details{ 29 | \code{generateOutput} is a generic function that is implemented for various types of pipeline objects 30 | such as \code{AnalysisPipeline} and \code{StreamingAnalysisPipeline} 31 | 32 | The sequence of operations stored in the pipeline object 33 | are run and outputs generated, stored in a list 34 | } 35 | \seealso{ 36 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, 37 | \code{\link{MetaAnalysisPipeline-class}}, 38 | \code{\link{assessEngineSetUp}}, 39 | \code{\link{checkSchemaMatch}}, 40 | \code{\link{createPipelineInstance}}, 41 | \code{\link{exportAsMetaPipeline}}, 42 | \code{\link{genericPipelineException}}, 43 | \code{\link{getInput}}, \code{\link{getLoggerDetails}}, 44 | \code{\link{getOutputById}}, 45 | \code{\link{getPipelinePrototype}}, 46 | \code{\link{getPipeline}}, \code{\link{getRegistry}}, 47 | \code{\link{initDfBasedOnType}}, 48 | \code{\link{initialize,BaseAnalysisPipeline-method}}, 49 | \code{\link{loadMetaPipeline}}, 50 | \code{\link{loadPipeline}}, 51 | \code{\link{loadPredefinedFunctionRegistry}}, 52 | \code{\link{loadRegistry}}, \code{\link{prepExecution}}, 53 | \code{\link{registerFunction}}, 54 | \code{\link{savePipeline}}, \code{\link{saveRegistry}}, 55 | \code{\link{setInput}}, \code{\link{setLoggerDetails}}, 56 | \code{\link{updateObject}}, 57 | \code{\link{visualizePipeline}} 58 | } 59 | \concept{Package core functions} 60 | -------------------------------------------------------------------------------- /man/generateReport.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions-batch.R 3 | \docType{methods} 4 | \name{generateReport} 5 | \alias{generateReport} 6 | \alias{generateReport,AnalysisPipeline,character-method} 7 | \title{Generate a HTML report from an \code{AnalysisPipeline} object} 8 | \usage{ 9 | generateReport(object, path) 10 | 11 | \S4method{generateReport}{AnalysisPipeline,character}(object, path = ".") 12 | } 13 | \arguments{ 14 | \item{object}{object that contains input, pipeline, registry and output} 15 | 16 | \item{path}{path on the file system, where the generated html report should be stored} 17 | } 18 | \value{ 19 | Updated \code{AnalysisPipeline} object 20 | } 21 | \description{ 22 | Generate a HTML report from an \code{AnalysisPipeline} object 23 | } 24 | \details{ 25 | The sequence of operations stored in the \code{AnalysisPipeline} object are run, outputs generated, 26 | and a HTML report is generated with outputs in the same sequence as the pipeline created by the user 27 | } 28 | \examples{ 29 | \dontrun{ 30 | pipelineObj <- AnalysisPipeline(input = iris) 31 | pipelineObj \%>>\% univarCatDistPlots(uniCol = "Species", storeOutput = T) -> pipelineObj 32 | pipelineObj \%>>\% generateReport(path = ".") 33 | } 34 | } 35 | \seealso{ 36 | Other Package core functions for batch/one-time analyses: \code{\link{AnalysisPipeline-class}}, 37 | \code{\link{checkSchema}}, 38 | \code{\link{initialize,BaseAnalysisPipeline-method}} 39 | } 40 | \concept{Package core functions for batch/one-time analyses} 41 | -------------------------------------------------------------------------------- /man/genericPipelineException.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions.R 3 | \name{genericPipelineException} 4 | \alias{genericPipelineException} 5 | \title{Default exception for pipeline functions} 6 | \usage{ 7 | genericPipelineException(error) 8 | } 9 | \arguments{ 10 | \item{error}{Error encountered during the execution of a particular pipeline function} 11 | } 12 | \description{ 13 | Default exception for pipeline functions 14 | } 15 | \details{ 16 | This functions defines the default function which will be called in case of an exception occurring while 17 | executing any of the pipeline functions. While a function is registered, a custom function to deal with exceptions 18 | incurred during the call of the function being registered can be passed by the user. If passed, the custom function 19 | will be called instead of this function 20 | } 21 | \seealso{ 22 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, 23 | \code{\link{MetaAnalysisPipeline-class}}, 24 | \code{\link{assessEngineSetUp}}, 25 | \code{\link{checkSchemaMatch}}, 26 | \code{\link{createPipelineInstance}}, 27 | \code{\link{exportAsMetaPipeline}}, 28 | \code{\link{generateOutput}}, \code{\link{getInput}}, 29 | \code{\link{getLoggerDetails}}, 30 | \code{\link{getOutputById}}, 31 | \code{\link{getPipelinePrototype}}, 32 | \code{\link{getPipeline}}, \code{\link{getRegistry}}, 33 | \code{\link{initDfBasedOnType}}, 34 | \code{\link{initialize,BaseAnalysisPipeline-method}}, 35 | \code{\link{loadMetaPipeline}}, 36 | \code{\link{loadPipeline}}, 37 | \code{\link{loadPredefinedFunctionRegistry}}, 38 | \code{\link{loadRegistry}}, \code{\link{prepExecution}}, 39 | \code{\link{registerFunction}}, 40 | \code{\link{savePipeline}}, \code{\link{saveRegistry}}, 41 | \code{\link{setInput}}, \code{\link{setLoggerDetails}}, 42 | \code{\link{updateObject}}, 43 | \code{\link{visualizePipeline}} 44 | } 45 | \concept{Package core functions} 46 | -------------------------------------------------------------------------------- /man/getDatatype.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/r-batch-eda-utilities.R 3 | \name{getDatatype} 4 | \alias{getDatatype} 5 | \title{Get Data Type} 6 | \usage{ 7 | getDatatype(dataset) 8 | } 9 | \arguments{ 10 | \item{dataset}{a dataset which needs to be loaded} 11 | } 12 | \value{ 13 | list with \code{numeric_cols} and \code{cat_cols} 14 | } 15 | \description{ 16 | Get Data Type 17 | } 18 | \details{ 19 | Based on the datatype the columns are seperated into categorical and numerical columns 20 | } 21 | \examples{ 22 | getDatatype(iris) 23 | } 24 | \seealso{ 25 | Other Package EDA Utilites functions: \code{\link{CheckColumnType}}, 26 | \code{\link{bivarPlots}}, 27 | \code{\link{correlationMatPlot}}, 28 | \code{\link{ignoreCols}}, 29 | \code{\link{multiVarOutlierPlot}}, 30 | \code{\link{outlierPlot}}, 31 | \code{\link{univarCatDistPlots}} 32 | } 33 | \concept{Package EDA Utilites functions} 34 | -------------------------------------------------------------------------------- /man/getEndPoints.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions.R 3 | \name{getEndPoints} 4 | \alias{getEndPoints} 5 | \title{Obtains end nodes in a graph given nodes and edges} 6 | \usage{ 7 | getEndPoints(nodes, edgeDf) 8 | } 9 | \description{ 10 | Obtains end nodes in a graph given nodes and edges 11 | } 12 | \keyword{internal} 13 | -------------------------------------------------------------------------------- /man/getFeaturesForPyClassification.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/r-helper-utilites-python.R 3 | \name{getFeaturesForPyClassification} 4 | \alias{getFeaturesForPyClassification} 5 | \title{Extracts selected columns from a data frame as a Python array} 6 | \usage{ 7 | getFeaturesForPyClassification(dataset, featureNames) 8 | } 9 | \arguments{ 10 | \item{dataset}{an R data frame} 11 | 12 | \item{featureNames}{Column names to be extracted from the R data frames. A character vector.} 13 | } 14 | \description{ 15 | Extracts selected columns from a data frame as a Python array 16 | } 17 | \details{ 18 | Helper function, which when provided an R data frame and a set of column/ feature names, 19 | extracts them from the R data frame as a matrix and converts them to the equivalent Python array. 20 | 21 | Typically this function can be used when providing a feature matrix to a Python machine learning function 22 | } 23 | \examples{ 24 | \dontrun{ 25 | getFeaturesForPyClassification(dataset = iris, 26 | featureNames = c("Sepal.Length", "Sepal.Width")) 27 | } 28 | } 29 | \seealso{ 30 | Other R helper utilities for Python: \code{\link{getTargetForPyClassification}}, 31 | \code{\link{setPythonEnvir}} 32 | } 33 | \concept{R helper utilities for Python} 34 | -------------------------------------------------------------------------------- /man/getInput.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions.R 3 | \docType{methods} 4 | \name{getInput} 5 | \alias{getInput} 6 | \alias{getInput,BaseAnalysisPipeline-method} 7 | \title{Obtains the initializedInput} 8 | \usage{ 9 | getInput(object) 10 | 11 | \S4method{getInput}{BaseAnalysisPipeline}(object) 12 | } 13 | \arguments{ 14 | \item{object}{The \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object} 15 | } 16 | \value{ 17 | Dataframe for an \code{AnalysisPipeline} & SparkDataFrame for a \code{StreamingAnalysisPipeline} 18 | } 19 | \description{ 20 | Obtains the initializedInput 21 | } 22 | \details{ 23 | Obtains the input from the \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object 24 | 25 | This method is implemented on the base class as it is a shared functionality types of Analysis Pipelines 26 | which extend this class 27 | } 28 | \examples{ 29 | library(analysisPipelines) 30 | pipelineObj <- AnalysisPipeline(input = iris) 31 | pipelineObj \%>>\% getInput 32 | } 33 | \seealso{ 34 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, 35 | \code{\link{MetaAnalysisPipeline-class}}, 36 | \code{\link{assessEngineSetUp}}, 37 | \code{\link{checkSchemaMatch}}, 38 | \code{\link{createPipelineInstance}}, 39 | \code{\link{exportAsMetaPipeline}}, 40 | \code{\link{generateOutput}}, 41 | \code{\link{genericPipelineException}}, 42 | \code{\link{getLoggerDetails}}, 43 | \code{\link{getOutputById}}, 44 | \code{\link{getPipelinePrototype}}, 45 | \code{\link{getPipeline}}, \code{\link{getRegistry}}, 46 | \code{\link{initDfBasedOnType}}, 47 | \code{\link{initialize,BaseAnalysisPipeline-method}}, 48 | \code{\link{loadMetaPipeline}}, 49 | \code{\link{loadPipeline}}, 50 | \code{\link{loadPredefinedFunctionRegistry}}, 51 | \code{\link{loadRegistry}}, \code{\link{prepExecution}}, 52 | \code{\link{registerFunction}}, 53 | \code{\link{savePipeline}}, \code{\link{saveRegistry}}, 54 | \code{\link{setInput}}, \code{\link{setLoggerDetails}}, 55 | \code{\link{updateObject}}, 56 | \code{\link{visualizePipeline}} 57 | } 58 | \concept{Package core functions} 59 | -------------------------------------------------------------------------------- /man/getLoggerDetails.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions.R 3 | \docType{methods} 4 | \name{getLoggerDetails} 5 | \alias{getLoggerDetails} 6 | \alias{getLoggerDetails,BaseAnalysisPipeline-method} 7 | \title{Obtains the logger configuration for the pipeline} 8 | \usage{ 9 | getLoggerDetails(object) 10 | 11 | \S4method{getLoggerDetails}{BaseAnalysisPipeline}(object) 12 | } 13 | \arguments{ 14 | \item{object}{A Pipeline object} 15 | } 16 | \value{ 17 | Logger configuration as a list 18 | } 19 | \description{ 20 | Obtains the logger configuration for the pipeline 21 | } 22 | \details{ 23 | This function obtains the logger configuration for the pipeline. 24 | } 25 | \examples{ 26 | library(analysisPipelines) 27 | pipelineObj <- AnalysisPipeline(input = iris) 28 | pipelineObj \%>>\% getLoggerDetails 29 | } 30 | \seealso{ 31 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, 32 | \code{\link{MetaAnalysisPipeline-class}}, 33 | \code{\link{assessEngineSetUp}}, 34 | \code{\link{checkSchemaMatch}}, 35 | \code{\link{createPipelineInstance}}, 36 | \code{\link{exportAsMetaPipeline}}, 37 | \code{\link{generateOutput}}, 38 | \code{\link{genericPipelineException}}, 39 | \code{\link{getInput}}, \code{\link{getOutputById}}, 40 | \code{\link{getPipelinePrototype}}, 41 | \code{\link{getPipeline}}, \code{\link{getRegistry}}, 42 | \code{\link{initDfBasedOnType}}, 43 | \code{\link{initialize,BaseAnalysisPipeline-method}}, 44 | \code{\link{loadMetaPipeline}}, 45 | \code{\link{loadPipeline}}, 46 | \code{\link{loadPredefinedFunctionRegistry}}, 47 | \code{\link{loadRegistry}}, \code{\link{prepExecution}}, 48 | \code{\link{registerFunction}}, 49 | \code{\link{savePipeline}}, \code{\link{saveRegistry}}, 50 | \code{\link{setInput}}, \code{\link{setLoggerDetails}}, 51 | \code{\link{updateObject}}, 52 | \code{\link{visualizePipeline}} 53 | } 54 | \concept{Package core functions} 55 | -------------------------------------------------------------------------------- /man/getOutputById.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions.R 3 | \docType{methods} 4 | \name{getOutputById} 5 | \alias{getOutputById} 6 | \alias{getOutputById,BaseAnalysisPipeline-method} 7 | \title{Obtains a specific output} 8 | \usage{ 9 | getOutputById(object, reqId, includeCall = F) 10 | 11 | \S4method{getOutputById}{BaseAnalysisPipeline}(object, reqId, 12 | includeCall = F) 13 | } 14 | \arguments{ 15 | \item{object}{The \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object} 16 | 17 | \item{reqId}{The position of the function for which the output is desired in the sequence of operations in the pipeline.} 18 | 19 | \item{includeCall}{Logical which defines whether the call used to generate the output should be returned. By, default this is false} 20 | } 21 | \value{ 22 | If includeCall = F, the output object generated by the function is returned 23 | 24 | If includeCall = T, it is a list containing to elements 25 | - call: tibble with 1 row containing the function call for the output desired 26 | - output: output generated 27 | } 28 | \description{ 29 | Obtains a specific output 30 | } 31 | \details{ 32 | Obtains a specific output from the \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object by passing the position 33 | of the function for which the output is desired, in the sequence of operations in the pipeline. This can be obtained by passing the number 34 | under the 'id' column in the pipeline table corresponding to the required function 35 | 36 | This method is implemented on the base class as it is a shared functionality types of Analysis Pipelines 37 | which extend this class 38 | } 39 | \examples{ 40 | \dontrun{ 41 | library(analysisPipelines) 42 | pipelineObj <- AnalysisPipeline(input = iris) 43 | getNumRows <- function(dataset){ 44 | return(nrow(dataset)) 45 | } 46 | registerFunction("getNumRows") 47 | pipelineObj \%>>\% getNumRows(storeOutput = TRUE) -> pipelineObj 48 | pipelineObj \%>>\% generateOutput \%>>\% getOutputById("1") 49 | } 50 | } 51 | \seealso{ 52 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, 53 | \code{\link{MetaAnalysisPipeline-class}}, 54 | \code{\link{assessEngineSetUp}}, 55 | \code{\link{checkSchemaMatch}}, 56 | \code{\link{createPipelineInstance}}, 57 | \code{\link{exportAsMetaPipeline}}, 58 | \code{\link{generateOutput}}, 59 | \code{\link{genericPipelineException}}, 60 | \code{\link{getInput}}, \code{\link{getLoggerDetails}}, 61 | \code{\link{getPipelinePrototype}}, 62 | \code{\link{getPipeline}}, \code{\link{getRegistry}}, 63 | \code{\link{initDfBasedOnType}}, 64 | \code{\link{initialize,BaseAnalysisPipeline-method}}, 65 | \code{\link{loadMetaPipeline}}, 66 | \code{\link{loadPipeline}}, 67 | \code{\link{loadPredefinedFunctionRegistry}}, 68 | \code{\link{loadRegistry}}, \code{\link{prepExecution}}, 69 | \code{\link{registerFunction}}, 70 | \code{\link{savePipeline}}, \code{\link{saveRegistry}}, 71 | \code{\link{setInput}}, \code{\link{setLoggerDetails}}, 72 | \code{\link{updateObject}}, 73 | \code{\link{visualizePipeline}} 74 | } 75 | \concept{Package core functions} 76 | -------------------------------------------------------------------------------- /man/getPipeline.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions.R 3 | \docType{methods} 4 | \name{getPipeline} 5 | \alias{getPipeline} 6 | \alias{getPipeline,BaseAnalysisPipeline-method} 7 | \title{Obtain the pipeline} 8 | \usage{ 9 | getPipeline(object) 10 | 11 | \S4method{getPipeline}{BaseAnalysisPipeline}(object) 12 | } 13 | \arguments{ 14 | \item{object}{The \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object} 15 | } 16 | \value{ 17 | Tibble describing the pipeline 18 | } 19 | \description{ 20 | Obtain the pipeline 21 | } 22 | \details{ 23 | Obtains the pipeline from the \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object as a tibble 24 | 25 | This method is implemented on the base class as it is a shared functionality types of Analysis Pipelines 26 | which extend this class 27 | } 28 | \examples{ 29 | \dontrun{ 30 | library(analysisPipelines) 31 | pipelineObj <- AnalysisPipeline(input = iris) 32 | getNumRows <- function(dataset){ 33 | return(nrow(dataset)) 34 | } 35 | registerFunction("getNumRows") 36 | pipelineObj \%>>\% getNumRows \%>>\% getPipeline 37 | } 38 | } 39 | \seealso{ 40 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, 41 | \code{\link{MetaAnalysisPipeline-class}}, 42 | \code{\link{assessEngineSetUp}}, 43 | \code{\link{checkSchemaMatch}}, 44 | \code{\link{createPipelineInstance}}, 45 | \code{\link{exportAsMetaPipeline}}, 46 | \code{\link{generateOutput}}, 47 | \code{\link{genericPipelineException}}, 48 | \code{\link{getInput}}, \code{\link{getLoggerDetails}}, 49 | \code{\link{getOutputById}}, 50 | \code{\link{getPipelinePrototype}}, 51 | \code{\link{getRegistry}}, 52 | \code{\link{initDfBasedOnType}}, 53 | \code{\link{initialize,BaseAnalysisPipeline-method}}, 54 | \code{\link{loadMetaPipeline}}, 55 | \code{\link{loadPipeline}}, 56 | \code{\link{loadPredefinedFunctionRegistry}}, 57 | \code{\link{loadRegistry}}, \code{\link{prepExecution}}, 58 | \code{\link{registerFunction}}, 59 | \code{\link{savePipeline}}, \code{\link{saveRegistry}}, 60 | \code{\link{setInput}}, \code{\link{setLoggerDetails}}, 61 | \code{\link{updateObject}}, 62 | \code{\link{visualizePipeline}} 63 | } 64 | \concept{Package core functions} 65 | -------------------------------------------------------------------------------- /man/getPipelinePrototype.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions-meta-pipelines.R 3 | \docType{methods} 4 | \name{getPipelinePrototype} 5 | \alias{getPipelinePrototype} 6 | \alias{getPipelinePrototype,MetaAnalysisPipeline-method} 7 | \title{Obtain the prototype of the functions in the pipeline} 8 | \usage{ 9 | getPipelinePrototype(metaPipelineObj) 10 | 11 | \S4method{getPipelinePrototype}{MetaAnalysisPipeline}(metaPipelineObj) 12 | } 13 | \arguments{ 14 | \item{metaPipelineObj}{A \code{MetaAnalysisPipeline} object} 15 | } 16 | \value{ 17 | An object og class \code{proto} from the 'proto' package 18 | } 19 | \description{ 20 | Obtain the prototype of the functions in the pipeline 21 | } 22 | \details{ 23 | This method returns the prototype of functions in the pipeline and their respective arguments as \code{proto} object. 24 | Functions in the pipeline can be accessed easily by using the '$' operator, and within the functions the arguments can 25 | be accessed the same way. These can be accessed and set to new values. This pipeline prototype can then be passed to the 26 | \code{createPipelineInstance} method which will instantiate an executable pipeline with the inputs set in the prototype 27 | } 28 | \examples{ 29 | \dontrun{ 30 | pipelineObj <- AnalysisPipeline(input = iris) 31 | pipelineObj \%>>\% univarCatDistPlots(uniCol = "Species") \%>>\% 32 | exportAsMetaPipeline \%>>\% getPipelinePrototype 33 | } 34 | } 35 | \seealso{ 36 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, 37 | \code{\link{MetaAnalysisPipeline-class}}, 38 | \code{\link{assessEngineSetUp}}, 39 | \code{\link{checkSchemaMatch}}, 40 | \code{\link{createPipelineInstance}}, 41 | \code{\link{exportAsMetaPipeline}}, 42 | \code{\link{generateOutput}}, 43 | \code{\link{genericPipelineException}}, 44 | \code{\link{getInput}}, \code{\link{getLoggerDetails}}, 45 | \code{\link{getOutputById}}, \code{\link{getPipeline}}, 46 | \code{\link{getRegistry}}, 47 | \code{\link{initDfBasedOnType}}, 48 | \code{\link{initialize,BaseAnalysisPipeline-method}}, 49 | \code{\link{loadMetaPipeline}}, 50 | \code{\link{loadPipeline}}, 51 | \code{\link{loadPredefinedFunctionRegistry}}, 52 | \code{\link{loadRegistry}}, \code{\link{prepExecution}}, 53 | \code{\link{registerFunction}}, 54 | \code{\link{savePipeline}}, \code{\link{saveRegistry}}, 55 | \code{\link{setInput}}, \code{\link{setLoggerDetails}}, 56 | \code{\link{updateObject}}, 57 | \code{\link{visualizePipeline}} 58 | } 59 | \concept{Package core functions} 60 | -------------------------------------------------------------------------------- /man/getRegistry.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions.R 3 | \name{getRegistry} 4 | \alias{getRegistry} 5 | \title{Obtains the function registry} 6 | \usage{ 7 | getRegistry() 8 | } 9 | \value{ 10 | Tibble describing the registry 11 | } 12 | \description{ 13 | Obtains the function registry 14 | } 15 | \details{ 16 | Obtains the function registry as a tibble, including both predefined and user defined functions 17 | } 18 | \examples{ 19 | getRegistry() 20 | } 21 | \seealso{ 22 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, 23 | \code{\link{MetaAnalysisPipeline-class}}, 24 | \code{\link{assessEngineSetUp}}, 25 | \code{\link{checkSchemaMatch}}, 26 | \code{\link{createPipelineInstance}}, 27 | \code{\link{exportAsMetaPipeline}}, 28 | \code{\link{generateOutput}}, 29 | \code{\link{genericPipelineException}}, 30 | \code{\link{getInput}}, \code{\link{getLoggerDetails}}, 31 | \code{\link{getOutputById}}, 32 | \code{\link{getPipelinePrototype}}, 33 | \code{\link{getPipeline}}, 34 | \code{\link{initDfBasedOnType}}, 35 | \code{\link{initialize,BaseAnalysisPipeline-method}}, 36 | \code{\link{loadMetaPipeline}}, 37 | \code{\link{loadPipeline}}, 38 | \code{\link{loadPredefinedFunctionRegistry}}, 39 | \code{\link{loadRegistry}}, \code{\link{prepExecution}}, 40 | \code{\link{registerFunction}}, 41 | \code{\link{savePipeline}}, \code{\link{saveRegistry}}, 42 | \code{\link{setInput}}, \code{\link{setLoggerDetails}}, 43 | \code{\link{updateObject}}, 44 | \code{\link{visualizePipeline}} 45 | } 46 | \concept{Package core functions} 47 | -------------------------------------------------------------------------------- /man/getResponse.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions.R 3 | \name{getResponse} 4 | \alias{getResponse} 5 | \title{Obtains the response term from the formula} 6 | \usage{ 7 | getResponse(f) 8 | } 9 | \arguments{ 10 | \item{f}{formula from which term is to be extracted.} 11 | } 12 | \value{ 13 | The response variable in the formula as a string 14 | } 15 | \description{ 16 | Obtains the response term from the formula 17 | } 18 | \details{ 19 | This is a helper function to extract the response variable from a formula 20 | } 21 | \examples{ 22 | library(analysisPipelines) 23 | getResponse(y ~ x1 + x2) 24 | } 25 | -------------------------------------------------------------------------------- /man/getStartingPoints.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions.R 3 | \name{getStartingPoints} 4 | \alias{getStartingPoints} 5 | \title{Obtains starting nodes in a graph given nodes and edges} 6 | \usage{ 7 | getStartingPoints(nodes, edgeDf) 8 | } 9 | \description{ 10 | Obtains starting nodes in a graph given nodes and edges 11 | } 12 | \keyword{internal} 13 | -------------------------------------------------------------------------------- /man/getTargetForPyClassification.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/r-helper-utilites-python.R 3 | \name{getTargetForPyClassification} 4 | \alias{getTargetForPyClassification} 5 | \title{Extracts selected column from a data frame a binary class Python array} 6 | \usage{ 7 | getTargetForPyClassification(dataset, targetVarName, positiveClass) 8 | } 9 | \arguments{ 10 | \item{dataset}{an R data frame} 11 | 12 | \item{targetVarName}{Name of the target variable for classification. Should be a categorical variable.} 13 | 14 | \item{positiveClass}{Name of the class of the target variable which should be coded as '1'} 15 | } 16 | \description{ 17 | Extracts selected column from a data frame a binary class Python array 18 | } 19 | \details{ 20 | Helper function, which when provided an R dataframe and a binary categorical column, 21 | extracts it from the R data frame, converts it to 1/0 class coding, and converts it to a Python array 22 | 23 | Typically this function can be used to extract a target variable for a classifier to be provided to a 24 | Python machine learning function 25 | } 26 | \examples{ 27 | \dontrun{ 28 | getTargetForPyClassification(dataset = iris, 29 | targetVarName = "Species", positiveClass = "setosa") 30 | } 31 | } 32 | \seealso{ 33 | Other R helper utilities for Python: \code{\link{getFeaturesForPyClassification}}, 34 | \code{\link{setPythonEnvir}} 35 | } 36 | \concept{R helper utilities for Python} 37 | -------------------------------------------------------------------------------- /man/getTerm.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions.R 3 | \name{getTerm} 4 | \alias{getTerm} 5 | \title{Obtains the dependency term from the formula} 6 | \usage{ 7 | getTerm(f) 8 | } 9 | \arguments{ 10 | \item{f}{formula from which term is to be extracted.} 11 | } 12 | \value{ 13 | String with the terms 14 | } 15 | \description{ 16 | Obtains the dependency term from the formula 17 | } 18 | \details{ 19 | This is a helper function to extract the terms from a formula 20 | } 21 | \examples{ 22 | library(analysisPipelines) 23 | getTerm(y ~ x) 24 | } 25 | -------------------------------------------------------------------------------- /man/getUpstreamDependencies.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions.R 3 | \name{getUpstreamDependencies} 4 | \alias{getUpstreamDependencies} 5 | \title{Obtains upstream dependencies for \code{AnalysisPipeline} objects} 6 | \usage{ 7 | getUpstreamDependencies(row) 8 | } 9 | \description{ 10 | Obtains upstream dependencies for \code{AnalysisPipeline} objects 11 | } 12 | \keyword{internal} 13 | -------------------------------------------------------------------------------- /man/identifyTopLevelRecursively.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions.R 3 | \name{identifyTopLevelRecursively} 4 | \alias{identifyTopLevelRecursively} 5 | \title{Recursive function to identify the toplogical levels of the functions in a pipeline} 6 | \usage{ 7 | identifyTopLevelRecursively(input = list(topDf = dplyr::tibble(), nodes = 8 | c(), edgeDf = dplyr::tibble(), level = 1)) 9 | } 10 | \description{ 11 | Recursive function to identify the toplogical levels of the functions in a pipeline 12 | } 13 | \keyword{internal} 14 | -------------------------------------------------------------------------------- /man/identifyTopologicalLevels.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions.R 3 | \name{identifyTopologicalLevels} 4 | \alias{identifyTopologicalLevels} 5 | \title{Identifies the topological levels of the functions in a pipeline} 6 | \usage{ 7 | identifyTopologicalLevels(nodes = c(), edgeDf = dplyr::tibble(), 8 | topDf = dplyr::tibble(id = character(), level = character()), 9 | level = 1) 10 | } 11 | \description{ 12 | Identifies the topological levels of the functions in a pipeline 13 | } 14 | \keyword{internal} 15 | -------------------------------------------------------------------------------- /man/ignoreCols.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/r-batch-eda-utilities.R 3 | \name{ignoreCols} 4 | \alias{ignoreCols} 5 | \title{Ignores the columns in the loaded dataframe object} 6 | \usage{ 7 | ignoreCols(data, columns) 8 | } 9 | \arguments{ 10 | \item{data}{the dataframe object that needs to be loaded} 11 | 12 | \item{columns}{the names of columns to be ignored from dataframe object} 13 | } 14 | \value{ 15 | Updated dataframe object 16 | } 17 | \description{ 18 | Ignores the columns in the loaded dataframe object 19 | } 20 | \details{ 21 | The columns selected are removed from the object 22 | } 23 | \examples{ 24 | ignoreCols(data = iris, columns = "Species") 25 | } 26 | \seealso{ 27 | Other Package EDA Utilites functions: \code{\link{CheckColumnType}}, 28 | \code{\link{bivarPlots}}, 29 | \code{\link{correlationMatPlot}}, 30 | \code{\link{getDatatype}}, 31 | \code{\link{multiVarOutlierPlot}}, 32 | \code{\link{outlierPlot}}, 33 | \code{\link{univarCatDistPlots}} 34 | } 35 | \concept{Package EDA Utilites functions} 36 | -------------------------------------------------------------------------------- /man/initDfBasedOnType.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions.R 3 | \name{initDfBasedOnType} 4 | \alias{initDfBasedOnType} 5 | \title{initializes the \code{AnalysisPipeline} object with the input based on the provided type} 6 | \usage{ 7 | initDfBasedOnType(input, filePath) 8 | } 9 | \arguments{ 10 | \item{input}{Input dataframe} 11 | 12 | \item{filePath}{File path where the .csv file is stored} 13 | } 14 | \value{ 15 | \code{AnalysisPipeline} object initialized with input 16 | } 17 | \description{ 18 | initializes the \code{AnalysisPipeline} object with the input based on the provided type 19 | } 20 | \details{ 21 | Transforms provided inputs into R data frame regardless of the input provided, be it Spark DataFrames 22 | or Python data frames 23 | } 24 | \seealso{ 25 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, 26 | \code{\link{MetaAnalysisPipeline-class}}, 27 | \code{\link{assessEngineSetUp}}, 28 | \code{\link{checkSchemaMatch}}, 29 | \code{\link{createPipelineInstance}}, 30 | \code{\link{exportAsMetaPipeline}}, 31 | \code{\link{generateOutput}}, 32 | \code{\link{genericPipelineException}}, 33 | \code{\link{getInput}}, \code{\link{getLoggerDetails}}, 34 | \code{\link{getOutputById}}, 35 | \code{\link{getPipelinePrototype}}, 36 | \code{\link{getPipeline}}, \code{\link{getRegistry}}, 37 | \code{\link{initialize,BaseAnalysisPipeline-method}}, 38 | \code{\link{loadMetaPipeline}}, 39 | \code{\link{loadPipeline}}, 40 | \code{\link{loadPredefinedFunctionRegistry}}, 41 | \code{\link{loadRegistry}}, \code{\link{prepExecution}}, 42 | \code{\link{registerFunction}}, 43 | \code{\link{savePipeline}}, \code{\link{saveRegistry}}, 44 | \code{\link{setInput}}, \code{\link{setLoggerDetails}}, 45 | \code{\link{updateObject}}, 46 | \code{\link{visualizePipeline}} 47 | } 48 | \concept{Package core functions} 49 | \keyword{internal} 50 | -------------------------------------------------------------------------------- /man/initialize-methods.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions.R, R/core-functions-batch.R, 3 | % R/core-functions-meta-pipelines.R, R/core-streaming-functions.R 4 | \docType{methods} 5 | \name{initialize,BaseAnalysisPipeline-method} 6 | \alias{initialize,BaseAnalysisPipeline-method} 7 | \alias{initialize,AnalysisPipeline-method} 8 | \alias{initialize,MetaAnalysisPipeline-method} 9 | \alias{initialize,StreamingAnalysisPipeline-method} 10 | \title{This is the constructor for the \link{BaseAnalysisPipeline} class} 11 | \usage{ 12 | \S4method{initialize}{BaseAnalysisPipeline}(.Object) 13 | 14 | \S4method{initialize}{AnalysisPipeline}(.Object, ..., 15 | input = data.frame(), filePath = "") 16 | 17 | \S4method{initialize}{MetaAnalysisPipeline}(.Object, type = "batch") 18 | 19 | \S4method{initialize}{StreamingAnalysisPipeline}(.Object, input) 20 | } 21 | \description{ 22 | BaseAnalysisPipeline constructor 23 | 24 | AnalysisPipeline constructor 25 | 26 | MetaAnalysisPipeline constructor 27 | 28 | StreamingAnalysisPipeline constructor 29 | } 30 | \seealso{ 31 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, 32 | \code{\link{MetaAnalysisPipeline-class}}, 33 | \code{\link{assessEngineSetUp}}, 34 | \code{\link{checkSchemaMatch}}, 35 | \code{\link{createPipelineInstance}}, 36 | \code{\link{exportAsMetaPipeline}}, 37 | \code{\link{generateOutput}}, 38 | \code{\link{genericPipelineException}}, 39 | \code{\link{getInput}}, \code{\link{getLoggerDetails}}, 40 | \code{\link{getOutputById}}, 41 | \code{\link{getPipelinePrototype}}, 42 | \code{\link{getPipeline}}, \code{\link{getRegistry}}, 43 | \code{\link{initDfBasedOnType}}, 44 | \code{\link{loadMetaPipeline}}, 45 | \code{\link{loadPipeline}}, 46 | \code{\link{loadPredefinedFunctionRegistry}}, 47 | \code{\link{loadRegistry}}, \code{\link{prepExecution}}, 48 | \code{\link{registerFunction}}, 49 | \code{\link{savePipeline}}, \code{\link{saveRegistry}}, 50 | \code{\link{setInput}}, \code{\link{setLoggerDetails}}, 51 | \code{\link{updateObject}}, 52 | \code{\link{visualizePipeline}} 53 | 54 | Other Package core functions for batch/one-time analyses: \code{\link{AnalysisPipeline-class}}, 55 | \code{\link{checkSchema}}, \code{\link{generateReport}} 56 | 57 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, 58 | \code{\link{MetaAnalysisPipeline-class}}, 59 | \code{\link{assessEngineSetUp}}, 60 | \code{\link{checkSchemaMatch}}, 61 | \code{\link{createPipelineInstance}}, 62 | \code{\link{exportAsMetaPipeline}}, 63 | \code{\link{generateOutput}}, 64 | \code{\link{genericPipelineException}}, 65 | \code{\link{getInput}}, \code{\link{getLoggerDetails}}, 66 | \code{\link{getOutputById}}, 67 | \code{\link{getPipelinePrototype}}, 68 | \code{\link{getPipeline}}, \code{\link{getRegistry}}, 69 | \code{\link{initDfBasedOnType}}, 70 | \code{\link{loadMetaPipeline}}, 71 | \code{\link{loadPipeline}}, 72 | \code{\link{loadPredefinedFunctionRegistry}}, 73 | \code{\link{loadRegistry}}, \code{\link{prepExecution}}, 74 | \code{\link{registerFunction}}, 75 | \code{\link{savePipeline}}, \code{\link{saveRegistry}}, 76 | \code{\link{setInput}}, \code{\link{setLoggerDetails}}, 77 | \code{\link{updateObject}}, 78 | \code{\link{visualizePipeline}} 79 | } 80 | \concept{Package core functions} 81 | \concept{Package core functions for batch/one-time analyses} 82 | \keyword{internal} 83 | -------------------------------------------------------------------------------- /man/initializeLoggers.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions.R 3 | \name{initializeLoggers} 4 | \alias{initializeLoggers} 5 | \title{intializes the loggers with the required appenders and layout based on the provided configuration} 6 | \usage{ 7 | initializeLoggers(object) 8 | } 9 | \description{ 10 | intializes the loggers with the required appenders and layout based on the provided configuration 11 | } 12 | \keyword{internal} 13 | -------------------------------------------------------------------------------- /man/isDependencyParam.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions.R 3 | \name{isDependencyParam} 4 | \alias{isDependencyParam} 5 | \title{Checks if the parameter is the dependency parameter} 6 | \usage{ 7 | isDependencyParam(f) 8 | } 9 | \arguments{ 10 | \item{f}{formula from which term is to be extracted.} 11 | } 12 | \value{ 13 | Logical as to whether it is a dependency parameter 14 | } 15 | \description{ 16 | Checks if the parameter is the dependency parameter 17 | } 18 | \details{ 19 | This is a helper function to check if the formula provided is a dependency parameter, 20 | as per the package's formula semantics, capturing function dependencies 21 | } 22 | \examples{ 23 | library(analysisPipelines) 24 | isDependencyParam(~f1) 25 | } 26 | -------------------------------------------------------------------------------- /man/loadMetaPipeline.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions-meta-pipelines.R 3 | \name{loadMetaPipeline} 4 | \alias{loadMetaPipeline} 5 | \title{Load a meta-pipeline} 6 | \usage{ 7 | loadMetaPipeline(path) 8 | } 9 | \arguments{ 10 | \item{path}{the path at which the .Rds file containing the pipeline is located} 11 | } 12 | \value{ 13 | An \code{MetaAnalysisPipeline} object 14 | } 15 | \description{ 16 | Load a meta-pipeline 17 | } 18 | \details{ 19 | This function loads a meta-pipeline from a file system, and returns the meta-pipeline object, which can be assigned 20 | to an object in the environment. 21 | 22 | Note - When a meta-pipeline is loaded, the existing registry is overwritten with the registry saved with the 23 | meta-pipeline 24 | } 25 | \examples{ 26 | \dontrun{ 27 | loadMetaPipeline(path = "./metaPipeline.RDS") 28 | } 29 | } 30 | \seealso{ 31 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, 32 | \code{\link{MetaAnalysisPipeline-class}}, 33 | \code{\link{assessEngineSetUp}}, 34 | \code{\link{checkSchemaMatch}}, 35 | \code{\link{createPipelineInstance}}, 36 | \code{\link{exportAsMetaPipeline}}, 37 | \code{\link{generateOutput}}, 38 | \code{\link{genericPipelineException}}, 39 | \code{\link{getInput}}, \code{\link{getLoggerDetails}}, 40 | \code{\link{getOutputById}}, 41 | \code{\link{getPipelinePrototype}}, 42 | \code{\link{getPipeline}}, \code{\link{getRegistry}}, 43 | \code{\link{initDfBasedOnType}}, 44 | \code{\link{initialize,BaseAnalysisPipeline-method}}, 45 | \code{\link{loadPipeline}}, 46 | \code{\link{loadPredefinedFunctionRegistry}}, 47 | \code{\link{loadRegistry}}, \code{\link{prepExecution}}, 48 | \code{\link{registerFunction}}, 49 | \code{\link{savePipeline}}, \code{\link{saveRegistry}}, 50 | \code{\link{setInput}}, \code{\link{setLoggerDetails}}, 51 | \code{\link{updateObject}}, 52 | \code{\link{visualizePipeline}} 53 | } 54 | \concept{Package core functions} 55 | -------------------------------------------------------------------------------- /man/loadPipeline.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions.R 3 | \name{loadPipeline} 4 | \alias{loadPipeline} 5 | \title{Loads the \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object from the file system} 6 | \usage{ 7 | loadPipeline(path, input = data.frame(), filePath = "") 8 | } 9 | \arguments{ 10 | \item{path}{the path at which the .Rds file containing the pipeline is located} 11 | 12 | \item{input}{(optional) data frame with which the pipeline object should be initialized} 13 | 14 | \item{filePath}{(optional) path where a dataset in .CSV format is present which is to be loaded} 15 | } 16 | \value{ 17 | An \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object, optinally initialized with the data frame provided 18 | } 19 | \description{ 20 | Loads the \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object from the file system 21 | } 22 | \details{ 23 | The \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object is loaded into the file system from the file system 24 | based on the path specified. 25 | 26 | Optionally, the \code{input} parameter can be provided to 27 | initialize the \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object with an R data frame 28 | or Streaming Spark DataFrame (in case of \code{StreamingAnalysisPipeline} object) present in the R session. 29 | 30 | Another provided option, is to specify a filePath where the input dataset is present (in a .CSV format) 31 | and the object will be initialized with this data frame. The \code{filePath} parameter takes precedence over 32 | \code{input} parameter. This is applicable only from \code{AnalysisPipeline} objects 33 | 34 | Note - When a pipeline is loaded, the existing registry is overwritten with the registry saved with the 35 | pipeline 36 | } 37 | \examples{ 38 | \dontrun{ 39 | library(analysisPipelines) 40 | loadPipeline(path = "./pipeline.RDS") 41 | } 42 | } 43 | \seealso{ 44 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, 45 | \code{\link{MetaAnalysisPipeline-class}}, 46 | \code{\link{assessEngineSetUp}}, 47 | \code{\link{checkSchemaMatch}}, 48 | \code{\link{createPipelineInstance}}, 49 | \code{\link{exportAsMetaPipeline}}, 50 | \code{\link{generateOutput}}, 51 | \code{\link{genericPipelineException}}, 52 | \code{\link{getInput}}, \code{\link{getLoggerDetails}}, 53 | \code{\link{getOutputById}}, 54 | \code{\link{getPipelinePrototype}}, 55 | \code{\link{getPipeline}}, \code{\link{getRegistry}}, 56 | \code{\link{initDfBasedOnType}}, 57 | \code{\link{initialize,BaseAnalysisPipeline-method}}, 58 | \code{\link{loadMetaPipeline}}, 59 | \code{\link{loadPredefinedFunctionRegistry}}, 60 | \code{\link{loadRegistry}}, \code{\link{prepExecution}}, 61 | \code{\link{registerFunction}}, 62 | \code{\link{savePipeline}}, \code{\link{saveRegistry}}, 63 | \code{\link{setInput}}, \code{\link{setLoggerDetails}}, 64 | \code{\link{updateObject}}, 65 | \code{\link{visualizePipeline}} 66 | } 67 | \concept{Package core functions} 68 | -------------------------------------------------------------------------------- /man/loadPredefinedFunctionRegistry.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions.R 3 | \name{loadPredefinedFunctionRegistry} 4 | \alias{loadPredefinedFunctionRegistry} 5 | \title{Loading the registry of predefined functions} 6 | \usage{ 7 | loadPredefinedFunctionRegistry() 8 | } 9 | \description{ 10 | Loading the registry of predefined functions 11 | } 12 | \details{ 13 | Loads the registry of predefined functions 14 | } 15 | \examples{ 16 | \dontrun{ 17 | library(analysisPipelines) 18 | loadPredefinedFunctionRegistry() 19 | } 20 | } 21 | \seealso{ 22 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, 23 | \code{\link{MetaAnalysisPipeline-class}}, 24 | \code{\link{assessEngineSetUp}}, 25 | \code{\link{checkSchemaMatch}}, 26 | \code{\link{createPipelineInstance}}, 27 | \code{\link{exportAsMetaPipeline}}, 28 | \code{\link{generateOutput}}, 29 | \code{\link{genericPipelineException}}, 30 | \code{\link{getInput}}, \code{\link{getLoggerDetails}}, 31 | \code{\link{getOutputById}}, 32 | \code{\link{getPipelinePrototype}}, 33 | \code{\link{getPipeline}}, \code{\link{getRegistry}}, 34 | \code{\link{initDfBasedOnType}}, 35 | \code{\link{initialize,BaseAnalysisPipeline-method}}, 36 | \code{\link{loadMetaPipeline}}, 37 | \code{\link{loadPipeline}}, \code{\link{loadRegistry}}, 38 | \code{\link{prepExecution}}, 39 | \code{\link{registerFunction}}, 40 | \code{\link{savePipeline}}, \code{\link{saveRegistry}}, 41 | \code{\link{setInput}}, \code{\link{setLoggerDetails}}, 42 | \code{\link{updateObject}}, 43 | \code{\link{visualizePipeline}} 44 | } 45 | \concept{Package core functions} 46 | -------------------------------------------------------------------------------- /man/loadRegistry.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions.R 3 | \name{loadRegistry} 4 | \alias{loadRegistry} 5 | \title{Loads a function registry from a file} 6 | \usage{ 7 | loadRegistry(path) 8 | } 9 | \arguments{ 10 | \item{path}{path on the file system, where the registry is to be loaded from} 11 | } 12 | \description{ 13 | Loads a function registry from a file 14 | } 15 | \details{ 16 | This function loads a function registry and associated function definition stored in an RDS file into the 17 | environment. The existing registry is overwritten with the newly loaded registry 18 | } 19 | \examples{ 20 | \dontrun{ 21 | library(analysisPipelines) 22 | loadRegistry(path = "./registry.RDS") 23 | } 24 | } 25 | \seealso{ 26 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, 27 | \code{\link{MetaAnalysisPipeline-class}}, 28 | \code{\link{assessEngineSetUp}}, 29 | \code{\link{checkSchemaMatch}}, 30 | \code{\link{createPipelineInstance}}, 31 | \code{\link{exportAsMetaPipeline}}, 32 | \code{\link{generateOutput}}, 33 | \code{\link{genericPipelineException}}, 34 | \code{\link{getInput}}, \code{\link{getLoggerDetails}}, 35 | \code{\link{getOutputById}}, 36 | \code{\link{getPipelinePrototype}}, 37 | \code{\link{getPipeline}}, \code{\link{getRegistry}}, 38 | \code{\link{initDfBasedOnType}}, 39 | \code{\link{initialize,BaseAnalysisPipeline-method}}, 40 | \code{\link{loadMetaPipeline}}, 41 | \code{\link{loadPipeline}}, 42 | \code{\link{loadPredefinedFunctionRegistry}}, 43 | \code{\link{prepExecution}}, 44 | \code{\link{registerFunction}}, 45 | \code{\link{savePipeline}}, \code{\link{saveRegistry}}, 46 | \code{\link{setInput}}, \code{\link{setLoggerDetails}}, 47 | \code{\link{updateObject}}, 48 | \code{\link{visualizePipeline}} 49 | } 50 | \concept{Package core functions} 51 | -------------------------------------------------------------------------------- /man/multiVarOutlierPlot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/r-batch-eda-utilities.R 3 | \name{multiVarOutlierPlot} 4 | \alias{multiVarOutlierPlot} 5 | \title{Multi-Variate Outlier Plot} 6 | \usage{ 7 | multiVarOutlierPlot(data, depCol, indepCol, sizeCol, priColor = "blue", 8 | optionalPlots = 0, cutoffValue = 0.05) 9 | } 10 | \arguments{ 11 | \item{data}{the dataframe that needs to be loaded} 12 | 13 | \item{depCol}{the name of column which is to be identified as dependent column} 14 | 15 | \item{indepCol}{the name of an independent column} 16 | 17 | \item{sizeCol}{the name of column used to define the size of point in plots} 18 | 19 | \item{priColor}{the primary color for the plots} 20 | 21 | \item{optionalPlots}{A Flag for optional plots} 22 | 23 | \item{cutoffValue}{A p-alue cutoff for detecting outliers} 24 | } 25 | \value{ 26 | Outliers plot 27 | } 28 | \description{ 29 | Multi-Variate Outlier Plot 30 | } 31 | \details{ 32 | Multivaraite outlier plot using the selected columns from the dataframe 33 | } 34 | \examples{ 35 | \dontrun{ 36 | multiVarOutlierPlot(data = iris, depCol = "Sepal.Length", 37 | indepCol = "Sepal.Width", sizeCol = "Petal.Length") 38 | } 39 | } 40 | \seealso{ 41 | Other Package EDA Utilites functions: \code{\link{CheckColumnType}}, 42 | \code{\link{bivarPlots}}, 43 | \code{\link{correlationMatPlot}}, 44 | \code{\link{getDatatype}}, \code{\link{ignoreCols}}, 45 | \code{\link{outlierPlot}}, 46 | \code{\link{univarCatDistPlots}} 47 | } 48 | \concept{Package EDA Utilites functions} 49 | -------------------------------------------------------------------------------- /man/outlierPlot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/r-batch-eda-utilities.R 3 | \name{outlierPlot} 4 | \alias{outlierPlot} 5 | \title{Outlier detection plot} 6 | \usage{ 7 | outlierPlot(data, method = "iqr", columnName, cutoffValue = 0.05, 8 | priColor = "blue", optionalPlots = 0) 9 | } 10 | \arguments{ 11 | \item{data}{the dataframe that needs to be loaded} 12 | 13 | \item{method}{the method on which outliers are to be identified} 14 | 15 | \item{columnName}{the name of column for which the outliers are identified} 16 | 17 | \item{cutoffValue}{the cut off value to define the threshold for outliers} 18 | 19 | \item{priColor}{the primary color for the plots} 20 | 21 | \item{optionalPlots}{A Flag for optional plots} 22 | } 23 | \value{ 24 | Outliers plot object 25 | } 26 | \description{ 27 | Outlier detection plot 28 | } 29 | \details{ 30 | Outlier are to be identified on the selected column from the dataframe 31 | } 32 | \examples{ 33 | \dontrun{ 34 | outlierPlot(data = iris, columnName = "Sepal.Length") 35 | } 36 | } 37 | \seealso{ 38 | Other Package EDA Utilites functions: \code{\link{CheckColumnType}}, 39 | \code{\link{bivarPlots}}, 40 | \code{\link{correlationMatPlot}}, 41 | \code{\link{getDatatype}}, \code{\link{ignoreCols}}, 42 | \code{\link{multiVarOutlierPlot}}, 43 | \code{\link{univarCatDistPlots}} 44 | } 45 | \concept{Package EDA Utilites functions} 46 | -------------------------------------------------------------------------------- /man/prepExecution.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions.R 3 | \docType{methods} 4 | \name{prepExecution} 5 | \alias{prepExecution} 6 | \alias{prepExecution,BaseAnalysisPipeline-method} 7 | \title{Prepare the pipleline for execution} 8 | \usage{ 9 | prepExecution(object) 10 | 11 | \S4method{prepExecution}{BaseAnalysisPipeline}(object) 12 | } 13 | \arguments{ 14 | \item{object}{A Pipeline object} 15 | } 16 | \value{ 17 | Updated \code{AnalysisPipeline} \code{StreamingAnalysisPipeline} object 18 | } 19 | \description{ 20 | Prepare the pipleline for execution 21 | } 22 | \details{ 23 | The pipeline is prepared for execution by identifying the graph of the pipeline as well as its topological ordering, 24 | and dependency map in order to prepare for execution 25 | } 26 | \examples{ 27 | \dontrun{ 28 | library(analysisPipelines) 29 | pipelineObj <- AnalysisPipeline(input = iris) 30 | pipelineObj \%>>\% univarCatDistPlots(uniCol = "Species", 31 | priColor = "blue", optionalPlots = 0, storeOutput = T) \%>>\% 32 | prepExecution -> pipelineObj 33 | } 34 | } 35 | \seealso{ 36 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, 37 | \code{\link{MetaAnalysisPipeline-class}}, 38 | \code{\link{assessEngineSetUp}}, 39 | \code{\link{checkSchemaMatch}}, 40 | \code{\link{createPipelineInstance}}, 41 | \code{\link{exportAsMetaPipeline}}, 42 | \code{\link{generateOutput}}, 43 | \code{\link{genericPipelineException}}, 44 | \code{\link{getInput}}, \code{\link{getLoggerDetails}}, 45 | \code{\link{getOutputById}}, 46 | \code{\link{getPipelinePrototype}}, 47 | \code{\link{getPipeline}}, \code{\link{getRegistry}}, 48 | \code{\link{initDfBasedOnType}}, 49 | \code{\link{initialize,BaseAnalysisPipeline-method}}, 50 | \code{\link{loadMetaPipeline}}, 51 | \code{\link{loadPipeline}}, 52 | \code{\link{loadPredefinedFunctionRegistry}}, 53 | \code{\link{loadRegistry}}, 54 | \code{\link{registerFunction}}, 55 | \code{\link{savePipeline}}, \code{\link{saveRegistry}}, 56 | \code{\link{setInput}}, \code{\link{setLoggerDetails}}, 57 | \code{\link{updateObject}}, 58 | \code{\link{visualizePipeline}} 59 | } 60 | \concept{Package core functions} 61 | -------------------------------------------------------------------------------- /man/registerFunction.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions.R 3 | \name{registerFunction} 4 | \alias{registerFunction} 5 | \title{Register a user-defined function to be used with a \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object} 6 | \usage{ 7 | registerFunction(functionName, heading = "", functionType = "batch", 8 | engine = "r", 9 | exceptionFunction = as.character(substitute(genericPipelineException)), 10 | isDataFunction = T, firstArgClass = "", loadPipeline = F, 11 | userDefined = T) 12 | } 13 | \arguments{ 14 | \item{functionName}{name of function to be registered} 15 | 16 | \item{heading}{heading of that section in report} 17 | 18 | \item{functionType}{type of function - 'batch' for \code{AnalysisPipeline} objects, 'streaming' for \code{StreamingAnalysisPipeline} objects} 19 | 20 | \item{engine}{specifies which engine the function is to be run on. Available engines include "r", "spark", and "python"} 21 | 22 | \item{exceptionFunction}{R object corresponding to the exception function} 23 | 24 | \item{isDataFunction}{logical parameter which defines whether the function to be registered operates on data i.e. the first parameter is a dataframe} 25 | 26 | \item{firstArgClass}{character string with the class of the first argument to the function, if it is a non-data function} 27 | 28 | \item{loadPipeline}{logical parameter to see if function is being used in loadPipeline or not. This is for internal working} 29 | 30 | \item{userDefined}{logical parameter defining whether the function is user defined. By default, set to true} 31 | } 32 | \description{ 33 | Register a user-defined function to be used with a \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object 34 | } 35 | \details{ 36 | The specified operation along with the heading and engine details is stored in the registry, after which it can be added to a pipeline. 37 | 38 | If the function already exists in the registry, registration will be skipped. In order to change the definition, the function needs 39 | to be reassigned in the Global Environment and then the \code{registerFunction} called again. 40 | } 41 | \examples{ 42 | \dontrun{ 43 | library(analysisPipelines) 44 | getNumRows <- function(dataset){ 45 | return(nrow(dataset)) 46 | } 47 | 48 | registerFunction("getNumRows") 49 | } 50 | } 51 | \seealso{ 52 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, 53 | \code{\link{MetaAnalysisPipeline-class}}, 54 | \code{\link{assessEngineSetUp}}, 55 | \code{\link{checkSchemaMatch}}, 56 | \code{\link{createPipelineInstance}}, 57 | \code{\link{exportAsMetaPipeline}}, 58 | \code{\link{generateOutput}}, 59 | \code{\link{genericPipelineException}}, 60 | \code{\link{getInput}}, \code{\link{getLoggerDetails}}, 61 | \code{\link{getOutputById}}, 62 | \code{\link{getPipelinePrototype}}, 63 | \code{\link{getPipeline}}, \code{\link{getRegistry}}, 64 | \code{\link{initDfBasedOnType}}, 65 | \code{\link{initialize,BaseAnalysisPipeline-method}}, 66 | \code{\link{loadMetaPipeline}}, 67 | \code{\link{loadPipeline}}, 68 | \code{\link{loadPredefinedFunctionRegistry}}, 69 | \code{\link{loadRegistry}}, \code{\link{prepExecution}}, 70 | \code{\link{savePipeline}}, \code{\link{saveRegistry}}, 71 | \code{\link{setInput}}, \code{\link{setLoggerDetails}}, 72 | \code{\link{updateObject}}, 73 | \code{\link{visualizePipeline}} 74 | } 75 | \concept{Package core functions} 76 | -------------------------------------------------------------------------------- /man/savePipeline.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions.R, 3 | % R/core-functions-meta-pipelines.R 4 | \docType{methods} 5 | \name{savePipeline} 6 | \alias{savePipeline} 7 | \alias{savePipeline,BaseAnalysisPipeline-method} 8 | \alias{savePipeline,MetaAnalysisPipeline-method} 9 | \title{Saves the \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object to the file system without outputs} 10 | \usage{ 11 | savePipeline(object, path) 12 | 13 | \S4method{savePipeline}{BaseAnalysisPipeline}(object, path) 14 | 15 | \S4method{savePipeline}{MetaAnalysisPipeline}(object, path) 16 | } 17 | \arguments{ 18 | \item{object}{object that contains input, pipeline, registry and output} 19 | 20 | \item{path}{the path at which the .Rda file containing the pipeline should be stored, along with the name of the file including 21 | a .Rda extension} 22 | } 23 | \value{ 24 | Does not return a value 25 | } 26 | \description{ 27 | Saves the \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object to the file system without outputs 28 | } 29 | \details{ 30 | The \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object is saved to the file system in the paths specified 31 | 32 | This method is implemented on the base class as it is a shared functionality types of Analysis Pipelines 33 | which extend this class 34 | } 35 | \examples{ 36 | \dontrun{ 37 | library(analysisPipelines) 38 | pipelineObj <- AnalysisPipeline(input = iris) 39 | pipelineObj \%>>\% savePipeline(path = "./test.RDS") 40 | } 41 | } 42 | \seealso{ 43 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, 44 | \code{\link{MetaAnalysisPipeline-class}}, 45 | \code{\link{assessEngineSetUp}}, 46 | \code{\link{checkSchemaMatch}}, 47 | \code{\link{createPipelineInstance}}, 48 | \code{\link{exportAsMetaPipeline}}, 49 | \code{\link{generateOutput}}, 50 | \code{\link{genericPipelineException}}, 51 | \code{\link{getInput}}, \code{\link{getLoggerDetails}}, 52 | \code{\link{getOutputById}}, 53 | \code{\link{getPipelinePrototype}}, 54 | \code{\link{getPipeline}}, \code{\link{getRegistry}}, 55 | \code{\link{initDfBasedOnType}}, 56 | \code{\link{initialize,BaseAnalysisPipeline-method}}, 57 | \code{\link{loadMetaPipeline}}, 58 | \code{\link{loadPipeline}}, 59 | \code{\link{loadPredefinedFunctionRegistry}}, 60 | \code{\link{loadRegistry}}, \code{\link{prepExecution}}, 61 | \code{\link{registerFunction}}, 62 | \code{\link{saveRegistry}}, \code{\link{setInput}}, 63 | \code{\link{setLoggerDetails}}, 64 | \code{\link{updateObject}}, 65 | \code{\link{visualizePipeline}} 66 | } 67 | \concept{Package core functions} 68 | -------------------------------------------------------------------------------- /man/saveRegistry.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions.R 3 | \name{saveRegistry} 4 | \alias{saveRegistry} 5 | \title{Saves the registry to the file system} 6 | \usage{ 7 | saveRegistry(path) 8 | } 9 | \arguments{ 10 | \item{path}{path on the file system, where the registry is to be saved to} 11 | } 12 | \description{ 13 | Saves the registry to the file system 14 | } 15 | \details{ 16 | This function saves the existing function registry and associated function definition loaded in the 17 | environment into a file. 18 | } 19 | \examples{ 20 | \dontrun{ 21 | library(analysisPipelines) 22 | saveRegistry(path = "./registry.RDS") 23 | } 24 | } 25 | \seealso{ 26 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, 27 | \code{\link{MetaAnalysisPipeline-class}}, 28 | \code{\link{assessEngineSetUp}}, 29 | \code{\link{checkSchemaMatch}}, 30 | \code{\link{createPipelineInstance}}, 31 | \code{\link{exportAsMetaPipeline}}, 32 | \code{\link{generateOutput}}, 33 | \code{\link{genericPipelineException}}, 34 | \code{\link{getInput}}, \code{\link{getLoggerDetails}}, 35 | \code{\link{getOutputById}}, 36 | \code{\link{getPipelinePrototype}}, 37 | \code{\link{getPipeline}}, \code{\link{getRegistry}}, 38 | \code{\link{initDfBasedOnType}}, 39 | \code{\link{initialize,BaseAnalysisPipeline-method}}, 40 | \code{\link{loadMetaPipeline}}, 41 | \code{\link{loadPipeline}}, 42 | \code{\link{loadPredefinedFunctionRegistry}}, 43 | \code{\link{loadRegistry}}, \code{\link{prepExecution}}, 44 | \code{\link{registerFunction}}, 45 | \code{\link{savePipeline}}, \code{\link{setInput}}, 46 | \code{\link{setLoggerDetails}}, 47 | \code{\link{updateObject}}, 48 | \code{\link{visualizePipeline}} 49 | } 50 | \concept{Package core functions} 51 | -------------------------------------------------------------------------------- /man/setInput.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions.R 3 | \docType{methods} 4 | \name{setInput} 5 | \alias{setInput} 6 | \alias{setInput,BaseAnalysisPipeline-method} 7 | \title{Sets the input for an \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object} 8 | \usage{ 9 | setInput(object, input, filePath = "") 10 | 11 | \S4method{setInput}{BaseAnalysisPipeline}(object, input, filePath = "") 12 | } 13 | \arguments{ 14 | \item{object}{object that contains input, pipeline, registry and output} 15 | 16 | \item{input}{the input data frame} 17 | 18 | \item{filePath}{path to the file which needs to be read (currently supports .csv files)} 19 | } 20 | \value{ 21 | Updated \code{AnalysisPipeline} \code{StreamingAnalysisPipeline} object 22 | } 23 | \description{ 24 | Sets the input for an \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object 25 | } 26 | \details{ 27 | Assigns the input to the pipeline for an \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object 28 | 29 | This method is implemented on the base class as it is a shared functionality types of Analysis Pipelines 30 | which extend this class 31 | } 32 | \examples{ 33 | library(analysisPipelines) 34 | pipelineObj <- AnalysisPipeline() 35 | pipelineObj \%>>\% setInput(input = iris) -> pipelineObj 36 | } 37 | \seealso{ 38 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, 39 | \code{\link{MetaAnalysisPipeline-class}}, 40 | \code{\link{assessEngineSetUp}}, 41 | \code{\link{checkSchemaMatch}}, 42 | \code{\link{createPipelineInstance}}, 43 | \code{\link{exportAsMetaPipeline}}, 44 | \code{\link{generateOutput}}, 45 | \code{\link{genericPipelineException}}, 46 | \code{\link{getInput}}, \code{\link{getLoggerDetails}}, 47 | \code{\link{getOutputById}}, 48 | \code{\link{getPipelinePrototype}}, 49 | \code{\link{getPipeline}}, \code{\link{getRegistry}}, 50 | \code{\link{initDfBasedOnType}}, 51 | \code{\link{initialize,BaseAnalysisPipeline-method}}, 52 | \code{\link{loadMetaPipeline}}, 53 | \code{\link{loadPipeline}}, 54 | \code{\link{loadPredefinedFunctionRegistry}}, 55 | \code{\link{loadRegistry}}, \code{\link{prepExecution}}, 56 | \code{\link{registerFunction}}, 57 | \code{\link{savePipeline}}, \code{\link{saveRegistry}}, 58 | \code{\link{setLoggerDetails}}, 59 | \code{\link{updateObject}}, 60 | \code{\link{visualizePipeline}} 61 | } 62 | \concept{Package core functions} 63 | -------------------------------------------------------------------------------- /man/setLoggerDetails.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions.R 3 | \docType{methods} 4 | \name{setLoggerDetails} 5 | \alias{setLoggerDetails} 6 | \alias{setLoggerDetails,BaseAnalysisPipeline-method} 7 | \title{Sets the logger configuration for the pipeline} 8 | \usage{ 9 | setLoggerDetails(object, target = "console", 10 | targetFile = "pipelineExecution.out", layout = "layout.simple") 11 | 12 | \S4method{setLoggerDetails}{BaseAnalysisPipeline}(object, 13 | target = "console", targetFile = "pipelineExecution.out", 14 | layout = "layout.simple") 15 | } 16 | \arguments{ 17 | \item{object}{A Pipeline object} 18 | 19 | \item{target}{A string value. 'console' for appending to console, 'file' for appending to a file, or 'console&file' for both} 20 | 21 | \item{targetFile}{File name of the log file in case the target is 'file'} 22 | 23 | \item{layout}{Specify the layout according to 'futile.logger' package convention} 24 | } 25 | \description{ 26 | Sets the logger configuration for the pipeline 27 | } 28 | \details{ 29 | This function sets the logger configuration for the pipeline. 30 | } 31 | \examples{ 32 | library(analysisPipelines) 33 | pipelineObj <- AnalysisPipeline(input = iris) 34 | pipelineObj \%>>\% setLoggerDetails(target = "file", 35 | targetFile = "pipeline.out") -> pipelineObj 36 | } 37 | \seealso{ 38 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, 39 | \code{\link{MetaAnalysisPipeline-class}}, 40 | \code{\link{assessEngineSetUp}}, 41 | \code{\link{checkSchemaMatch}}, 42 | \code{\link{createPipelineInstance}}, 43 | \code{\link{exportAsMetaPipeline}}, 44 | \code{\link{generateOutput}}, 45 | \code{\link{genericPipelineException}}, 46 | \code{\link{getInput}}, \code{\link{getLoggerDetails}}, 47 | \code{\link{getOutputById}}, 48 | \code{\link{getPipelinePrototype}}, 49 | \code{\link{getPipeline}}, \code{\link{getRegistry}}, 50 | \code{\link{initDfBasedOnType}}, 51 | \code{\link{initialize,BaseAnalysisPipeline-method}}, 52 | \code{\link{loadMetaPipeline}}, 53 | \code{\link{loadPipeline}}, 54 | \code{\link{loadPredefinedFunctionRegistry}}, 55 | \code{\link{loadRegistry}}, \code{\link{prepExecution}}, 56 | \code{\link{registerFunction}}, 57 | \code{\link{savePipeline}}, \code{\link{saveRegistry}}, 58 | \code{\link{setInput}}, \code{\link{updateObject}}, 59 | \code{\link{visualizePipeline}} 60 | } 61 | \concept{Package core functions} 62 | -------------------------------------------------------------------------------- /man/setPythonEnvir.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/r-helper-utilites-python.R 3 | \name{setPythonEnvir} 4 | \alias{setPythonEnvir} 5 | \title{Sets the python environment to be used} 6 | \usage{ 7 | setPythonEnvir(type = "conda", pathOrEnvirName = "base") 8 | } 9 | \arguments{ 10 | \item{type}{Type of python environment. Takes three possible vales - 'conda' for Anaconda environments, 11 | 'virtualenv' for Virtual environments, and 'python' to manually set the python path to use} 12 | 13 | \item{pathOrEnvirName}{Name of the environment for Anaconda and Virtual environments, 14 | or the Python path when type is 'python'} 15 | } 16 | \description{ 17 | Sets the python environment to be used 18 | } 19 | \details{ 20 | Wrapper function over reticulate functions to set a python environment to be used 21 | } 22 | \examples{ 23 | \dontrun{ 24 | setPythonEnvir() 25 | } 26 | } 27 | \seealso{ 28 | Other R helper utilities for Python: \code{\link{getFeaturesForPyClassification}}, 29 | \code{\link{getTargetForPyClassification}} 30 | } 31 | \concept{R helper utilities for Python} 32 | -------------------------------------------------------------------------------- /man/setUpstreamDependencies.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions.R 3 | \name{setUpstreamDependencies} 4 | \alias{setUpstreamDependencies} 5 | \title{Sets upstream dependencies for the entire pipeline} 6 | \usage{ 7 | setUpstreamDependencies(pipeline) 8 | } 9 | \description{ 10 | Sets upstream dependencies for the entire pipeline 11 | } 12 | \keyword{internal} 13 | -------------------------------------------------------------------------------- /man/sparkRSessionCreateIfNotPresent.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/spark-structured-streaming-utilities.R 3 | \name{sparkRSessionCreateIfNotPresent} 4 | \alias{sparkRSessionCreateIfNotPresent} 5 | \title{Connect to a Spark session} 6 | \usage{ 7 | sparkRSessionCreateIfNotPresent(...) 8 | } 9 | \arguments{ 10 | \item{...}{Arguments to sparkR.session} 11 | } 12 | \description{ 13 | Connect to a Spark session 14 | } 15 | \details{ 16 | Loads the SparkR package and intializes a Spark session from R 17 | } 18 | \examples{ 19 | \dontrun{ 20 | sparkHome <- "/Users/naren/softwares/spark-2.3.1-bin-hadoop2.7/" 21 | sparkMaster <- "local[1]" 22 | sparkPackages <- c("org.apache.spark:spark-sql-kafka-0-10_2.11:2.3.1") 23 | sparkRSessionCreateIfNotPresent(master = sparkMaster, 24 | sparkPackages = sparkPackages) 25 | } 26 | } 27 | \seealso{ 28 | Other Spark utilities: \code{\link{castKafkaStreamAsString}}, 29 | \code{\link{convertKafkaValueFromJson}} 30 | } 31 | \concept{Spark utilities} 32 | -------------------------------------------------------------------------------- /man/univarCatDistPlots.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/r-batch-eda-utilities.R 3 | \name{univarCatDistPlots} 4 | \alias{univarCatDistPlots} 5 | \title{Univariate Categoric Distribution} 6 | \usage{ 7 | univarCatDistPlots(data, uniCol, priColor = "blue", optionalPlots = 0) 8 | } 9 | \arguments{ 10 | \item{data}{the dataset where the column on which the plot is to be generated is present} 11 | 12 | \item{uniCol}{the name of column on which the plot needs to be generated} 13 | 14 | \item{priColor}{the primary color for the plots} 15 | 16 | \item{optionalPlots}{A Flag for optional plots} 17 | } 18 | \value{ 19 | A univariate categoric distribution plot 20 | } 21 | \description{ 22 | Univariate Categoric Distribution 23 | } 24 | \details{ 25 | A univariate distribution graph on the selected categorical columns from the dataframe 26 | } 27 | \examples{ 28 | univarCatDistPlots(data = iris, uniCol = "Species") 29 | } 30 | \seealso{ 31 | Other Package EDA Utilites functions: \code{\link{CheckColumnType}}, 32 | \code{\link{bivarPlots}}, 33 | \code{\link{correlationMatPlot}}, 34 | \code{\link{getDatatype}}, \code{\link{ignoreCols}}, 35 | \code{\link{multiVarOutlierPlot}}, 36 | \code{\link{outlierPlot}} 37 | } 38 | \concept{Package EDA Utilites functions} 39 | -------------------------------------------------------------------------------- /man/updateObject.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions.R 3 | \docType{methods} 4 | \name{updateObject} 5 | \alias{updateObject} 6 | \alias{updateObject,BaseAnalysisPipeline-method} 7 | \title{Update the \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object by adding an operation to the pipeline} 8 | \usage{ 9 | updateObject(object, operation, heading = "", parameters, outAsIn = F, 10 | storeOutput = F) 11 | 12 | \S4method{updateObject}{BaseAnalysisPipeline}(object, operation, 13 | heading = "", parameters, outAsIn = F, storeOutput = F) 14 | } 15 | \arguments{ 16 | \item{object}{object that contains input, pipeline, registry and output} 17 | 18 | \item{operation}{function name to be updated in tibble} 19 | 20 | \item{heading}{heading of that section in report} 21 | 22 | \item{parameters}{parameters passed to that function} 23 | 24 | \item{outAsIn}{whether to use original input or output from previous function} 25 | 26 | \item{storeOutput}{whether the output of this operation is to be stored} 27 | } 28 | \value{ 29 | Updated \code{AnalysisPipeline} \code{StreamingAnalysisPipeline} object 30 | } 31 | \description{ 32 | Update the \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object by adding an operation to the pipeline 33 | } 34 | \details{ 35 | The specified operation along with the heading and parameters is updated in the pipeline slot 36 | of the \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object, where the sequence of operations 37 | to be performed is stored 38 | 39 | This method is implemented on the base class as it is a shared functionality types of Analysis Pipelines 40 | which extend this class 41 | } 42 | \seealso{ 43 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, 44 | \code{\link{MetaAnalysisPipeline-class}}, 45 | \code{\link{assessEngineSetUp}}, 46 | \code{\link{checkSchemaMatch}}, 47 | \code{\link{createPipelineInstance}}, 48 | \code{\link{exportAsMetaPipeline}}, 49 | \code{\link{generateOutput}}, 50 | \code{\link{genericPipelineException}}, 51 | \code{\link{getInput}}, \code{\link{getLoggerDetails}}, 52 | \code{\link{getOutputById}}, 53 | \code{\link{getPipelinePrototype}}, 54 | \code{\link{getPipeline}}, \code{\link{getRegistry}}, 55 | \code{\link{initDfBasedOnType}}, 56 | \code{\link{initialize,BaseAnalysisPipeline-method}}, 57 | \code{\link{loadMetaPipeline}}, 58 | \code{\link{loadPipeline}}, 59 | \code{\link{loadPredefinedFunctionRegistry}}, 60 | \code{\link{loadRegistry}}, \code{\link{prepExecution}}, 61 | \code{\link{registerFunction}}, 62 | \code{\link{savePipeline}}, \code{\link{saveRegistry}}, 63 | \code{\link{setInput}}, \code{\link{setLoggerDetails}}, 64 | \code{\link{visualizePipeline}} 65 | } 66 | \concept{Package core functions} 67 | -------------------------------------------------------------------------------- /man/visualizePipeline.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/core-functions.R, 3 | % R/core-functions-meta-pipelines.R 4 | \docType{methods} 5 | \name{visualizePipeline} 6 | \alias{visualizePipeline} 7 | \alias{visualizePipeline,BaseAnalysisPipeline-method} 8 | \alias{visualizePipeline,MetaAnalysisPipeline-method} 9 | \title{Visualizes the pipeline as a graph} 10 | \usage{ 11 | visualizePipeline(object) 12 | 13 | \S4method{visualizePipeline}{BaseAnalysisPipeline}(object) 14 | 15 | \S4method{visualizePipeline}{MetaAnalysisPipeline}(object) 16 | } 17 | \arguments{ 18 | \item{object}{The \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object} 19 | } 20 | \value{ 21 | A graph object which can be printed (or) plotted to visualize the pipeline 22 | } 23 | \description{ 24 | Visualizes the pipeline as a graph 25 | } 26 | \details{ 27 | Indicates dependencies amongst functions as well as functions for which output 28 | needs to be stored 29 | } 30 | \examples{ 31 | \dontrun{ 32 | library(analysisPipelines) 33 | pipelineObj <- AnalysisPipeline(input = iris) 34 | pipelineObj \%>>\% univarCatDistPlots(uniCol = "Species", 35 | priColor = "blue", optionalPlots = 0, storeOutput = T) \%>>\% 36 | visualizePipeline 37 | } 38 | } 39 | \seealso{ 40 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, 41 | \code{\link{MetaAnalysisPipeline-class}}, 42 | \code{\link{assessEngineSetUp}}, 43 | \code{\link{checkSchemaMatch}}, 44 | \code{\link{createPipelineInstance}}, 45 | \code{\link{exportAsMetaPipeline}}, 46 | \code{\link{generateOutput}}, 47 | \code{\link{genericPipelineException}}, 48 | \code{\link{getInput}}, \code{\link{getLoggerDetails}}, 49 | \code{\link{getOutputById}}, 50 | \code{\link{getPipelinePrototype}}, 51 | \code{\link{getPipeline}}, \code{\link{getRegistry}}, 52 | \code{\link{initDfBasedOnType}}, 53 | \code{\link{initialize,BaseAnalysisPipeline-method}}, 54 | \code{\link{loadMetaPipeline}}, 55 | \code{\link{loadPipeline}}, 56 | \code{\link{loadPredefinedFunctionRegistry}}, 57 | \code{\link{loadRegistry}}, \code{\link{prepExecution}}, 58 | \code{\link{registerFunction}}, 59 | \code{\link{savePipeline}}, \code{\link{saveRegistry}}, 60 | \code{\link{setInput}}, \code{\link{setLoggerDetails}}, 61 | \code{\link{updateObject}} 62 | } 63 | \concept{Package core functions} 64 | -------------------------------------------------------------------------------- /vignettes/Analysis_pipelines_for_working_with_Python_functions.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Analysis pipelines for working with Python functions" 3 | author: "Naren Srinivasan" 4 | date: "11/27/2018" 5 | output: 6 | rmarkdown::html_vignette: 7 | toc: true 8 | fig_width: 8 9 | vignette: > 10 | %\VignetteIndexEntry{Analysis pipelines for working with Python functions} 11 | %\VignetteEngine{knitr::rmarkdown} 12 | %\VignetteEncoding{UTF-8} 13 | --- 14 | # Introduction 15 | 16 | *Python* has grown exponentially over the past few years in terms of usage for data science, and specifically machine learning. It provides an extensive set of modules for executing various machine learning tasks. The *reticulate* R package provides a mechanism for interoperability between R and Python. It provides direct translation between equivalent commonly used object types, as well as functions. 17 | 18 | The *analysisPipelines* package uses the *reticulate* package under the hood, and provides a consistent high-level interface for the data scientist, as discussed in other vignettes. 19 | 20 | The vignette describes defining and executing *Python*-only pipelines using the *analysisPipelines* package. 21 | 22 | # Important Note 23 | 24 | The functionality of adding Python functions to the pipeline is enabled under the hood by the *reticulate* package. As the *reticulate* package itself is in its early stages of development and usage, some things might not work as expected. Additionally, for reticulating *Python* code itself in R MarkDown chunks (as opposed to sourcing Python files) **RStudio 1.2** is required, though it is still in Preview phase, as of the time of writing this vignette. 25 | 26 | On a separate note, there is a slight difference between how *SparkR* and *reticulate* are designed. SparkR provides wrappers to Spark functions and stays true to the conventions and classes used in *Apache Spark*, with the main type conversion offered being that on a data frame. *reticulate* is different in the sense that its aim is to provide interoperability, and provides type conversion between a wide range of object types between R and Python. 27 | 28 | The biggest difference is in terms of functions - in SparkR, functions written in Scala, etc. in a Python session cannot be accessed from an R session. However, using *reticulate* user-defined functions written in Python and sourced, can be accessed as objects in an R session. This allows greater flexibility, to write custom functions in Python, source the file, and then call those functions from R. This difference in design is important to understand, in order to construct functions which can then be used to compose pipelines. 29 | ```{r} 30 | knitr::opts_chunk$set( 31 | eval = FALSE 32 | ) 33 | ``` 34 | 35 | 36 | # Setup 37 | 38 | The *analysisPipelines* provides a couple of helper functions in R, making it easier to interact with the Python environment. One of them is to set the Python environment, which we do, like so: 39 | 40 | ```{r} 41 | 42 | library(analysisPipelines) 43 | 44 | analysisPipelines::setPythonEnvir('python', '/Users/naren/anaconda3/bin/python') 45 | os <- reticulate::import("os") 46 | numpy <- reticulate::import("numpy") 47 | pandas <- reticulate::import("pandas") 48 | sklearn <- reticulate::import("sklearn") 49 | 50 | reticulate::source_python(system.file("python/sampleFunctions.py", package = "analysisPipelines")) 51 | 52 | reticulate::py_config() 53 | ``` 54 | 55 | # Registering Python functions 56 | 57 | Python functions which have been sourced through *reticulate* are available as references in the R environment and can be directly registered as part of the pipeline, through the usual mechanism. 58 | 59 | For non-R engines, such as Spark and Python, a suffix with the engine name is added to the function name on registration. So, functions with this suffix need to be used when pipelining to an *Analysis Pipeline* object. The engine is added as a suffix for better readability. A suffix is used (as opposed to a prefix) to enable easier auto-completes. 60 | 61 | The *analysisPipelines* package creates wrapper methods which contain the *argument* signature of the Python function. This allows the user to know what arguments need to passed. Normal *reticulate* imports have a `...` signature. 62 | 63 | In our Python sample function file, we have a function called `decisionTreeTrainAndTest` which was sourced. We register this function: 64 | 65 | ```{r} 66 | registerFunction('decisionTreeTrainAndTest', engine = "python", isDataFunction = F, firstArgClass = "numpy.ndarray") 67 | getRegistry() 68 | ``` 69 | 70 | # Defining pipelines 71 | 72 | Pipelines are defined and executed as usual. Regardless of the engine being used the high-level interface remains the same. 73 | 74 | ```{r} 75 | data("iris") 76 | trainSample <- sample(1:150, size = 100) 77 | train <- iris[trainSample,] 78 | test <- iris[-trainSample,] #%>>% getFeaturesForPyClassification(featureNames = colnames(iris)[-ncol(iris)]) 79 | obj <- AnalysisPipeline(input = train) 80 | 81 | obj %>>% getFeaturesForPyClassification(featureNames = colnames(train)[-ncol(train)]) %>>% 82 | getTargetForPyClassification(targetVarName = "Species", positiveClass = "setosa") %>>% 83 | getFeaturesForPyClassification(dataset = test, featureNames = colnames(test)[-ncol(test)]) %>>% 84 | decisionTreeTrainAndTest_py(data = ~f1, target = ~f2, newData = ~f3, storeOutput = T) -> objDecisionTree 85 | 86 | objDecisionTree %>>% assessEngineSetUp 87 | objDecisionTree %>>% visualizePipeline 88 | ``` 89 | 90 | # Execution 91 | 92 | ```{r} 93 | objDecisionTree %>>% generateOutput -> op 94 | #op %>>% generateReport("~/Desktop") 95 | op %>>% getOutputById("4") 96 | ``` 97 | 98 | -------------------------------------------------------------------------------- /vignettes/Analysis_pipelines_for_working_with_sparkR.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Analysis pipelines for working with Spark DataFrames for one-time/ batch analyses" 3 | author: "Naren S, Anoop S" 4 | date: "11/13/2018" 5 | output: 6 | rmarkdown::html_vignette: 7 | toc: true 8 | fig_width: 8 9 | vignette: > 10 | %\VignetteIndexEntry{Analysis pipelines for working with Spark DataFrames for batch analyses} 11 | %\VignetteEngine{knitr::rmarkdown} 12 | %\VignetteEncoding{UTF-8} 13 | --- 14 | # Introduction 15 | 16 | *Apache Spark* can be leveraged to process large volumes of distributed data that are typically impossible to process on standalone R servers. The vignette describes defining and executing *Spark*-only pipelines using the *analysisPipelines* package. 17 | 18 | # Important Note 19 | 20 | Using *Spark* as an engine requires the *SparkR* package to be installed. *SparkR* is distributed natively with *Apache Spark* and is not distributed on CRAN. The *SparkR* version needs to directly map to the Spark version (hence the native distribution), and care needs to be taken to ensure that this is configured properly. 21 | 22 | To install from Github, run the following command, if you know the Spark version: 23 | 24 | ```{r eval = F} 25 | devtools::install_github('apache/spark@v2.x.x', subdir='R/pkg') 26 | ``` 27 | 28 | The other option is to install SparkR by running the following *terminal* commands if Spark has already been installed. 29 | 30 | ```{bash eval = F} 31 | $ export SPARK_HOME=/path/to/spark/directory 32 | $ cd $SPARK_HOME/R/lib/SparkR/ 33 | $ R -e "devtools::install('.')" 34 | ``` 35 | 36 | 37 | # Initialize libraries 38 | 39 | * Load the *analysisPipelines* and *SparkR* libraries 40 | * Check if the SPARK_HOME environment variable is set to Spark installation folder. Else, define it using `sys.setenv()` function. 41 | ```{r} 42 | knitr::opts_chunk$set( 43 | eval = FALSE 44 | ) 45 | ``` 46 | 47 | ```{r, include=FALSE} 48 | 49 | library(ggplot2) 50 | library(analysisPipelines) 51 | library(SparkR) 52 | 53 | ## Define these variables as per the configuration of your machine. This is just an example. 54 | sparkHome <- "/Users/naren/softwares/spark-2.3.1-bin-hadoop2.7/" 55 | sparkMaster <- "local[1]" 56 | sparkPackages <- c("org.apache.spark:spark-sql-kafka-0-10_2.11:2.3.1") 57 | # Set spark home variable if not present 58 | if(Sys.getenv("SPARK_HOME") == "") { 59 | Sys.setenv(SPARK_HOME = sparkHome) 60 | } 61 | ``` 62 | 63 | # Connect to Spark cluster 64 | 65 | * Define the Spark master URL 66 | * Specify dependency packages if any during Spark connection. Example: `sparkPackages <- c("org.apache.spark:spark-sql-kafka-0-10_2.11:2.3.1")` 67 | * Connect to the cluster using the package's `sparkRSessionCreateIfNotPresent` function 68 | 69 | ```{r} 70 | sparkRSessionCreateIfNotPresent(master = sparkMaster, sparkPackages = sparkPackages) 71 | ``` 72 | 73 | # Read data from csv and initialize pipeline object 74 | 75 | Spark can connect to datasources like Hive, Kafka. Besides, it can also read parquet, json and csv files. In this example we will read a csv file. 76 | 77 | ```{r} 78 | inputDataset <- iris 79 | 80 | # Replacing '.' in column names with '_' as SparkR is not able to deal with '.' in column names 81 | colnames(inputDataset) <- gsub(".", "_", colnames(inputDataset), fixed = T) 82 | 83 | pipelineObj <- AnalysisPipeline(input = iris) 84 | ``` 85 | 86 | # User-defined Spark functions 87 | The example below shows a few functions to perform simple aggregations. 88 | 89 | ```{r} 90 | meanByGroup <- function(inputDataset, groupByColumn, colToSummarize) { 91 | groupSummary <- SparkR::summarize( SparkR::groupBy(inputDataset,inputDataset[[groupByColumn]]), 92 | avg = SparkR::mean(inputDataset[[colToSummarize]])) 93 | return(groupSummary) 94 | } 95 | ``` 96 | 97 | # Registering user-defined functions to the pipeline object 98 | 99 | Each user-defined function needs to be registered to the pipeline object. For non-R engines, such as Spark and Python, a suffix with the engine name is added to the function name on registration. So, functions with this suffix need to be used when pipelining to an *Analysis Pipeline* object. The engine is added as a suffix for better readability. A suffix is used (as opposed to a prefix) to enable easier auto-completes. 100 | 101 | Post registration, the function can be used to construct a pipeline. A pipeline is a set of multiple functions called in a particular sequence. 102 | 103 | ```{r} 104 | # Register user-defined functions 105 | registerFunction("meanByGroup", "Mean By Group", 106 | engine = "spark") 107 | 108 | # List all registered functions 109 | getRegistry() 110 | 111 | # Define pipeline from list of registered functions 112 | pipelineObj %>% meanByGroup_spark(groupByColumn = "Species", colToSummarize = "Sepal_Length", storeOutput = T) %>% 113 | meanByGroup_spark(groupByColumn = "Species", colToSummarize = "Petal_Length", storeOutput = T) -> pipelineObj 114 | 115 | pipelineObj %>>% getPipeline 116 | pipelineObj %>>% visualizePipeline 117 | ``` 118 | 119 | # Running the pipeline and generating an output 120 | 121 | The pipeline is run by calling the `generateOutput()` function. A particular output in the sequence on evaluations can be accessed by calling the `getOutputById` function 122 | 123 | 124 | ```{r fig.width=6, fig.height=3} 125 | pipelineObj %>% generateOutput -> pipelineObj 126 | 127 | sepalLengthBySpecies <- pipelineObj %>>% getOutputById(1) 128 | sepalLengthBySpeciesDf <- as.data.frame(sepalLengthBySpecies) 129 | DT::datatable(head(sepalLengthBySpeciesDf),options = list(scrollX = T, scrollY = T)) 130 | 131 | petalLengthBySpecies <- pipelineObj %>>% getOutputById(2) 132 | petalLengthBySpeciesDf <- as.data.frame(petalLengthBySpecies) 133 | DT::datatable(head(petalLengthBySpeciesDf),options = list(scrollX = T, scrollY = T)) 134 | ``` 135 | 136 | # Supplementary Note 137 | 138 | The *analysisPipelines* package internally uses the *SparkR* package to interface with *Spark*. *SparkR* masks many typical data manipulation and processing functions from *base* as well as packages like *dplyr*. Therefore, ensure you use function scoping when calling a function. 139 | -------------------------------------------------------------------------------- /vignettes/Interoperable_Pipelines.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Interoperable analysis pipelines" 3 | author: "Naren Srinivasan" 4 | date: "11/13/2018" 5 | output: 6 | rmarkdown::html_vignette: 7 | toc: true 8 | fig_width: 8 9 | vignette: > 10 | %\VignetteIndexEntry{Interoperable analysis pipelines} 11 | %\VignetteEngine{knitr::rmarkdown} 12 | %\VignetteEncoding{UTF-8} 13 | --- 14 | 15 | # Introduction 16 | 17 | This vignette explains how **interoperable pipelines** containing functions operating on different engines such as R, Spark and Python can be configured and executed through the **analysisPipelines** package. Currently, the package supports interoperable pipelines containing R and Spark batch functions. 18 | 19 | If the package is new to you, it is recommended that you go through the *Analysis pipelines - Core functionality and working with R data frames and functions* vignette first. 20 | 21 | # Important Note 22 | 23 | Using *Spark* as an engine requires the *SparkR* package to be installed. *SparkR* is distributed natively with *Apache Spark* and is not distributed on CRAN. 24 | 25 | ```{r echo = FALSE} 26 | library(analysisPipelines) 27 | knitr::opts_chunk$set( 28 | eval = FALSE 29 | ) 30 | ``` 31 | 32 | # An example of an interoperable pipeline 33 | 34 | In this vignette we demonstrate an interoperable pipeline built using the **analysisPipelines** package, which contains a couple of filtering/ aggregation functions performed in *Spark*, which is then subsequently visualized through *R* functions using *ggplot2* 35 | 36 | ## Initializing a Spark connection from R and loading the data 37 | 38 | We initialize a Spark session using the `sparkRSessionCreateIfNotPresent` helper function in the **analysisPipelines** package, which internally uses *SparkR*. We then read the data into the Spark session using functions in the SparkR package. In this case we read a *.csv* file, though SparkR can work with multiple other data sources 39 | 40 | ```{r} 41 | 42 | ## Define these variables as per the configuration of your machine. This is just an example. 43 | sparkHome <- "/Users/naren/softwares/spark-2.3.1-bin-hadoop2.7/" 44 | sparkMaster <- "local[1]" 45 | sparkPackages <- c("org.apache.spark:spark-sql-kafka-0-10_2.11:2.3.1") 46 | 47 | sparkRSessionCreateIfNotPresent(sparkHome = sparkHome, master = sparkMaster, sparkPackages = sparkPackages) 48 | 49 | inputDataset <- iris 50 | 51 | # Replacing '.' in column names with '_' as SparkR is not able to deal with '.' in column names 52 | colnames(inputDataset) <- gsub(".", "_", colnames(inputDataset), fixed = T) 53 | ``` 54 | 55 | 56 | ##Initializing Python connection 57 | 58 | ```{r} 59 | 60 | ## Define these variables as per the configuration of your machine. This is just an example. 61 | 62 | analysisPipelines::setPythonEnvir('python', '/Users/naren/anaconda3/bin/python') 63 | os <- reticulate::import("os") 64 | numpy <- reticulate::import("numpy") 65 | pandas <- reticulate::import("pandas") 66 | sklearn <- reticulate::import("sklearn") 67 | 68 | reticulate::source_python(system.file("python/sampleFunctions.py", package = "analysisPipelines")) 69 | 70 | reticulate::py_config() 71 | ``` 72 | 73 | 74 | ## Creating an analysisPipeline object 75 | 76 | We then initialize an *AnalysisPipeline*, with the input dataset 77 | 78 | ```{r} 79 | pipelineObj <- AnalysisPipeline(input = inputDataset) 80 | ``` 81 | 82 | 83 | ## Registering functions to work in the Spark environment 84 | 85 | In order to manipulate the data in the Spark environment, we define our own functions using SparkR interface functions. We then **register** these functions with the **AnalysisPipeline** object, so that they can be used in constructing a pipeline. 86 | 87 | The `getRegistry` function lists all the registered functions, along with details such as which **engine** they should run on. 88 | 89 | ```{r} 90 | getSchema <- function(inputDataset) { 91 | sparkSchema <- SparkR::schema(inputDataset) 92 | return(sparkSchema) 93 | } 94 | 95 | filterData <- function(inputDataset, condition) { 96 | filteredData <- SparkR::filter(inputDataset, condition) 97 | return(filteredData) 98 | } 99 | 100 | registerFunction(functionName = "getSchema", engine = "spark") 101 | registerFunction(functionName = "filterData", engine = "spark") 102 | 103 | 104 | getRegistry() 105 | ``` 106 | 107 | ## Registering R functions 108 | 109 | Similar to the Spark functions, we register some user-defined functions in R. In this case to plot a bivariate plot using *ggplot2*. 110 | 111 | ```{r} 112 | 113 | rBivarPlots <- function(dataset, select_var_name_1, select_var_name_2, priColor = "blue", secColor= "black") { 114 | 115 | numeric_cols <- unlist(getDatatype(dataset)['numeric_cols']) 116 | cat_cols <- unlist(getDatatype(dataset)['cat_cols']) 117 | 118 | if (select_var_name_1 %in% numeric_cols && select_var_name_2 %in% numeric_cols) { 119 | x = dataset[, select_var_name_1] 120 | y = dataset[, select_var_name_2] 121 | bivarPlot <- 122 | ggplot2::ggplot(dataset, ggplot2::aes(x, y)) + 123 | ggplot2::geom_point(color = priColor, alpha = 0.7) + 124 | ggplot2::geom_smooth(method = lm, color = secColor) + 125 | ggplot2::xlab(select_var_name_1) + 126 | ggplot2::ylab(select_var_name_2) + ggplot2::theme_bw() + 127 | ggplot2::ggtitle(paste( 128 | 'Bivariate plot for', 129 | select_var_name_1, 130 | 'and', 131 | select_var_name_2, 132 | sep = ' ' 133 | )) + 134 | ggplot2::theme( 135 | plot.title = ggplot2::element_text(hjust = 0.5, size = 10), 136 | axis.text = ggplot2::element_text(size = 10), 137 | axis.title = ggplot2::element_text(size = 10) 138 | ) 139 | 140 | 141 | 142 | } else if (select_var_name_1 %in% cat_cols && 143 | select_var_name_2 %in% cat_cols) { 144 | new_df <- dataset %>% dplyr::group_by_(.dots=c(select_var_name_1,select_var_name_2)) %>% dplyr::summarise(n = dplyr::n()) 145 | colfunc <- grDevices::colorRampPalette(c(priColor, "white" , secColor)) 146 | colorvar <- length(unique(new_df[[select_var_name_2]])) 147 | a=as.vector(as.character(unique(new_df[[select_var_name_1]]))) 148 | y=new_df[[select_var_name_1]] 149 | label=new_df[[select_var_name_2]] 150 | bivarPlot <-ggplot2::ggplot(new_df, ggplot2::aes(x = y, y= n, fill = label)) + 151 | ggplot2::geom_bar(position = "dodge", stat = "identity",alpha=0.9) + 152 | ggplot2::guides(fill=ggplot2::guide_legend(title=select_var_name_2)) + 153 | ggplot2::coord_flip()+ 154 | ggplot2::xlab(select_var_name_1) + 155 | ggplot2::ylab("count") + ggplot2::theme_bw() + 156 | ggplot2::ggtitle(paste('Bivariate plot for',select_var_name_1,'and',select_var_name_2,sep=' '))+ 157 | ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 10),axis.text = ggplot2::element_text(size=10), 158 | axis.title=ggplot2::element_text(size=10),legend.position="bottom",axis.text.x=ggplot2::element_text(angle=45, hjust=1))+ ggplot2::scale_fill_manual(values = colfunc(colorvar)) 159 | 160 | 161 | } else { 162 | cols <- c(select_var_name_1, select_var_name_2) 163 | cat_col <- cols[which(cols %in% cat_cols)] 164 | num_col <- cols[which(cols %in% numeric_cols)] 165 | a = as.vector(as.character(unique(dataset[[cat_col]]))) 166 | y = dataset[[cat_col]] 167 | x = dataset[[num_col]] 168 | bivarPlot <- 169 | ggplot2::ggplot(dataset, ggplot2::aes(x = y, y = x)) + 170 | ggplot2::geom_point(color = priColor, alpha = 0.7) + 171 | ggplot2::coord_flip() + 172 | ggplot2::xlab(cat_col) + 173 | ggplot2::ylab(num_col) + ggplot2::theme_bw() + 174 | ggplot2::ggtitle(paste( 175 | 'Bivariate plot for', 176 | select_var_name_1, 177 | 'and', 178 | select_var_name_2, 179 | sep = ' ' 180 | )) + 181 | ggplot2::theme( 182 | plot.title = ggplot2::element_text(hjust = 0.5, size = 10), 183 | axis.text = ggplot2::element_text(size = 10), 184 | axis.title = ggplot2::element_text(size = 10) 185 | ) 186 | } 187 | 188 | return(bivarPlot) 189 | } 190 | 191 | registerFunction(functionName = "rBivarPlots", engine = "r", heading = "Bivariate analysis") 192 | 193 | getRegistry() 194 | ``` 195 | ## Registering Python functions 196 | 197 | ```{r} 198 | registerFunction("decisionTreeTrainAndTest", engine = "python", isDataFunction = F, firstArgClass = "numpy.ndarray") 199 | getRegistry() 200 | ``` 201 | 202 | 203 | ## Interoperable pipeline containing R, Spark and Python functions 204 | 205 | * Here we consider a typical use case of performing data filtering/ aggregations and so on and Spark, and then using R to visualize, and Python to run a Machine learning model 206 | 207 | We first visualize the data without filtering: 208 | 209 | 210 | ```{r} 211 | 212 | pipelineObj %>>% rBivarPlots(select_var_name_1 = "Sepal_Length", select_var_name_2 = "Sepal_Width", 213 | priColor = "blue", secColor = "green", storeOutput = T) -> vizWithoutFilterPipeline 214 | vizWithoutFilterPipeline %>>% getPipeline 215 | vizWithoutFilterPipeline %>>% assessEngineSetUp 216 | vizWithoutFilterPipeline %>>% generateOutput -> opWithoutFilter 217 | opWithoutFilter %>>% getOutputById(1) 218 | ``` 219 | 220 | We then perform filtering on one of the variables in Spark, before visualizing in R 221 | 222 | ```{r} 223 | pipelineObj %>>% filterData_spark(condition = "Species == 'setosa'") %>>% 224 | rBivarPlots(select_var_name_1 = "Sepal_Length", select_var_name_2 = "Sepal_Width", 225 | priColor = "blue", secColor = "green", outAsIn = T, storeOutput = T) -> singleFilterPipeline 226 | singleFilterPipeline %>>% visualizePipeline 227 | 228 | singleFilterPipeline %>>% generateOutput -> opWithFilter 229 | opWithFilter %>>% getOutputById(2) 230 | ``` 231 | 232 | Finally, we show a case, where sequential filtering steps are performed in Spark, before visualizing in R, and running a decision tree model in Python. 233 | 234 | Note, that in this case, `getTargetForPyClassifcation` and `getTargetForPyClassification` have been registered as *data* functions. Type conversions between R, Spark and Python for data functions are performed automatically by the package. 235 | 236 | ```{r} 237 | pipelineObj %>>% filterData_spark(condition = "Species == 'setosa' or Species == 'virginica'") %>>% 238 | filterData_spark(condition = "Petal_Length > 3.7", outAsIn = T) %>>% 239 | rBivarPlots(select_var_name_1 = "Sepal_Length", select_var_name_2 = "Sepal_Width", 240 | priColor = "blue", secColor = "green", outAsIn = T, storeOutput = T) %>>% 241 | getFeaturesForPyClassification(dataset = ~f2, featureNames = c("Sepal_Length", 242 | "Sepal_Width", 243 | "Petal_Length")) %>>% 244 | getTargetForPyClassification(dataset = ~f2, targetVarName = "Species", positiveClass = "setosa") %>>% 245 | decisionTreeTrainAndTest_py(data = ~f4, target = ~f5, newData = ~f4, storeOutput = T) -> twoFilterPipeline 246 | 247 | twoFilterPipeline %>>% visualizePipeline 248 | 249 | twoFilterPipeline %>>% generateOutput -> opWith2Filters 250 | opWith2Filters %>>% getOutputById(3) 251 | opWith2Filters %>>% getOutputById(6) 252 | 253 | ``` 254 | 255 | # Supplementary Note 256 | 257 | The *analysisPipelines* package internally uses the *SparkR* package to interface with *Spark*. *SparkR* masks many typical data manipulation and processing functions from *base* as well as packages like *dplyr*. Therefore, ensure you use function scoping when calling a function. 258 | -------------------------------------------------------------------------------- /vignettes/Meta_Pipelines.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "An introduction to meta-pipelines" 3 | author: "Naren Srinivasan" 4 | date: "11/19/2018" 5 | output: 6 | rmarkdown::html_vignette: 7 | toc: true 8 | fig_width: 8 9 | vignette: > 10 | %\VignetteIndexEntry{Meta-pipelines} 11 | %\VignetteEngine{knitr::rmarkdown} 12 | %\VignetteEncoding{UTF-8} 13 | --- 14 | 15 | ```{r message=F, results='hide'} 16 | knitr::opts_chunk$set( 17 | eval = TRUE 18 | ) 19 | library(analysisPipelines) 20 | ``` 21 | 22 | # Introduction 23 | 24 | The **meta-pipeline** construct is one which allows users to export pipelines they have created for a particular use case to a general analysis flow which can be used for a different dataset and different set of parameters. A *pipeline* is one where the data can change, though retaining the same schema, and the same set of parameters for the functions. A *meta-pipeline* is one where only the analysis flow, function dependencies and so on are retained. The specific parameters for each of the functions can be set differently for a new use case. 25 | 26 | The objective of a meta-pipeline is to define and execute reusable analysis flows. They can be used to: 27 | 28 | * Document best practices for a particular problem 29 | * Templatize analyses for particular situations 30 | 31 | # Using meta-pipelines 32 | 33 | ## Creating a meta-pipeline 34 | 35 | Through this package, *meta-pipelines* can be created by exporting an already created *pipeline* to a *meta-pipeline*. The export retains the following items: 36 | 37 | * Function definitions 38 | * Flow of the functions and the dependencies (specified through formula semantics) 39 | * The registry from which the pipeline is defined 40 | 41 | In the example below, we first create a pipeline, similar to the one described in the other vignettes. 42 | 43 | ```{r} 44 | pipeline <- AnalysisPipeline(input = iris) 45 | getColor <- function(color){ 46 | return(color) 47 | } 48 | 49 | getColumnName <-function(columnName){ 50 | return(columnName) 51 | } 52 | 53 | registerFunction(functionName = "getColor", isDataFunction = F, firstArgClass = "character") 54 | registerFunction(functionName = "getColumnName", isDataFunction = F, firstArgClass = "character") 55 | 56 | getRegistry() 57 | ``` 58 | 59 | We then generate an output from the pipeline, just to validate that the pipeline works properly. Of course, to define a meta-pipeline generation of output is not required. 60 | 61 | ```{r} 62 | pipeline %>>% getColor(color = "blue") %>>% getColumnName(columnName = "Sepal.Length") %>>% 63 | univarCatDistPlots(uniCol = "Species", priColor = ~f1, optionalPlots = 0, storeOutput = T) %>>% 64 | outlierPlot(method = "iqr", columnName = ~f2, cutoffValue = 0.01, priColor = ~f1 , optionalPlots = 0) -> complexPipeline 65 | 66 | complexPipeline %>>% getPipeline 67 | complexPipeline %>>% prepExecution -> complexPipeline 68 | 69 | complexPipeline %>>% generateOutput -> op 70 | op %>>% getOutputById("3") 71 | 72 | ``` 73 | 74 | ## Exporting and reusing for a different case 75 | 76 | Once a pipeline has been created, be it a batch or a streaming pipeline, it can be exported using the `exportAsMetaPipeline` method. This returns an object of class `MetaAnalysisPipeline` which stores the required information. 77 | 78 | The meta-pipeline can be *visualized* similar to a normal pipeline object by calling the `visualizePipeline` method on the `MetaAnalysisPipeline` object. 79 | 80 | ```{r} 81 | 82 | complexPipeline %>>% exportAsMetaPipeline -> complexMetaPipeline 83 | 84 | # complexMetaPipeline %>>% visualizePipeline 85 | ``` 86 | 87 | ## Setting the new parameters 88 | 89 | The next part of using the meta-pipeline is creating another pipeline with a different set of parameters. For this purpose, the user can first export the *pipeline prototype* which basically contains the set of functions used in the pipeline and their respective arguments. 90 | 91 | The pipeline prototype is exported as an object of class `proto` from the 'proto' package, which is a thin skin over environments, with usability advantages such as using methods like `names` to get the names of objects contained in it, as well as using the '$' operator to refer to specific objects. The aim of using this class is to provide an easy-to-use interface to set the new values of the arguments. 92 | 93 | The pipeline prototype has a nested structure. The first level is a list of objects which represent the list of functions in the pipeline. A specific function can just be referred to through its name. The second level, is the list of arguments for each of those functions (again referred by the usual name). 94 | 95 | The new values of the parameters can simply be set by using the '$' operator to refer to the values. The exported pipeline prototype by default contains the values of the parameters defined in the original pipeline. Therefore, the user can simply change some of the values as required or for all of the parameters. 96 | 97 | In the following example, we reconfigure the pipeline for use with the 'iris' dataset. 98 | 99 | ```{r} 100 | pipelineProto <- getPipelinePrototype(complexMetaPipeline) 101 | str(pipelineProto) 102 | 103 | #Setting new parameters on ToothGrowth dataset 104 | pipelineProto$getColor$color<- "green" 105 | pipelineProto$getColumnName$columnName<- "len" 106 | pipelineProto$univarCatDistPlots$uniCol <- "supp" 107 | 108 | #complexMetaPipeline %>>% visualizePipeline 109 | ``` 110 | 111 | ## Execution 112 | 113 | Now once the parameters have been set, a new pipeline object (which is executable) can be created by calling the `createPipelineInstance` method, and passing the meta-pipeline object and the pipeline prototype. This creates a pipeline object with the usual properties. 114 | 115 | We set the input of the pipeline object to the `iris` dataset and then execute to generate the output. 116 | 117 | ```{r} 118 | complexMetaPipeline %>>% createPipelineInstance(pipelineProto) -> newPipelineObj 119 | 120 | newPipelineObj %>>% setInput(input = ToothGrowth) -> newPipelineObj 121 | 122 | newPipelineObj %>>% generateOutput %>>% getOutputById("3") 123 | ``` 124 | 125 | 126 | # Saving and loading meta-pipelines 127 | 128 | Similar to pipelines, meta-pipelines can be saved and loaded using the `savePipeline` method and the `loadMetaPipeline` function. As with pipelines, when a meta-pipeline is loaded, it overwrites the existing registry with the registry stored with the meta-pipeline. 129 | 130 | ```{r eval = FALSE} 131 | 132 | complexMetaPipeline %>>% savePipeline("metaPipeline.RDS") 133 | 134 | #Checking if registry is updated 135 | getC <- function(color){ 136 | return(color) 137 | } 138 | 139 | getCol <-function(columnName){ 140 | return(columnName) 141 | } 142 | 143 | registerFunction(functionName = "getC", isDataFunction = F, firstArgClass = "character") 144 | registerFunction(functionName = "getCol", isDataFunction = F, firstArgClass = "character") 145 | 146 | getRegistry() 147 | loadMetaPipeline(path = "metaPipeline.RDS") -> loadedMetaPipeline 148 | getRegistry() 149 | 150 | pipelineProtoLoaded <- getPipelinePrototype(loadedMetaPipeline) 151 | str(pipelineProtoLoaded) 152 | 153 | pipelineProtoLoaded$getColor$color<- "green" 154 | pipelineProtoLoaded$getColumnName$columnName<- "Sepal.Length" 155 | pipelineProtoLoaded$univarCatDistPlots$uniCol <- "Species" 156 | 157 | loadedMetaPipeline %>>% createPipelineInstance(pipelineProtoLoaded) -> newPipelineObjLoaded 158 | 159 | newPipelineObjLoaded %>>% setInput(input = iris) %>>% 160 | generateOutput %>>% getOutputById("3") 161 | ``` 162 | 163 | -------------------------------------------------------------------------------- /vignettes/Streaming_pipelines_for_working_Apache_Spark_Structured_Streaming.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Streaming Analysis Pipelines for working with Apache Spark Structured Streaming" 3 | author: "Naren Srinivasan, Anoop S" 4 | date: "9/11/2018" 5 | output: 6 | rmarkdown::html_vignette: 7 | toc: true 8 | fig_width: 8 9 | vignette: > 10 | %\VignetteIndexEntry{Streaming Analysis Pipelines for working with Apache Spark Structured Streaming} 11 | %\VignetteEngine{knitr::rmarkdown} 12 | %\VignetteEncoding{UTF-8} 13 | --- 14 | 15 | # Introduction 16 | 17 | The vignette aims to show examples of using SparkR as an interface to run streaming Spark jobs through R - using the analysisPipelines package. The major use case is that of implementing a pipeline using SparkR dataframes for streaming data. 18 | 19 | # Important Note 20 | 21 | Using *Spark* as an engine requires the *SparkR* package to be installed. *SparkR* is distributed natively with *Apache Spark* and is not distributed on CRAN. The *SparkR* version needs to directly map to the Spark version (hence the native distribution), and care needs to be taken to ensure that this is configured properly. 22 | 23 | To install from Github, run the following command, if you know the Spark version: 24 | 25 | ```{r eval = F} 26 | devtools::install_github('apache/spark@v2.x.x', subdir='R/pkg') 27 | ``` 28 | 29 | The other option is to install SparkR by running the following *terminal* commands if Spark has already been installed. 30 | 31 | ```{bash eval = F} 32 | $ export SPARK_HOME=/path/to/spark/directory 33 | $ cd $SPARK_HOME/R/lib/SparkR/ 34 | $ R -e "devtools::install('.')" 35 | ``` 36 | 37 | # Initialize libraries 38 | 39 | * Initialize the analysisPipelines and SparkR libraries 40 | * Ensure you have a local installation of Spark and SparkR package is installed 41 | * Check if the SPARK_HOME environment variable is set to Spark installation folder. Else, define it using `sys.setenv()` function. 42 | 43 | ```{r} 44 | knitr::opts_chunk$set( 45 | eval = FALSE 46 | ) 47 | ``` 48 | 49 | 50 | ```{r, include=FALSE} 51 | library(analysisPipelines) 52 | library(SparkR) 53 | 54 | ## Define these variables as per the configuration of your machine. The below example is just illustrative. 55 | 56 | sparkHome <- "/path/to/spark/directory/" 57 | sparkMaster <- "local[1]" 58 | sparkPackages <- c("org.apache.spark:spark-sql-kafka-0-10_2.11:2.3.1") 59 | # Set spark home variable if not present 60 | if(Sys.getenv("SPARK_HOME") == "") { 61 | Sys.setenv(SPARK_HOME = sparkHome) 62 | } 63 | ``` 64 | 65 | # Connect to Spark cluster 66 | * Define the Spark master URL 67 | * Specify dependency packages if any during Spark connection. Example: `sparkPackages <- c("org.apache.spark:spark-sql-kafka-0-10_2.11:2.3.1")` 68 | * Connect to the cluster using the package's `sparkRSessionCreateIfNotPresent` function 69 | 70 | ```{r, results='hide'} 71 | sparkRSessionCreateIfNotPresent(master = sparkMaster, sparkPackages = sparkPackages) 72 | ``` 73 | 74 | 75 | # Streaming Analysis Pipelines using Apache Spark Structured Streaming 76 | 77 | This example illustrates usage of pipelines for a streaming application. In this use case streaming data is read from Kafka, aggregations are performed and the output is written to the console. 78 | 79 | ## Read stream from Kafka 80 | 81 | Read streaming data from Kafka. 82 | 83 | ```{r} 84 | ## Define these variables as per the configuration of your machine. The below example is just illustrative. 85 | 86 | kafkaBootstrapServers <- "192.168.0.256:9092,192.168.0.257:9092,192.168.0.258:9092" 87 | consumerTopic <- "topic1" 88 | streamObj <- read.stream(source = "kafka", kafka.bootstrap.servers = kafkaBootstrapServers, subscribe = consumerTopic, startingOffsets="earliest") 89 | printSchema(streamObj) 90 | ``` 91 | 92 | ## User-defined Spark functions 93 | 94 | Users can define their own functions and use it as a part of the pipeline. These functions range from data prep, aggregations, casting data to suitable write stream format, etc. 95 | 96 | ```{r} 97 | 98 | # Function to convert datatype json struct to columns 99 | convertStructToDf <- function(streamObj) { 100 | streamObj <- SparkR::select(streamObj,list(getField(streamObj$`jsontostructs(value)`,"bannerId"), 101 | getField(streamObj$`jsontostructs(value)`,"mobile"), 102 | getField(streamObj$`jsontostructs(value)`,"homeAppliance"), 103 | getField(streamObj$`jsontostructs(value)`,"gamingConsole"), 104 | getField(streamObj$`jsontostructs(value)`,"accessories"), 105 | getField(streamObj$`jsontostructs(value)`,"brand"), 106 | getField(streamObj$`jsontostructs(value)`,"previousPrice"), 107 | getField(streamObj$`jsontostructs(value)`,"currentPrice"), 108 | getField(streamObj$`jsontostructs(value)`,"discount"), 109 | getField(streamObj$`jsontostructs(value)`,"emi"), 110 | getField(streamObj$`jsontostructs(value)`,"crossSale"), 111 | getField(streamObj$`jsontostructs(value)`,"customerId"), 112 | getField(streamObj$`jsontostructs(value)`,"ts"), 113 | getField(streamObj$`jsontostructs(value)`,"click"), 114 | getField(streamObj$`jsontostructs(value)`,"conversion"), 115 | getField(streamObj$`jsontostructs(value)`,"age"), 116 | getField(streamObj$`jsontostructs(value)`,"income"), 117 | getField(streamObj$`jsontostructs(value)`,"maritalStatus"), 118 | getField(streamObj$`jsontostructs(value)`,"segment"))) 119 | colnames(streamObj) <- c("bannerId","mobile","homeAppliance","gamingConsole","accessories","brand","previousPrice","currentPrice", 120 | "discount","emi","crossSale","customerId","ts","click","conversion","age","income","maritalStatus","segment") 121 | return(streamObj) 122 | } 123 | 124 | # Function to cast columns as string, integer, etc 125 | castDfColumns <- function(streamObj) { 126 | streamObj <- SparkR::selectExpr(streamObj, "bannerId","mobile","homeAppliance","gamingConsole","accessories","brand", 127 | "CAST(previousPrice as INTEGER)","CAST(currentPrice as INTEGER)","CAST(discount as INTEGER)","emi", 128 | "crossSale","customerId","ts","CAST(click as INTEGER)","CAST(conversion as INTEGER)", 129 | "CAST(age as INTEGER)","CAST(income as INTEGER)","maritalStatus","segment") 130 | streamObj$ts <- SparkR::to_timestamp(streamObj$ts,"yyyy-MM-dd HH:mm:ss") 131 | return (streamObj) 132 | } 133 | 134 | # Function to convert datatype json struct to columns 135 | convertDfToKafkaKeyValuePairs <- function (streamObj, kafkaKey) { 136 | streamObj <- SparkR::toJSON(streamObj) 137 | streamObj$key <- kafkaKey 138 | return(streamObj) 139 | } 140 | 141 | # Function to summarize click stream data 142 | globalUiMetrics <- function (streamObj) { 143 | ## Aggregation query 144 | streamObj <- SparkR::summarize(SparkR::groupBy(streamObj,streamObj$bannerId), 145 | impressions=count(streamObj$customerId), 146 | clicks=sum(streamObj$click), 147 | conversions=sum(streamObj$conversion)) 148 | SparkR::colnames(streamObj) <- c("banner_id","impressions","clicks","conversions") 149 | return (streamObj) 150 | } 151 | 152 | ``` 153 | 154 | ## Define pipeline object, register user-defined functions to the pipeline object 155 | 156 | In order to use pipelines, a pipeline object needs to be defined. Notice the Spark pipelines are defined using the `readInputSpark` function. 157 | 158 | Each user-defined function needs to be registered to the pipeline object. Post registration, the function can be used to construct a pipeline. A pipeline can be a pipeline of multiple functions called in a particular sequence. 159 | 160 | ```{r} 161 | # Define pipeline object 162 | pipelineObj <- analysisPipelines::StreamingAnalysisPipeline(input = streamObj) 163 | 164 | consumerDataSchema <- structType(structField("bannerId", "string"), 165 | structField("mobile", "string"), 166 | structField("homeAppliance", "string"), 167 | structField("gamingConsole", "string"), 168 | structField("accessories", "string"), 169 | structField("brand", "string"), 170 | structField("previousPrice", "string"), 171 | structField("currentPrice", "string"), 172 | structField("discount", "string"), 173 | structField("emi", "string"), 174 | structField("crossSale", "string"), 175 | structField("customerId", "string"), 176 | structField("ts", "string"), 177 | structField("click", "string"), 178 | structField("conversion", "string"), 179 | structField("age", "string"), 180 | structField("income", "string"), 181 | structField("maritalStatus", "string"), 182 | structField("segment", "string")) 183 | 184 | # Register user-defined functions 185 | registerFunction("convertStructToDf", "", functionType = "streaming", engine = "spark-structured-streaming") 186 | registerFunction("castDfColumns", "", functionType = "streaming", engine = "spark-structured-streaming") 187 | registerFunction("convertDfToKafkaKeyValuePairs", "", functionType = "streaming", engine = "spark-structured-streaming") 188 | 189 | getRegistry() 190 | 191 | # Define pipeline 192 | # Do data prep 193 | pipelineObj %>% castKafkaStreamAsString_sparkSS() %>% 194 | convertKafkaValueFromJson_sparkSS(schema = consumerDataSchema, outAsIn = T) %>% convertStructToDf_sparkSS(outAsIn = T) %>% castDfColumns_sparkSS(outAsIn = T, storeOutput = T) -> pipelineObj 195 | 196 | pipelineObj %>>% getPipeline 197 | pipelineObj %>>% visualizePipeline 198 | ``` 199 | 200 | ## Running the pipeline and generating an output 201 | 202 | The pipeline is run by calling the `generateOutput()` function. The `output` attribute of the pipeline object contains the resultant Spark dataframe(s). 203 | 204 | In this example the Spark DataFrames are converted to R dataframes to help understand the result. 205 | 206 | 207 | ```{r} 208 | 209 | ## Run pipeline 210 | pipelineObj %>% generateOutput() -> pipelineObj 211 | 212 | ## Write to output stream 213 | streamObj <- pipelineObj %>>% getOutputById("4") 214 | streamObj 215 | ``` 216 | 217 | # Supplementary note 218 | 219 | Currently, streaming pipelines have the limitation that they are able to execute only linear flows as this constrained by *Apache Spark Structured Streaming.* Non-linear flows can be defined but might throw execution errors in runtime. Also, streaming pipelines can be implemented using only 1 engine i.e. *Apache Spark Structured Streaming.* 220 | -------------------------------------------------------------------------------- /vignettes/Using_pipelines_inside_shiny_widgets.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Using pipelines inside Shiny widgets or apps" 3 | author: "Naren Srinivasan" 4 | date: "11/14/2018" 5 | runtime: shiny-prerendered 6 | output: 7 | rmarkdown::html_vignette: 8 | fig_width: 8 9 | vignette: > 10 | %\VignetteIndexEntry{Using pipelines inside Shiny widgets or apps} 11 | %\VignetteEngine{knitr::rmarkdown} 12 | %\VignetteEncoding{UTF-8} 13 | --- 14 | 15 | ```{r setup, include=FALSE, echo=FALSE, message=FALSE} 16 | library(analysisPipelines) 17 | library(shiny) 18 | knitr::opts_chunk$set( 19 | eval = FALSE 20 | ) 21 | ``` 22 | 23 | # Pipelines in shiny apps 24 | Pipelines can be used as part of Shiny widgets or apps. In the following example, we define a simple pipeline which generates a chart, and use that to power a shiny widget. 25 | 26 | In this example, we emulate streaming dataset using the `shiny::reactivePoll` function and randomly sampling from an existing sample dataset in the package. 27 | 28 | 29 | 30 | ```{r data, echo=T, results='hide'} 31 | data("iris") 32 | shinyPipeline <- AnalysisPipeline() 33 | shinyPipeline %>>% setLoggerDetails(target = "none") 34 | shinyPipeline %>>% univarCatDistPlots(uniCol = "Species", priColor = "blue", optionalPlots = 0, storeOutput = T) -> shinyPipeline 35 | ``` 36 | 37 | 38 | We then use the pipeline within the `shiny::renderPlot` function, and set the sampled data to execute the pipeline, and generate the chart. Since the data keeps changing due to our reactive poll, the expression within the `shiny::renderPlot` function keeps getting called in the reactive context. 39 | 40 | 41 | ```{r shiny, context="server", message=FALSE, warning=FALSE, echo =TRUE, results='asis'} 42 | sampled_data <- shiny::reactivePoll(intervalMillis = 2000, 43 | session = NULL, 44 | checkFunc = function() return(base::sample(1:100, 1)), 45 | valueFunc = function() return(iris[sample(1:nrow(iris), 100),])) 46 | ``` 47 | 48 | ```{r} 49 | shiny::renderPlot(height = 400, { 50 | sampled_data <- sampled_data() 51 | shinyPipeline %>>% setInput(input = sampled_data) -> shinyPipeline 52 | shinyPipeline %>>% generateOutput %>>% getOutputById("1") 53 | }) 54 | ``` 55 | 56 | --------------------------------------------------------------------------------