├── .Rbuildignore
├── .gitattributes
├── .gitignore
├── DESCRIPTION
├── LICENSE
├── NAMESPACE
├── R
├── analysisPipelines_package.R
├── core-functions-batch.R
├── core-functions-meta-pipelines.R
├── core-functions.R
├── core-streaming-functions.R
├── r-batch-eda-utilities.R
├── r-helper-utilites-python.R
├── spark-structured-streaming-utilities.R
├── sysdata.rda
└── zzz.R
├── README.md
├── analysisPipelines.Rproj
├── data-raw
└── predefFunctions.R
├── inst
├── data-icon.png
├── logging.png
├── output-icon.png
├── param-icon.png
├── pipelineViz1.png
├── pipelineViz2.png
├── python-logo.png
├── python
│ └── sampleFunctions.py
├── r-logo.png
├── report.Rmd
├── report1.png
├── report2.png
├── report3.png
├── spark-logo.png
├── spark-structured-streaming-logo.png
└── styles.css
├── knit-vignettes
├── 2.Analysis_pipelines_for_working_with_Spark_DataFrames_for_one-time_batch_analyses.html
├── 3.Analysis_pipelines_for_working_with_Python_functions.html
├── 4.Interoperable_analysis_pipelines.html
└── 5.Streaming_Analysis_Pipelines_for_working_with_Apache_Spark_Structured_Streaming.html
├── man
├── AnalysisPipeline-class.Rd
├── BaseAnalysisPipeline-class.Rd
├── CheckColumnType.Rd
├── MetaAnalysisPipeline-class.Rd
├── StreamingAnalysisPipeline-class.Rd
├── analysisPipelines.Rd
├── assessEngineSetUp.Rd
├── bivarPlots.Rd
├── castKafkaStreamAsString.Rd
├── checkSchema.Rd
├── checkSchemaMatch.Rd
├── computeEdges.Rd
├── convertKafkaValueFromJson.Rd
├── correlationMatPlot.Rd
├── createPipelineInstance.Rd
├── dot-analysisPipelinesEnvir.Rd
├── dot-getCache.Rd
├── dot-saveMetaPipeline.Rd
├── dot-setRegistry.Rd
├── dot-updateRegistry.Rd
├── dot-visualizeMetaPipeline.Rd
├── exportAsMetaPipeline.Rd
├── generateOutput.Rd
├── generateReport.Rd
├── genericPipelineException.Rd
├── getDatatype.Rd
├── getEndPoints.Rd
├── getFeaturesForPyClassification.Rd
├── getInput.Rd
├── getLoggerDetails.Rd
├── getOutputById.Rd
├── getPipeline.Rd
├── getPipelinePrototype.Rd
├── getRegistry.Rd
├── getResponse.Rd
├── getStartingPoints.Rd
├── getTargetForPyClassification.Rd
├── getTerm.Rd
├── getUpstreamDependencies.Rd
├── identifyTopLevelRecursively.Rd
├── identifyTopologicalLevels.Rd
├── ignoreCols.Rd
├── initDfBasedOnType.Rd
├── initialize-methods.Rd
├── initializeLoggers.Rd
├── isDependencyParam.Rd
├── loadMetaPipeline.Rd
├── loadPipeline.Rd
├── loadPredefinedFunctionRegistry.Rd
├── loadRegistry.Rd
├── multiVarOutlierPlot.Rd
├── outlierPlot.Rd
├── prepExecution.Rd
├── registerFunction.Rd
├── savePipeline.Rd
├── saveRegistry.Rd
├── setInput.Rd
├── setLoggerDetails.Rd
├── setPythonEnvir.Rd
├── setUpstreamDependencies.Rd
├── sparkRSessionCreateIfNotPresent.Rd
├── univarCatDistPlots.Rd
├── updateObject.Rd
└── visualizePipeline.Rd
└── vignettes
├── Analysis_pipelines_for_working_with_Python_functions.Rmd
├── Analysis_pipelines_for_working_with_R_dataframes.Rmd
├── Analysis_pipelines_for_working_with_sparkR.Rmd
├── Interoperable_Pipelines.Rmd
├── Meta_Pipelines.Rmd
├── Streaming_pipelines_for_working_Apache_Spark_Structured_Streaming.Rmd
└── Using_pipelines_inside_shiny_widgets.Rmd
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | data-raw/
4 | inst/python/.ipynb_checkpoints
5 | metastore_db/
6 | vignettes/*.R
7 | vignettes/*.html
8 | vignettes/*.RDS
9 | vignettes/metastore_db/
10 | vignettes/*.out
11 | knit-vignettes/
12 |
13 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.* linguist-language=R
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | metastore_db/
6 | inst/python/.ipynb_checkpoints/
7 | .DS_Store
8 | vignettes/metastore_db/
9 | vignettes/*.RDS
10 | vignettes/*.out
11 | vignettes/*.R
12 | vignettes/*.html
13 |
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: analysisPipelines
2 | Type: Package
3 | Date: 2020-06-12
4 | Title: Compose Interoperable Analysis Pipelines & Put Them in Production
5 | Version: 1.0.2
6 | Authors@R: c(
7 | person("Naren","Srinivasan", email = "naren1991@gmail.com", role = c("aut")),
8 | person("Zubin Dowlaty","", email = "Zubin.Dowlaty@mu-sigma.com", role = c("aut")),
9 | person("Sanjay","", email = "Sanjay@mu-sigma.com", role = c("ctb")),
10 | person("Neeratyoy","Mallik", email = "Neeratyoy.Mallik@mu-sigma.com", role = c("ctb")),
11 | person("Anoop S","", email = "Anoop.S@mu-sigma.com", role = c("ctb")),
12 | person("Mu Sigma, Inc.", email = "ird.experiencelab@mu-sigma.com", role = c("cre"))
13 | )
14 | Description: Enables data scientists to compose pipelines of analysis which consist of data manipulation, exploratory analysis & reporting, as well as modeling steps. Data scientists can use tools of their choice through an R interface, and compose interoperable pipelines between R, Spark, and Python.
15 | Credits to Mu Sigma for supporting the development of the package.
16 | Note - To enable pipelines involving Spark tasks, the package uses the 'SparkR' package.
17 | The SparkR package needs to be installed to use Spark as an engine within a pipeline. SparkR is distributed natively with Apache Spark and is not distributed on CRAN. The SparkR version needs to directly map to the Spark version (hence the native distribution), and care needs to be taken to ensure that this is configured properly.
18 | To install SparkR from Github, run the following command if you know the Spark version: 'devtools::install_github('apache/spark@v2.x.x', subdir='R/pkg')'.
19 | The other option is to install SparkR by running the following terminal commands if Spark has already been installed: '$ export SPARK_HOME=/path/to/spark/directory && cd $SPARK_HOME/R/lib/SparkR/ && R -e "devtools::install('.')"'.
20 | Depends: R (>= 3.4.0), magrittr, pipeR, methods
21 | Imports: ggplot2, dplyr, futile.logger, RCurl, rlang (>= 0.3.0), proto, purrr
22 | Suggests: plotly, knitr, rmarkdown, parallel, visNetwork, rjson, DT, shiny, R.devices, corrplot, car, foreign
23 | Enhances: SparkR, reticulate
24 | BugReports: https://github.com/Mu-Sigma/analysis-pipelines/issues
25 | URL: https://github.com/Mu-Sigma/analysis-pipelines
26 | Encoding: UTF-8
27 | License: MIT
28 | LazyLoad: yes
29 | LazyData: yes
30 | RoxygenNote: 6.1.1
31 | VignetteBuilder: knitr
32 | Collate:
33 | 'analysisPipelines_package.R'
34 | 'core-functions.R'
35 | 'core-functions-batch.R'
36 | 'core-functions-meta-pipelines.R'
37 | 'core-streaming-functions.R'
38 | 'r-batch-eda-utilities.R'
39 | 'r-helper-utilites-python.R'
40 | 'spark-structured-streaming-utilities.R'
41 | 'zzz.R'
42 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (C) 2019 Mu Sigma Labs
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge and/or publish of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6 |
7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8 |
9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE MU-SIGMA LABS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
10 |
11 | Except as contained in this notice, the name of the Mu-Sigma Labs shall not be used in advertising or otherwise to promote the sale, use or other dealings in this Software without prior written authorization from the Mu-Sigma Labs.
12 |
13 | Mu Sigma Labs is a trademark of Mu Sigma Business Solutions Private Limited.
--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 |
3 | export(AnalysisPipeline)
4 | export(BaseAnalysisPipeline)
5 | export(CheckColumnType)
6 | export(MetaAnalysisPipeline)
7 | export(StreamingAnalysisPipeline)
8 | export(assessEngineSetUp)
9 | export(bivarPlots)
10 | export(castKafkaStreamAsString)
11 | export(convertKafkaValueFromJson)
12 | export(correlationMatPlot)
13 | export(createPipelineInstance)
14 | export(exportAsMetaPipeline)
15 | export(generateReport)
16 | export(genericPipelineException)
17 | export(getDatatype)
18 | export(getFeaturesForPyClassification)
19 | export(getInput)
20 | export(getLoggerDetails)
21 | export(getOutputById)
22 | export(getPipeline)
23 | export(getPipelinePrototype)
24 | export(getRegistry)
25 | export(getResponse)
26 | export(getTargetForPyClassification)
27 | export(getTerm)
28 | export(ignoreCols)
29 | export(isDependencyParam)
30 | export(loadMetaPipeline)
31 | export(loadPipeline)
32 | export(loadPredefinedFunctionRegistry)
33 | export(loadRegistry)
34 | export(multiVarOutlierPlot)
35 | export(outlierPlot)
36 | export(prepExecution)
37 | export(registerFunction)
38 | export(savePipeline)
39 | export(saveRegistry)
40 | export(setInput)
41 | export(setLoggerDetails)
42 | export(setPythonEnvir)
43 | export(sparkRSessionCreateIfNotPresent)
44 | export(univarCatDistPlots)
45 | export(updateObject)
46 | export(visualizePipeline)
47 | exportClasses(AnalysisPipeline)
48 | exportClasses(BaseAnalysisPipeline)
49 | exportClasses(MetaAnalysisPipeline)
50 | exportClasses(StreamingAnalysisPipeline)
51 | exportMethods(checkSchemaMatch)
52 | exportMethods(generateOutput)
53 | importFrom(graphics,image)
54 | importFrom(magrittr,"%>%")
55 | importFrom(methods,getClass)
56 | importFrom(methods,new)
57 | importFrom(methods,removeMethod)
58 | importFrom(methods,setClassUnion)
59 | importFrom(methods,setGeneric)
60 | importFrom(methods,setOldClass)
61 | importFrom(pipeR,"%>>%")
62 | importFrom(rlang,"!!")
63 | importFrom(rlang,.data)
64 | importFrom(stats,as.formula)
65 | importFrom(stats,lm)
66 | importFrom(stats,reorder)
67 | importFrom(stats,terms)
68 | importFrom(utils,installed.packages)
69 | importFrom(utils,read.csv)
70 |
--------------------------------------------------------------------------------
/R/analysisPipelines_package.R:
--------------------------------------------------------------------------------
1 | #' analysisPipelines
2 | #'
3 | #' The package aims at enabling data scientists to compose pipelines of analysis which consist of data manipulation,
4 | #' exploratory analysis & reporting, as well as modeling steps. It also aims to enable data scientists to use tools
5 | #' of their choice through an R interface, and compose interoperable pipelines between R, Spark, and Python.
6 | #' Credits to Mu Sigma for supporting the development of the package.
7 | #'
8 | #' @note To enable pipelines involving Spark tasks, the package uses the 'SparkR' package. Using Spark as an engine requires the SparkR package to be installed.
9 | #' SparkR is distributed natively with Apache Spark and is not distributed on CRAN. The SparkR version needs to directly map to the Spark version (hence the native distribution), and care needs to be taken to ensure that this is configured properly.
10 | #' To install from Github, run the following command, if you know the Spark version:
11 | #' \itemize{
12 | #' \item devtools::install_github('apache/spark@v2.x.x', subdir='R/pkg')
13 | #' }
14 | #' The other option is to install SparkR by running the following terminal commands if Spark has already been installed:
15 | #' \itemize{
16 | #' \item $ export SPARK_HOME=/path/to/spark/directory
17 | #' \item $ cd $SPARK_HOME/R/lib/SparkR/
18 | #' \item $ R -e "devtools::install('.')"
19 | #' }
20 | #' @docType package
21 | #' @name analysisPipelines
22 | NULL
23 |
--------------------------------------------------------------------------------
/R/core-functions-meta-pipelines.R:
--------------------------------------------------------------------------------
1 | ##################################################################################################
2 | # Title: Meta pipelines
3 | # Author: Naren Srinivasan
4 | # Created on: Nov 20, 2018
5 | # Description: Functions/ Methods to define and use meta-pipelines
6 | ##################################################################################################
7 |
8 | # proto' is an S3 class whic is used as a slot, and hence it is defined in the environment
9 | setOldClass("proto")
10 |
11 | #' @name MetaAnalysisPipeline-class
12 | #' @rdname MetaAnalysisPipeline-class
13 | #' @title Class for creating and working with meta-pipelines
14 | #' @details This class works with the \code{AnalysisPipeline} and \code{StreamingAnalysisPipeline} classes, and allows the
15 | #' pipeline to be exported as meta-pipeline. A meta-pipeline is a construct, where the input dataset as well as the arguments
16 | #' to functions in the pipeline are not defined. Only the analysis flow and dependencies are stored.
17 | #' @slot pipeline A tibble which holds functions to be called in the pipeline
18 | #' @slot pipelinePrototype An object of class \code{proto} from the 'proto' package which maintains the prototype of the
19 | #' functions in the pipeline and their respective arguments
20 | #' @slot type A string defining whether it is a batch or streaming pipeline. Acceptable values are 'batch' & 'streaming'
21 | #' @family Package core functions
22 | #' @exportClass MetaAnalysisPipeline
23 | #' @export MetaAnalysisPipeline
24 |
25 | MetaAnalysisPipeline <- setClass("MetaAnalysisPipeline",
26 | slots = c(
27 | pipeline = "tbl",
28 | pipelinePrototype = "proto",
29 | type = "character"
30 | ))
31 |
32 | #' MetaAnalysisPipeline constructor
33 | #' @docType methods
34 | #' @rdname initialize-methods
35 | #' @title This is the constructor for the \link{MetaAnalysisPipeline} class
36 | #' @family Package core functions
37 | #' @keywords internal
38 |
39 | setMethod(
40 | f = "initialize",
41 | signature = "MetaAnalysisPipeline",
42 | definition = function(.Object, type = "batch")
43 | {
44 | tryCatch({
45 | .Object@pipeline <- dplyr::tibble(
46 | id = character(),
47 | operation = character(),
48 | heading = character(),
49 | parameters = list(),
50 | outAsIn = logical(),
51 | storeOutput = F
52 | )
53 |
54 | .Object@type <- "batch"
55 |
56 | return(.Object)
57 | }, error = function(e){
58 | futile.logger::flog.error(e, name = "logger.base")
59 | stop()
60 | })
61 | }
62 | )
63 |
64 | #' @name exportAsMetaPipeline
65 | #' @rdname exportAsMetaPipeline
66 | #' @title Method to export a meta-pipeline
67 | #' @details This method exports a Pipeline object i.e. of the classes \code{AnalysisPipeline} or
68 | #' \code{StreamingAnalysisPipeline} as a meta-pipeline
69 | #' @param object A Pipeline object
70 | #' @return an object of class "\code{MetaAnalysisPipeline}"
71 | #' @family Package core functions
72 | #' @examples
73 | #' \dontrun{
74 | #' #' pipelineObj <- AnalysisPipeline(input = iris)
75 | #' pipelineObj %>>% univarCatDistPlots(uniCol = "Species") %>>%
76 | #' exportAsMetaPipeline -> exportedMetaPipeline
77 | #' }
78 | #' @export
79 | setGeneric(
80 | name = "exportAsMetaPipeline",
81 | def = function(object){
82 | standardGeneric("exportAsMetaPipeline")
83 | }
84 | )
85 |
86 | .exportAsMetaPipeline <- function(object){
87 | object %>>% setLoggerDetails(target = "none") -> object
88 | metaPipeline <- MetaAnalysisPipeline()
89 | pipelineProto <- proto::proto()
90 | if(class(object) == "AnalysisPipeline"){
91 | metaPipeline@type <- "batch"
92 | }else if(class(object) == "StreamingAnalysisPipeline"){
93 | metaPipeline@type <- "streaming"
94 | }
95 |
96 | if(nrow(object@pipelineExecutor$topologicalOrdering) == 0){
97 | object %>>% prepExecution -> object
98 | }
99 |
100 | object@pipeline -> pipeline
101 | pipeline %>>% purrr::pmap(function(id, operation, heading,
102 | parameters, outAsIn, storeOutput, dependencies){
103 | # fnName <- paste0("fn_", operation)
104 | fnName <- operation
105 | assign(x = fnName, value = proto::proto(), envir = pipelineProto)
106 |
107 | purrr::imap(parameters, function(p, np){
108 | # n <- names(p)
109 | if(class(p) == "formula"){
110 | if(analysisPipelines::isDependencyParam(p)){
111 | n <- analysisPipelines::getResponse(p)
112 | p <- paste0("~", analysisPipelines::getTerm(p)) %>>% as.formula()
113 | }
114 | }
115 | assign(x = paste0(np),
116 | value = p,
117 | envir = pipelineProto[[fnName]])
118 | return(NULL)
119 | })
120 | return(NULL)
121 | })
122 | metaPipeline@pipeline <- pipeline
123 | metaPipeline@pipelinePrototype <- pipelineProto
124 | return(metaPipeline)
125 | }
126 |
127 | #' @rdname exportAsMetaPipeline
128 | setMethod(
129 | f = "exportAsMetaPipeline",
130 | signature = "BaseAnalysisPipeline",
131 | definition = .exportAsMetaPipeline
132 | )
133 |
134 |
135 | #' @name getPipelinePrototype
136 | #' @rdname getPipelinePrototype
137 | #' @title Obtain the prototype of the functions in the pipeline
138 | #' @param metaPipelineObj A \code{MetaAnalysisPipeline} object
139 | #' @details This method returns the prototype of functions in the pipeline and their respective arguments as \code{proto} object.
140 | #' Functions in the pipeline can be accessed easily by using the '$' operator, and within the functions the arguments can
141 | #' be accessed the same way. These can be accessed and set to new values. This pipeline prototype can then be passed to the
142 | #' \code{createPipelineInstance} method which will instantiate an executable pipeline with the inputs set in the prototype
143 | #' @return An object og class \code{proto} from the 'proto' package
144 | #' @family Package core functions
145 | #' @examples
146 | #' \dontrun{
147 | #' pipelineObj <- AnalysisPipeline(input = iris)
148 | #' pipelineObj %>>% univarCatDistPlots(uniCol = "Species") %>>%
149 | #' exportAsMetaPipeline %>>% getPipelinePrototype
150 | #' }
151 | #' @export
152 | setGeneric(
153 | name = "getPipelinePrototype",
154 | def = function(metaPipelineObj){
155 | standardGeneric("getPipelinePrototype")
156 | }
157 | )
158 |
159 | .getPipelinePrototype <- function(metaPipelineObj){
160 | return(metaPipelineObj@pipelinePrototype)
161 | }
162 |
163 | #' @rdname getPipelinePrototype
164 | setMethod(
165 | f = "getPipelinePrototype",
166 | signature = "MetaAnalysisPipeline",
167 | definition = .getPipelinePrototype
168 | )
169 |
170 |
171 | #' @name createPipelineInstance
172 | #' @rdname createPipelineInstance
173 | #' @title Create a Pipeline object from a meta-pipeline
174 | #' @param metaPipelineObj A \code{MetaAnalysisPipeline} object
175 | #' @param newParams Either a nested named list containing all the functions in the pipeline, their arguments and
176 | #' corresponding values (OR) an object of class \code{proto} which is a pipeline prototype, with the new values of the arguments
177 | #' set. Refer the \code{getPipelinePrototype} method.
178 | #' @details This method instantiates a Pipeline object (both \code{AnalysisPipeline} and \code{StreamingAnalysisPipeline}) from
179 | #' a meta-pipeline as well as an object containing the new set of values for the arguments of all the functions in the pipeline.
180 | #' @return A Pipeline object
181 | #' @family Package core functions
182 | #' @examples
183 | #' \dontrun{
184 | #' pipelineObj <- AnalysisPipeline(input = iris)
185 | #' pipelineObj %>>% univarCatDistPlots(uniCol = "Species") -> pipelineObj
186 | #' pipelineObj %>>% exportAsMetaPipeline -> exportedMetaPipeline
187 | #' exportedMetaPipeline %>>%
188 | #' createPipelineInstance(newParams = exportedMetaPipeline %>>%
189 | #' getPipelinePrototype)
190 | #' }
191 | #' @export
192 | setGeneric(
193 | name = "createPipelineInstance",
194 | def = function(metaPipelineObj, newParams){
195 | standardGeneric("createPipelineInstance")
196 | }
197 | )
198 |
199 | .createPipelineInstance <- function(metaPipelineObj, newParams){
200 |
201 | if(metaPipelineObj@type == "batch"){
202 | pipelineObj <- AnalysisPipeline()
203 | }else if(metaPipelineObj@type == "streaming"){
204 | pipelineObj <- StreamingAnalysisPipeline()
205 | }
206 |
207 | pipelineObj@pipeline <- metaPipelineObj@pipeline
208 |
209 | newParamList <- newParams
210 | if(any(class(newParams) == "proto")){
211 | names(newParams) %>>% grep(pattern = "^[.]", value = T, invert = T ) -> fnNames
212 |
213 | newParamList <- purrr::imap(fnNames, function(fn, nfn){
214 | fnEnvir <- get(fn, envir = newParams)
215 | fnEnvir %>>% names %>>% grep(pattern = "^[.]", invert = T, value = T ) -> argNames
216 | params <- mget(x = argNames, envir = newParams[[fn]])
217 | params <- purrr::imap(params, function(p, np){
218 | if(class(p) == "formula"){
219 | if(analysisPipelines::isDependencyParam(p)){
220 | p <- paste(np, "~", analysisPipelines::getTerm(p)) %>>% as.formula
221 | }
222 | #TODO: Deal with normal formula parameters
223 | }
224 | return(p)
225 | })
226 | return(params)
227 | })
228 | names(newParamList) <- fnNames
229 | }
230 |
231 | # Match pipeline table order
232 | tblOrder <- match(pipelineObj@pipeline$operation, names(newParamList))
233 | newParamList <- newParamList[tblOrder]
234 |
235 | #Match argument list orders
236 | newParamList <- purrr::imap(newParamList, function(params, fnName){
237 | pipelineParams <- pipelineObj@pipeline %>>% dplyr::filter(.data$operation == fnName)
238 | pipelineParams <- unlist(pipelineParams$parameters, recursive = F)
239 | argOrder <- match(names(pipelineParams), names(params))
240 | params <- params[argOrder]
241 | return(params)
242 | })
243 |
244 | names(newParamList) <- NULL
245 | pipelineObj@pipeline %>>% dplyr::mutate(parameters = newParamList) -> pipelineObj@pipeline
246 |
247 | return(pipelineObj)
248 | }
249 |
250 | #' @rdname createPipelineInstance
251 | setMethod(
252 | f = "createPipelineInstance",
253 | signature = "MetaAnalysisPipeline",
254 | definition = .createPipelineInstance
255 | )
256 |
257 | #' A method definition for visualizing meta-pipelines, called when the 'visualizePipeline' method is called against the
258 | #' \code{MetaAnalysisPipeline} signature
259 | #' @name .visualizeMetaPipeline
260 | #' @keywords internal
261 | .visualizeMetaPipeline <- function(object){
262 | object %>>% createPipelineInstance(object@pipelinePrototype) -> sampleObj
263 | vis <- NULL
264 | sampleObj %>>% setLoggerDetails(target = "none") -> sampleObj
265 | sampleObj %>>% prepExecution -> sampleObj
266 | sampleObj %>>% visualizePipeline -> vis
267 | return(vis)
268 | }
269 |
270 | #' @rdname visualizePipeline
271 | setMethod(
272 | f = "visualizePipeline",
273 | signature = "MetaAnalysisPipeline",
274 | definition = .visualizeMetaPipeline
275 | )
276 |
277 |
278 | #' A method definition for saving meta-pipelines, called when the 'savePipeline' method is called against the
279 | #' \code{MetaAnalysisPipeline} signature
280 | #' @name .saveMetaPipeline
281 | #' @keywords internal
282 | .saveMetaPipeline <- function(object, path){
283 | tryCatch({
284 | .registry <- getRegistry()
285 | listToBeSaved <- c("object", ".registry", getRegistry()$functionName, getRegistry()$exceptionHandlingFunction)
286 | save(list = listToBeSaved,file = path)
287 | futile.logger::flog.info("|| Registry saved successfully at path '%s' ||", path,
288 | name = "logger.base")
289 | },error = function(e){
290 | futile.logger::flog.error(e, name = "logger.base")
291 | stop()
292 | }, warning = function(w){
293 | futile.logger::flog.warn(w, name = "logger.base")
294 | })
295 | }
296 |
297 | #' @rdname savePipeline
298 | setMethod(
299 | f = "savePipeline",
300 | signature = "MetaAnalysisPipeline",
301 | definition = .saveMetaPipeline
302 | )
303 |
304 | #' @name loadMetaPipeline
305 | #' @title Load a meta-pipeline
306 | #' @param path the path at which the .Rds file containing the pipeline is located
307 | #' @details This function loads a meta-pipeline from a file system, and returns the meta-pipeline object, which can be assigned
308 | #' to an object in the environment.
309 | #' @details Note - When a meta-pipeline is loaded, the existing registry is overwritten with the registry saved with the
310 | #' meta-pipeline
311 | #' @return An \code{MetaAnalysisPipeline} object
312 | #' @family Package core functions
313 | #' @examples
314 | #' \dontrun{
315 | #' loadMetaPipeline(path = "./metaPipeline.RDS")
316 | #' }
317 | #' @export
318 | loadMetaPipeline <- function(path){
319 | tryCatch({
320 | object <- NULL
321 | futile.logger::flog.warn("|| The existing registry will be overwritten with the registry from the RDS file ||",
322 | name = "logger.base")
323 | load(path, envir = environment())
324 | functionNames = setdiff(ls(envir = environment()), c("path", "object", ".registry"))
325 | eval(parse(text = paste0(".setRegistry(.registry)")))
326 | lapply(functionNames, function(x){
327 | assign(x, get(x, environment()), globEnv)
328 | })
329 |
330 | return(object)
331 | },error = function(e){
332 | futile.logger::flog.error(e, name = "logger.base")
333 | stop()
334 | })
335 | }
336 |
337 |
338 |
--------------------------------------------------------------------------------
/R/core-streaming-functions.R:
--------------------------------------------------------------------------------
1 | ##################################################################################################
2 | # Title: Reusable pipelines for streaming analyses
3 | # Author: Naren Srinivasan
4 | # Created on: July 12, 2018
5 | # Description: An R package version - Currently supports Apache Spark Structured Streaming
6 | ##################################################################################################
7 |
8 | # TO DO
9 | # - Add schema checks
10 | # - Add ability to initialized without input and check for generate output if there is not input initialized
11 | # - Remove workingInput - DONE
12 | # - Test loadPipeline function
13 |
14 | #' @include core-functions.R
15 | NULL
16 |
17 | #' @name StreamingAnalysisPipeline-class
18 | #' @rdname StreamingAnalysisPipeline-class
19 | #' @title Class for constructing Analysis Pipelines for streaming analyeses
20 | #' @details Inherits the base class \link{BaseAnalysisPipeline} class which holds the metadata including the registry of available functions,
21 | #' the data on which the pipeline is to be applied, as well as the pipeline itself
22 | #' @details This class currently only supports Apache Spark Structured Streaming, implemented through the SparkR interface
23 | #' @slot input The input Spark DataFrame on which analysis is to be performed
24 | #' @slot originalSchemaDf Empty Spark DataFrame representing the schema of the input
25 | #' @family Package core functions for Streaming Analyses
26 | #' @include core-functions.R
27 | #' @exportClass StreamingAnalysisPipeline
28 | #' @export StreamingAnalysisPipeline
29 |
30 |
31 | StreamingAnalysisPipeline <- setClass("StreamingAnalysisPipeline",
32 | slots = c(
33 | input = "ANY",
34 | #Should be a SparkDataFrame, but unable to specify as SparkR is not distributed on CRAN
35 | originalSchemaDf = "ANY"
36 | ), contains = "BaseAnalysisPipeline")
37 |
38 | #' StreamingAnalysisPipeline constructor
39 | #' @docType methods
40 | #' @rdname initialize-methods
41 | #' @title Constructor for the \code{StreamingAnalysisPipeline} object
42 | #' @include core-functions.R
43 | #' @keywords internal
44 |
45 | setMethod(
46 | f = "initialize",
47 | signature = "StreamingAnalysisPipeline",
48 | definition = function(.Object,input)
49 | {
50 | .Object@input <- input
51 |
52 | ## Calling the parent constructor
53 | .Object <- methods::callNextMethod(.Object)
54 | return(.Object)
55 | }
56 | )
57 |
58 | .checkSparkDataFrame <- function(obj){
59 | if(class(obj) != "SparkDataFrame"){
60 | futile.logger::flog.error("|| The input should be of class 'SparkDataFrame' from the 'SparkR' package ||",
61 | name = "logger.base")
62 | stop()
63 | }
64 | }
65 |
66 | .executeStream<- function(object){
67 |
68 | tryCatch({
69 |
70 | futile.logger::flog.info("|| Pipeline Execution STARTED ||" , name='logger.execution')
71 |
72 | outputCache <- .getCache()
73 |
74 | topOrder <- object@pipelineExecutor$topologicalOrdering
75 | dplyr::left_join(object@pipeline, getRegistry(), by = c("operation" = "functionName")) %>>%
76 | dplyr::left_join(object@pipelineExecutor$topologicalOrdering, by = c("id" = "id")) -> pipelineRegistryOrderingJoin
77 |
78 | batches <- unique(pipelineRegistryOrderingJoin$level)
79 | numBatches <- max(as.numeric(batches))
80 |
81 |
82 | # Iterate across batches i.e. sets of independent functions
83 | lapply(batches, function(x, object, pipelineRegistryOrderingJoin, outputCache){
84 |
85 | pipelineRegistryOrderingJoin %>>% dplyr::filter(.data$level == x) -> functionsInBatch
86 |
87 | ## Function execution in a stream
88 | lapply(functionsInBatch$id, function(y, object, functionsInBatch, outputCache){
89 |
90 | functionsInBatch %>>% dplyr::filter(.data$id == y) %>>% as.list -> funcDetails
91 |
92 | futile.logger::flog.info("|| Function ID '%s' named '%s' STARTED on the '%s' engine ||",
93 | funcDetails$id, funcDetails$operation, funcDetails$engine,
94 | name='logger.func')
95 |
96 |
97 | # Set parameters
98 |
99 | params <- unlist(funcDetails$parameters, recursive = F)
100 | dep <- unique(unlist(funcDetails$dependencies, recursive = F))
101 | depTerms <- paste0("f", dep)
102 |
103 | # Datasets passed as a formula are updated here
104 |
105 | params <- lapply(params, function(p, depTerms, outputCache){
106 | if(class(p) == "formula"){
107 | isDepParam <- analysisPipelines::isDependencyParam(p)
108 | if(isDepParam){
109 | formulaTerm <- analysisPipelines::getTerm(p)
110 | argName <- analysisPipelines::getResponse(p)
111 | if(formulaTerm %in% depTerms){
112 |
113 | ## Formula of previous function in pipeline
114 | actualParamObjectName <- paste0(formulaTerm, ".out")
115 | p <- get(actualParamObjectName, envir = outputCache)
116 | }
117 | }
118 | }
119 |
120 | return(p)
121 | }, depTerms, outputCache)
122 |
123 | # No type conversion for Streaming pipelines
124 |
125 | if(funcDetails$isDataFunction){
126 | # Not passed as a formula
127 | if(any(class(params[[1]]) == "rlang_fake_data_pronoun")){
128 | # Checking for outAsIn
129 | if(funcDetails$outAsIn && funcDetails$id != "1"){
130 | dataOpFn <- paste0("f", as.numeric(funcDetails$id) - 1)
131 | actualDataObjectName <- paste0(dataOpFn, ".out")
132 | params[[1]] <- get(actualDataObjectName, envir = outputCache)
133 | }else{
134 | # On original input
135 | params[[1]]<- object@input
136 | }
137 | }
138 | }
139 |
140 | #Call
141 | startFunc <- Sys.time()
142 | args <- params
143 | output <- tryCatch({do.call(what = funcDetails$operation,
144 | args = args)},
145 | error = function(e){
146 | futile.logger::flog.error("|| ERROR Occurred in Function ID '%s' named '%s'. EXITING PIPELINE EXECUTION. Calling Exception Function - '%s' ||",
147 | funcDetails$id, funcDetails$operation, funcDetails$exceptionHandlingFunction,
148 | name='logger.func')
149 | do.call(funcDetails$exceptionHandlingFunction,
150 | list(error = e))
151 |
152 | })
153 |
154 | endFunc <- Sys.time()
155 | funcExecTime <- endFunc - startFunc
156 |
157 | opName <- paste0("f", funcDetails$id, ".out") #eg: f1.out
158 | if(funcDetails$storeOutput){
159 | assign(opName, value = output, envir = outputCache)
160 | }else{
161 | #Check if there are dependent children
162 | fromList <- object@pipelineExecutor$dependencyLinks$from
163 | if(funcDetails$id %in% fromList){
164 | assign(opName, value = output, envir = outputCache)
165 | }
166 | }
167 |
168 |
169 | futile.logger::flog.info("|| NEW MICRO_BATCH PROCESSED for Function ID '%s' named '%s' in %s seconds ||",
170 | funcDetails$id, funcDetails$operation, funcExecTime,
171 | name='logger.func')
172 |
173 | }, object, functionsInBatch, outputCache)
174 |
175 | }, object, pipelineRegistryOrderingJoin, outputCache)
176 |
177 | object@output <- mget(ls(outputCache), envir = outputCache)
178 | rm(list = ls(outputCache), envir = outputCache)
179 |
180 | return(object)
181 | },error = function(e){
182 | futile.logger::flog.error(e, name = "logger.base")
183 | stop()
184 | })
185 | }
186 |
187 | .generateStreamingOutput <- function(object){
188 | tryCatch({
189 |
190 | object %>>% initializeLoggers
191 |
192 | inputToExecute <- object@input
193 |
194 | if(class(inputToExecute) != "SparkDataFrame"){
195 | m <- "This streaming pipeline has not been initialized with a SparkDataFrame. Please use the setInput() function to do so."
196 | futile.logger::flog.error(m, name = 'logger.pipeline')
197 | stop(m)
198 | }
199 |
200 | ## Check engine setup
201 | object %>>% assessEngineSetUp -> engineAssessment
202 | engineAssessment %>>% dplyr::filter(.data$requiredForPipeline == T) -> requiredEngines
203 |
204 | if(!all(requiredEngines$isSetup)){
205 | m <- paste0("All engines required for the pipelines have not been configured. ",
206 | "Please use the analysisPipelines::assessEngine() function to check")
207 | futile.logger::flog.error(m, name = 'logger.engine.assessment')
208 | stop(m)
209 | }
210 |
211 | if(nrow(object@pipelineExecutor$topologicalOrdering) == 0){
212 | object %>>% prepExecution -> object
213 | }
214 |
215 | object %>>% .executeStream -> object
216 |
217 | return(object)
218 |
219 | },error = function(e){
220 | futile.logger::flog.error(e, name = "logger.base")
221 | stop()
222 | })
223 | }
224 |
225 | #' @rdname generateOutput
226 | setMethod(
227 | f = "generateOutput",
228 | signature = "StreamingAnalysisPipeline",
229 | definition = .generateStreamingOutput
230 | )
231 |
--------------------------------------------------------------------------------
/R/r-helper-utilites-python.R:
--------------------------------------------------------------------------------
1 | #' @name setPythonEnvir
2 | #' @title Sets the python environment to be used
3 | #' @details Wrapper function over reticulate functions to set a python environment to be used
4 | #' @param type Type of python environment. Takes three possible vales - 'conda' for Anaconda environments,
5 | #' 'virtualenv' for Virtual environments, and 'python' to manually set the python path to use
6 | #' @param pathOrEnvirName Name of the environment for Anaconda and Virtual environments,
7 | #' or the Python path when type is 'python'
8 | #' @family R helper utilities for Python
9 | #' @examples
10 | #' \dontrun{
11 | #' setPythonEnvir()
12 | #' }
13 | #' @export
14 | setPythonEnvir <- function(type = 'conda', pathOrEnvirName = 'base'){
15 | tryCatch({
16 | if(type == 'conda'){
17 | reticulate::use_condaenv(pathOrEnvirName, required = T)
18 | futile.logger::flog.info("|| Using conda environment of name '%s' ||", pathOrEnvirName,
19 | name = "logger.base")
20 | }else if(type == 'virtualenv'){
21 | reticulate::use_virtualenv(pathOrEnvirName, required = T)
22 | futile.logger::flog.info("|| Using virtual environment of name '%s' ||", pathOrEnvirName,
23 | name = "logger.base")
24 | }else if (type == 'python'){
25 | reticulate::use_python(pathOrEnvirName, required = T)
26 | futile.logger::flog.info("|| Using python at path: '%s' ||", pathOrEnvirName,
27 | name = "logger.base")
28 | }else{
29 | futile.logger::flog.error("|| Invalid type - Should be one of 'conda', 'virtualenv', or 'python' ||")
30 | }
31 | }, error = function(e){
32 | futile.logger::flog.error("|| %s ||", e, name = 'logger.base')
33 | })
34 | }
35 |
36 |
37 | #' @name getFeaturesForPyClassification
38 | #' @title Extracts selected columns from a data frame as a Python array
39 | #' @details Helper function, which when provided an R data frame and a set of column/ feature names,
40 | #' extracts them from the R data frame as a matrix and converts them to the equivalent Python array.
41 | #' @details Typically this function can be used when providing a feature matrix to a Python machine learning function
42 | #' @param dataset an R data frame
43 | #' @param featureNames Column names to be extracted from the R data frames. A character vector.
44 | #' @family R helper utilities for Python
45 | #' @examples
46 | #' \dontrun{
47 | #' getFeaturesForPyClassification(dataset = iris,
48 | #' featureNames = c("Sepal.Length", "Sepal.Width"))
49 | #' }
50 | #' @export
51 | getFeaturesForPyClassification <- function(dataset, featureNames){
52 | dataset %>% dplyr::select(!!featureNames) %>% as.matrix %>% reticulate::r_to_py() -> featureMatrix
53 | return(featureMatrix)
54 | }
55 |
56 | #' @name getTargetForPyClassification
57 | #' @title Extracts selected column from a data frame a binary class Python array
58 | #' @param dataset an R data frame
59 | #' @param targetVarName Name of the target variable for classification. Should be a categorical variable.
60 | #' @param positiveClass Name of the class of the target variable which should be coded as '1'
61 | #' @details Helper function, which when provided an R dataframe and a binary categorical column,
62 | #' extracts it from the R data frame, converts it to 1/0 class coding, and converts it to a Python array
63 | #' @details Typically this function can be used to extract a target variable for a classifier to be provided to a
64 | #' Python machine learning function
65 | #' @family R helper utilities for Python
66 | #' @examples
67 | #' \dontrun{
68 | #' getTargetForPyClassification(dataset = iris,
69 | #' targetVarName = "Species", positiveClass = "setosa")
70 | #' }
71 | #' @export
72 | getTargetForPyClassification <- function(dataset, targetVarName, positiveClass){
73 | dataset %>% dplyr::mutate(target = ifelse(!!rlang::sym(targetVarName) == !!(positiveClass) , 1, 0)) %>% dplyr::select(target) %>%
74 | as.list() %>% unlist -> targetList
75 | names(targetList) <- NULL
76 | targetList %>% as.factor %>% reticulate::r_to_py() -> target
77 | return(target)
78 | }
79 |
80 |
--------------------------------------------------------------------------------
/R/spark-structured-streaming-utilities.R:
--------------------------------------------------------------------------------
1 | ######################################################################################################
2 | # Title: Utility functions for working with Spark through R
3 | # Author: Naren Srinivasan, Anoop S
4 | # Created on: August 24, 2018
5 | # Description: Functions to work with Spark, incuding Structured Streaming
6 | ######################################################################################################
7 |
8 |
9 | #' @name sparkRSessionCreateIfNotPresent
10 | #' @title Connect to a Spark session
11 | #' @details Loads the SparkR package and intializes a Spark session from R
12 | #' @param ... Arguments to sparkR.session
13 | #' @family Spark utilities
14 | #' @examples
15 | #' \dontrun{
16 | #' sparkHome <- "/Users/naren/softwares/spark-2.3.1-bin-hadoop2.7/"
17 | #' sparkMaster <- "local[1]"
18 | #' sparkPackages <- c("org.apache.spark:spark-sql-kafka-0-10_2.11:2.3.1")
19 | #' sparkRSessionCreateIfNotPresent(master = sparkMaster,
20 | #' sparkPackages = sparkPackages)
21 | #' }
22 | #' @export
23 |
24 | sparkRSessionCreateIfNotPresent <- function(...){
25 |
26 | if(Sys.getenv("SPARK_HOME") == "" && sparkHome == ""){
27 | stop("SPARK_HOME environment variable is not set on the system, and sparkHome argument is empty")
28 | }
29 |
30 | if(!("SparkR" %in% installed.packages())){
31 | stop("SparkR package not installed. Please install from the $SPARK_HOME folder")
32 | }
33 |
34 | if(sparkHome == ""){
35 | .libPaths(c(file.path(Sys.getenv("SPARK_HOME"), "R", "lib"), .libPaths()))
36 | sparkHome <- Sys.getenv("SPARK_HOME")
37 | }else{
38 | .libPaths(c(file.path(sparkHome, "R", "lib"), .libPaths()))
39 | }
40 |
41 | SparkR::sparkR.session(...)
42 | }
43 |
44 | #' @name castKafkaStreamAsString
45 | #' @title Connect to a Spark session
46 | #' @details Takes in a Structured Stream from Kafka created from \code{read.stream(source = 'kafka', ...)} and returns
47 | #' a Structured Streaming DataFrame where the \code{key} and \code{value} from the Kafka stream are cast to string
48 | #' @param streamObj Spark Structured Streaming DataFrame returned by \code{read.stream} function with \code{source = 'kafka'}
49 | #' @return Updated Spark Structured Streaming DataFrame with key, value, topic and timestamp from the Kafka stream
50 | #' @family Spark utilities
51 | #' @export
52 |
53 | castKafkaStreamAsString <- function(streamObj){
54 | streamObj <- SparkR::selectExpr(streamObj, "CAST(key AS STRING)", "CAST(value AS STRING)","topic","timestamp")
55 | return(streamObj)
56 | }
57 |
58 | #' @name convertKafkaValueFromJson
59 | #' @title Connect to a Spark session
60 | #' @details Takes in a Structured Stream from Kafka created from \code{read.stream(source = 'kafka', ...)} and returns
61 | #' a Structured Streaming DataFrame where the \code{key} and \code{value} from the Kafka stream are cast to string
62 | #' @param streamObj Spark Structured Streaming DataFrame which is returned by the \code{castKafkaStreamAsString} function
63 | #' @param schema A structType object created from SparkR specifying the schema of the json data present in the \code{value}
64 | #' attribute of the incoming Kafka stream
65 | #' @return Spark Structured Streaming DataFrame with the json data in the \code{value} attribute of the Kafka stream parsed
66 | #' into a DataFrame format
67 | #' @family Spark utilities
68 | #' @export
69 |
70 | convertKafkaValueFromJson <- function(streamObj, schema){
71 | streamObj <- SparkR::select(streamObj, SparkR::from_json(streamObj$value,
72 | schema = schema))
73 | return(streamObj)
74 | }
75 |
76 |
--------------------------------------------------------------------------------
/R/sysdata.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Mu-Sigma/analysis-pipelines/a7bfb1a0d5d251a42309b2430c11535be817dea9/R/sysdata.rda
--------------------------------------------------------------------------------
/R/zzz.R:
--------------------------------------------------------------------------------
1 |
2 | .onAttach <- function(libName, pkgName){
3 | loadPredefinedFunctionRegistry()
4 | }
5 |
6 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Table of contents
2 | 1. [An overview of the package](#overview)
3 | 2. [Usage](#Usage)
4 | 3. [Features](#Features)
5 |
6 | # An overview of the package
7 |
8 | In a typical data science workflow there are multiple steps involved from data aggregation, cleaning, exploratory analysis, modeling and so on. As the data science community matures, we are seeing that there are a variety of languages which provide better capabilities for specific steps in the data science workflow. *R* is typically used for data transformations, statistical models, and visualizations, while *Python* provides more robust functions for machine learning. In addition to this, *Spark* provides an environment to process high volume data - both as one-time/ batch or as streams.
9 |
10 | The job of today's data scientist is changing from one where they are married to a specific tool or language, to one where they are using all these tools for their specialized purposes. The key problem then becomes one of translation between these tools for seamless analysis. Additionally, in the work of a data scientist, there is a need to perform the same task repeatedly, as well as put certain analysis flows (or) pipelines into production to work on new data periodically, or work on streaming data.
11 |
12 | Recently, interfaces for using these various tools have been published. In terms of R packages, the *reticulate* package provides an interface to Python, and the *SparkR* and *sparklyr* packages provide an interface to Spark.
13 |
14 | The *analysisPipelines* package uses these interfaces to enable *Interoperable Pipelines* i.e. the ability compose and execute a reusable data science pipeline which can contain functions to be executed in an *R* environment, in a *Python* environment or in a *Spark* environment. These pipelines can saved and loaded, to enable batch operation as datasets get updated with new data.
15 |
16 | The goal of the *analysisPipelines* package is to make the job of the data scientist easier and help them compose pipelines of analysis which consist of data manipulation, exploratory analysis & reporting, as well as modeling steps. The idea is for data scientists to use tools of their choice through an *R* interface, using this package
17 | Essentially, it allows data scientists to:
18 |
19 | * Compose **reusable, interoperable** pipelines in a flexible manner
20 | * Leverage available utility functions for performing different analytical operations
21 | * Put these pipelines into production in order to execute repeatedly
22 | * Generated analysis reports by executing these pipelines
23 |
24 | ## Types of pipelines
25 |
26 | This package supports for both *batch/ repeated* pipelines, as well as *streaming pipelines.*
27 |
28 | For *batch* pipelines, the vision is to enable interoperable pipelines which execute efficiently with functions in *R*, *Spark* and *Python*
29 |
30 | For *streaming* pipelines, the package allows for streaming analyses through *Apache Spark Structured Streaming.*
31 |
32 | ## Classes and implementation
33 |
34 | The *analysisPipelines* package uses S4 classes and methods to implement all the core functionality. The fundamental class exposed in this package is the *BaseAnalysisPipeline* class on which most of the core functions are implemented. The user, however, interacts with the *AnalysisPipeline* and *StreamingAnalysisPipeline* classes for batch and streaming analysis respectively.
35 |
36 | ## Pipelining semantics
37 |
38 | The package stays true to the *tidyverse* pipelining style which also fits nicely into the idea of creating pipelines. The core mechanism in the package is to instantiate a pipeline with data and then pipeline required functions to the object itself.
39 |
40 | The package allows both the use of *magrittr* pipe **(%>%)** or the *pipeR* pipe **(%>>%)**.
41 |
42 | ## Supported engines
43 |
44 | As of this version, the package supports functions executed on *R*, or *Spark* through the SparkR interface, as well as Python functions run through *reticulate* for batch pipelines. It also supports *Apache Spark Structured Streaming* pipelines for streaming analyses.
45 |
46 | ## Bug reports and feature requests
47 |
48 | * Bug reports/ Feature requests and other thoughts can be raised [here](https://github.com/Mu-Sigma/analysis-pipelines/issues)
49 |
50 | ## Available vignettes
51 |
52 | This package contains 7 vignettes:
53 |
54 | 1. **Analysis pipelines - Core functionality and working with R data frames and functions** - This is the main vignette describing the package's core functionality, and explaining this through **batch** pipelines in just **R**
55 | 2. **Analysis pipelines for working with Spark DataFrames for one-time/ batch analyses** - This vignette describes creating **batch** pipelines to execute solely in a *Spark* environment
56 | 3. **Analysis pipelines for working with Python functions** - This vignette describes creating **batch** pipelines to execute solely in a *Python* environment
57 | 4. **Interoperable analysis pipelines** - This vignette describes creating and executing **batch** pipelines which are composed of functions executing across *supported engines*
58 | 5. **Streaming Analysis Pipelines for working with Apache Spark Structured Streaming** - This vignette describes setting up streaming pipelines on *Apache Spark Structured Streaming*
59 | 6. **Using pipelines inside Shiny widgets or apps** - A brief vignette which illustrates an example of using a pipeline inside a shiny widget with reactive elements and changing data
60 | 7. **An introduction to meta-pipelines** - This vignette illustrates the use of meta-pipelines
61 |
62 | When the package is installed and loaded, vignettes 1 & 7 have all the chunks evaluated. Other vignettes require specific Python and Spark configurations and hence all chunks are not evaluated as part of the package. However, an evaluated version of vignettes 2-5 can be found in the `knit-vignettes` folder in the Github project. Vignette 6 is a shiny document which can be run.
63 |
64 | # Usage
65 |
66 | ## Loading the package
67 |
68 | ```r
69 | library(analysisPipelines)
70 | ```
71 |
72 | ## Creating an analysisPipeline object
73 |
74 | An object of class *AnalysisPipeline* can be created like so:
75 |
76 | ```{r creating object, warning=F}
77 | obj <- AnalysisPipeline(input = iris)
78 | class(obj)
79 | ```
80 |
81 | While initializing the object, an input dataframe can be provided on which the pipeline should work, either by providing the filePath to a *.csv* file through the *filePath* argument, or by providing R dataframe available in the session, through the *input* argument
82 |
83 | The *AnalysisPipeline* object has a set of getters, for retrieving various slots containing data and metadata required for pipeline execution. The most basic of them is the *getInput* method which retrieves the input dataframe with which the object has been initialized. If not initialized with a dataframe, the *setInput* method can be used to do so.
84 |
85 | ```{r printing object contents, warning=F}
86 | obj %>>% getInput %>>% str
87 | getRegistry()
88 | ```
89 |
90 | The *getRegistry* function retrieves the set of functions and their metadata available for pipelining. Any *AnalysisPipeline* object comes with a set of pre-registered functions which can be used **out-of-the-box**. Of course, the user can register her own functions, to be used in the pipeline. We will explore this later on.
91 |
92 | There are two types of functions which can be pipelined:
93 |
94 | * **Data functions** - These functions necessarily take their **first** argument as a dataframe. These are functions focused on performing operations on data. Specifically, the nomenclature *data functions* is used for those functions which work on the input dataframe set to the pipeline object, and perform some transformation or analysis on them. They help form the main *path* in a pipeline, constituting a linear flow from the input.
95 | * **Non-data functions** - These are auxiliary helper functions which are required in a pipeline, which may or may not operate on data. However, the *key* difference is that these functions do not operate on the **input (or some direct transformation of it)**. In essence, they help form auxiliary paths in the pipeline, which eventually merge into the main path.
96 |
97 | Both pre-registered and user-defined functions work with the *AnalysisPipeline* object in the same way i.e. regardless of who writes the function, they follow the same semantics.
98 |
99 | ## Creating a simple pipeline
100 |
101 | We'll now take a look at creating a simple pipeline, with some of the pre-registered functions available in the registry. We pipeline the *univarCatDistPlots* function (available as a pre-registered utility function,which generates a chart showing distribution of a categorical variable in a dataset), by simply using the *pipe* or *double pipe* operator, and providing the required additional parameters apart from the *data* on which it needs to operate, as we have already initialized the *AnalysisPipeline* object with the data.
102 |
103 | Note that unless assigned to the same or another object, the pipeline does not get stored.
104 |
105 | We can access the details of the pipeline as a tibble through the `getPipeline` method.
106 |
107 | ```{r pipe demo 1, warning=F}
108 | # Running univariate categorical distribution plot on the constructed object
109 | # ?analysisPipelines::univarCatDistPlots
110 | obj1 <- obj %>>% univarCatDistPlots(uniCol = "Species", priColor = "blue", optionalPlots = 0, storeOutput = T)
111 | obj1 %>>% getPipeline
112 | ```
113 |
114 | # Features
115 |
116 | ## User-defined functions
117 |
118 | ### Registering your own function
119 |
120 | You can register your own *data* or *non-data* functions by calling `registerFunction.` This adds the user-defined function to the registry. The registry is maintained by the package and once registered, functions can be used across pipeline objects. The registry can be view by calling the `getRegistry` function.
121 |
122 | ```r
123 | # Currently registered functions
124 | getRegistry()
125 | ```
126 |
127 |
128 | In order to register a function, first the function must be defined in the Global environment, before calling `registerFunction`.
129 |
130 | ```r
131 | bivariatePlots <- function(dataset, select_var_name_1, select_var_name_2,
132 | priColor = "blue", secColor='black') {
133 | x=dataset[, select_var_name_1]
134 | y=dataset[, select_var_name_2]
135 | bivarPlot <- ggplot2::ggplot(dataset, ggplot2::aes(x,y)) +
136 | ggplot2::geom_point(color=priColor,alpha=0.7) +
137 | ggplot2::geom_smooth(method = lm,color=secColor) +
138 | ggplot2::xlab(select_var_name_1) +
139 | ggplot2::ylab(select_var_name_2) +
140 | ggplot2::theme_bw() +
141 | ggplot2::ggtitle(paste('Bivariate plot for', select_var_name_1,
142 | 'and', select_var_name_2, sep=' ')) +
143 | ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 10),
144 | axis.text = ggplot2::element_text(size=10),
145 | axis.title=ggplot2::element_text(size=10))
146 | return(bivarPlot)
147 | }
148 |
149 | registerFunction(functionName = "bivariatePlots", heading = "Bivariate Analysis")
150 | ```
151 |
152 | ### Adding the newly registered function to a pipeline
153 |
154 | Now the newly registered user-defined function can be used as part of the pipeline, exactly as described before. For example, we add it to a pipeline which already contains some functions. The function then gets added to the end of the pipeline
155 |
156 | ```{r register function 2, warning=F}
157 | # Chaining the user-defined function to the object's pipeline where it was registered
158 | obj2 <- obj2 %>>%
159 | bivariatePlots(select_var_name_1 = 'Sepal.Length', select_var_name_2 = 'Sepal.Width',
160 | priColor = "blue", secColor = "black")
161 |
162 | # Printing the updated pipeline
163 | obj2 %>>% getPipeline
164 | ```
165 |
166 | ## Complex pipelines and formula semantics
167 |
168 | In addition to simple linear pipelines, more complex pipelines can also be defined. There are cases when the outputs of previous functions in the pipeline, as inputs to arbitrary parameters of subsequent functions.
169 |
170 | The package defines certain *formula* semantics to accomplish this. We take the example of two simple user-defined functions, both which simply return the color of the graph, as well as the column on which the graph should be plotted, in order to illustrate how this works.
171 |
172 | Preceding outputs can be passed to subsequent functions simply by specifying a **formula** of the form 'f*id*' against the argument to which the output is to be passed . The ID represents the ID of the function in the pipeline. For example, to pass the output of function with ID '1' as an argument to a parameter of a subsequent function, the formula '~f1' is passed to that corresponding argument.
173 |
174 | ```r
175 | obj %>>% getColor(color = "blue") %>>% getColumnName(columnName = "Sepal.Length") %>>%
176 | univarCatDistPlots(uniCol = "Species", priColor = ~f1, optionalPlots = 0, storeOutput = T) %>>%
177 | outlierPlot(method = "iqr", columnName = ~f2, cutoffValue = 0.01, priColor = ~f1 , optionalPlots = 0) -> complexPipeline
178 |
179 | complexPipeline %>>% getPipeline
180 | complexPipeline %>>% generateOutput -> op
181 | op %>>% getOutputById("4")
182 | ```
183 |
184 | ## Interoperable pipelines
185 |
186 | **Interoperable pipelines** containing functions operating on different engines such as R, Spark and Python can be configured and executed through the **analysisPipelines** package. Currently, the package supports interoperable pipelines containing *R* and *Spark* batch functions.
187 |
188 |
189 |
190 | ## Pipeline visualization
191 |
192 | Pipelines can be visualized as directed graphs, providing information about the engines being used, function dependencies and so on.
193 |
194 |
195 |
196 |
197 | ## Report generation
198 |
199 | Outputs generated from pipelines can easily be exported to formatted reports, showcasing the results, generating pipeline as well as a peek at the data
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 | ## Meta-pipelines
209 |
210 | The meta-pipeline construct is one which allows users to export pipelines they have created for a particular use case to a general analysis flow which can be used for a different dataset and different set of parameters. A pipeline is one where the data can change, though retaining the same schema, and the same set of parameters for the functions. A meta-pipeline is one where only the analysis flow, function dependencies and so on are retained. The specific parameters for each of the functions can be set differently for a new use case.
211 |
212 | The objective of a meta-pipeline is to define and execute reusable analysis flows. They can be used to:
213 | * Document best practices for a particular problem
214 | * Templatize analyses for particular situations
215 |
216 | Meta-pipelines can be created by exporting from pipelines, and new pipelines instantiated from a meta-pipeline, with an easy-to-use method to set the new values of parameters.
217 |
218 | ## Execution
219 |
220 | The 'analysisPipelines' package internally converts the pipeline defined by the user into a **directed graph** which captures dependencies of each function in the pipeline on data, other arguments as well as outputs as other functions.
221 |
222 | ### Topological sort and ordering
223 |
224 | When it is required to generate the output, the pipeline is first *prepped* by performing a **topological sort** of the directed graph, and identifying *sets (or) batches* of independent functions and a sequence of *batches* for execution. A later release of the package will allow for parallel execution of these independent functions
225 |
226 | ### Memory management & garbage cleaning
227 |
228 | Memory is managed efficiently, by only storing outputs which the user has explicitly specified, or temporarily storing intermediate outputs required for subsequent functions **only until** they are required for processing. Garbage cleaning is performed after the execution of each *batch* in order to manage memory effectively.
229 |
230 | ### Type conversions
231 |
232 | In the case of *Interoperable pipelines* executing across multiple engines such as *R, Spark and Python*, type conversions between data types in the different engines is **minimized** by identifying the optimal number of type conversions, before execution starts
233 |
234 | ## Logging & Execution times
235 |
236 | The package provides logging capabilities for execution of pipelines, as you might have noted when the output was generated in sections above. By default, logs are written to the *console*, but alternatively the user can specify an output file to which the logs need to be written through the `setLoggerDetails` function.
237 |
238 | Logs capture errors, as well as provide information on the steps being performed, execution times and so on.
239 |
240 |
241 |
242 | ## Custom exception-handling
243 |
244 | By default, when a function is registered, a generic exception handling function which captures the R error message, in case of error is registered against each function in the registry. The user can define a custom exception handling function, by defining it and providing it during the time of registration. The function should take 1 argument, which is the error object.
245 |
246 |
247 |
248 |
--------------------------------------------------------------------------------
/analysisPipelines.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 |
3 | RestoreWorkspace: No
4 | SaveWorkspace: No
5 | AlwaysSaveHistory: Default
6 |
7 | EnableCodeIndexing: Yes
8 | UseSpacesForTab: Yes
9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 |
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 |
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 |
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | PackageBuildArgs: --resave-data
22 | PackageCheckArgs: --as-cran
23 | PackageRoxygenize: rd,collate,namespace,vignette
24 |
--------------------------------------------------------------------------------
/data-raw/predefFunctions.R:
--------------------------------------------------------------------------------
1 | ##################################################################################################
2 | # Title: Predefined functions as part of package
3 | # Version: 18.08.01
4 | # Created on: August 23, 2018
5 | # Description: Reproducible code to generate list of predefined functions
6 | ##################################################################################################
7 |
8 | ##################################################################################################
9 | # Working with batch pipelines - data frames in R, Spark or Python
10 | ##################################################################################################
11 |
12 | ##################################################################################################
13 | # EDA
14 | ##################################################################################################
15 | .batchPredefFunctions <- data.frame(functionName = c("univarCatDistPlots"),
16 | heading = c("Univariate Distribution Categorical"),
17 | engine = c("r"),
18 | exceptionHandlingFunction = c(as.character(substitute(genericPipelineException))),
19 | isDataFunction = TRUE,
20 | firstArgClass = "",
21 | stringsAsFactors = F)
22 |
23 | .batchPredefFunctions %>>% dplyr::add_row(functionName = "outlierPlot",
24 | heading = "Univariate Outlier",
25 | # outAsIn = FALSE,
26 | engine = "r",
27 | exceptionHandlingFunction = c(as.character(substitute(genericPipelineException))),
28 | isDataFunction = TRUE,
29 | firstArgClass = "") -> .batchPredefFunctions
30 | .batchPredefFunctions %>>% dplyr::add_row(functionName = "multiVarOutlierPlot",
31 | heading = "Multivariate Outlier",
32 | engine = "r",
33 | exceptionHandlingFunction = c(as.character(substitute(genericPipelineException))),
34 | isDataFunction = T,
35 | firstArgClass = "") -> .batchPredefFunctions
36 | .batchPredefFunctions %>>% dplyr::add_row(functionName = "ignoreCols",
37 | heading = "Ignore Columns",
38 | engine = "r",
39 | exceptionHandlingFunction = c(as.character(substitute(genericPipelineException))),
40 | isDataFunction = TRUE,
41 | firstArgClass = "") -> .batchPredefFunctions
42 | .batchPredefFunctions %>>% dplyr::add_row(functionName = "getFeaturesForPyClassification",
43 | heading = "",
44 | engine = "r",
45 | exceptionHandlingFunction = c(as.character(substitute(genericPipelineException))),
46 | isDataFunction = T,
47 | firstArgClass = "") -> .batchPredefFunctions
48 | .batchPredefFunctions %>>% dplyr::add_row(functionName = "getTargetForPyClassification",
49 | heading = "",
50 | engine = "r",
51 | exceptionHandlingFunction = c(as.character(substitute(genericPipelineException))),
52 | isDataFunction = TRUE,
53 | firstArgClass = "") -> .batchPredefFunctions
54 |
55 | ##################################################################################################
56 |
57 | ##################################################################################################
58 | # Working with Streaming pipelines - Currently supports Apache Spark Structured Streaming
59 | ##################################################################################################
60 |
61 | ##################################################################################################
62 | # Kafka Streams as input
63 | ##################################################################################################
64 |
65 | .streamingPredefFunctions <- data.frame(functionName = c("castKafkaStreamAsString"),
66 | heading = c("Cast Kafka stream to a string"),
67 | engine = c("spark-structured-streaming"),
68 | exceptionHandlingFunction = c(as.character(substitute(genericPipelineException))),
69 | isDataFunction = TRUE,
70 | firstArgClass = "",
71 | stringsAsFactors = F)
72 |
73 | .streamingPredefFunctions %>>% dplyr::add_row(functionName = "convertKafkaValueFromJson",
74 | heading = "Convert Kafka Value from JSON",
75 | engine = c("spark-structured-streaming"),
76 | exceptionHandlingFunction = c(as.character(substitute(genericPipelineException))),
77 | isDataFunction = TRUE,
78 | firstArgClass = ""
79 | ) -> .streamingPredefFunctions
80 |
81 |
82 | devtools::use_data(.batchPredefFunctions, .streamingPredefFunctions, internal = TRUE, overwrite = T)
83 |
--------------------------------------------------------------------------------
/inst/data-icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Mu-Sigma/analysis-pipelines/a7bfb1a0d5d251a42309b2430c11535be817dea9/inst/data-icon.png
--------------------------------------------------------------------------------
/inst/logging.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Mu-Sigma/analysis-pipelines/a7bfb1a0d5d251a42309b2430c11535be817dea9/inst/logging.png
--------------------------------------------------------------------------------
/inst/output-icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Mu-Sigma/analysis-pipelines/a7bfb1a0d5d251a42309b2430c11535be817dea9/inst/output-icon.png
--------------------------------------------------------------------------------
/inst/param-icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Mu-Sigma/analysis-pipelines/a7bfb1a0d5d251a42309b2430c11535be817dea9/inst/param-icon.png
--------------------------------------------------------------------------------
/inst/pipelineViz1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Mu-Sigma/analysis-pipelines/a7bfb1a0d5d251a42309b2430c11535be817dea9/inst/pipelineViz1.png
--------------------------------------------------------------------------------
/inst/pipelineViz2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Mu-Sigma/analysis-pipelines/a7bfb1a0d5d251a42309b2430c11535be817dea9/inst/pipelineViz2.png
--------------------------------------------------------------------------------
/inst/python-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Mu-Sigma/analysis-pipelines/a7bfb1a0d5d251a42309b2430c11535be817dea9/inst/python-logo.png
--------------------------------------------------------------------------------
/inst/python/sampleFunctions.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from sklearn import datasets
3 | from sklearn import metrics
4 | from sklearn.tree import DecisionTreeClassifier
5 |
6 | def getColMeans(df):
7 | meanList = []
8 | for x in df.columns:
9 | meanList.append(df[x].mean())
10 | return meanList
11 |
12 | def decisionTreeTrainAndTest(data, target, newData):
13 | model = DecisionTreeClassifier()
14 | model.fit(data, target)
15 | testPred = model.predict(newData)
16 | return testPred
17 |
18 |
--------------------------------------------------------------------------------
/inst/r-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Mu-Sigma/analysis-pipelines/a7bfb1a0d5d251a42309b2430c11535be817dea9/inst/r-logo.png
--------------------------------------------------------------------------------
/inst/report.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Analysis Pipeline Results"
3 |
4 | subtitle: '`r format(Sys.Date(), "%B %d, %Y")`'
5 |
6 | output: html_document
7 |
8 | params:
9 | obj: r! analysisPipelines::AnalysisPipeline()
10 | ---
11 |
12 | ## Pipeline Visualization
13 |
14 | ```{r echo=FALSE, warning=FALSE, comment=FALSE, message=FALSE, results='asis', fig.width = 12, out.width = '100%'}
15 | obj <- params$obj
16 | input <- obj@input
17 | pipelineDetails <-obj@pipeline
18 | output <- obj@output
19 |
20 |
21 | analysisPipelines::visualizePipeline(obj)
22 | ```
23 |
24 |
25 | ## Quick Peek
26 | ```{r quick peek,echo=FALSE,warning=FALSE,results='asis', fig.width = 12, out.width = '100%'}
27 | DT::datatable(head(input),options = list(scrollX = T, scrollY = T))
28 | ```
29 |
30 |
31 | ```{r, echo =FALSE,warnings=FALSE,results='asis'}
32 |
33 | knitString <- ""
34 |
35 | storedOps <- pipelineDetails %>>% dplyr::filter(storeOutput == T)
36 |
37 | for(i in storedOps$id){
38 | opTable <- storedOps %>>% dplyr::filter(id == i)
39 | obj%>>% getOutputById(i) -> op
40 | eval(parse(text = paste0("op_", i, " <- op")))
41 | knit_expanded <- paste0(
42 | "\n```{r chunk",i,",results='asis', fig.width = 12, out.width = '100%', echo=FALSE}
43 | \n\n
44 | cat('## ",opTable$heading," \n')
45 | \n\n
46 | op_", i, "
47 | \n```
48 | \n\n"
49 | )
50 | knitString <- paste0(knitString, knit_expanded)
51 | }
52 |
53 | ```
54 |
55 |
56 | `r paste(knitr::knit(text = knitString), collapse = '\n')`
57 |
58 |
--------------------------------------------------------------------------------
/inst/report1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Mu-Sigma/analysis-pipelines/a7bfb1a0d5d251a42309b2430c11535be817dea9/inst/report1.png
--------------------------------------------------------------------------------
/inst/report2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Mu-Sigma/analysis-pipelines/a7bfb1a0d5d251a42309b2430c11535be817dea9/inst/report2.png
--------------------------------------------------------------------------------
/inst/report3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Mu-Sigma/analysis-pipelines/a7bfb1a0d5d251a42309b2430c11535be817dea9/inst/report3.png
--------------------------------------------------------------------------------
/inst/spark-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Mu-Sigma/analysis-pipelines/a7bfb1a0d5d251a42309b2430c11535be817dea9/inst/spark-logo.png
--------------------------------------------------------------------------------
/inst/spark-structured-streaming-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Mu-Sigma/analysis-pipelines/a7bfb1a0d5d251a42309b2430c11535be817dea9/inst/spark-structured-streaming-logo.png
--------------------------------------------------------------------------------
/inst/styles.css:
--------------------------------------------------------------------------------
1 | body {
2 | font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
3 | font-size: 14px;
4 | }
5 |
6 | td {
7 | padding-left: 10px;
8 | width: 50px;
9 | align:left;
10 | }
11 |
12 | table{
13 | width: 100%
14 | }
15 |
16 | /*h1{
17 | text-align:center;
18 | }*/
19 | .level2{
20 | width:77vw;
21 | margin-left: 20vw;
22 | }
23 | .fluid-row{
24 | position: relative;
25 | background-color: #162f47;
26 | font-weight: bold;
27 | color: white;
28 | width:98.8vw;
29 | left: -15px;
30 | padding: 0.3cm 0.5cm 0.5cm 0.3cm;
31 | z-index :100;
32 | box-shadow: 0px 2px 4px 0px rgba(0, 0, 0, 0.80);
33 | }
34 | h3{
35 | font-family: Arial, Helvetica, sans-serif;
36 |
37 | }
38 | #hideGridMsg{
39 | display:none;
40 | }
41 | .main-container{
42 | max-width:100vw !important;
43 | }
44 | h1{
45 | font-family: Arial, Helvetica, sans-serif;
46 | margin-left:2%;
47 | margin-top: 0px !important;
48 | margin-bottom: -15px !important;
49 | font-size: 34px !important;
50 | }
51 | .author{
52 | font-family: Arial, Helvetica, sans-serif;
53 | margin-left:2%;
54 | font-size: 16px;
55 | }
56 | h2{
57 | position: relative;
58 | left: -1%;
59 | bottom: 3px;
60 | border-bottom: 2px solid #444444;
61 | font-size: 24px;
62 | font-family: Arial, Helvetica, sans-serif;
63 | font-weight: 600;
64 | color:#444444;
65 | }
66 | .subtitle{
67 | margin-left: 2%;
68 | font-size: 20px;
69 | }
70 | body{
71 | font-family: Arial, Helvetica, sans-serif;
72 | overflow-x: hidden;
73 | }
74 | head{
75 | font-family: Arial, Helvetica, sans-serif;
76 | }
77 | .mulogo{
78 | position: relative;
79 | float: right;
80 | right: -20vw;
81 | margin-top: -115px;
82 | z-index:100;
83 | }
84 | .client{
85 | position: relative;
86 | float: right;
87 | right:-27vw;
88 | height: 90px;
89 | width: 90px;
90 | margin-top: -115px;
91 | z-index:100;
92 | }
93 |
94 |
95 |
96 | h3{
97 |
98 | }
99 |
100 | /*
101 | h4{
102 | text-align:center;
103 | font-style:normal;
104 | }
105 | .date, .author {
106 | font-style: italic;
107 | }
108 | */
109 |
110 | tr:nth-child(even) {background-color: #f2f2f2}
111 | tr:hover {background-color: #f5f5f5}
112 |
113 | th {
114 | background-color: #ffffff;
115 | color: black;
116 | padding-left: 10px;
117 | width: 100px;
118 | }
119 |
120 | #var_dist_table{
121 | width: 100px;
122 | table-layout:fixed;
123 | }
124 |
125 | #post_trellis, #post_time_series, #post_pkg_details,#post_peek,#post_summ,#post_miss,#post_num_var,#post_cat_var,#post_bi_var,#post_prob_outlier,#post_corr_net,#post_corr_mat, #post_univar, #post_cluster, #post_factAnalysis,#post_cat_summ,#post_univar, #post_trellis,#post_factAnalysis,#post_cluster{
126 | font-weight: bold;
127 | box-shadow: 5px 5px 5px #888888;
128 | }
129 |
130 | /*
131 | #post_num_var, #post_cat_var, #post_bi_var, #post_miss{
132 | position: absolute; right: 10%;
133 | position: relative; right: 10%;
134 | }
135 |
136 | #package_details, #post_pkg_details, #post_peek, #post_summ{
137 | position: absolute; left: 300%;
138 | position: relative; top: 10%;
139 | }
140 | */
141 |
142 | #quickFilters{
143 | margin-bottom:10px;
144 | }
145 | .tocify{
146 |
147 | width:21vw !important;
148 | margin-left: -20px;
149 | background-color: white;
150 | border-color: white;
151 | margin: 150px 0px 20px 0px !important;
152 | border-radius: 1px;
153 | z-index: 1000;
154 | top:40px;
155 | max-width: 19vw !important;
156 | }
157 | .tocify-item{
158 |
159 | padding: 10px 10px 10px 20px !important;
160 | background-color: #424C55 !important;
161 | border-bottom: 1px solid #454545 !important;
162 | font-size: 14px;
163 | color:#d8d8d8 !important;
164 | border-radius: 1px !important;
165 | transition-property: all;
166 | -moz-transition-property: all;
167 | -webkit-transition-property: all;
168 | -o-transition-property: all;
169 |
170 | transition-duration: 250ms;
171 | -webkit-transition-duration: 250ms;
172 |
173 | }
174 | .tocify-item-hover {
175 | background: #F00;
176 | color: #FFF;
177 | }
178 |
179 |
180 | .tocify-subheader .tocify-item {
181 | padding-left: 45px !important;
182 | }
183 | .tocify-item:hover{
184 | background-color: #d8d8d8 !important;
185 |
186 |
187 | border-left: 5px solid #2d6396 !important;
188 | text-decoration: none !important;
189 | color:#2d6396 !important;
190 |
191 | }
192 |
193 |
194 | .toc-content{
195 | position: absolute;
196 | }
197 |
198 | .tocify-extend-page{
199 | height: 0px !important;
200 | }
201 |
202 | .list-group-item.active, .list-group-item.active:focus, .list-group-item.active:hover .list-group-item:hover{
203 | background-color: #ffffff !important;
204 | font-weight: bold !important;
205 |
206 | border-left: 5px solid #2d6396 !important;
207 | text-decoration: none !important;
208 | color:#2d6396 !important;
209 |
210 | }
211 |
212 | .row-fluid{
213 | margin-left:-26px;
214 | }
215 |
216 | html,body{
217 | height: 100%;
218 | }
219 | .fixedToc{
220 | top:-100px;
221 | }
222 |
--------------------------------------------------------------------------------
/man/AnalysisPipeline-class.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions-batch.R
3 | \docType{class}
4 | \name{AnalysisPipeline-class}
5 | \alias{AnalysisPipeline-class}
6 | \alias{AnalysisPipeline}
7 | \title{Class for constructing Analysis Pipelines for batch/ one-time analyeses}
8 | \description{
9 | Class for constructing Analysis Pipelines for batch/ one-time analyeses
10 | }
11 | \details{
12 | Inherits the base class \link{BaseAnalysisPipeline} class which holds the metadata including the registry of available functions,
13 | the data on which the pipeline is to be applied, as well as the pipeline itself
14 |
15 | Additionally, this class is meant to be used for batch/ one-time processing. Contains additional slots to
16 | hold the data frame to be used for the pipeline and associated schema
17 | }
18 | \section{Slots}{
19 |
20 | \describe{
21 | \item{\code{input}}{The input dataset on which analysis is to be performed}
22 |
23 | \item{\code{originalSchemaDf}}{Empty data frame representing the schema of the input}
24 | }}
25 |
26 | \seealso{
27 | Other Package core functions for batch/one-time analyses: \code{\link{checkSchema}},
28 | \code{\link{generateReport}},
29 | \code{\link{initialize,BaseAnalysisPipeline-method}}
30 | }
31 | \concept{Package core functions for batch/one-time analyses}
32 |
--------------------------------------------------------------------------------
/man/BaseAnalysisPipeline-class.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions.R
3 | \docType{class}
4 | \name{BaseAnalysisPipeline-class}
5 | \alias{BaseAnalysisPipeline-class}
6 | \alias{BaseAnalysisPipeline}
7 | \title{Base class for \code{AnalysisPipeline} and \code{StreamingAnalysisPipeline} objects}
8 | \description{
9 | Base class for \code{AnalysisPipeline} and \code{StreamingAnalysisPipeline} objects
10 | }
11 | \details{
12 | The class which holds the metadata including the registry of available functions,
13 | the data on which the pipeline is to be applied, as well as the pipeline itself, and serves
14 | as the base class for various types of Pipeline objects such as Batch and Streaming.
15 |
16 | This base class which contains the slots related to the registry, pipeline and output can be extended
17 | to create custom class for specific scenarios if required.
18 |
19 | In the documentation, objects of classes which are subclasses of this class are referred to as 'Pipeline' objects
20 | }
21 | \section{Slots}{
22 |
23 | \describe{
24 | \item{\code{pipeline}}{A tibble which holds functions to be called}
25 |
26 | \item{\code{pipelineExecutor}}{A list containing details of the execution, such as topological ordering of functions to be executed,
27 | dependency map of functions, as well as logger configuration}
28 |
29 | \item{\code{output}}{A list which holds all the functions output}
30 | }}
31 |
32 | \seealso{
33 | Other Package core functions: \code{\link{MetaAnalysisPipeline-class}},
34 | \code{\link{assessEngineSetUp}},
35 | \code{\link{checkSchemaMatch}},
36 | \code{\link{createPipelineInstance}},
37 | \code{\link{exportAsMetaPipeline}},
38 | \code{\link{generateOutput}},
39 | \code{\link{genericPipelineException}},
40 | \code{\link{getInput}}, \code{\link{getLoggerDetails}},
41 | \code{\link{getOutputById}},
42 | \code{\link{getPipelinePrototype}},
43 | \code{\link{getPipeline}}, \code{\link{getRegistry}},
44 | \code{\link{initDfBasedOnType}},
45 | \code{\link{initialize,BaseAnalysisPipeline-method}},
46 | \code{\link{loadMetaPipeline}},
47 | \code{\link{loadPipeline}},
48 | \code{\link{loadPredefinedFunctionRegistry}},
49 | \code{\link{loadRegistry}}, \code{\link{prepExecution}},
50 | \code{\link{registerFunction}},
51 | \code{\link{savePipeline}}, \code{\link{saveRegistry}},
52 | \code{\link{setInput}}, \code{\link{setLoggerDetails}},
53 | \code{\link{updateObject}},
54 | \code{\link{visualizePipeline}}
55 | }
56 | \concept{Package core functions}
57 |
--------------------------------------------------------------------------------
/man/CheckColumnType.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/r-batch-eda-utilities.R
3 | \name{CheckColumnType}
4 | \alias{CheckColumnType}
5 | \title{Check for type of column}
6 | \usage{
7 | CheckColumnType(dataVector)
8 | }
9 | \arguments{
10 | \item{dataVector}{a data vector of a column}
11 | }
12 | \value{
13 | column Type
14 | }
15 | \description{
16 | Check for type of column
17 | }
18 | \details{
19 | Checking for type of columns in the datavector
20 | }
21 | \examples{
22 | CheckColumnType(iris$Sepal.Length)
23 | }
24 | \seealso{
25 | Other Package EDA Utilites functions: \code{\link{bivarPlots}},
26 | \code{\link{correlationMatPlot}},
27 | \code{\link{getDatatype}}, \code{\link{ignoreCols}},
28 | \code{\link{multiVarOutlierPlot}},
29 | \code{\link{outlierPlot}},
30 | \code{\link{univarCatDistPlots}}
31 | }
32 | \concept{Package EDA Utilites functions}
33 |
--------------------------------------------------------------------------------
/man/MetaAnalysisPipeline-class.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions-meta-pipelines.R
3 | \docType{class}
4 | \name{MetaAnalysisPipeline-class}
5 | \alias{MetaAnalysisPipeline-class}
6 | \alias{MetaAnalysisPipeline}
7 | \title{Class for creating and working with meta-pipelines}
8 | \description{
9 | Class for creating and working with meta-pipelines
10 | }
11 | \details{
12 | This class works with the \code{AnalysisPipeline} and \code{StreamingAnalysisPipeline} classes, and allows the
13 | pipeline to be exported as meta-pipeline. A meta-pipeline is a construct, where the input dataset as well as the arguments
14 | to functions in the pipeline are not defined. Only the analysis flow and dependencies are stored.
15 | }
16 | \section{Slots}{
17 |
18 | \describe{
19 | \item{\code{pipeline}}{A tibble which holds functions to be called in the pipeline}
20 |
21 | \item{\code{pipelinePrototype}}{An object of class \code{proto} from the 'proto' package which maintains the prototype of the
22 | functions in the pipeline and their respective arguments}
23 |
24 | \item{\code{type}}{A string defining whether it is a batch or streaming pipeline. Acceptable values are 'batch' & 'streaming'}
25 | }}
26 |
27 | \seealso{
28 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}},
29 | \code{\link{assessEngineSetUp}},
30 | \code{\link{checkSchemaMatch}},
31 | \code{\link{createPipelineInstance}},
32 | \code{\link{exportAsMetaPipeline}},
33 | \code{\link{generateOutput}},
34 | \code{\link{genericPipelineException}},
35 | \code{\link{getInput}}, \code{\link{getLoggerDetails}},
36 | \code{\link{getOutputById}},
37 | \code{\link{getPipelinePrototype}},
38 | \code{\link{getPipeline}}, \code{\link{getRegistry}},
39 | \code{\link{initDfBasedOnType}},
40 | \code{\link{initialize,BaseAnalysisPipeline-method}},
41 | \code{\link{loadMetaPipeline}},
42 | \code{\link{loadPipeline}},
43 | \code{\link{loadPredefinedFunctionRegistry}},
44 | \code{\link{loadRegistry}}, \code{\link{prepExecution}},
45 | \code{\link{registerFunction}},
46 | \code{\link{savePipeline}}, \code{\link{saveRegistry}},
47 | \code{\link{setInput}}, \code{\link{setLoggerDetails}},
48 | \code{\link{updateObject}},
49 | \code{\link{visualizePipeline}}
50 | }
51 | \concept{Package core functions}
52 |
--------------------------------------------------------------------------------
/man/StreamingAnalysisPipeline-class.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-streaming-functions.R
3 | \docType{class}
4 | \name{StreamingAnalysisPipeline-class}
5 | \alias{StreamingAnalysisPipeline-class}
6 | \alias{StreamingAnalysisPipeline}
7 | \title{Class for constructing Analysis Pipelines for streaming analyeses}
8 | \description{
9 | Class for constructing Analysis Pipelines for streaming analyeses
10 | }
11 | \details{
12 | Inherits the base class \link{BaseAnalysisPipeline} class which holds the metadata including the registry of available functions,
13 | the data on which the pipeline is to be applied, as well as the pipeline itself
14 |
15 | This class currently only supports Apache Spark Structured Streaming, implemented through the SparkR interface
16 | }
17 | \section{Slots}{
18 |
19 | \describe{
20 | \item{\code{input}}{The input Spark DataFrame on which analysis is to be performed}
21 |
22 | \item{\code{originalSchemaDf}}{Empty Spark DataFrame representing the schema of the input}
23 | }}
24 |
25 | \concept{Package core functions for Streaming Analyses}
26 |
--------------------------------------------------------------------------------
/man/analysisPipelines.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/analysisPipelines_package.R
3 | \docType{package}
4 | \name{analysisPipelines}
5 | \alias{analysisPipelines}
6 | \alias{analysisPipelines-package}
7 | \title{analysisPipelines}
8 | \description{
9 | The package aims at enabling data scientists to compose pipelines of analysis which consist of data manipulation,
10 | exploratory analysis & reporting, as well as modeling steps. It also aims to enable data scientists to use tools
11 | of their choice through an R interface, and compose interoperable pipelines between R, Spark, and Python.
12 | Credits to Mu Sigma for supporting the development of the package.
13 | }
14 | \note{
15 | To enable pipelines involving Spark tasks, the package uses the 'SparkR' package. Using Spark as an engine requires the SparkR package to be installed.
16 | SparkR is distributed natively with Apache Spark and is not distributed on CRAN. The SparkR version needs to directly map to the Spark version (hence the native distribution), and care needs to be taken to ensure that this is configured properly.
17 | To install from Github, run the following command, if you know the Spark version:
18 | \itemize{
19 | \item devtools::install_github('apache/spark@v2.x.x', subdir='R/pkg')
20 | }
21 | The other option is to install SparkR by running the following terminal commands if Spark has already been installed:
22 | \itemize{
23 | \item $ export SPARK_HOME=/path/to/spark/directory
24 | \item $ cd $SPARK_HOME/R/lib/SparkR/
25 | \item $ R -e "devtools::install('.')"
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/man/assessEngineSetUp.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions.R
3 | \docType{methods}
4 | \name{assessEngineSetUp}
5 | \alias{assessEngineSetUp}
6 | \alias{assessEngineSetUp,BaseAnalysisPipeline-method}
7 | \title{Assesses engine (R, Spark, Python, Spark Structured Streaming) set up}
8 | \usage{
9 | assessEngineSetUp(object)
10 |
11 | \S4method{assessEngineSetUp}{BaseAnalysisPipeline}(object)
12 | }
13 | \arguments{
14 | \item{object}{A Pipeline object}
15 | }
16 | \value{
17 | Tibble containing the details of available engines, whether they are required for a pipeline, a logical value
18 | reporting whether the engine has been set up, and comments.
19 | }
20 | \description{
21 | Assesses engine (R, Spark, Python, Spark Structured Streaming) set up
22 | }
23 | \details{
24 | Assesses whether engines required for executing functions in an \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline}
25 | object have been set up
26 |
27 | This method is implemented on the base class as it is a shared functionality across Pipeline objects
28 | }
29 | \examples{
30 | \dontrun{
31 | library(analysisPipelines)
32 | pipelineObj <- AnalysisPipeline(input = iris)
33 | pipelineObj \%>>\% univarCatDistPlots(uniCol = "Species", priColor = "blue",
34 | optionalPlots = 0) \%>>\% assessEngineSetUp
35 | }
36 | }
37 | \seealso{
38 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}},
39 | \code{\link{MetaAnalysisPipeline-class}},
40 | \code{\link{checkSchemaMatch}},
41 | \code{\link{createPipelineInstance}},
42 | \code{\link{exportAsMetaPipeline}},
43 | \code{\link{generateOutput}},
44 | \code{\link{genericPipelineException}},
45 | \code{\link{getInput}}, \code{\link{getLoggerDetails}},
46 | \code{\link{getOutputById}},
47 | \code{\link{getPipelinePrototype}},
48 | \code{\link{getPipeline}}, \code{\link{getRegistry}},
49 | \code{\link{initDfBasedOnType}},
50 | \code{\link{initialize,BaseAnalysisPipeline-method}},
51 | \code{\link{loadMetaPipeline}},
52 | \code{\link{loadPipeline}},
53 | \code{\link{loadPredefinedFunctionRegistry}},
54 | \code{\link{loadRegistry}}, \code{\link{prepExecution}},
55 | \code{\link{registerFunction}},
56 | \code{\link{savePipeline}}, \code{\link{saveRegistry}},
57 | \code{\link{setInput}}, \code{\link{setLoggerDetails}},
58 | \code{\link{updateObject}},
59 | \code{\link{visualizePipeline}}
60 | }
61 | \concept{Package core functions}
62 |
--------------------------------------------------------------------------------
/man/bivarPlots.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/r-batch-eda-utilities.R
3 | \name{bivarPlots}
4 | \alias{bivarPlots}
5 | \title{Bi-Variate Plot}
6 | \usage{
7 | bivarPlots(dataset, select_var_name_1, select_var_name_2,
8 | priColor = "blue", secColor = "black")
9 | }
10 | \arguments{
11 | \item{dataset}{the dataframe that needs to be loaded}
12 |
13 | \item{select_var_name_1}{the name of first column on which the plot needs to be generated}
14 |
15 | \item{select_var_name_2}{the name of second column on which the plot needs to be generated}
16 |
17 | \item{priColor}{the primary color for the plots}
18 |
19 | \item{secColor}{A secondary color for the plots}
20 | }
21 | \value{
22 | Bivariate plot
23 | }
24 | \description{
25 | Bi-Variate Plot
26 | }
27 | \details{
28 | A bivariate distribution graph on the selected columns from the dataframe.Selected two columns are on two axis' and a plot is generated
29 | }
30 | \examples{
31 | bivarPlots(dataset = iris, select_var_name_1 = "Sepal.Length",
32 | select_var_name_2 = "Sepal.Width")
33 | }
34 | \seealso{
35 | Other Package EDA Utilites functions: \code{\link{CheckColumnType}},
36 | \code{\link{correlationMatPlot}},
37 | \code{\link{getDatatype}}, \code{\link{ignoreCols}},
38 | \code{\link{multiVarOutlierPlot}},
39 | \code{\link{outlierPlot}},
40 | \code{\link{univarCatDistPlots}}
41 | }
42 | \concept{Package EDA Utilites functions}
43 |
--------------------------------------------------------------------------------
/man/castKafkaStreamAsString.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/spark-structured-streaming-utilities.R
3 | \name{castKafkaStreamAsString}
4 | \alias{castKafkaStreamAsString}
5 | \title{Connect to a Spark session}
6 | \usage{
7 | castKafkaStreamAsString(streamObj)
8 | }
9 | \arguments{
10 | \item{streamObj}{Spark Structured Streaming DataFrame returned by \code{read.stream} function with \code{source = 'kafka'}}
11 | }
12 | \value{
13 | Updated Spark Structured Streaming DataFrame with key, value, topic and timestamp from the Kafka stream
14 | }
15 | \description{
16 | Connect to a Spark session
17 | }
18 | \details{
19 | Takes in a Structured Stream from Kafka created from \code{read.stream(source = 'kafka', ...)} and returns
20 | a Structured Streaming DataFrame where the \code{key} and \code{value} from the Kafka stream are cast to string
21 | }
22 | \seealso{
23 | Other Spark utilities: \code{\link{convertKafkaValueFromJson}},
24 | \code{\link{sparkRSessionCreateIfNotPresent}}
25 | }
26 | \concept{Spark utilities}
27 |
--------------------------------------------------------------------------------
/man/checkSchema.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions-batch.R
3 | \name{checkSchema}
4 | \alias{checkSchema}
5 | \title{Compare the schemas of two dataframes}
6 | \usage{
7 | checkSchema(dfOld, dfNew)
8 | }
9 | \arguments{
10 | \item{dfOld}{Old dataframe}
11 |
12 | \item{dfNew}{New dataframe}
13 | }
14 | \value{
15 | Returns a list with details on added columns, removed columns, comparison between column classes, and a logical
16 | whether the schema has remained the same from the old dataframe to the new one
17 | }
18 | \description{
19 | Compare the schemas of two dataframes
20 | }
21 | \details{
22 | Compares the schemas of two dataframes, providing information on added and removed columns in the new dataframe
23 | as compared to the old
24 | }
25 | \seealso{
26 | Other Package core functions for batch/one-time analyses: \code{\link{AnalysisPipeline-class}},
27 | \code{\link{generateReport}},
28 | \code{\link{initialize,BaseAnalysisPipeline-method}}
29 | }
30 | \concept{Package core functions for batch/one-time analyses}
31 | \keyword{internal}
32 |
--------------------------------------------------------------------------------
/man/checkSchemaMatch.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions.R, R/core-functions-batch.R
3 | \docType{methods}
4 | \name{checkSchemaMatch}
5 | \alias{checkSchemaMatch}
6 | \alias{checkSchemaMatch,AnalysisPipeline-method}
7 | \title{Checks the schema of the input to a Pipeline object against the original}
8 | \usage{
9 | checkSchemaMatch(object, newData)
10 |
11 | \S4method{checkSchemaMatch}{AnalysisPipeline}(object, newData)
12 | }
13 | \arguments{
14 | \item{object}{A Pipeline object}
15 |
16 | \item{newData}{The newData that the pipeline is to be initialized with}
17 | }
18 | \value{
19 | Returns a list with details on added columns, removed columns, comparison between column classes, and a logical
20 | whether the schema has remained the same from the old dataframe to the new one
21 | }
22 | \description{
23 | Checks the schema of the input to a Pipeline object against the original
24 | }
25 | \details{
26 | Checks the schema of the new data frame that the pipeline is to be initialized with against
27 | the original schema that the pipeline was saved with. Provides a detailed comparison
28 | }
29 | \seealso{
30 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}},
31 | \code{\link{MetaAnalysisPipeline-class}},
32 | \code{\link{assessEngineSetUp}},
33 | \code{\link{createPipelineInstance}},
34 | \code{\link{exportAsMetaPipeline}},
35 | \code{\link{generateOutput}},
36 | \code{\link{genericPipelineException}},
37 | \code{\link{getInput}}, \code{\link{getLoggerDetails}},
38 | \code{\link{getOutputById}},
39 | \code{\link{getPipelinePrototype}},
40 | \code{\link{getPipeline}}, \code{\link{getRegistry}},
41 | \code{\link{initDfBasedOnType}},
42 | \code{\link{initialize,BaseAnalysisPipeline-method}},
43 | \code{\link{loadMetaPipeline}},
44 | \code{\link{loadPipeline}},
45 | \code{\link{loadPredefinedFunctionRegistry}},
46 | \code{\link{loadRegistry}}, \code{\link{prepExecution}},
47 | \code{\link{registerFunction}},
48 | \code{\link{savePipeline}}, \code{\link{saveRegistry}},
49 | \code{\link{setInput}}, \code{\link{setLoggerDetails}},
50 | \code{\link{updateObject}},
51 | \code{\link{visualizePipeline}}
52 | }
53 | \concept{Package core functions}
54 |
--------------------------------------------------------------------------------
/man/computeEdges.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions.R
3 | \name{computeEdges}
4 | \alias{computeEdges}
5 | \title{Computes edges (dependencies) in a pipeline given the joined tibble of the pipeline and registry}
6 | \usage{
7 | computeEdges(pipelineRegistryJoin)
8 | }
9 | \description{
10 | Computes edges (dependencies) in a pipeline given the joined tibble of the pipeline and registry
11 | }
12 | \keyword{internal}
13 |
--------------------------------------------------------------------------------
/man/convertKafkaValueFromJson.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/spark-structured-streaming-utilities.R
3 | \name{convertKafkaValueFromJson}
4 | \alias{convertKafkaValueFromJson}
5 | \title{Connect to a Spark session}
6 | \usage{
7 | convertKafkaValueFromJson(streamObj, schema)
8 | }
9 | \arguments{
10 | \item{streamObj}{Spark Structured Streaming DataFrame which is returned by the \code{castKafkaStreamAsString} function}
11 |
12 | \item{schema}{A structType object created from SparkR specifying the schema of the json data present in the \code{value}
13 | attribute of the incoming Kafka stream}
14 | }
15 | \value{
16 | Spark Structured Streaming DataFrame with the json data in the \code{value} attribute of the Kafka stream parsed
17 | into a DataFrame format
18 | }
19 | \description{
20 | Connect to a Spark session
21 | }
22 | \details{
23 | Takes in a Structured Stream from Kafka created from \code{read.stream(source = 'kafka', ...)} and returns
24 | a Structured Streaming DataFrame where the \code{key} and \code{value} from the Kafka stream are cast to string
25 | }
26 | \seealso{
27 | Other Spark utilities: \code{\link{castKafkaStreamAsString}},
28 | \code{\link{sparkRSessionCreateIfNotPresent}}
29 | }
30 | \concept{Spark utilities}
31 |
--------------------------------------------------------------------------------
/man/correlationMatPlot.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/r-batch-eda-utilities.R
3 | \name{correlationMatPlot}
4 | \alias{correlationMatPlot}
5 | \title{Correlation Matrix Plot}
6 | \usage{
7 | correlationMatPlot(dataset, methodused = "everything")
8 | }
9 | \arguments{
10 | \item{dataset}{the dataset that needs to be loaded}
11 |
12 | \item{methodused}{methods to be used for computing correlation}
13 | }
14 | \value{
15 | Correlation Matrix graph
16 | }
17 | \description{
18 | A correlation matrix is created and plotted across all the columns in the dataset
19 | }
20 | \examples{
21 | correlationMatPlot(dataset = iris)
22 | }
23 | \seealso{
24 | Other Package EDA Utilites functions: \code{\link{CheckColumnType}},
25 | \code{\link{bivarPlots}}, \code{\link{getDatatype}},
26 | \code{\link{ignoreCols}},
27 | \code{\link{multiVarOutlierPlot}},
28 | \code{\link{outlierPlot}},
29 | \code{\link{univarCatDistPlots}}
30 | }
31 | \concept{Package EDA Utilites functions}
32 |
--------------------------------------------------------------------------------
/man/createPipelineInstance.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions-meta-pipelines.R
3 | \docType{methods}
4 | \name{createPipelineInstance}
5 | \alias{createPipelineInstance}
6 | \alias{createPipelineInstance,MetaAnalysisPipeline-method}
7 | \title{Create a Pipeline object from a meta-pipeline}
8 | \usage{
9 | createPipelineInstance(metaPipelineObj, newParams)
10 |
11 | \S4method{createPipelineInstance}{MetaAnalysisPipeline}(metaPipelineObj,
12 | newParams)
13 | }
14 | \arguments{
15 | \item{metaPipelineObj}{A \code{MetaAnalysisPipeline} object}
16 |
17 | \item{newParams}{Either a nested named list containing all the functions in the pipeline, their arguments and
18 | corresponding values (OR) an object of class \code{proto} which is a pipeline prototype, with the new values of the arguments
19 | set. Refer the \code{getPipelinePrototype} method.}
20 | }
21 | \value{
22 | A Pipeline object
23 | }
24 | \description{
25 | Create a Pipeline object from a meta-pipeline
26 | }
27 | \details{
28 | This method instantiates a Pipeline object (both \code{AnalysisPipeline} and \code{StreamingAnalysisPipeline}) from
29 | a meta-pipeline as well as an object containing the new set of values for the arguments of all the functions in the pipeline.
30 | }
31 | \examples{
32 | \dontrun{
33 | pipelineObj <- AnalysisPipeline(input = iris)
34 | pipelineObj \%>>\% univarCatDistPlots(uniCol = "Species") -> pipelineObj
35 | pipelineObj \%>>\% exportAsMetaPipeline -> exportedMetaPipeline
36 | exportedMetaPipeline \%>>\%
37 | createPipelineInstance(newParams = exportedMetaPipeline \%>>\%
38 | getPipelinePrototype)
39 | }
40 | }
41 | \seealso{
42 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}},
43 | \code{\link{MetaAnalysisPipeline-class}},
44 | \code{\link{assessEngineSetUp}},
45 | \code{\link{checkSchemaMatch}},
46 | \code{\link{exportAsMetaPipeline}},
47 | \code{\link{generateOutput}},
48 | \code{\link{genericPipelineException}},
49 | \code{\link{getInput}}, \code{\link{getLoggerDetails}},
50 | \code{\link{getOutputById}},
51 | \code{\link{getPipelinePrototype}},
52 | \code{\link{getPipeline}}, \code{\link{getRegistry}},
53 | \code{\link{initDfBasedOnType}},
54 | \code{\link{initialize,BaseAnalysisPipeline-method}},
55 | \code{\link{loadMetaPipeline}},
56 | \code{\link{loadPipeline}},
57 | \code{\link{loadPredefinedFunctionRegistry}},
58 | \code{\link{loadRegistry}}, \code{\link{prepExecution}},
59 | \code{\link{registerFunction}},
60 | \code{\link{savePipeline}}, \code{\link{saveRegistry}},
61 | \code{\link{setInput}}, \code{\link{setLoggerDetails}},
62 | \code{\link{updateObject}},
63 | \code{\link{visualizePipeline}}
64 | }
65 | \concept{Package core functions}
66 |
--------------------------------------------------------------------------------
/man/dot-analysisPipelinesEnvir.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions.R
3 | \docType{data}
4 | \name{.analysisPipelinesEnvir}
5 | \alias{.analysisPipelinesEnvir}
6 | \title{This section defines the environment which the package uses for maintaining the registry and an outputCache}
7 | \format{An object of class \code{environment} of length 2.}
8 | \usage{
9 | .analysisPipelinesEnvir
10 | }
11 | \description{
12 | This section defines the environment which the package uses for maintaining the registry and an outputCache
13 | }
14 | \keyword{internal}
15 |
--------------------------------------------------------------------------------
/man/dot-getCache.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions.R
3 | \name{.getCache}
4 | \alias{.getCache}
5 | \title{This is an internal function which returns the cache from the package namespace}
6 | \usage{
7 | .getCache()
8 | }
9 | \description{
10 | This is an internal function which returns the cache from the package namespace
11 | }
12 | \keyword{internal}
13 |
--------------------------------------------------------------------------------
/man/dot-saveMetaPipeline.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions-meta-pipelines.R
3 | \name{.saveMetaPipeline}
4 | \alias{.saveMetaPipeline}
5 | \title{A method definition for saving meta-pipelines, called when the 'savePipeline' method is called against the
6 | \code{MetaAnalysisPipeline} signature}
7 | \usage{
8 | .saveMetaPipeline(object, path)
9 | }
10 | \description{
11 | A method definition for saving meta-pipelines, called when the 'savePipeline' method is called against the
12 | \code{MetaAnalysisPipeline} signature
13 | }
14 | \keyword{internal}
15 |
--------------------------------------------------------------------------------
/man/dot-setRegistry.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions.R
3 | \name{.setRegistry}
4 | \alias{.setRegistry}
5 | \title{Internal function used to set the registry object in case of loading pipelines or meta-pipelines}
6 | \usage{
7 | .setRegistry(.registry)
8 | }
9 | \description{
10 | Internal function used to set the registry object in case of loading pipelines or meta-pipelines
11 | }
12 | \keyword{internal}
13 |
--------------------------------------------------------------------------------
/man/dot-updateRegistry.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions.R
3 | \name{.updateRegistry}
4 | \alias{.updateRegistry}
5 | \title{This is an internal function used to update the registry, in order to override existing function registrations}
6 | \usage{
7 | .updateRegistry(functionName, heading = "", engine = "r",
8 | exceptionHandlingFunction = as.character(substitute(genericPipelineException)),
9 | userDefined = F, isDataFunction = T, firstArgClass = "")
10 | }
11 | \description{
12 | This is an internal function used to update the registry, in order to override existing function registrations
13 | }
14 | \keyword{internal}
15 |
--------------------------------------------------------------------------------
/man/dot-visualizeMetaPipeline.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions-meta-pipelines.R
3 | \name{.visualizeMetaPipeline}
4 | \alias{.visualizeMetaPipeline}
5 | \title{A method definition for visualizing meta-pipelines, called when the 'visualizePipeline' method is called against the
6 | \code{MetaAnalysisPipeline} signature}
7 | \usage{
8 | .visualizeMetaPipeline(object)
9 | }
10 | \description{
11 | A method definition for visualizing meta-pipelines, called when the 'visualizePipeline' method is called against the
12 | \code{MetaAnalysisPipeline} signature
13 | }
14 | \keyword{internal}
15 |
--------------------------------------------------------------------------------
/man/exportAsMetaPipeline.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions-meta-pipelines.R
3 | \docType{methods}
4 | \name{exportAsMetaPipeline}
5 | \alias{exportAsMetaPipeline}
6 | \alias{exportAsMetaPipeline,BaseAnalysisPipeline-method}
7 | \title{Method to export a meta-pipeline}
8 | \usage{
9 | exportAsMetaPipeline(object)
10 |
11 | \S4method{exportAsMetaPipeline}{BaseAnalysisPipeline}(object)
12 | }
13 | \arguments{
14 | \item{object}{A Pipeline object}
15 | }
16 | \value{
17 | an object of class "\code{MetaAnalysisPipeline}"
18 | }
19 | \description{
20 | Method to export a meta-pipeline
21 | }
22 | \details{
23 | This method exports a Pipeline object i.e. of the classes \code{AnalysisPipeline} or
24 | \code{StreamingAnalysisPipeline} as a meta-pipeline
25 | }
26 | \examples{
27 | \dontrun{
28 | #' pipelineObj <- AnalysisPipeline(input = iris)
29 | pipelineObj \%>>\% univarCatDistPlots(uniCol = "Species") \%>>\%
30 | exportAsMetaPipeline -> exportedMetaPipeline
31 | }
32 | }
33 | \seealso{
34 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}},
35 | \code{\link{MetaAnalysisPipeline-class}},
36 | \code{\link{assessEngineSetUp}},
37 | \code{\link{checkSchemaMatch}},
38 | \code{\link{createPipelineInstance}},
39 | \code{\link{generateOutput}},
40 | \code{\link{genericPipelineException}},
41 | \code{\link{getInput}}, \code{\link{getLoggerDetails}},
42 | \code{\link{getOutputById}},
43 | \code{\link{getPipelinePrototype}},
44 | \code{\link{getPipeline}}, \code{\link{getRegistry}},
45 | \code{\link{initDfBasedOnType}},
46 | \code{\link{initialize,BaseAnalysisPipeline-method}},
47 | \code{\link{loadMetaPipeline}},
48 | \code{\link{loadPipeline}},
49 | \code{\link{loadPredefinedFunctionRegistry}},
50 | \code{\link{loadRegistry}}, \code{\link{prepExecution}},
51 | \code{\link{registerFunction}},
52 | \code{\link{savePipeline}}, \code{\link{saveRegistry}},
53 | \code{\link{setInput}}, \code{\link{setLoggerDetails}},
54 | \code{\link{updateObject}},
55 | \code{\link{visualizePipeline}}
56 | }
57 | \concept{Package core functions}
58 |
--------------------------------------------------------------------------------
/man/generateOutput.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions.R, R/core-functions-batch.R,
3 | % R/core-streaming-functions.R
4 | \docType{methods}
5 | \name{generateOutput}
6 | \alias{generateOutput}
7 | \alias{generateOutput,AnalysisPipeline-method}
8 | \alias{generateOutput,StreamingAnalysisPipeline-method}
9 | \title{Generate a list of outputs from Pipeline objects}
10 | \usage{
11 | generateOutput(object)
12 |
13 | \S4method{generateOutput}{AnalysisPipeline}(object)
14 |
15 | \S4method{generateOutput}{StreamingAnalysisPipeline}(object)
16 | }
17 | \arguments{
18 | \item{object}{object that contains input, pipeline, registry and output}
19 | }
20 | \value{
21 | Updated Pipeline object with the outputs at each step stored in the \code{output} slot.
22 |
23 | Specific outputs can be obtained by using the \link{getOutputById} function
24 | }
25 | \description{
26 | Generate a list of outputs from Pipeline objects
27 | }
28 | \details{
29 | \code{generateOutput} is a generic function that is implemented for various types of pipeline objects
30 | such as \code{AnalysisPipeline} and \code{StreamingAnalysisPipeline}
31 |
32 | The sequence of operations stored in the pipeline object
33 | are run and outputs generated, stored in a list
34 | }
35 | \seealso{
36 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}},
37 | \code{\link{MetaAnalysisPipeline-class}},
38 | \code{\link{assessEngineSetUp}},
39 | \code{\link{checkSchemaMatch}},
40 | \code{\link{createPipelineInstance}},
41 | \code{\link{exportAsMetaPipeline}},
42 | \code{\link{genericPipelineException}},
43 | \code{\link{getInput}}, \code{\link{getLoggerDetails}},
44 | \code{\link{getOutputById}},
45 | \code{\link{getPipelinePrototype}},
46 | \code{\link{getPipeline}}, \code{\link{getRegistry}},
47 | \code{\link{initDfBasedOnType}},
48 | \code{\link{initialize,BaseAnalysisPipeline-method}},
49 | \code{\link{loadMetaPipeline}},
50 | \code{\link{loadPipeline}},
51 | \code{\link{loadPredefinedFunctionRegistry}},
52 | \code{\link{loadRegistry}}, \code{\link{prepExecution}},
53 | \code{\link{registerFunction}},
54 | \code{\link{savePipeline}}, \code{\link{saveRegistry}},
55 | \code{\link{setInput}}, \code{\link{setLoggerDetails}},
56 | \code{\link{updateObject}},
57 | \code{\link{visualizePipeline}}
58 | }
59 | \concept{Package core functions}
60 |
--------------------------------------------------------------------------------
/man/generateReport.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions-batch.R
3 | \docType{methods}
4 | \name{generateReport}
5 | \alias{generateReport}
6 | \alias{generateReport,AnalysisPipeline,character-method}
7 | \title{Generate a HTML report from an \code{AnalysisPipeline} object}
8 | \usage{
9 | generateReport(object, path)
10 |
11 | \S4method{generateReport}{AnalysisPipeline,character}(object, path = ".")
12 | }
13 | \arguments{
14 | \item{object}{object that contains input, pipeline, registry and output}
15 |
16 | \item{path}{path on the file system, where the generated html report should be stored}
17 | }
18 | \value{
19 | Updated \code{AnalysisPipeline} object
20 | }
21 | \description{
22 | Generate a HTML report from an \code{AnalysisPipeline} object
23 | }
24 | \details{
25 | The sequence of operations stored in the \code{AnalysisPipeline} object are run, outputs generated,
26 | and a HTML report is generated with outputs in the same sequence as the pipeline created by the user
27 | }
28 | \examples{
29 | \dontrun{
30 | pipelineObj <- AnalysisPipeline(input = iris)
31 | pipelineObj \%>>\% univarCatDistPlots(uniCol = "Species", storeOutput = T) -> pipelineObj
32 | pipelineObj \%>>\% generateReport(path = ".")
33 | }
34 | }
35 | \seealso{
36 | Other Package core functions for batch/one-time analyses: \code{\link{AnalysisPipeline-class}},
37 | \code{\link{checkSchema}},
38 | \code{\link{initialize,BaseAnalysisPipeline-method}}
39 | }
40 | \concept{Package core functions for batch/one-time analyses}
41 |
--------------------------------------------------------------------------------
/man/genericPipelineException.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions.R
3 | \name{genericPipelineException}
4 | \alias{genericPipelineException}
5 | \title{Default exception for pipeline functions}
6 | \usage{
7 | genericPipelineException(error)
8 | }
9 | \arguments{
10 | \item{error}{Error encountered during the execution of a particular pipeline function}
11 | }
12 | \description{
13 | Default exception for pipeline functions
14 | }
15 | \details{
16 | This functions defines the default function which will be called in case of an exception occurring while
17 | executing any of the pipeline functions. While a function is registered, a custom function to deal with exceptions
18 | incurred during the call of the function being registered can be passed by the user. If passed, the custom function
19 | will be called instead of this function
20 | }
21 | \seealso{
22 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}},
23 | \code{\link{MetaAnalysisPipeline-class}},
24 | \code{\link{assessEngineSetUp}},
25 | \code{\link{checkSchemaMatch}},
26 | \code{\link{createPipelineInstance}},
27 | \code{\link{exportAsMetaPipeline}},
28 | \code{\link{generateOutput}}, \code{\link{getInput}},
29 | \code{\link{getLoggerDetails}},
30 | \code{\link{getOutputById}},
31 | \code{\link{getPipelinePrototype}},
32 | \code{\link{getPipeline}}, \code{\link{getRegistry}},
33 | \code{\link{initDfBasedOnType}},
34 | \code{\link{initialize,BaseAnalysisPipeline-method}},
35 | \code{\link{loadMetaPipeline}},
36 | \code{\link{loadPipeline}},
37 | \code{\link{loadPredefinedFunctionRegistry}},
38 | \code{\link{loadRegistry}}, \code{\link{prepExecution}},
39 | \code{\link{registerFunction}},
40 | \code{\link{savePipeline}}, \code{\link{saveRegistry}},
41 | \code{\link{setInput}}, \code{\link{setLoggerDetails}},
42 | \code{\link{updateObject}},
43 | \code{\link{visualizePipeline}}
44 | }
45 | \concept{Package core functions}
46 |
--------------------------------------------------------------------------------
/man/getDatatype.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/r-batch-eda-utilities.R
3 | \name{getDatatype}
4 | \alias{getDatatype}
5 | \title{Get Data Type}
6 | \usage{
7 | getDatatype(dataset)
8 | }
9 | \arguments{
10 | \item{dataset}{a dataset which needs to be loaded}
11 | }
12 | \value{
13 | list with \code{numeric_cols} and \code{cat_cols}
14 | }
15 | \description{
16 | Get Data Type
17 | }
18 | \details{
19 | Based on the datatype the columns are seperated into categorical and numerical columns
20 | }
21 | \examples{
22 | getDatatype(iris)
23 | }
24 | \seealso{
25 | Other Package EDA Utilites functions: \code{\link{CheckColumnType}},
26 | \code{\link{bivarPlots}},
27 | \code{\link{correlationMatPlot}},
28 | \code{\link{ignoreCols}},
29 | \code{\link{multiVarOutlierPlot}},
30 | \code{\link{outlierPlot}},
31 | \code{\link{univarCatDistPlots}}
32 | }
33 | \concept{Package EDA Utilites functions}
34 |
--------------------------------------------------------------------------------
/man/getEndPoints.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions.R
3 | \name{getEndPoints}
4 | \alias{getEndPoints}
5 | \title{Obtains end nodes in a graph given nodes and edges}
6 | \usage{
7 | getEndPoints(nodes, edgeDf)
8 | }
9 | \description{
10 | Obtains end nodes in a graph given nodes and edges
11 | }
12 | \keyword{internal}
13 |
--------------------------------------------------------------------------------
/man/getFeaturesForPyClassification.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/r-helper-utilites-python.R
3 | \name{getFeaturesForPyClassification}
4 | \alias{getFeaturesForPyClassification}
5 | \title{Extracts selected columns from a data frame as a Python array}
6 | \usage{
7 | getFeaturesForPyClassification(dataset, featureNames)
8 | }
9 | \arguments{
10 | \item{dataset}{an R data frame}
11 |
12 | \item{featureNames}{Column names to be extracted from the R data frames. A character vector.}
13 | }
14 | \description{
15 | Extracts selected columns from a data frame as a Python array
16 | }
17 | \details{
18 | Helper function, which when provided an R data frame and a set of column/ feature names,
19 | extracts them from the R data frame as a matrix and converts them to the equivalent Python array.
20 |
21 | Typically this function can be used when providing a feature matrix to a Python machine learning function
22 | }
23 | \examples{
24 | \dontrun{
25 | getFeaturesForPyClassification(dataset = iris,
26 | featureNames = c("Sepal.Length", "Sepal.Width"))
27 | }
28 | }
29 | \seealso{
30 | Other R helper utilities for Python: \code{\link{getTargetForPyClassification}},
31 | \code{\link{setPythonEnvir}}
32 | }
33 | \concept{R helper utilities for Python}
34 |
--------------------------------------------------------------------------------
/man/getInput.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions.R
3 | \docType{methods}
4 | \name{getInput}
5 | \alias{getInput}
6 | \alias{getInput,BaseAnalysisPipeline-method}
7 | \title{Obtains the initializedInput}
8 | \usage{
9 | getInput(object)
10 |
11 | \S4method{getInput}{BaseAnalysisPipeline}(object)
12 | }
13 | \arguments{
14 | \item{object}{The \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object}
15 | }
16 | \value{
17 | Dataframe for an \code{AnalysisPipeline} & SparkDataFrame for a \code{StreamingAnalysisPipeline}
18 | }
19 | \description{
20 | Obtains the initializedInput
21 | }
22 | \details{
23 | Obtains the input from the \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object
24 |
25 | This method is implemented on the base class as it is a shared functionality types of Analysis Pipelines
26 | which extend this class
27 | }
28 | \examples{
29 | library(analysisPipelines)
30 | pipelineObj <- AnalysisPipeline(input = iris)
31 | pipelineObj \%>>\% getInput
32 | }
33 | \seealso{
34 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}},
35 | \code{\link{MetaAnalysisPipeline-class}},
36 | \code{\link{assessEngineSetUp}},
37 | \code{\link{checkSchemaMatch}},
38 | \code{\link{createPipelineInstance}},
39 | \code{\link{exportAsMetaPipeline}},
40 | \code{\link{generateOutput}},
41 | \code{\link{genericPipelineException}},
42 | \code{\link{getLoggerDetails}},
43 | \code{\link{getOutputById}},
44 | \code{\link{getPipelinePrototype}},
45 | \code{\link{getPipeline}}, \code{\link{getRegistry}},
46 | \code{\link{initDfBasedOnType}},
47 | \code{\link{initialize,BaseAnalysisPipeline-method}},
48 | \code{\link{loadMetaPipeline}},
49 | \code{\link{loadPipeline}},
50 | \code{\link{loadPredefinedFunctionRegistry}},
51 | \code{\link{loadRegistry}}, \code{\link{prepExecution}},
52 | \code{\link{registerFunction}},
53 | \code{\link{savePipeline}}, \code{\link{saveRegistry}},
54 | \code{\link{setInput}}, \code{\link{setLoggerDetails}},
55 | \code{\link{updateObject}},
56 | \code{\link{visualizePipeline}}
57 | }
58 | \concept{Package core functions}
59 |
--------------------------------------------------------------------------------
/man/getLoggerDetails.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions.R
3 | \docType{methods}
4 | \name{getLoggerDetails}
5 | \alias{getLoggerDetails}
6 | \alias{getLoggerDetails,BaseAnalysisPipeline-method}
7 | \title{Obtains the logger configuration for the pipeline}
8 | \usage{
9 | getLoggerDetails(object)
10 |
11 | \S4method{getLoggerDetails}{BaseAnalysisPipeline}(object)
12 | }
13 | \arguments{
14 | \item{object}{A Pipeline object}
15 | }
16 | \value{
17 | Logger configuration as a list
18 | }
19 | \description{
20 | Obtains the logger configuration for the pipeline
21 | }
22 | \details{
23 | This function obtains the logger configuration for the pipeline.
24 | }
25 | \examples{
26 | library(analysisPipelines)
27 | pipelineObj <- AnalysisPipeline(input = iris)
28 | pipelineObj \%>>\% getLoggerDetails
29 | }
30 | \seealso{
31 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}},
32 | \code{\link{MetaAnalysisPipeline-class}},
33 | \code{\link{assessEngineSetUp}},
34 | \code{\link{checkSchemaMatch}},
35 | \code{\link{createPipelineInstance}},
36 | \code{\link{exportAsMetaPipeline}},
37 | \code{\link{generateOutput}},
38 | \code{\link{genericPipelineException}},
39 | \code{\link{getInput}}, \code{\link{getOutputById}},
40 | \code{\link{getPipelinePrototype}},
41 | \code{\link{getPipeline}}, \code{\link{getRegistry}},
42 | \code{\link{initDfBasedOnType}},
43 | \code{\link{initialize,BaseAnalysisPipeline-method}},
44 | \code{\link{loadMetaPipeline}},
45 | \code{\link{loadPipeline}},
46 | \code{\link{loadPredefinedFunctionRegistry}},
47 | \code{\link{loadRegistry}}, \code{\link{prepExecution}},
48 | \code{\link{registerFunction}},
49 | \code{\link{savePipeline}}, \code{\link{saveRegistry}},
50 | \code{\link{setInput}}, \code{\link{setLoggerDetails}},
51 | \code{\link{updateObject}},
52 | \code{\link{visualizePipeline}}
53 | }
54 | \concept{Package core functions}
55 |
--------------------------------------------------------------------------------
/man/getOutputById.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions.R
3 | \docType{methods}
4 | \name{getOutputById}
5 | \alias{getOutputById}
6 | \alias{getOutputById,BaseAnalysisPipeline-method}
7 | \title{Obtains a specific output}
8 | \usage{
9 | getOutputById(object, reqId, includeCall = F)
10 |
11 | \S4method{getOutputById}{BaseAnalysisPipeline}(object, reqId,
12 | includeCall = F)
13 | }
14 | \arguments{
15 | \item{object}{The \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object}
16 |
17 | \item{reqId}{The position of the function for which the output is desired in the sequence of operations in the pipeline.}
18 |
19 | \item{includeCall}{Logical which defines whether the call used to generate the output should be returned. By, default this is false}
20 | }
21 | \value{
22 | If includeCall = F, the output object generated by the function is returned
23 |
24 | If includeCall = T, it is a list containing to elements
25 | - call: tibble with 1 row containing the function call for the output desired
26 | - output: output generated
27 | }
28 | \description{
29 | Obtains a specific output
30 | }
31 | \details{
32 | Obtains a specific output from the \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object by passing the position
33 | of the function for which the output is desired, in the sequence of operations in the pipeline. This can be obtained by passing the number
34 | under the 'id' column in the pipeline table corresponding to the required function
35 |
36 | This method is implemented on the base class as it is a shared functionality types of Analysis Pipelines
37 | which extend this class
38 | }
39 | \examples{
40 | \dontrun{
41 | library(analysisPipelines)
42 | pipelineObj <- AnalysisPipeline(input = iris)
43 | getNumRows <- function(dataset){
44 | return(nrow(dataset))
45 | }
46 | registerFunction("getNumRows")
47 | pipelineObj \%>>\% getNumRows(storeOutput = TRUE) -> pipelineObj
48 | pipelineObj \%>>\% generateOutput \%>>\% getOutputById("1")
49 | }
50 | }
51 | \seealso{
52 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}},
53 | \code{\link{MetaAnalysisPipeline-class}},
54 | \code{\link{assessEngineSetUp}},
55 | \code{\link{checkSchemaMatch}},
56 | \code{\link{createPipelineInstance}},
57 | \code{\link{exportAsMetaPipeline}},
58 | \code{\link{generateOutput}},
59 | \code{\link{genericPipelineException}},
60 | \code{\link{getInput}}, \code{\link{getLoggerDetails}},
61 | \code{\link{getPipelinePrototype}},
62 | \code{\link{getPipeline}}, \code{\link{getRegistry}},
63 | \code{\link{initDfBasedOnType}},
64 | \code{\link{initialize,BaseAnalysisPipeline-method}},
65 | \code{\link{loadMetaPipeline}},
66 | \code{\link{loadPipeline}},
67 | \code{\link{loadPredefinedFunctionRegistry}},
68 | \code{\link{loadRegistry}}, \code{\link{prepExecution}},
69 | \code{\link{registerFunction}},
70 | \code{\link{savePipeline}}, \code{\link{saveRegistry}},
71 | \code{\link{setInput}}, \code{\link{setLoggerDetails}},
72 | \code{\link{updateObject}},
73 | \code{\link{visualizePipeline}}
74 | }
75 | \concept{Package core functions}
76 |
--------------------------------------------------------------------------------
/man/getPipeline.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions.R
3 | \docType{methods}
4 | \name{getPipeline}
5 | \alias{getPipeline}
6 | \alias{getPipeline,BaseAnalysisPipeline-method}
7 | \title{Obtain the pipeline}
8 | \usage{
9 | getPipeline(object)
10 |
11 | \S4method{getPipeline}{BaseAnalysisPipeline}(object)
12 | }
13 | \arguments{
14 | \item{object}{The \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object}
15 | }
16 | \value{
17 | Tibble describing the pipeline
18 | }
19 | \description{
20 | Obtain the pipeline
21 | }
22 | \details{
23 | Obtains the pipeline from the \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object as a tibble
24 |
25 | This method is implemented on the base class as it is a shared functionality types of Analysis Pipelines
26 | which extend this class
27 | }
28 | \examples{
29 | \dontrun{
30 | library(analysisPipelines)
31 | pipelineObj <- AnalysisPipeline(input = iris)
32 | getNumRows <- function(dataset){
33 | return(nrow(dataset))
34 | }
35 | registerFunction("getNumRows")
36 | pipelineObj \%>>\% getNumRows \%>>\% getPipeline
37 | }
38 | }
39 | \seealso{
40 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}},
41 | \code{\link{MetaAnalysisPipeline-class}},
42 | \code{\link{assessEngineSetUp}},
43 | \code{\link{checkSchemaMatch}},
44 | \code{\link{createPipelineInstance}},
45 | \code{\link{exportAsMetaPipeline}},
46 | \code{\link{generateOutput}},
47 | \code{\link{genericPipelineException}},
48 | \code{\link{getInput}}, \code{\link{getLoggerDetails}},
49 | \code{\link{getOutputById}},
50 | \code{\link{getPipelinePrototype}},
51 | \code{\link{getRegistry}},
52 | \code{\link{initDfBasedOnType}},
53 | \code{\link{initialize,BaseAnalysisPipeline-method}},
54 | \code{\link{loadMetaPipeline}},
55 | \code{\link{loadPipeline}},
56 | \code{\link{loadPredefinedFunctionRegistry}},
57 | \code{\link{loadRegistry}}, \code{\link{prepExecution}},
58 | \code{\link{registerFunction}},
59 | \code{\link{savePipeline}}, \code{\link{saveRegistry}},
60 | \code{\link{setInput}}, \code{\link{setLoggerDetails}},
61 | \code{\link{updateObject}},
62 | \code{\link{visualizePipeline}}
63 | }
64 | \concept{Package core functions}
65 |
--------------------------------------------------------------------------------
/man/getPipelinePrototype.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions-meta-pipelines.R
3 | \docType{methods}
4 | \name{getPipelinePrototype}
5 | \alias{getPipelinePrototype}
6 | \alias{getPipelinePrototype,MetaAnalysisPipeline-method}
7 | \title{Obtain the prototype of the functions in the pipeline}
8 | \usage{
9 | getPipelinePrototype(metaPipelineObj)
10 |
11 | \S4method{getPipelinePrototype}{MetaAnalysisPipeline}(metaPipelineObj)
12 | }
13 | \arguments{
14 | \item{metaPipelineObj}{A \code{MetaAnalysisPipeline} object}
15 | }
16 | \value{
17 | An object og class \code{proto} from the 'proto' package
18 | }
19 | \description{
20 | Obtain the prototype of the functions in the pipeline
21 | }
22 | \details{
23 | This method returns the prototype of functions in the pipeline and their respective arguments as \code{proto} object.
24 | Functions in the pipeline can be accessed easily by using the '$' operator, and within the functions the arguments can
25 | be accessed the same way. These can be accessed and set to new values. This pipeline prototype can then be passed to the
26 | \code{createPipelineInstance} method which will instantiate an executable pipeline with the inputs set in the prototype
27 | }
28 | \examples{
29 | \dontrun{
30 | pipelineObj <- AnalysisPipeline(input = iris)
31 | pipelineObj \%>>\% univarCatDistPlots(uniCol = "Species") \%>>\%
32 | exportAsMetaPipeline \%>>\% getPipelinePrototype
33 | }
34 | }
35 | \seealso{
36 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}},
37 | \code{\link{MetaAnalysisPipeline-class}},
38 | \code{\link{assessEngineSetUp}},
39 | \code{\link{checkSchemaMatch}},
40 | \code{\link{createPipelineInstance}},
41 | \code{\link{exportAsMetaPipeline}},
42 | \code{\link{generateOutput}},
43 | \code{\link{genericPipelineException}},
44 | \code{\link{getInput}}, \code{\link{getLoggerDetails}},
45 | \code{\link{getOutputById}}, \code{\link{getPipeline}},
46 | \code{\link{getRegistry}},
47 | \code{\link{initDfBasedOnType}},
48 | \code{\link{initialize,BaseAnalysisPipeline-method}},
49 | \code{\link{loadMetaPipeline}},
50 | \code{\link{loadPipeline}},
51 | \code{\link{loadPredefinedFunctionRegistry}},
52 | \code{\link{loadRegistry}}, \code{\link{prepExecution}},
53 | \code{\link{registerFunction}},
54 | \code{\link{savePipeline}}, \code{\link{saveRegistry}},
55 | \code{\link{setInput}}, \code{\link{setLoggerDetails}},
56 | \code{\link{updateObject}},
57 | \code{\link{visualizePipeline}}
58 | }
59 | \concept{Package core functions}
60 |
--------------------------------------------------------------------------------
/man/getRegistry.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions.R
3 | \name{getRegistry}
4 | \alias{getRegistry}
5 | \title{Obtains the function registry}
6 | \usage{
7 | getRegistry()
8 | }
9 | \value{
10 | Tibble describing the registry
11 | }
12 | \description{
13 | Obtains the function registry
14 | }
15 | \details{
16 | Obtains the function registry as a tibble, including both predefined and user defined functions
17 | }
18 | \examples{
19 | getRegistry()
20 | }
21 | \seealso{
22 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}},
23 | \code{\link{MetaAnalysisPipeline-class}},
24 | \code{\link{assessEngineSetUp}},
25 | \code{\link{checkSchemaMatch}},
26 | \code{\link{createPipelineInstance}},
27 | \code{\link{exportAsMetaPipeline}},
28 | \code{\link{generateOutput}},
29 | \code{\link{genericPipelineException}},
30 | \code{\link{getInput}}, \code{\link{getLoggerDetails}},
31 | \code{\link{getOutputById}},
32 | \code{\link{getPipelinePrototype}},
33 | \code{\link{getPipeline}},
34 | \code{\link{initDfBasedOnType}},
35 | \code{\link{initialize,BaseAnalysisPipeline-method}},
36 | \code{\link{loadMetaPipeline}},
37 | \code{\link{loadPipeline}},
38 | \code{\link{loadPredefinedFunctionRegistry}},
39 | \code{\link{loadRegistry}}, \code{\link{prepExecution}},
40 | \code{\link{registerFunction}},
41 | \code{\link{savePipeline}}, \code{\link{saveRegistry}},
42 | \code{\link{setInput}}, \code{\link{setLoggerDetails}},
43 | \code{\link{updateObject}},
44 | \code{\link{visualizePipeline}}
45 | }
46 | \concept{Package core functions}
47 |
--------------------------------------------------------------------------------
/man/getResponse.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions.R
3 | \name{getResponse}
4 | \alias{getResponse}
5 | \title{Obtains the response term from the formula}
6 | \usage{
7 | getResponse(f)
8 | }
9 | \arguments{
10 | \item{f}{formula from which term is to be extracted.}
11 | }
12 | \value{
13 | The response variable in the formula as a string
14 | }
15 | \description{
16 | Obtains the response term from the formula
17 | }
18 | \details{
19 | This is a helper function to extract the response variable from a formula
20 | }
21 | \examples{
22 | library(analysisPipelines)
23 | getResponse(y ~ x1 + x2)
24 | }
25 |
--------------------------------------------------------------------------------
/man/getStartingPoints.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions.R
3 | \name{getStartingPoints}
4 | \alias{getStartingPoints}
5 | \title{Obtains starting nodes in a graph given nodes and edges}
6 | \usage{
7 | getStartingPoints(nodes, edgeDf)
8 | }
9 | \description{
10 | Obtains starting nodes in a graph given nodes and edges
11 | }
12 | \keyword{internal}
13 |
--------------------------------------------------------------------------------
/man/getTargetForPyClassification.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/r-helper-utilites-python.R
3 | \name{getTargetForPyClassification}
4 | \alias{getTargetForPyClassification}
5 | \title{Extracts selected column from a data frame a binary class Python array}
6 | \usage{
7 | getTargetForPyClassification(dataset, targetVarName, positiveClass)
8 | }
9 | \arguments{
10 | \item{dataset}{an R data frame}
11 |
12 | \item{targetVarName}{Name of the target variable for classification. Should be a categorical variable.}
13 |
14 | \item{positiveClass}{Name of the class of the target variable which should be coded as '1'}
15 | }
16 | \description{
17 | Extracts selected column from a data frame a binary class Python array
18 | }
19 | \details{
20 | Helper function, which when provided an R dataframe and a binary categorical column,
21 | extracts it from the R data frame, converts it to 1/0 class coding, and converts it to a Python array
22 |
23 | Typically this function can be used to extract a target variable for a classifier to be provided to a
24 | Python machine learning function
25 | }
26 | \examples{
27 | \dontrun{
28 | getTargetForPyClassification(dataset = iris,
29 | targetVarName = "Species", positiveClass = "setosa")
30 | }
31 | }
32 | \seealso{
33 | Other R helper utilities for Python: \code{\link{getFeaturesForPyClassification}},
34 | \code{\link{setPythonEnvir}}
35 | }
36 | \concept{R helper utilities for Python}
37 |
--------------------------------------------------------------------------------
/man/getTerm.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions.R
3 | \name{getTerm}
4 | \alias{getTerm}
5 | \title{Obtains the dependency term from the formula}
6 | \usage{
7 | getTerm(f)
8 | }
9 | \arguments{
10 | \item{f}{formula from which term is to be extracted.}
11 | }
12 | \value{
13 | String with the terms
14 | }
15 | \description{
16 | Obtains the dependency term from the formula
17 | }
18 | \details{
19 | This is a helper function to extract the terms from a formula
20 | }
21 | \examples{
22 | library(analysisPipelines)
23 | getTerm(y ~ x)
24 | }
25 |
--------------------------------------------------------------------------------
/man/getUpstreamDependencies.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions.R
3 | \name{getUpstreamDependencies}
4 | \alias{getUpstreamDependencies}
5 | \title{Obtains upstream dependencies for \code{AnalysisPipeline} objects}
6 | \usage{
7 | getUpstreamDependencies(row)
8 | }
9 | \description{
10 | Obtains upstream dependencies for \code{AnalysisPipeline} objects
11 | }
12 | \keyword{internal}
13 |
--------------------------------------------------------------------------------
/man/identifyTopLevelRecursively.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions.R
3 | \name{identifyTopLevelRecursively}
4 | \alias{identifyTopLevelRecursively}
5 | \title{Recursive function to identify the toplogical levels of the functions in a pipeline}
6 | \usage{
7 | identifyTopLevelRecursively(input = list(topDf = dplyr::tibble(), nodes =
8 | c(), edgeDf = dplyr::tibble(), level = 1))
9 | }
10 | \description{
11 | Recursive function to identify the toplogical levels of the functions in a pipeline
12 | }
13 | \keyword{internal}
14 |
--------------------------------------------------------------------------------
/man/identifyTopologicalLevels.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions.R
3 | \name{identifyTopologicalLevels}
4 | \alias{identifyTopologicalLevels}
5 | \title{Identifies the topological levels of the functions in a pipeline}
6 | \usage{
7 | identifyTopologicalLevels(nodes = c(), edgeDf = dplyr::tibble(),
8 | topDf = dplyr::tibble(id = character(), level = character()),
9 | level = 1)
10 | }
11 | \description{
12 | Identifies the topological levels of the functions in a pipeline
13 | }
14 | \keyword{internal}
15 |
--------------------------------------------------------------------------------
/man/ignoreCols.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/r-batch-eda-utilities.R
3 | \name{ignoreCols}
4 | \alias{ignoreCols}
5 | \title{Ignores the columns in the loaded dataframe object}
6 | \usage{
7 | ignoreCols(data, columns)
8 | }
9 | \arguments{
10 | \item{data}{the dataframe object that needs to be loaded}
11 |
12 | \item{columns}{the names of columns to be ignored from dataframe object}
13 | }
14 | \value{
15 | Updated dataframe object
16 | }
17 | \description{
18 | Ignores the columns in the loaded dataframe object
19 | }
20 | \details{
21 | The columns selected are removed from the object
22 | }
23 | \examples{
24 | ignoreCols(data = iris, columns = "Species")
25 | }
26 | \seealso{
27 | Other Package EDA Utilites functions: \code{\link{CheckColumnType}},
28 | \code{\link{bivarPlots}},
29 | \code{\link{correlationMatPlot}},
30 | \code{\link{getDatatype}},
31 | \code{\link{multiVarOutlierPlot}},
32 | \code{\link{outlierPlot}},
33 | \code{\link{univarCatDistPlots}}
34 | }
35 | \concept{Package EDA Utilites functions}
36 |
--------------------------------------------------------------------------------
/man/initDfBasedOnType.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions.R
3 | \name{initDfBasedOnType}
4 | \alias{initDfBasedOnType}
5 | \title{initializes the \code{AnalysisPipeline} object with the input based on the provided type}
6 | \usage{
7 | initDfBasedOnType(input, filePath)
8 | }
9 | \arguments{
10 | \item{input}{Input dataframe}
11 |
12 | \item{filePath}{File path where the .csv file is stored}
13 | }
14 | \value{
15 | \code{AnalysisPipeline} object initialized with input
16 | }
17 | \description{
18 | initializes the \code{AnalysisPipeline} object with the input based on the provided type
19 | }
20 | \details{
21 | Transforms provided inputs into R data frame regardless of the input provided, be it Spark DataFrames
22 | or Python data frames
23 | }
24 | \seealso{
25 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}},
26 | \code{\link{MetaAnalysisPipeline-class}},
27 | \code{\link{assessEngineSetUp}},
28 | \code{\link{checkSchemaMatch}},
29 | \code{\link{createPipelineInstance}},
30 | \code{\link{exportAsMetaPipeline}},
31 | \code{\link{generateOutput}},
32 | \code{\link{genericPipelineException}},
33 | \code{\link{getInput}}, \code{\link{getLoggerDetails}},
34 | \code{\link{getOutputById}},
35 | \code{\link{getPipelinePrototype}},
36 | \code{\link{getPipeline}}, \code{\link{getRegistry}},
37 | \code{\link{initialize,BaseAnalysisPipeline-method}},
38 | \code{\link{loadMetaPipeline}},
39 | \code{\link{loadPipeline}},
40 | \code{\link{loadPredefinedFunctionRegistry}},
41 | \code{\link{loadRegistry}}, \code{\link{prepExecution}},
42 | \code{\link{registerFunction}},
43 | \code{\link{savePipeline}}, \code{\link{saveRegistry}},
44 | \code{\link{setInput}}, \code{\link{setLoggerDetails}},
45 | \code{\link{updateObject}},
46 | \code{\link{visualizePipeline}}
47 | }
48 | \concept{Package core functions}
49 | \keyword{internal}
50 |
--------------------------------------------------------------------------------
/man/initialize-methods.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions.R, R/core-functions-batch.R,
3 | % R/core-functions-meta-pipelines.R, R/core-streaming-functions.R
4 | \docType{methods}
5 | \name{initialize,BaseAnalysisPipeline-method}
6 | \alias{initialize,BaseAnalysisPipeline-method}
7 | \alias{initialize,AnalysisPipeline-method}
8 | \alias{initialize,MetaAnalysisPipeline-method}
9 | \alias{initialize,StreamingAnalysisPipeline-method}
10 | \title{This is the constructor for the \link{BaseAnalysisPipeline} class}
11 | \usage{
12 | \S4method{initialize}{BaseAnalysisPipeline}(.Object)
13 |
14 | \S4method{initialize}{AnalysisPipeline}(.Object, ...,
15 | input = data.frame(), filePath = "")
16 |
17 | \S4method{initialize}{MetaAnalysisPipeline}(.Object, type = "batch")
18 |
19 | \S4method{initialize}{StreamingAnalysisPipeline}(.Object, input)
20 | }
21 | \description{
22 | BaseAnalysisPipeline constructor
23 |
24 | AnalysisPipeline constructor
25 |
26 | MetaAnalysisPipeline constructor
27 |
28 | StreamingAnalysisPipeline constructor
29 | }
30 | \seealso{
31 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}},
32 | \code{\link{MetaAnalysisPipeline-class}},
33 | \code{\link{assessEngineSetUp}},
34 | \code{\link{checkSchemaMatch}},
35 | \code{\link{createPipelineInstance}},
36 | \code{\link{exportAsMetaPipeline}},
37 | \code{\link{generateOutput}},
38 | \code{\link{genericPipelineException}},
39 | \code{\link{getInput}}, \code{\link{getLoggerDetails}},
40 | \code{\link{getOutputById}},
41 | \code{\link{getPipelinePrototype}},
42 | \code{\link{getPipeline}}, \code{\link{getRegistry}},
43 | \code{\link{initDfBasedOnType}},
44 | \code{\link{loadMetaPipeline}},
45 | \code{\link{loadPipeline}},
46 | \code{\link{loadPredefinedFunctionRegistry}},
47 | \code{\link{loadRegistry}}, \code{\link{prepExecution}},
48 | \code{\link{registerFunction}},
49 | \code{\link{savePipeline}}, \code{\link{saveRegistry}},
50 | \code{\link{setInput}}, \code{\link{setLoggerDetails}},
51 | \code{\link{updateObject}},
52 | \code{\link{visualizePipeline}}
53 |
54 | Other Package core functions for batch/one-time analyses: \code{\link{AnalysisPipeline-class}},
55 | \code{\link{checkSchema}}, \code{\link{generateReport}}
56 |
57 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}},
58 | \code{\link{MetaAnalysisPipeline-class}},
59 | \code{\link{assessEngineSetUp}},
60 | \code{\link{checkSchemaMatch}},
61 | \code{\link{createPipelineInstance}},
62 | \code{\link{exportAsMetaPipeline}},
63 | \code{\link{generateOutput}},
64 | \code{\link{genericPipelineException}},
65 | \code{\link{getInput}}, \code{\link{getLoggerDetails}},
66 | \code{\link{getOutputById}},
67 | \code{\link{getPipelinePrototype}},
68 | \code{\link{getPipeline}}, \code{\link{getRegistry}},
69 | \code{\link{initDfBasedOnType}},
70 | \code{\link{loadMetaPipeline}},
71 | \code{\link{loadPipeline}},
72 | \code{\link{loadPredefinedFunctionRegistry}},
73 | \code{\link{loadRegistry}}, \code{\link{prepExecution}},
74 | \code{\link{registerFunction}},
75 | \code{\link{savePipeline}}, \code{\link{saveRegistry}},
76 | \code{\link{setInput}}, \code{\link{setLoggerDetails}},
77 | \code{\link{updateObject}},
78 | \code{\link{visualizePipeline}}
79 | }
80 | \concept{Package core functions}
81 | \concept{Package core functions for batch/one-time analyses}
82 | \keyword{internal}
83 |
--------------------------------------------------------------------------------
/man/initializeLoggers.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions.R
3 | \name{initializeLoggers}
4 | \alias{initializeLoggers}
5 | \title{intializes the loggers with the required appenders and layout based on the provided configuration}
6 | \usage{
7 | initializeLoggers(object)
8 | }
9 | \description{
10 | intializes the loggers with the required appenders and layout based on the provided configuration
11 | }
12 | \keyword{internal}
13 |
--------------------------------------------------------------------------------
/man/isDependencyParam.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions.R
3 | \name{isDependencyParam}
4 | \alias{isDependencyParam}
5 | \title{Checks if the parameter is the dependency parameter}
6 | \usage{
7 | isDependencyParam(f)
8 | }
9 | \arguments{
10 | \item{f}{formula from which term is to be extracted.}
11 | }
12 | \value{
13 | Logical as to whether it is a dependency parameter
14 | }
15 | \description{
16 | Checks if the parameter is the dependency parameter
17 | }
18 | \details{
19 | This is a helper function to check if the formula provided is a dependency parameter,
20 | as per the package's formula semantics, capturing function dependencies
21 | }
22 | \examples{
23 | library(analysisPipelines)
24 | isDependencyParam(~f1)
25 | }
26 |
--------------------------------------------------------------------------------
/man/loadMetaPipeline.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions-meta-pipelines.R
3 | \name{loadMetaPipeline}
4 | \alias{loadMetaPipeline}
5 | \title{Load a meta-pipeline}
6 | \usage{
7 | loadMetaPipeline(path)
8 | }
9 | \arguments{
10 | \item{path}{the path at which the .Rds file containing the pipeline is located}
11 | }
12 | \value{
13 | An \code{MetaAnalysisPipeline} object
14 | }
15 | \description{
16 | Load a meta-pipeline
17 | }
18 | \details{
19 | This function loads a meta-pipeline from a file system, and returns the meta-pipeline object, which can be assigned
20 | to an object in the environment.
21 |
22 | Note - When a meta-pipeline is loaded, the existing registry is overwritten with the registry saved with the
23 | meta-pipeline
24 | }
25 | \examples{
26 | \dontrun{
27 | loadMetaPipeline(path = "./metaPipeline.RDS")
28 | }
29 | }
30 | \seealso{
31 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}},
32 | \code{\link{MetaAnalysisPipeline-class}},
33 | \code{\link{assessEngineSetUp}},
34 | \code{\link{checkSchemaMatch}},
35 | \code{\link{createPipelineInstance}},
36 | \code{\link{exportAsMetaPipeline}},
37 | \code{\link{generateOutput}},
38 | \code{\link{genericPipelineException}},
39 | \code{\link{getInput}}, \code{\link{getLoggerDetails}},
40 | \code{\link{getOutputById}},
41 | \code{\link{getPipelinePrototype}},
42 | \code{\link{getPipeline}}, \code{\link{getRegistry}},
43 | \code{\link{initDfBasedOnType}},
44 | \code{\link{initialize,BaseAnalysisPipeline-method}},
45 | \code{\link{loadPipeline}},
46 | \code{\link{loadPredefinedFunctionRegistry}},
47 | \code{\link{loadRegistry}}, \code{\link{prepExecution}},
48 | \code{\link{registerFunction}},
49 | \code{\link{savePipeline}}, \code{\link{saveRegistry}},
50 | \code{\link{setInput}}, \code{\link{setLoggerDetails}},
51 | \code{\link{updateObject}},
52 | \code{\link{visualizePipeline}}
53 | }
54 | \concept{Package core functions}
55 |
--------------------------------------------------------------------------------
/man/loadPipeline.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions.R
3 | \name{loadPipeline}
4 | \alias{loadPipeline}
5 | \title{Loads the \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object from the file system}
6 | \usage{
7 | loadPipeline(path, input = data.frame(), filePath = "")
8 | }
9 | \arguments{
10 | \item{path}{the path at which the .Rds file containing the pipeline is located}
11 |
12 | \item{input}{(optional) data frame with which the pipeline object should be initialized}
13 |
14 | \item{filePath}{(optional) path where a dataset in .CSV format is present which is to be loaded}
15 | }
16 | \value{
17 | An \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object, optinally initialized with the data frame provided
18 | }
19 | \description{
20 | Loads the \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object from the file system
21 | }
22 | \details{
23 | The \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object is loaded into the file system from the file system
24 | based on the path specified.
25 |
26 | Optionally, the \code{input} parameter can be provided to
27 | initialize the \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object with an R data frame
28 | or Streaming Spark DataFrame (in case of \code{StreamingAnalysisPipeline} object) present in the R session.
29 |
30 | Another provided option, is to specify a filePath where the input dataset is present (in a .CSV format)
31 | and the object will be initialized with this data frame. The \code{filePath} parameter takes precedence over
32 | \code{input} parameter. This is applicable only from \code{AnalysisPipeline} objects
33 |
34 | Note - When a pipeline is loaded, the existing registry is overwritten with the registry saved with the
35 | pipeline
36 | }
37 | \examples{
38 | \dontrun{
39 | library(analysisPipelines)
40 | loadPipeline(path = "./pipeline.RDS")
41 | }
42 | }
43 | \seealso{
44 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}},
45 | \code{\link{MetaAnalysisPipeline-class}},
46 | \code{\link{assessEngineSetUp}},
47 | \code{\link{checkSchemaMatch}},
48 | \code{\link{createPipelineInstance}},
49 | \code{\link{exportAsMetaPipeline}},
50 | \code{\link{generateOutput}},
51 | \code{\link{genericPipelineException}},
52 | \code{\link{getInput}}, \code{\link{getLoggerDetails}},
53 | \code{\link{getOutputById}},
54 | \code{\link{getPipelinePrototype}},
55 | \code{\link{getPipeline}}, \code{\link{getRegistry}},
56 | \code{\link{initDfBasedOnType}},
57 | \code{\link{initialize,BaseAnalysisPipeline-method}},
58 | \code{\link{loadMetaPipeline}},
59 | \code{\link{loadPredefinedFunctionRegistry}},
60 | \code{\link{loadRegistry}}, \code{\link{prepExecution}},
61 | \code{\link{registerFunction}},
62 | \code{\link{savePipeline}}, \code{\link{saveRegistry}},
63 | \code{\link{setInput}}, \code{\link{setLoggerDetails}},
64 | \code{\link{updateObject}},
65 | \code{\link{visualizePipeline}}
66 | }
67 | \concept{Package core functions}
68 |
--------------------------------------------------------------------------------
/man/loadPredefinedFunctionRegistry.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions.R
3 | \name{loadPredefinedFunctionRegistry}
4 | \alias{loadPredefinedFunctionRegistry}
5 | \title{Loading the registry of predefined functions}
6 | \usage{
7 | loadPredefinedFunctionRegistry()
8 | }
9 | \description{
10 | Loading the registry of predefined functions
11 | }
12 | \details{
13 | Loads the registry of predefined functions
14 | }
15 | \examples{
16 | \dontrun{
17 | library(analysisPipelines)
18 | loadPredefinedFunctionRegistry()
19 | }
20 | }
21 | \seealso{
22 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}},
23 | \code{\link{MetaAnalysisPipeline-class}},
24 | \code{\link{assessEngineSetUp}},
25 | \code{\link{checkSchemaMatch}},
26 | \code{\link{createPipelineInstance}},
27 | \code{\link{exportAsMetaPipeline}},
28 | \code{\link{generateOutput}},
29 | \code{\link{genericPipelineException}},
30 | \code{\link{getInput}}, \code{\link{getLoggerDetails}},
31 | \code{\link{getOutputById}},
32 | \code{\link{getPipelinePrototype}},
33 | \code{\link{getPipeline}}, \code{\link{getRegistry}},
34 | \code{\link{initDfBasedOnType}},
35 | \code{\link{initialize,BaseAnalysisPipeline-method}},
36 | \code{\link{loadMetaPipeline}},
37 | \code{\link{loadPipeline}}, \code{\link{loadRegistry}},
38 | \code{\link{prepExecution}},
39 | \code{\link{registerFunction}},
40 | \code{\link{savePipeline}}, \code{\link{saveRegistry}},
41 | \code{\link{setInput}}, \code{\link{setLoggerDetails}},
42 | \code{\link{updateObject}},
43 | \code{\link{visualizePipeline}}
44 | }
45 | \concept{Package core functions}
46 |
--------------------------------------------------------------------------------
/man/loadRegistry.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions.R
3 | \name{loadRegistry}
4 | \alias{loadRegistry}
5 | \title{Loads a function registry from a file}
6 | \usage{
7 | loadRegistry(path)
8 | }
9 | \arguments{
10 | \item{path}{path on the file system, where the registry is to be loaded from}
11 | }
12 | \description{
13 | Loads a function registry from a file
14 | }
15 | \details{
16 | This function loads a function registry and associated function definition stored in an RDS file into the
17 | environment. The existing registry is overwritten with the newly loaded registry
18 | }
19 | \examples{
20 | \dontrun{
21 | library(analysisPipelines)
22 | loadRegistry(path = "./registry.RDS")
23 | }
24 | }
25 | \seealso{
26 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}},
27 | \code{\link{MetaAnalysisPipeline-class}},
28 | \code{\link{assessEngineSetUp}},
29 | \code{\link{checkSchemaMatch}},
30 | \code{\link{createPipelineInstance}},
31 | \code{\link{exportAsMetaPipeline}},
32 | \code{\link{generateOutput}},
33 | \code{\link{genericPipelineException}},
34 | \code{\link{getInput}}, \code{\link{getLoggerDetails}},
35 | \code{\link{getOutputById}},
36 | \code{\link{getPipelinePrototype}},
37 | \code{\link{getPipeline}}, \code{\link{getRegistry}},
38 | \code{\link{initDfBasedOnType}},
39 | \code{\link{initialize,BaseAnalysisPipeline-method}},
40 | \code{\link{loadMetaPipeline}},
41 | \code{\link{loadPipeline}},
42 | \code{\link{loadPredefinedFunctionRegistry}},
43 | \code{\link{prepExecution}},
44 | \code{\link{registerFunction}},
45 | \code{\link{savePipeline}}, \code{\link{saveRegistry}},
46 | \code{\link{setInput}}, \code{\link{setLoggerDetails}},
47 | \code{\link{updateObject}},
48 | \code{\link{visualizePipeline}}
49 | }
50 | \concept{Package core functions}
51 |
--------------------------------------------------------------------------------
/man/multiVarOutlierPlot.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/r-batch-eda-utilities.R
3 | \name{multiVarOutlierPlot}
4 | \alias{multiVarOutlierPlot}
5 | \title{Multi-Variate Outlier Plot}
6 | \usage{
7 | multiVarOutlierPlot(data, depCol, indepCol, sizeCol, priColor = "blue",
8 | optionalPlots = 0, cutoffValue = 0.05)
9 | }
10 | \arguments{
11 | \item{data}{the dataframe that needs to be loaded}
12 |
13 | \item{depCol}{the name of column which is to be identified as dependent column}
14 |
15 | \item{indepCol}{the name of an independent column}
16 |
17 | \item{sizeCol}{the name of column used to define the size of point in plots}
18 |
19 | \item{priColor}{the primary color for the plots}
20 |
21 | \item{optionalPlots}{A Flag for optional plots}
22 |
23 | \item{cutoffValue}{A p-alue cutoff for detecting outliers}
24 | }
25 | \value{
26 | Outliers plot
27 | }
28 | \description{
29 | Multi-Variate Outlier Plot
30 | }
31 | \details{
32 | Multivaraite outlier plot using the selected columns from the dataframe
33 | }
34 | \examples{
35 | \dontrun{
36 | multiVarOutlierPlot(data = iris, depCol = "Sepal.Length",
37 | indepCol = "Sepal.Width", sizeCol = "Petal.Length")
38 | }
39 | }
40 | \seealso{
41 | Other Package EDA Utilites functions: \code{\link{CheckColumnType}},
42 | \code{\link{bivarPlots}},
43 | \code{\link{correlationMatPlot}},
44 | \code{\link{getDatatype}}, \code{\link{ignoreCols}},
45 | \code{\link{outlierPlot}},
46 | \code{\link{univarCatDistPlots}}
47 | }
48 | \concept{Package EDA Utilites functions}
49 |
--------------------------------------------------------------------------------
/man/outlierPlot.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/r-batch-eda-utilities.R
3 | \name{outlierPlot}
4 | \alias{outlierPlot}
5 | \title{Outlier detection plot}
6 | \usage{
7 | outlierPlot(data, method = "iqr", columnName, cutoffValue = 0.05,
8 | priColor = "blue", optionalPlots = 0)
9 | }
10 | \arguments{
11 | \item{data}{the dataframe that needs to be loaded}
12 |
13 | \item{method}{the method on which outliers are to be identified}
14 |
15 | \item{columnName}{the name of column for which the outliers are identified}
16 |
17 | \item{cutoffValue}{the cut off value to define the threshold for outliers}
18 |
19 | \item{priColor}{the primary color for the plots}
20 |
21 | \item{optionalPlots}{A Flag for optional plots}
22 | }
23 | \value{
24 | Outliers plot object
25 | }
26 | \description{
27 | Outlier detection plot
28 | }
29 | \details{
30 | Outlier are to be identified on the selected column from the dataframe
31 | }
32 | \examples{
33 | \dontrun{
34 | outlierPlot(data = iris, columnName = "Sepal.Length")
35 | }
36 | }
37 | \seealso{
38 | Other Package EDA Utilites functions: \code{\link{CheckColumnType}},
39 | \code{\link{bivarPlots}},
40 | \code{\link{correlationMatPlot}},
41 | \code{\link{getDatatype}}, \code{\link{ignoreCols}},
42 | \code{\link{multiVarOutlierPlot}},
43 | \code{\link{univarCatDistPlots}}
44 | }
45 | \concept{Package EDA Utilites functions}
46 |
--------------------------------------------------------------------------------
/man/prepExecution.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions.R
3 | \docType{methods}
4 | \name{prepExecution}
5 | \alias{prepExecution}
6 | \alias{prepExecution,BaseAnalysisPipeline-method}
7 | \title{Prepare the pipleline for execution}
8 | \usage{
9 | prepExecution(object)
10 |
11 | \S4method{prepExecution}{BaseAnalysisPipeline}(object)
12 | }
13 | \arguments{
14 | \item{object}{A Pipeline object}
15 | }
16 | \value{
17 | Updated \code{AnalysisPipeline} \code{StreamingAnalysisPipeline} object
18 | }
19 | \description{
20 | Prepare the pipleline for execution
21 | }
22 | \details{
23 | The pipeline is prepared for execution by identifying the graph of the pipeline as well as its topological ordering,
24 | and dependency map in order to prepare for execution
25 | }
26 | \examples{
27 | \dontrun{
28 | library(analysisPipelines)
29 | pipelineObj <- AnalysisPipeline(input = iris)
30 | pipelineObj \%>>\% univarCatDistPlots(uniCol = "Species",
31 | priColor = "blue", optionalPlots = 0, storeOutput = T) \%>>\%
32 | prepExecution -> pipelineObj
33 | }
34 | }
35 | \seealso{
36 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}},
37 | \code{\link{MetaAnalysisPipeline-class}},
38 | \code{\link{assessEngineSetUp}},
39 | \code{\link{checkSchemaMatch}},
40 | \code{\link{createPipelineInstance}},
41 | \code{\link{exportAsMetaPipeline}},
42 | \code{\link{generateOutput}},
43 | \code{\link{genericPipelineException}},
44 | \code{\link{getInput}}, \code{\link{getLoggerDetails}},
45 | \code{\link{getOutputById}},
46 | \code{\link{getPipelinePrototype}},
47 | \code{\link{getPipeline}}, \code{\link{getRegistry}},
48 | \code{\link{initDfBasedOnType}},
49 | \code{\link{initialize,BaseAnalysisPipeline-method}},
50 | \code{\link{loadMetaPipeline}},
51 | \code{\link{loadPipeline}},
52 | \code{\link{loadPredefinedFunctionRegistry}},
53 | \code{\link{loadRegistry}},
54 | \code{\link{registerFunction}},
55 | \code{\link{savePipeline}}, \code{\link{saveRegistry}},
56 | \code{\link{setInput}}, \code{\link{setLoggerDetails}},
57 | \code{\link{updateObject}},
58 | \code{\link{visualizePipeline}}
59 | }
60 | \concept{Package core functions}
61 |
--------------------------------------------------------------------------------
/man/registerFunction.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions.R
3 | \name{registerFunction}
4 | \alias{registerFunction}
5 | \title{Register a user-defined function to be used with a \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object}
6 | \usage{
7 | registerFunction(functionName, heading = "", functionType = "batch",
8 | engine = "r",
9 | exceptionFunction = as.character(substitute(genericPipelineException)),
10 | isDataFunction = T, firstArgClass = "", loadPipeline = F,
11 | userDefined = T)
12 | }
13 | \arguments{
14 | \item{functionName}{name of function to be registered}
15 |
16 | \item{heading}{heading of that section in report}
17 |
18 | \item{functionType}{type of function - 'batch' for \code{AnalysisPipeline} objects, 'streaming' for \code{StreamingAnalysisPipeline} objects}
19 |
20 | \item{engine}{specifies which engine the function is to be run on. Available engines include "r", "spark", and "python"}
21 |
22 | \item{exceptionFunction}{R object corresponding to the exception function}
23 |
24 | \item{isDataFunction}{logical parameter which defines whether the function to be registered operates on data i.e. the first parameter is a dataframe}
25 |
26 | \item{firstArgClass}{character string with the class of the first argument to the function, if it is a non-data function}
27 |
28 | \item{loadPipeline}{logical parameter to see if function is being used in loadPipeline or not. This is for internal working}
29 |
30 | \item{userDefined}{logical parameter defining whether the function is user defined. By default, set to true}
31 | }
32 | \description{
33 | Register a user-defined function to be used with a \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object
34 | }
35 | \details{
36 | The specified operation along with the heading and engine details is stored in the registry, after which it can be added to a pipeline.
37 |
38 | If the function already exists in the registry, registration will be skipped. In order to change the definition, the function needs
39 | to be reassigned in the Global Environment and then the \code{registerFunction} called again.
40 | }
41 | \examples{
42 | \dontrun{
43 | library(analysisPipelines)
44 | getNumRows <- function(dataset){
45 | return(nrow(dataset))
46 | }
47 |
48 | registerFunction("getNumRows")
49 | }
50 | }
51 | \seealso{
52 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}},
53 | \code{\link{MetaAnalysisPipeline-class}},
54 | \code{\link{assessEngineSetUp}},
55 | \code{\link{checkSchemaMatch}},
56 | \code{\link{createPipelineInstance}},
57 | \code{\link{exportAsMetaPipeline}},
58 | \code{\link{generateOutput}},
59 | \code{\link{genericPipelineException}},
60 | \code{\link{getInput}}, \code{\link{getLoggerDetails}},
61 | \code{\link{getOutputById}},
62 | \code{\link{getPipelinePrototype}},
63 | \code{\link{getPipeline}}, \code{\link{getRegistry}},
64 | \code{\link{initDfBasedOnType}},
65 | \code{\link{initialize,BaseAnalysisPipeline-method}},
66 | \code{\link{loadMetaPipeline}},
67 | \code{\link{loadPipeline}},
68 | \code{\link{loadPredefinedFunctionRegistry}},
69 | \code{\link{loadRegistry}}, \code{\link{prepExecution}},
70 | \code{\link{savePipeline}}, \code{\link{saveRegistry}},
71 | \code{\link{setInput}}, \code{\link{setLoggerDetails}},
72 | \code{\link{updateObject}},
73 | \code{\link{visualizePipeline}}
74 | }
75 | \concept{Package core functions}
76 |
--------------------------------------------------------------------------------
/man/savePipeline.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions.R,
3 | % R/core-functions-meta-pipelines.R
4 | \docType{methods}
5 | \name{savePipeline}
6 | \alias{savePipeline}
7 | \alias{savePipeline,BaseAnalysisPipeline-method}
8 | \alias{savePipeline,MetaAnalysisPipeline-method}
9 | \title{Saves the \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object to the file system without outputs}
10 | \usage{
11 | savePipeline(object, path)
12 |
13 | \S4method{savePipeline}{BaseAnalysisPipeline}(object, path)
14 |
15 | \S4method{savePipeline}{MetaAnalysisPipeline}(object, path)
16 | }
17 | \arguments{
18 | \item{object}{object that contains input, pipeline, registry and output}
19 |
20 | \item{path}{the path at which the .Rda file containing the pipeline should be stored, along with the name of the file including
21 | a .Rda extension}
22 | }
23 | \value{
24 | Does not return a value
25 | }
26 | \description{
27 | Saves the \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object to the file system without outputs
28 | }
29 | \details{
30 | The \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object is saved to the file system in the paths specified
31 |
32 | This method is implemented on the base class as it is a shared functionality types of Analysis Pipelines
33 | which extend this class
34 | }
35 | \examples{
36 | \dontrun{
37 | library(analysisPipelines)
38 | pipelineObj <- AnalysisPipeline(input = iris)
39 | pipelineObj \%>>\% savePipeline(path = "./test.RDS")
40 | }
41 | }
42 | \seealso{
43 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}},
44 | \code{\link{MetaAnalysisPipeline-class}},
45 | \code{\link{assessEngineSetUp}},
46 | \code{\link{checkSchemaMatch}},
47 | \code{\link{createPipelineInstance}},
48 | \code{\link{exportAsMetaPipeline}},
49 | \code{\link{generateOutput}},
50 | \code{\link{genericPipelineException}},
51 | \code{\link{getInput}}, \code{\link{getLoggerDetails}},
52 | \code{\link{getOutputById}},
53 | \code{\link{getPipelinePrototype}},
54 | \code{\link{getPipeline}}, \code{\link{getRegistry}},
55 | \code{\link{initDfBasedOnType}},
56 | \code{\link{initialize,BaseAnalysisPipeline-method}},
57 | \code{\link{loadMetaPipeline}},
58 | \code{\link{loadPipeline}},
59 | \code{\link{loadPredefinedFunctionRegistry}},
60 | \code{\link{loadRegistry}}, \code{\link{prepExecution}},
61 | \code{\link{registerFunction}},
62 | \code{\link{saveRegistry}}, \code{\link{setInput}},
63 | \code{\link{setLoggerDetails}},
64 | \code{\link{updateObject}},
65 | \code{\link{visualizePipeline}}
66 | }
67 | \concept{Package core functions}
68 |
--------------------------------------------------------------------------------
/man/saveRegistry.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions.R
3 | \name{saveRegistry}
4 | \alias{saveRegistry}
5 | \title{Saves the registry to the file system}
6 | \usage{
7 | saveRegistry(path)
8 | }
9 | \arguments{
10 | \item{path}{path on the file system, where the registry is to be saved to}
11 | }
12 | \description{
13 | Saves the registry to the file system
14 | }
15 | \details{
16 | This function saves the existing function registry and associated function definition loaded in the
17 | environment into a file.
18 | }
19 | \examples{
20 | \dontrun{
21 | library(analysisPipelines)
22 | saveRegistry(path = "./registry.RDS")
23 | }
24 | }
25 | \seealso{
26 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}},
27 | \code{\link{MetaAnalysisPipeline-class}},
28 | \code{\link{assessEngineSetUp}},
29 | \code{\link{checkSchemaMatch}},
30 | \code{\link{createPipelineInstance}},
31 | \code{\link{exportAsMetaPipeline}},
32 | \code{\link{generateOutput}},
33 | \code{\link{genericPipelineException}},
34 | \code{\link{getInput}}, \code{\link{getLoggerDetails}},
35 | \code{\link{getOutputById}},
36 | \code{\link{getPipelinePrototype}},
37 | \code{\link{getPipeline}}, \code{\link{getRegistry}},
38 | \code{\link{initDfBasedOnType}},
39 | \code{\link{initialize,BaseAnalysisPipeline-method}},
40 | \code{\link{loadMetaPipeline}},
41 | \code{\link{loadPipeline}},
42 | \code{\link{loadPredefinedFunctionRegistry}},
43 | \code{\link{loadRegistry}}, \code{\link{prepExecution}},
44 | \code{\link{registerFunction}},
45 | \code{\link{savePipeline}}, \code{\link{setInput}},
46 | \code{\link{setLoggerDetails}},
47 | \code{\link{updateObject}},
48 | \code{\link{visualizePipeline}}
49 | }
50 | \concept{Package core functions}
51 |
--------------------------------------------------------------------------------
/man/setInput.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions.R
3 | \docType{methods}
4 | \name{setInput}
5 | \alias{setInput}
6 | \alias{setInput,BaseAnalysisPipeline-method}
7 | \title{Sets the input for an \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object}
8 | \usage{
9 | setInput(object, input, filePath = "")
10 |
11 | \S4method{setInput}{BaseAnalysisPipeline}(object, input, filePath = "")
12 | }
13 | \arguments{
14 | \item{object}{object that contains input, pipeline, registry and output}
15 |
16 | \item{input}{the input data frame}
17 |
18 | \item{filePath}{path to the file which needs to be read (currently supports .csv files)}
19 | }
20 | \value{
21 | Updated \code{AnalysisPipeline} \code{StreamingAnalysisPipeline} object
22 | }
23 | \description{
24 | Sets the input for an \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object
25 | }
26 | \details{
27 | Assigns the input to the pipeline for an \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object
28 |
29 | This method is implemented on the base class as it is a shared functionality types of Analysis Pipelines
30 | which extend this class
31 | }
32 | \examples{
33 | library(analysisPipelines)
34 | pipelineObj <- AnalysisPipeline()
35 | pipelineObj \%>>\% setInput(input = iris) -> pipelineObj
36 | }
37 | \seealso{
38 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}},
39 | \code{\link{MetaAnalysisPipeline-class}},
40 | \code{\link{assessEngineSetUp}},
41 | \code{\link{checkSchemaMatch}},
42 | \code{\link{createPipelineInstance}},
43 | \code{\link{exportAsMetaPipeline}},
44 | \code{\link{generateOutput}},
45 | \code{\link{genericPipelineException}},
46 | \code{\link{getInput}}, \code{\link{getLoggerDetails}},
47 | \code{\link{getOutputById}},
48 | \code{\link{getPipelinePrototype}},
49 | \code{\link{getPipeline}}, \code{\link{getRegistry}},
50 | \code{\link{initDfBasedOnType}},
51 | \code{\link{initialize,BaseAnalysisPipeline-method}},
52 | \code{\link{loadMetaPipeline}},
53 | \code{\link{loadPipeline}},
54 | \code{\link{loadPredefinedFunctionRegistry}},
55 | \code{\link{loadRegistry}}, \code{\link{prepExecution}},
56 | \code{\link{registerFunction}},
57 | \code{\link{savePipeline}}, \code{\link{saveRegistry}},
58 | \code{\link{setLoggerDetails}},
59 | \code{\link{updateObject}},
60 | \code{\link{visualizePipeline}}
61 | }
62 | \concept{Package core functions}
63 |
--------------------------------------------------------------------------------
/man/setLoggerDetails.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions.R
3 | \docType{methods}
4 | \name{setLoggerDetails}
5 | \alias{setLoggerDetails}
6 | \alias{setLoggerDetails,BaseAnalysisPipeline-method}
7 | \title{Sets the logger configuration for the pipeline}
8 | \usage{
9 | setLoggerDetails(object, target = "console",
10 | targetFile = "pipelineExecution.out", layout = "layout.simple")
11 |
12 | \S4method{setLoggerDetails}{BaseAnalysisPipeline}(object,
13 | target = "console", targetFile = "pipelineExecution.out",
14 | layout = "layout.simple")
15 | }
16 | \arguments{
17 | \item{object}{A Pipeline object}
18 |
19 | \item{target}{A string value. 'console' for appending to console, 'file' for appending to a file, or 'console&file' for both}
20 |
21 | \item{targetFile}{File name of the log file in case the target is 'file'}
22 |
23 | \item{layout}{Specify the layout according to 'futile.logger' package convention}
24 | }
25 | \description{
26 | Sets the logger configuration for the pipeline
27 | }
28 | \details{
29 | This function sets the logger configuration for the pipeline.
30 | }
31 | \examples{
32 | library(analysisPipelines)
33 | pipelineObj <- AnalysisPipeline(input = iris)
34 | pipelineObj \%>>\% setLoggerDetails(target = "file",
35 | targetFile = "pipeline.out") -> pipelineObj
36 | }
37 | \seealso{
38 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}},
39 | \code{\link{MetaAnalysisPipeline-class}},
40 | \code{\link{assessEngineSetUp}},
41 | \code{\link{checkSchemaMatch}},
42 | \code{\link{createPipelineInstance}},
43 | \code{\link{exportAsMetaPipeline}},
44 | \code{\link{generateOutput}},
45 | \code{\link{genericPipelineException}},
46 | \code{\link{getInput}}, \code{\link{getLoggerDetails}},
47 | \code{\link{getOutputById}},
48 | \code{\link{getPipelinePrototype}},
49 | \code{\link{getPipeline}}, \code{\link{getRegistry}},
50 | \code{\link{initDfBasedOnType}},
51 | \code{\link{initialize,BaseAnalysisPipeline-method}},
52 | \code{\link{loadMetaPipeline}},
53 | \code{\link{loadPipeline}},
54 | \code{\link{loadPredefinedFunctionRegistry}},
55 | \code{\link{loadRegistry}}, \code{\link{prepExecution}},
56 | \code{\link{registerFunction}},
57 | \code{\link{savePipeline}}, \code{\link{saveRegistry}},
58 | \code{\link{setInput}}, \code{\link{updateObject}},
59 | \code{\link{visualizePipeline}}
60 | }
61 | \concept{Package core functions}
62 |
--------------------------------------------------------------------------------
/man/setPythonEnvir.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/r-helper-utilites-python.R
3 | \name{setPythonEnvir}
4 | \alias{setPythonEnvir}
5 | \title{Sets the python environment to be used}
6 | \usage{
7 | setPythonEnvir(type = "conda", pathOrEnvirName = "base")
8 | }
9 | \arguments{
10 | \item{type}{Type of python environment. Takes three possible vales - 'conda' for Anaconda environments,
11 | 'virtualenv' for Virtual environments, and 'python' to manually set the python path to use}
12 |
13 | \item{pathOrEnvirName}{Name of the environment for Anaconda and Virtual environments,
14 | or the Python path when type is 'python'}
15 | }
16 | \description{
17 | Sets the python environment to be used
18 | }
19 | \details{
20 | Wrapper function over reticulate functions to set a python environment to be used
21 | }
22 | \examples{
23 | \dontrun{
24 | setPythonEnvir()
25 | }
26 | }
27 | \seealso{
28 | Other R helper utilities for Python: \code{\link{getFeaturesForPyClassification}},
29 | \code{\link{getTargetForPyClassification}}
30 | }
31 | \concept{R helper utilities for Python}
32 |
--------------------------------------------------------------------------------
/man/setUpstreamDependencies.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions.R
3 | \name{setUpstreamDependencies}
4 | \alias{setUpstreamDependencies}
5 | \title{Sets upstream dependencies for the entire pipeline}
6 | \usage{
7 | setUpstreamDependencies(pipeline)
8 | }
9 | \description{
10 | Sets upstream dependencies for the entire pipeline
11 | }
12 | \keyword{internal}
13 |
--------------------------------------------------------------------------------
/man/sparkRSessionCreateIfNotPresent.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/spark-structured-streaming-utilities.R
3 | \name{sparkRSessionCreateIfNotPresent}
4 | \alias{sparkRSessionCreateIfNotPresent}
5 | \title{Connect to a Spark session}
6 | \usage{
7 | sparkRSessionCreateIfNotPresent(...)
8 | }
9 | \arguments{
10 | \item{...}{Arguments to sparkR.session}
11 | }
12 | \description{
13 | Connect to a Spark session
14 | }
15 | \details{
16 | Loads the SparkR package and intializes a Spark session from R
17 | }
18 | \examples{
19 | \dontrun{
20 | sparkHome <- "/Users/naren/softwares/spark-2.3.1-bin-hadoop2.7/"
21 | sparkMaster <- "local[1]"
22 | sparkPackages <- c("org.apache.spark:spark-sql-kafka-0-10_2.11:2.3.1")
23 | sparkRSessionCreateIfNotPresent(master = sparkMaster,
24 | sparkPackages = sparkPackages)
25 | }
26 | }
27 | \seealso{
28 | Other Spark utilities: \code{\link{castKafkaStreamAsString}},
29 | \code{\link{convertKafkaValueFromJson}}
30 | }
31 | \concept{Spark utilities}
32 |
--------------------------------------------------------------------------------
/man/univarCatDistPlots.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/r-batch-eda-utilities.R
3 | \name{univarCatDistPlots}
4 | \alias{univarCatDistPlots}
5 | \title{Univariate Categoric Distribution}
6 | \usage{
7 | univarCatDistPlots(data, uniCol, priColor = "blue", optionalPlots = 0)
8 | }
9 | \arguments{
10 | \item{data}{the dataset where the column on which the plot is to be generated is present}
11 |
12 | \item{uniCol}{the name of column on which the plot needs to be generated}
13 |
14 | \item{priColor}{the primary color for the plots}
15 |
16 | \item{optionalPlots}{A Flag for optional plots}
17 | }
18 | \value{
19 | A univariate categoric distribution plot
20 | }
21 | \description{
22 | Univariate Categoric Distribution
23 | }
24 | \details{
25 | A univariate distribution graph on the selected categorical columns from the dataframe
26 | }
27 | \examples{
28 | univarCatDistPlots(data = iris, uniCol = "Species")
29 | }
30 | \seealso{
31 | Other Package EDA Utilites functions: \code{\link{CheckColumnType}},
32 | \code{\link{bivarPlots}},
33 | \code{\link{correlationMatPlot}},
34 | \code{\link{getDatatype}}, \code{\link{ignoreCols}},
35 | \code{\link{multiVarOutlierPlot}},
36 | \code{\link{outlierPlot}}
37 | }
38 | \concept{Package EDA Utilites functions}
39 |
--------------------------------------------------------------------------------
/man/updateObject.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions.R
3 | \docType{methods}
4 | \name{updateObject}
5 | \alias{updateObject}
6 | \alias{updateObject,BaseAnalysisPipeline-method}
7 | \title{Update the \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object by adding an operation to the pipeline}
8 | \usage{
9 | updateObject(object, operation, heading = "", parameters, outAsIn = F,
10 | storeOutput = F)
11 |
12 | \S4method{updateObject}{BaseAnalysisPipeline}(object, operation,
13 | heading = "", parameters, outAsIn = F, storeOutput = F)
14 | }
15 | \arguments{
16 | \item{object}{object that contains input, pipeline, registry and output}
17 |
18 | \item{operation}{function name to be updated in tibble}
19 |
20 | \item{heading}{heading of that section in report}
21 |
22 | \item{parameters}{parameters passed to that function}
23 |
24 | \item{outAsIn}{whether to use original input or output from previous function}
25 |
26 | \item{storeOutput}{whether the output of this operation is to be stored}
27 | }
28 | \value{
29 | Updated \code{AnalysisPipeline} \code{StreamingAnalysisPipeline} object
30 | }
31 | \description{
32 | Update the \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object by adding an operation to the pipeline
33 | }
34 | \details{
35 | The specified operation along with the heading and parameters is updated in the pipeline slot
36 | of the \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object, where the sequence of operations
37 | to be performed is stored
38 |
39 | This method is implemented on the base class as it is a shared functionality types of Analysis Pipelines
40 | which extend this class
41 | }
42 | \seealso{
43 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}},
44 | \code{\link{MetaAnalysisPipeline-class}},
45 | \code{\link{assessEngineSetUp}},
46 | \code{\link{checkSchemaMatch}},
47 | \code{\link{createPipelineInstance}},
48 | \code{\link{exportAsMetaPipeline}},
49 | \code{\link{generateOutput}},
50 | \code{\link{genericPipelineException}},
51 | \code{\link{getInput}}, \code{\link{getLoggerDetails}},
52 | \code{\link{getOutputById}},
53 | \code{\link{getPipelinePrototype}},
54 | \code{\link{getPipeline}}, \code{\link{getRegistry}},
55 | \code{\link{initDfBasedOnType}},
56 | \code{\link{initialize,BaseAnalysisPipeline-method}},
57 | \code{\link{loadMetaPipeline}},
58 | \code{\link{loadPipeline}},
59 | \code{\link{loadPredefinedFunctionRegistry}},
60 | \code{\link{loadRegistry}}, \code{\link{prepExecution}},
61 | \code{\link{registerFunction}},
62 | \code{\link{savePipeline}}, \code{\link{saveRegistry}},
63 | \code{\link{setInput}}, \code{\link{setLoggerDetails}},
64 | \code{\link{visualizePipeline}}
65 | }
66 | \concept{Package core functions}
67 |
--------------------------------------------------------------------------------
/man/visualizePipeline.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/core-functions.R,
3 | % R/core-functions-meta-pipelines.R
4 | \docType{methods}
5 | \name{visualizePipeline}
6 | \alias{visualizePipeline}
7 | \alias{visualizePipeline,BaseAnalysisPipeline-method}
8 | \alias{visualizePipeline,MetaAnalysisPipeline-method}
9 | \title{Visualizes the pipeline as a graph}
10 | \usage{
11 | visualizePipeline(object)
12 |
13 | \S4method{visualizePipeline}{BaseAnalysisPipeline}(object)
14 |
15 | \S4method{visualizePipeline}{MetaAnalysisPipeline}(object)
16 | }
17 | \arguments{
18 | \item{object}{The \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object}
19 | }
20 | \value{
21 | A graph object which can be printed (or) plotted to visualize the pipeline
22 | }
23 | \description{
24 | Visualizes the pipeline as a graph
25 | }
26 | \details{
27 | Indicates dependencies amongst functions as well as functions for which output
28 | needs to be stored
29 | }
30 | \examples{
31 | \dontrun{
32 | library(analysisPipelines)
33 | pipelineObj <- AnalysisPipeline(input = iris)
34 | pipelineObj \%>>\% univarCatDistPlots(uniCol = "Species",
35 | priColor = "blue", optionalPlots = 0, storeOutput = T) \%>>\%
36 | visualizePipeline
37 | }
38 | }
39 | \seealso{
40 | Other Package core functions: \code{\link{BaseAnalysisPipeline-class}},
41 | \code{\link{MetaAnalysisPipeline-class}},
42 | \code{\link{assessEngineSetUp}},
43 | \code{\link{checkSchemaMatch}},
44 | \code{\link{createPipelineInstance}},
45 | \code{\link{exportAsMetaPipeline}},
46 | \code{\link{generateOutput}},
47 | \code{\link{genericPipelineException}},
48 | \code{\link{getInput}}, \code{\link{getLoggerDetails}},
49 | \code{\link{getOutputById}},
50 | \code{\link{getPipelinePrototype}},
51 | \code{\link{getPipeline}}, \code{\link{getRegistry}},
52 | \code{\link{initDfBasedOnType}},
53 | \code{\link{initialize,BaseAnalysisPipeline-method}},
54 | \code{\link{loadMetaPipeline}},
55 | \code{\link{loadPipeline}},
56 | \code{\link{loadPredefinedFunctionRegistry}},
57 | \code{\link{loadRegistry}}, \code{\link{prepExecution}},
58 | \code{\link{registerFunction}},
59 | \code{\link{savePipeline}}, \code{\link{saveRegistry}},
60 | \code{\link{setInput}}, \code{\link{setLoggerDetails}},
61 | \code{\link{updateObject}}
62 | }
63 | \concept{Package core functions}
64 |
--------------------------------------------------------------------------------
/vignettes/Analysis_pipelines_for_working_with_Python_functions.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Analysis pipelines for working with Python functions"
3 | author: "Naren Srinivasan"
4 | date: "11/27/2018"
5 | output:
6 | rmarkdown::html_vignette:
7 | toc: true
8 | fig_width: 8
9 | vignette: >
10 | %\VignetteIndexEntry{Analysis pipelines for working with Python functions}
11 | %\VignetteEngine{knitr::rmarkdown}
12 | %\VignetteEncoding{UTF-8}
13 | ---
14 | # Introduction
15 |
16 | *Python* has grown exponentially over the past few years in terms of usage for data science, and specifically machine learning. It provides an extensive set of modules for executing various machine learning tasks. The *reticulate* R package provides a mechanism for interoperability between R and Python. It provides direct translation between equivalent commonly used object types, as well as functions.
17 |
18 | The *analysisPipelines* package uses the *reticulate* package under the hood, and provides a consistent high-level interface for the data scientist, as discussed in other vignettes.
19 |
20 | The vignette describes defining and executing *Python*-only pipelines using the *analysisPipelines* package.
21 |
22 | # Important Note
23 |
24 | The functionality of adding Python functions to the pipeline is enabled under the hood by the *reticulate* package. As the *reticulate* package itself is in its early stages of development and usage, some things might not work as expected. Additionally, for reticulating *Python* code itself in R MarkDown chunks (as opposed to sourcing Python files) **RStudio 1.2** is required, though it is still in Preview phase, as of the time of writing this vignette.
25 |
26 | On a separate note, there is a slight difference between how *SparkR* and *reticulate* are designed. SparkR provides wrappers to Spark functions and stays true to the conventions and classes used in *Apache Spark*, with the main type conversion offered being that on a data frame. *reticulate* is different in the sense that its aim is to provide interoperability, and provides type conversion between a wide range of object types between R and Python.
27 |
28 | The biggest difference is in terms of functions - in SparkR, functions written in Scala, etc. in a Python session cannot be accessed from an R session. However, using *reticulate* user-defined functions written in Python and sourced, can be accessed as objects in an R session. This allows greater flexibility, to write custom functions in Python, source the file, and then call those functions from R. This difference in design is important to understand, in order to construct functions which can then be used to compose pipelines.
29 | ```{r}
30 | knitr::opts_chunk$set(
31 | eval = FALSE
32 | )
33 | ```
34 |
35 |
36 | # Setup
37 |
38 | The *analysisPipelines* provides a couple of helper functions in R, making it easier to interact with the Python environment. One of them is to set the Python environment, which we do, like so:
39 |
40 | ```{r}
41 |
42 | library(analysisPipelines)
43 |
44 | analysisPipelines::setPythonEnvir('python', '/Users/naren/anaconda3/bin/python')
45 | os <- reticulate::import("os")
46 | numpy <- reticulate::import("numpy")
47 | pandas <- reticulate::import("pandas")
48 | sklearn <- reticulate::import("sklearn")
49 |
50 | reticulate::source_python(system.file("python/sampleFunctions.py", package = "analysisPipelines"))
51 |
52 | reticulate::py_config()
53 | ```
54 |
55 | # Registering Python functions
56 |
57 | Python functions which have been sourced through *reticulate* are available as references in the R environment and can be directly registered as part of the pipeline, through the usual mechanism.
58 |
59 | For non-R engines, such as Spark and Python, a suffix with the engine name is added to the function name on registration. So, functions with this suffix need to be used when pipelining to an *Analysis Pipeline* object. The engine is added as a suffix for better readability. A suffix is used (as opposed to a prefix) to enable easier auto-completes.
60 |
61 | The *analysisPipelines* package creates wrapper methods which contain the *argument* signature of the Python function. This allows the user to know what arguments need to passed. Normal *reticulate* imports have a `...` signature.
62 |
63 | In our Python sample function file, we have a function called `decisionTreeTrainAndTest` which was sourced. We register this function:
64 |
65 | ```{r}
66 | registerFunction('decisionTreeTrainAndTest', engine = "python", isDataFunction = F, firstArgClass = "numpy.ndarray")
67 | getRegistry()
68 | ```
69 |
70 | # Defining pipelines
71 |
72 | Pipelines are defined and executed as usual. Regardless of the engine being used the high-level interface remains the same.
73 |
74 | ```{r}
75 | data("iris")
76 | trainSample <- sample(1:150, size = 100)
77 | train <- iris[trainSample,]
78 | test <- iris[-trainSample,] #%>>% getFeaturesForPyClassification(featureNames = colnames(iris)[-ncol(iris)])
79 | obj <- AnalysisPipeline(input = train)
80 |
81 | obj %>>% getFeaturesForPyClassification(featureNames = colnames(train)[-ncol(train)]) %>>%
82 | getTargetForPyClassification(targetVarName = "Species", positiveClass = "setosa") %>>%
83 | getFeaturesForPyClassification(dataset = test, featureNames = colnames(test)[-ncol(test)]) %>>%
84 | decisionTreeTrainAndTest_py(data = ~f1, target = ~f2, newData = ~f3, storeOutput = T) -> objDecisionTree
85 |
86 | objDecisionTree %>>% assessEngineSetUp
87 | objDecisionTree %>>% visualizePipeline
88 | ```
89 |
90 | # Execution
91 |
92 | ```{r}
93 | objDecisionTree %>>% generateOutput -> op
94 | #op %>>% generateReport("~/Desktop")
95 | op %>>% getOutputById("4")
96 | ```
97 |
98 |
--------------------------------------------------------------------------------
/vignettes/Analysis_pipelines_for_working_with_sparkR.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Analysis pipelines for working with Spark DataFrames for one-time/ batch analyses"
3 | author: "Naren S, Anoop S"
4 | date: "11/13/2018"
5 | output:
6 | rmarkdown::html_vignette:
7 | toc: true
8 | fig_width: 8
9 | vignette: >
10 | %\VignetteIndexEntry{Analysis pipelines for working with Spark DataFrames for batch analyses}
11 | %\VignetteEngine{knitr::rmarkdown}
12 | %\VignetteEncoding{UTF-8}
13 | ---
14 | # Introduction
15 |
16 | *Apache Spark* can be leveraged to process large volumes of distributed data that are typically impossible to process on standalone R servers. The vignette describes defining and executing *Spark*-only pipelines using the *analysisPipelines* package.
17 |
18 | # Important Note
19 |
20 | Using *Spark* as an engine requires the *SparkR* package to be installed. *SparkR* is distributed natively with *Apache Spark* and is not distributed on CRAN. The *SparkR* version needs to directly map to the Spark version (hence the native distribution), and care needs to be taken to ensure that this is configured properly.
21 |
22 | To install from Github, run the following command, if you know the Spark version:
23 |
24 | ```{r eval = F}
25 | devtools::install_github('apache/spark@v2.x.x', subdir='R/pkg')
26 | ```
27 |
28 | The other option is to install SparkR by running the following *terminal* commands if Spark has already been installed.
29 |
30 | ```{bash eval = F}
31 | $ export SPARK_HOME=/path/to/spark/directory
32 | $ cd $SPARK_HOME/R/lib/SparkR/
33 | $ R -e "devtools::install('.')"
34 | ```
35 |
36 |
37 | # Initialize libraries
38 |
39 | * Load the *analysisPipelines* and *SparkR* libraries
40 | * Check if the SPARK_HOME environment variable is set to Spark installation folder. Else, define it using `sys.setenv()` function.
41 | ```{r}
42 | knitr::opts_chunk$set(
43 | eval = FALSE
44 | )
45 | ```
46 |
47 | ```{r, include=FALSE}
48 |
49 | library(ggplot2)
50 | library(analysisPipelines)
51 | library(SparkR)
52 |
53 | ## Define these variables as per the configuration of your machine. This is just an example.
54 | sparkHome <- "/Users/naren/softwares/spark-2.3.1-bin-hadoop2.7/"
55 | sparkMaster <- "local[1]"
56 | sparkPackages <- c("org.apache.spark:spark-sql-kafka-0-10_2.11:2.3.1")
57 | # Set spark home variable if not present
58 | if(Sys.getenv("SPARK_HOME") == "") {
59 | Sys.setenv(SPARK_HOME = sparkHome)
60 | }
61 | ```
62 |
63 | # Connect to Spark cluster
64 |
65 | * Define the Spark master URL
66 | * Specify dependency packages if any during Spark connection. Example: `sparkPackages <- c("org.apache.spark:spark-sql-kafka-0-10_2.11:2.3.1")`
67 | * Connect to the cluster using the package's `sparkRSessionCreateIfNotPresent` function
68 |
69 | ```{r}
70 | sparkRSessionCreateIfNotPresent(master = sparkMaster, sparkPackages = sparkPackages)
71 | ```
72 |
73 | # Read data from csv and initialize pipeline object
74 |
75 | Spark can connect to datasources like Hive, Kafka. Besides, it can also read parquet, json and csv files. In this example we will read a csv file.
76 |
77 | ```{r}
78 | inputDataset <- iris
79 |
80 | # Replacing '.' in column names with '_' as SparkR is not able to deal with '.' in column names
81 | colnames(inputDataset) <- gsub(".", "_", colnames(inputDataset), fixed = T)
82 |
83 | pipelineObj <- AnalysisPipeline(input = iris)
84 | ```
85 |
86 | # User-defined Spark functions
87 | The example below shows a few functions to perform simple aggregations.
88 |
89 | ```{r}
90 | meanByGroup <- function(inputDataset, groupByColumn, colToSummarize) {
91 | groupSummary <- SparkR::summarize( SparkR::groupBy(inputDataset,inputDataset[[groupByColumn]]),
92 | avg = SparkR::mean(inputDataset[[colToSummarize]]))
93 | return(groupSummary)
94 | }
95 | ```
96 |
97 | # Registering user-defined functions to the pipeline object
98 |
99 | Each user-defined function needs to be registered to the pipeline object. For non-R engines, such as Spark and Python, a suffix with the engine name is added to the function name on registration. So, functions with this suffix need to be used when pipelining to an *Analysis Pipeline* object. The engine is added as a suffix for better readability. A suffix is used (as opposed to a prefix) to enable easier auto-completes.
100 |
101 | Post registration, the function can be used to construct a pipeline. A pipeline is a set of multiple functions called in a particular sequence.
102 |
103 | ```{r}
104 | # Register user-defined functions
105 | registerFunction("meanByGroup", "Mean By Group",
106 | engine = "spark")
107 |
108 | # List all registered functions
109 | getRegistry()
110 |
111 | # Define pipeline from list of registered functions
112 | pipelineObj %>% meanByGroup_spark(groupByColumn = "Species", colToSummarize = "Sepal_Length", storeOutput = T) %>%
113 | meanByGroup_spark(groupByColumn = "Species", colToSummarize = "Petal_Length", storeOutput = T) -> pipelineObj
114 |
115 | pipelineObj %>>% getPipeline
116 | pipelineObj %>>% visualizePipeline
117 | ```
118 |
119 | # Running the pipeline and generating an output
120 |
121 | The pipeline is run by calling the `generateOutput()` function. A particular output in the sequence on evaluations can be accessed by calling the `getOutputById` function
122 |
123 |
124 | ```{r fig.width=6, fig.height=3}
125 | pipelineObj %>% generateOutput -> pipelineObj
126 |
127 | sepalLengthBySpecies <- pipelineObj %>>% getOutputById(1)
128 | sepalLengthBySpeciesDf <- as.data.frame(sepalLengthBySpecies)
129 | DT::datatable(head(sepalLengthBySpeciesDf),options = list(scrollX = T, scrollY = T))
130 |
131 | petalLengthBySpecies <- pipelineObj %>>% getOutputById(2)
132 | petalLengthBySpeciesDf <- as.data.frame(petalLengthBySpecies)
133 | DT::datatable(head(petalLengthBySpeciesDf),options = list(scrollX = T, scrollY = T))
134 | ```
135 |
136 | # Supplementary Note
137 |
138 | The *analysisPipelines* package internally uses the *SparkR* package to interface with *Spark*. *SparkR* masks many typical data manipulation and processing functions from *base* as well as packages like *dplyr*. Therefore, ensure you use function scoping when calling a function.
139 |
--------------------------------------------------------------------------------
/vignettes/Interoperable_Pipelines.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Interoperable analysis pipelines"
3 | author: "Naren Srinivasan"
4 | date: "11/13/2018"
5 | output:
6 | rmarkdown::html_vignette:
7 | toc: true
8 | fig_width: 8
9 | vignette: >
10 | %\VignetteIndexEntry{Interoperable analysis pipelines}
11 | %\VignetteEngine{knitr::rmarkdown}
12 | %\VignetteEncoding{UTF-8}
13 | ---
14 |
15 | # Introduction
16 |
17 | This vignette explains how **interoperable pipelines** containing functions operating on different engines such as R, Spark and Python can be configured and executed through the **analysisPipelines** package. Currently, the package supports interoperable pipelines containing R and Spark batch functions.
18 |
19 | If the package is new to you, it is recommended that you go through the *Analysis pipelines - Core functionality and working with R data frames and functions* vignette first.
20 |
21 | # Important Note
22 |
23 | Using *Spark* as an engine requires the *SparkR* package to be installed. *SparkR* is distributed natively with *Apache Spark* and is not distributed on CRAN.
24 |
25 | ```{r echo = FALSE}
26 | library(analysisPipelines)
27 | knitr::opts_chunk$set(
28 | eval = FALSE
29 | )
30 | ```
31 |
32 | # An example of an interoperable pipeline
33 |
34 | In this vignette we demonstrate an interoperable pipeline built using the **analysisPipelines** package, which contains a couple of filtering/ aggregation functions performed in *Spark*, which is then subsequently visualized through *R* functions using *ggplot2*
35 |
36 | ## Initializing a Spark connection from R and loading the data
37 |
38 | We initialize a Spark session using the `sparkRSessionCreateIfNotPresent` helper function in the **analysisPipelines** package, which internally uses *SparkR*. We then read the data into the Spark session using functions in the SparkR package. In this case we read a *.csv* file, though SparkR can work with multiple other data sources
39 |
40 | ```{r}
41 |
42 | ## Define these variables as per the configuration of your machine. This is just an example.
43 | sparkHome <- "/Users/naren/softwares/spark-2.3.1-bin-hadoop2.7/"
44 | sparkMaster <- "local[1]"
45 | sparkPackages <- c("org.apache.spark:spark-sql-kafka-0-10_2.11:2.3.1")
46 |
47 | sparkRSessionCreateIfNotPresent(sparkHome = sparkHome, master = sparkMaster, sparkPackages = sparkPackages)
48 |
49 | inputDataset <- iris
50 |
51 | # Replacing '.' in column names with '_' as SparkR is not able to deal with '.' in column names
52 | colnames(inputDataset) <- gsub(".", "_", colnames(inputDataset), fixed = T)
53 | ```
54 |
55 |
56 | ##Initializing Python connection
57 |
58 | ```{r}
59 |
60 | ## Define these variables as per the configuration of your machine. This is just an example.
61 |
62 | analysisPipelines::setPythonEnvir('python', '/Users/naren/anaconda3/bin/python')
63 | os <- reticulate::import("os")
64 | numpy <- reticulate::import("numpy")
65 | pandas <- reticulate::import("pandas")
66 | sklearn <- reticulate::import("sklearn")
67 |
68 | reticulate::source_python(system.file("python/sampleFunctions.py", package = "analysisPipelines"))
69 |
70 | reticulate::py_config()
71 | ```
72 |
73 |
74 | ## Creating an analysisPipeline object
75 |
76 | We then initialize an *AnalysisPipeline*, with the input dataset
77 |
78 | ```{r}
79 | pipelineObj <- AnalysisPipeline(input = inputDataset)
80 | ```
81 |
82 |
83 | ## Registering functions to work in the Spark environment
84 |
85 | In order to manipulate the data in the Spark environment, we define our own functions using SparkR interface functions. We then **register** these functions with the **AnalysisPipeline** object, so that they can be used in constructing a pipeline.
86 |
87 | The `getRegistry` function lists all the registered functions, along with details such as which **engine** they should run on.
88 |
89 | ```{r}
90 | getSchema <- function(inputDataset) {
91 | sparkSchema <- SparkR::schema(inputDataset)
92 | return(sparkSchema)
93 | }
94 |
95 | filterData <- function(inputDataset, condition) {
96 | filteredData <- SparkR::filter(inputDataset, condition)
97 | return(filteredData)
98 | }
99 |
100 | registerFunction(functionName = "getSchema", engine = "spark")
101 | registerFunction(functionName = "filterData", engine = "spark")
102 |
103 |
104 | getRegistry()
105 | ```
106 |
107 | ## Registering R functions
108 |
109 | Similar to the Spark functions, we register some user-defined functions in R. In this case to plot a bivariate plot using *ggplot2*.
110 |
111 | ```{r}
112 |
113 | rBivarPlots <- function(dataset, select_var_name_1, select_var_name_2, priColor = "blue", secColor= "black") {
114 |
115 | numeric_cols <- unlist(getDatatype(dataset)['numeric_cols'])
116 | cat_cols <- unlist(getDatatype(dataset)['cat_cols'])
117 |
118 | if (select_var_name_1 %in% numeric_cols && select_var_name_2 %in% numeric_cols) {
119 | x = dataset[, select_var_name_1]
120 | y = dataset[, select_var_name_2]
121 | bivarPlot <-
122 | ggplot2::ggplot(dataset, ggplot2::aes(x, y)) +
123 | ggplot2::geom_point(color = priColor, alpha = 0.7) +
124 | ggplot2::geom_smooth(method = lm, color = secColor) +
125 | ggplot2::xlab(select_var_name_1) +
126 | ggplot2::ylab(select_var_name_2) + ggplot2::theme_bw() +
127 | ggplot2::ggtitle(paste(
128 | 'Bivariate plot for',
129 | select_var_name_1,
130 | 'and',
131 | select_var_name_2,
132 | sep = ' '
133 | )) +
134 | ggplot2::theme(
135 | plot.title = ggplot2::element_text(hjust = 0.5, size = 10),
136 | axis.text = ggplot2::element_text(size = 10),
137 | axis.title = ggplot2::element_text(size = 10)
138 | )
139 |
140 |
141 |
142 | } else if (select_var_name_1 %in% cat_cols &&
143 | select_var_name_2 %in% cat_cols) {
144 | new_df <- dataset %>% dplyr::group_by_(.dots=c(select_var_name_1,select_var_name_2)) %>% dplyr::summarise(n = dplyr::n())
145 | colfunc <- grDevices::colorRampPalette(c(priColor, "white" , secColor))
146 | colorvar <- length(unique(new_df[[select_var_name_2]]))
147 | a=as.vector(as.character(unique(new_df[[select_var_name_1]])))
148 | y=new_df[[select_var_name_1]]
149 | label=new_df[[select_var_name_2]]
150 | bivarPlot <-ggplot2::ggplot(new_df, ggplot2::aes(x = y, y= n, fill = label)) +
151 | ggplot2::geom_bar(position = "dodge", stat = "identity",alpha=0.9) +
152 | ggplot2::guides(fill=ggplot2::guide_legend(title=select_var_name_2)) +
153 | ggplot2::coord_flip()+
154 | ggplot2::xlab(select_var_name_1) +
155 | ggplot2::ylab("count") + ggplot2::theme_bw() +
156 | ggplot2::ggtitle(paste('Bivariate plot for',select_var_name_1,'and',select_var_name_2,sep=' '))+
157 | ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 10),axis.text = ggplot2::element_text(size=10),
158 | axis.title=ggplot2::element_text(size=10),legend.position="bottom",axis.text.x=ggplot2::element_text(angle=45, hjust=1))+ ggplot2::scale_fill_manual(values = colfunc(colorvar))
159 |
160 |
161 | } else {
162 | cols <- c(select_var_name_1, select_var_name_2)
163 | cat_col <- cols[which(cols %in% cat_cols)]
164 | num_col <- cols[which(cols %in% numeric_cols)]
165 | a = as.vector(as.character(unique(dataset[[cat_col]])))
166 | y = dataset[[cat_col]]
167 | x = dataset[[num_col]]
168 | bivarPlot <-
169 | ggplot2::ggplot(dataset, ggplot2::aes(x = y, y = x)) +
170 | ggplot2::geom_point(color = priColor, alpha = 0.7) +
171 | ggplot2::coord_flip() +
172 | ggplot2::xlab(cat_col) +
173 | ggplot2::ylab(num_col) + ggplot2::theme_bw() +
174 | ggplot2::ggtitle(paste(
175 | 'Bivariate plot for',
176 | select_var_name_1,
177 | 'and',
178 | select_var_name_2,
179 | sep = ' '
180 | )) +
181 | ggplot2::theme(
182 | plot.title = ggplot2::element_text(hjust = 0.5, size = 10),
183 | axis.text = ggplot2::element_text(size = 10),
184 | axis.title = ggplot2::element_text(size = 10)
185 | )
186 | }
187 |
188 | return(bivarPlot)
189 | }
190 |
191 | registerFunction(functionName = "rBivarPlots", engine = "r", heading = "Bivariate analysis")
192 |
193 | getRegistry()
194 | ```
195 | ## Registering Python functions
196 |
197 | ```{r}
198 | registerFunction("decisionTreeTrainAndTest", engine = "python", isDataFunction = F, firstArgClass = "numpy.ndarray")
199 | getRegistry()
200 | ```
201 |
202 |
203 | ## Interoperable pipeline containing R, Spark and Python functions
204 |
205 | * Here we consider a typical use case of performing data filtering/ aggregations and so on and Spark, and then using R to visualize, and Python to run a Machine learning model
206 |
207 | We first visualize the data without filtering:
208 |
209 |
210 | ```{r}
211 |
212 | pipelineObj %>>% rBivarPlots(select_var_name_1 = "Sepal_Length", select_var_name_2 = "Sepal_Width",
213 | priColor = "blue", secColor = "green", storeOutput = T) -> vizWithoutFilterPipeline
214 | vizWithoutFilterPipeline %>>% getPipeline
215 | vizWithoutFilterPipeline %>>% assessEngineSetUp
216 | vizWithoutFilterPipeline %>>% generateOutput -> opWithoutFilter
217 | opWithoutFilter %>>% getOutputById(1)
218 | ```
219 |
220 | We then perform filtering on one of the variables in Spark, before visualizing in R
221 |
222 | ```{r}
223 | pipelineObj %>>% filterData_spark(condition = "Species == 'setosa'") %>>%
224 | rBivarPlots(select_var_name_1 = "Sepal_Length", select_var_name_2 = "Sepal_Width",
225 | priColor = "blue", secColor = "green", outAsIn = T, storeOutput = T) -> singleFilterPipeline
226 | singleFilterPipeline %>>% visualizePipeline
227 |
228 | singleFilterPipeline %>>% generateOutput -> opWithFilter
229 | opWithFilter %>>% getOutputById(2)
230 | ```
231 |
232 | Finally, we show a case, where sequential filtering steps are performed in Spark, before visualizing in R, and running a decision tree model in Python.
233 |
234 | Note, that in this case, `getTargetForPyClassifcation` and `getTargetForPyClassification` have been registered as *data* functions. Type conversions between R, Spark and Python for data functions are performed automatically by the package.
235 |
236 | ```{r}
237 | pipelineObj %>>% filterData_spark(condition = "Species == 'setosa' or Species == 'virginica'") %>>%
238 | filterData_spark(condition = "Petal_Length > 3.7", outAsIn = T) %>>%
239 | rBivarPlots(select_var_name_1 = "Sepal_Length", select_var_name_2 = "Sepal_Width",
240 | priColor = "blue", secColor = "green", outAsIn = T, storeOutput = T) %>>%
241 | getFeaturesForPyClassification(dataset = ~f2, featureNames = c("Sepal_Length",
242 | "Sepal_Width",
243 | "Petal_Length")) %>>%
244 | getTargetForPyClassification(dataset = ~f2, targetVarName = "Species", positiveClass = "setosa") %>>%
245 | decisionTreeTrainAndTest_py(data = ~f4, target = ~f5, newData = ~f4, storeOutput = T) -> twoFilterPipeline
246 |
247 | twoFilterPipeline %>>% visualizePipeline
248 |
249 | twoFilterPipeline %>>% generateOutput -> opWith2Filters
250 | opWith2Filters %>>% getOutputById(3)
251 | opWith2Filters %>>% getOutputById(6)
252 |
253 | ```
254 |
255 | # Supplementary Note
256 |
257 | The *analysisPipelines* package internally uses the *SparkR* package to interface with *Spark*. *SparkR* masks many typical data manipulation and processing functions from *base* as well as packages like *dplyr*. Therefore, ensure you use function scoping when calling a function.
258 |
--------------------------------------------------------------------------------
/vignettes/Meta_Pipelines.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "An introduction to meta-pipelines"
3 | author: "Naren Srinivasan"
4 | date: "11/19/2018"
5 | output:
6 | rmarkdown::html_vignette:
7 | toc: true
8 | fig_width: 8
9 | vignette: >
10 | %\VignetteIndexEntry{Meta-pipelines}
11 | %\VignetteEngine{knitr::rmarkdown}
12 | %\VignetteEncoding{UTF-8}
13 | ---
14 |
15 | ```{r message=F, results='hide'}
16 | knitr::opts_chunk$set(
17 | eval = TRUE
18 | )
19 | library(analysisPipelines)
20 | ```
21 |
22 | # Introduction
23 |
24 | The **meta-pipeline** construct is one which allows users to export pipelines they have created for a particular use case to a general analysis flow which can be used for a different dataset and different set of parameters. A *pipeline* is one where the data can change, though retaining the same schema, and the same set of parameters for the functions. A *meta-pipeline* is one where only the analysis flow, function dependencies and so on are retained. The specific parameters for each of the functions can be set differently for a new use case.
25 |
26 | The objective of a meta-pipeline is to define and execute reusable analysis flows. They can be used to:
27 |
28 | * Document best practices for a particular problem
29 | * Templatize analyses for particular situations
30 |
31 | # Using meta-pipelines
32 |
33 | ## Creating a meta-pipeline
34 |
35 | Through this package, *meta-pipelines* can be created by exporting an already created *pipeline* to a *meta-pipeline*. The export retains the following items:
36 |
37 | * Function definitions
38 | * Flow of the functions and the dependencies (specified through formula semantics)
39 | * The registry from which the pipeline is defined
40 |
41 | In the example below, we first create a pipeline, similar to the one described in the other vignettes.
42 |
43 | ```{r}
44 | pipeline <- AnalysisPipeline(input = iris)
45 | getColor <- function(color){
46 | return(color)
47 | }
48 |
49 | getColumnName <-function(columnName){
50 | return(columnName)
51 | }
52 |
53 | registerFunction(functionName = "getColor", isDataFunction = F, firstArgClass = "character")
54 | registerFunction(functionName = "getColumnName", isDataFunction = F, firstArgClass = "character")
55 |
56 | getRegistry()
57 | ```
58 |
59 | We then generate an output from the pipeline, just to validate that the pipeline works properly. Of course, to define a meta-pipeline generation of output is not required.
60 |
61 | ```{r}
62 | pipeline %>>% getColor(color = "blue") %>>% getColumnName(columnName = "Sepal.Length") %>>%
63 | univarCatDistPlots(uniCol = "Species", priColor = ~f1, optionalPlots = 0, storeOutput = T) %>>%
64 | outlierPlot(method = "iqr", columnName = ~f2, cutoffValue = 0.01, priColor = ~f1 , optionalPlots = 0) -> complexPipeline
65 |
66 | complexPipeline %>>% getPipeline
67 | complexPipeline %>>% prepExecution -> complexPipeline
68 |
69 | complexPipeline %>>% generateOutput -> op
70 | op %>>% getOutputById("3")
71 |
72 | ```
73 |
74 | ## Exporting and reusing for a different case
75 |
76 | Once a pipeline has been created, be it a batch or a streaming pipeline, it can be exported using the `exportAsMetaPipeline` method. This returns an object of class `MetaAnalysisPipeline` which stores the required information.
77 |
78 | The meta-pipeline can be *visualized* similar to a normal pipeline object by calling the `visualizePipeline` method on the `MetaAnalysisPipeline` object.
79 |
80 | ```{r}
81 |
82 | complexPipeline %>>% exportAsMetaPipeline -> complexMetaPipeline
83 |
84 | # complexMetaPipeline %>>% visualizePipeline
85 | ```
86 |
87 | ## Setting the new parameters
88 |
89 | The next part of using the meta-pipeline is creating another pipeline with a different set of parameters. For this purpose, the user can first export the *pipeline prototype* which basically contains the set of functions used in the pipeline and their respective arguments.
90 |
91 | The pipeline prototype is exported as an object of class `proto` from the 'proto' package, which is a thin skin over environments, with usability advantages such as using methods like `names` to get the names of objects contained in it, as well as using the '$' operator to refer to specific objects. The aim of using this class is to provide an easy-to-use interface to set the new values of the arguments.
92 |
93 | The pipeline prototype has a nested structure. The first level is a list of objects which represent the list of functions in the pipeline. A specific function can just be referred to through its name. The second level, is the list of arguments for each of those functions (again referred by the usual name).
94 |
95 | The new values of the parameters can simply be set by using the '$' operator to refer to the values. The exported pipeline prototype by default contains the values of the parameters defined in the original pipeline. Therefore, the user can simply change some of the values as required or for all of the parameters.
96 |
97 | In the following example, we reconfigure the pipeline for use with the 'iris' dataset.
98 |
99 | ```{r}
100 | pipelineProto <- getPipelinePrototype(complexMetaPipeline)
101 | str(pipelineProto)
102 |
103 | #Setting new parameters on ToothGrowth dataset
104 | pipelineProto$getColor$color<- "green"
105 | pipelineProto$getColumnName$columnName<- "len"
106 | pipelineProto$univarCatDistPlots$uniCol <- "supp"
107 |
108 | #complexMetaPipeline %>>% visualizePipeline
109 | ```
110 |
111 | ## Execution
112 |
113 | Now once the parameters have been set, a new pipeline object (which is executable) can be created by calling the `createPipelineInstance` method, and passing the meta-pipeline object and the pipeline prototype. This creates a pipeline object with the usual properties.
114 |
115 | We set the input of the pipeline object to the `iris` dataset and then execute to generate the output.
116 |
117 | ```{r}
118 | complexMetaPipeline %>>% createPipelineInstance(pipelineProto) -> newPipelineObj
119 |
120 | newPipelineObj %>>% setInput(input = ToothGrowth) -> newPipelineObj
121 |
122 | newPipelineObj %>>% generateOutput %>>% getOutputById("3")
123 | ```
124 |
125 |
126 | # Saving and loading meta-pipelines
127 |
128 | Similar to pipelines, meta-pipelines can be saved and loaded using the `savePipeline` method and the `loadMetaPipeline` function. As with pipelines, when a meta-pipeline is loaded, it overwrites the existing registry with the registry stored with the meta-pipeline.
129 |
130 | ```{r eval = FALSE}
131 |
132 | complexMetaPipeline %>>% savePipeline("metaPipeline.RDS")
133 |
134 | #Checking if registry is updated
135 | getC <- function(color){
136 | return(color)
137 | }
138 |
139 | getCol <-function(columnName){
140 | return(columnName)
141 | }
142 |
143 | registerFunction(functionName = "getC", isDataFunction = F, firstArgClass = "character")
144 | registerFunction(functionName = "getCol", isDataFunction = F, firstArgClass = "character")
145 |
146 | getRegistry()
147 | loadMetaPipeline(path = "metaPipeline.RDS") -> loadedMetaPipeline
148 | getRegistry()
149 |
150 | pipelineProtoLoaded <- getPipelinePrototype(loadedMetaPipeline)
151 | str(pipelineProtoLoaded)
152 |
153 | pipelineProtoLoaded$getColor$color<- "green"
154 | pipelineProtoLoaded$getColumnName$columnName<- "Sepal.Length"
155 | pipelineProtoLoaded$univarCatDistPlots$uniCol <- "Species"
156 |
157 | loadedMetaPipeline %>>% createPipelineInstance(pipelineProtoLoaded) -> newPipelineObjLoaded
158 |
159 | newPipelineObjLoaded %>>% setInput(input = iris) %>>%
160 | generateOutput %>>% getOutputById("3")
161 | ```
162 |
163 |
--------------------------------------------------------------------------------
/vignettes/Streaming_pipelines_for_working_Apache_Spark_Structured_Streaming.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Streaming Analysis Pipelines for working with Apache Spark Structured Streaming"
3 | author: "Naren Srinivasan, Anoop S"
4 | date: "9/11/2018"
5 | output:
6 | rmarkdown::html_vignette:
7 | toc: true
8 | fig_width: 8
9 | vignette: >
10 | %\VignetteIndexEntry{Streaming Analysis Pipelines for working with Apache Spark Structured Streaming}
11 | %\VignetteEngine{knitr::rmarkdown}
12 | %\VignetteEncoding{UTF-8}
13 | ---
14 |
15 | # Introduction
16 |
17 | The vignette aims to show examples of using SparkR as an interface to run streaming Spark jobs through R - using the analysisPipelines package. The major use case is that of implementing a pipeline using SparkR dataframes for streaming data.
18 |
19 | # Important Note
20 |
21 | Using *Spark* as an engine requires the *SparkR* package to be installed. *SparkR* is distributed natively with *Apache Spark* and is not distributed on CRAN. The *SparkR* version needs to directly map to the Spark version (hence the native distribution), and care needs to be taken to ensure that this is configured properly.
22 |
23 | To install from Github, run the following command, if you know the Spark version:
24 |
25 | ```{r eval = F}
26 | devtools::install_github('apache/spark@v2.x.x', subdir='R/pkg')
27 | ```
28 |
29 | The other option is to install SparkR by running the following *terminal* commands if Spark has already been installed.
30 |
31 | ```{bash eval = F}
32 | $ export SPARK_HOME=/path/to/spark/directory
33 | $ cd $SPARK_HOME/R/lib/SparkR/
34 | $ R -e "devtools::install('.')"
35 | ```
36 |
37 | # Initialize libraries
38 |
39 | * Initialize the analysisPipelines and SparkR libraries
40 | * Ensure you have a local installation of Spark and SparkR package is installed
41 | * Check if the SPARK_HOME environment variable is set to Spark installation folder. Else, define it using `sys.setenv()` function.
42 |
43 | ```{r}
44 | knitr::opts_chunk$set(
45 | eval = FALSE
46 | )
47 | ```
48 |
49 |
50 | ```{r, include=FALSE}
51 | library(analysisPipelines)
52 | library(SparkR)
53 |
54 | ## Define these variables as per the configuration of your machine. The below example is just illustrative.
55 |
56 | sparkHome <- "/path/to/spark/directory/"
57 | sparkMaster <- "local[1]"
58 | sparkPackages <- c("org.apache.spark:spark-sql-kafka-0-10_2.11:2.3.1")
59 | # Set spark home variable if not present
60 | if(Sys.getenv("SPARK_HOME") == "") {
61 | Sys.setenv(SPARK_HOME = sparkHome)
62 | }
63 | ```
64 |
65 | # Connect to Spark cluster
66 | * Define the Spark master URL
67 | * Specify dependency packages if any during Spark connection. Example: `sparkPackages <- c("org.apache.spark:spark-sql-kafka-0-10_2.11:2.3.1")`
68 | * Connect to the cluster using the package's `sparkRSessionCreateIfNotPresent` function
69 |
70 | ```{r, results='hide'}
71 | sparkRSessionCreateIfNotPresent(master = sparkMaster, sparkPackages = sparkPackages)
72 | ```
73 |
74 |
75 | # Streaming Analysis Pipelines using Apache Spark Structured Streaming
76 |
77 | This example illustrates usage of pipelines for a streaming application. In this use case streaming data is read from Kafka, aggregations are performed and the output is written to the console.
78 |
79 | ## Read stream from Kafka
80 |
81 | Read streaming data from Kafka.
82 |
83 | ```{r}
84 | ## Define these variables as per the configuration of your machine. The below example is just illustrative.
85 |
86 | kafkaBootstrapServers <- "192.168.0.256:9092,192.168.0.257:9092,192.168.0.258:9092"
87 | consumerTopic <- "topic1"
88 | streamObj <- read.stream(source = "kafka", kafka.bootstrap.servers = kafkaBootstrapServers, subscribe = consumerTopic, startingOffsets="earliest")
89 | printSchema(streamObj)
90 | ```
91 |
92 | ## User-defined Spark functions
93 |
94 | Users can define their own functions and use it as a part of the pipeline. These functions range from data prep, aggregations, casting data to suitable write stream format, etc.
95 |
96 | ```{r}
97 |
98 | # Function to convert datatype json struct to columns
99 | convertStructToDf <- function(streamObj) {
100 | streamObj <- SparkR::select(streamObj,list(getField(streamObj$`jsontostructs(value)`,"bannerId"),
101 | getField(streamObj$`jsontostructs(value)`,"mobile"),
102 | getField(streamObj$`jsontostructs(value)`,"homeAppliance"),
103 | getField(streamObj$`jsontostructs(value)`,"gamingConsole"),
104 | getField(streamObj$`jsontostructs(value)`,"accessories"),
105 | getField(streamObj$`jsontostructs(value)`,"brand"),
106 | getField(streamObj$`jsontostructs(value)`,"previousPrice"),
107 | getField(streamObj$`jsontostructs(value)`,"currentPrice"),
108 | getField(streamObj$`jsontostructs(value)`,"discount"),
109 | getField(streamObj$`jsontostructs(value)`,"emi"),
110 | getField(streamObj$`jsontostructs(value)`,"crossSale"),
111 | getField(streamObj$`jsontostructs(value)`,"customerId"),
112 | getField(streamObj$`jsontostructs(value)`,"ts"),
113 | getField(streamObj$`jsontostructs(value)`,"click"),
114 | getField(streamObj$`jsontostructs(value)`,"conversion"),
115 | getField(streamObj$`jsontostructs(value)`,"age"),
116 | getField(streamObj$`jsontostructs(value)`,"income"),
117 | getField(streamObj$`jsontostructs(value)`,"maritalStatus"),
118 | getField(streamObj$`jsontostructs(value)`,"segment")))
119 | colnames(streamObj) <- c("bannerId","mobile","homeAppliance","gamingConsole","accessories","brand","previousPrice","currentPrice",
120 | "discount","emi","crossSale","customerId","ts","click","conversion","age","income","maritalStatus","segment")
121 | return(streamObj)
122 | }
123 |
124 | # Function to cast columns as string, integer, etc
125 | castDfColumns <- function(streamObj) {
126 | streamObj <- SparkR::selectExpr(streamObj, "bannerId","mobile","homeAppliance","gamingConsole","accessories","brand",
127 | "CAST(previousPrice as INTEGER)","CAST(currentPrice as INTEGER)","CAST(discount as INTEGER)","emi",
128 | "crossSale","customerId","ts","CAST(click as INTEGER)","CAST(conversion as INTEGER)",
129 | "CAST(age as INTEGER)","CAST(income as INTEGER)","maritalStatus","segment")
130 | streamObj$ts <- SparkR::to_timestamp(streamObj$ts,"yyyy-MM-dd HH:mm:ss")
131 | return (streamObj)
132 | }
133 |
134 | # Function to convert datatype json struct to columns
135 | convertDfToKafkaKeyValuePairs <- function (streamObj, kafkaKey) {
136 | streamObj <- SparkR::toJSON(streamObj)
137 | streamObj$key <- kafkaKey
138 | return(streamObj)
139 | }
140 |
141 | # Function to summarize click stream data
142 | globalUiMetrics <- function (streamObj) {
143 | ## Aggregation query
144 | streamObj <- SparkR::summarize(SparkR::groupBy(streamObj,streamObj$bannerId),
145 | impressions=count(streamObj$customerId),
146 | clicks=sum(streamObj$click),
147 | conversions=sum(streamObj$conversion))
148 | SparkR::colnames(streamObj) <- c("banner_id","impressions","clicks","conversions")
149 | return (streamObj)
150 | }
151 |
152 | ```
153 |
154 | ## Define pipeline object, register user-defined functions to the pipeline object
155 |
156 | In order to use pipelines, a pipeline object needs to be defined. Notice the Spark pipelines are defined using the `readInputSpark` function.
157 |
158 | Each user-defined function needs to be registered to the pipeline object. Post registration, the function can be used to construct a pipeline. A pipeline can be a pipeline of multiple functions called in a particular sequence.
159 |
160 | ```{r}
161 | # Define pipeline object
162 | pipelineObj <- analysisPipelines::StreamingAnalysisPipeline(input = streamObj)
163 |
164 | consumerDataSchema <- structType(structField("bannerId", "string"),
165 | structField("mobile", "string"),
166 | structField("homeAppliance", "string"),
167 | structField("gamingConsole", "string"),
168 | structField("accessories", "string"),
169 | structField("brand", "string"),
170 | structField("previousPrice", "string"),
171 | structField("currentPrice", "string"),
172 | structField("discount", "string"),
173 | structField("emi", "string"),
174 | structField("crossSale", "string"),
175 | structField("customerId", "string"),
176 | structField("ts", "string"),
177 | structField("click", "string"),
178 | structField("conversion", "string"),
179 | structField("age", "string"),
180 | structField("income", "string"),
181 | structField("maritalStatus", "string"),
182 | structField("segment", "string"))
183 |
184 | # Register user-defined functions
185 | registerFunction("convertStructToDf", "", functionType = "streaming", engine = "spark-structured-streaming")
186 | registerFunction("castDfColumns", "", functionType = "streaming", engine = "spark-structured-streaming")
187 | registerFunction("convertDfToKafkaKeyValuePairs", "", functionType = "streaming", engine = "spark-structured-streaming")
188 |
189 | getRegistry()
190 |
191 | # Define pipeline
192 | # Do data prep
193 | pipelineObj %>% castKafkaStreamAsString_sparkSS() %>%
194 | convertKafkaValueFromJson_sparkSS(schema = consumerDataSchema, outAsIn = T) %>% convertStructToDf_sparkSS(outAsIn = T) %>% castDfColumns_sparkSS(outAsIn = T, storeOutput = T) -> pipelineObj
195 |
196 | pipelineObj %>>% getPipeline
197 | pipelineObj %>>% visualizePipeline
198 | ```
199 |
200 | ## Running the pipeline and generating an output
201 |
202 | The pipeline is run by calling the `generateOutput()` function. The `output` attribute of the pipeline object contains the resultant Spark dataframe(s).
203 |
204 | In this example the Spark DataFrames are converted to R dataframes to help understand the result.
205 |
206 |
207 | ```{r}
208 |
209 | ## Run pipeline
210 | pipelineObj %>% generateOutput() -> pipelineObj
211 |
212 | ## Write to output stream
213 | streamObj <- pipelineObj %>>% getOutputById("4")
214 | streamObj
215 | ```
216 |
217 | # Supplementary note
218 |
219 | Currently, streaming pipelines have the limitation that they are able to execute only linear flows as this constrained by *Apache Spark Structured Streaming.* Non-linear flows can be defined but might throw execution errors in runtime. Also, streaming pipelines can be implemented using only 1 engine i.e. *Apache Spark Structured Streaming.*
220 |
--------------------------------------------------------------------------------
/vignettes/Using_pipelines_inside_shiny_widgets.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Using pipelines inside Shiny widgets or apps"
3 | author: "Naren Srinivasan"
4 | date: "11/14/2018"
5 | runtime: shiny-prerendered
6 | output:
7 | rmarkdown::html_vignette:
8 | fig_width: 8
9 | vignette: >
10 | %\VignetteIndexEntry{Using pipelines inside Shiny widgets or apps}
11 | %\VignetteEngine{knitr::rmarkdown}
12 | %\VignetteEncoding{UTF-8}
13 | ---
14 |
15 | ```{r setup, include=FALSE, echo=FALSE, message=FALSE}
16 | library(analysisPipelines)
17 | library(shiny)
18 | knitr::opts_chunk$set(
19 | eval = FALSE
20 | )
21 | ```
22 |
23 | # Pipelines in shiny apps
24 | Pipelines can be used as part of Shiny widgets or apps. In the following example, we define a simple pipeline which generates a chart, and use that to power a shiny widget.
25 |
26 | In this example, we emulate streaming dataset using the `shiny::reactivePoll` function and randomly sampling from an existing sample dataset in the package.
27 |
28 |
29 |
30 | ```{r data, echo=T, results='hide'}
31 | data("iris")
32 | shinyPipeline <- AnalysisPipeline()
33 | shinyPipeline %>>% setLoggerDetails(target = "none")
34 | shinyPipeline %>>% univarCatDistPlots(uniCol = "Species", priColor = "blue", optionalPlots = 0, storeOutput = T) -> shinyPipeline
35 | ```
36 |
37 |
38 | We then use the pipeline within the `shiny::renderPlot` function, and set the sampled data to execute the pipeline, and generate the chart. Since the data keeps changing due to our reactive poll, the expression within the `shiny::renderPlot` function keeps getting called in the reactive context.
39 |
40 |
41 | ```{r shiny, context="server", message=FALSE, warning=FALSE, echo =TRUE, results='asis'}
42 | sampled_data <- shiny::reactivePoll(intervalMillis = 2000,
43 | session = NULL,
44 | checkFunc = function() return(base::sample(1:100, 1)),
45 | valueFunc = function() return(iris[sample(1:nrow(iris), 100),]))
46 | ```
47 |
48 | ```{r}
49 | shiny::renderPlot(height = 400, {
50 | sampled_data <- sampled_data()
51 | shinyPipeline %>>% setInput(input = sampled_data) -> shinyPipeline
52 | shinyPipeline %>>% generateOutput %>>% getOutputById("1")
53 | })
54 | ```
55 |
56 |
--------------------------------------------------------------------------------