├── .Rbuildignore
├── .gitignore
├── .travis.yml
├── DESCRIPTION
├── LICENSE
├── Makefile
├── NAMESPACE
├── NEWS.md
├── R
    ├── AllClass.R
    ├── AllGeneric.R
    ├── MapSchedule.R
    ├── Platform.R
    ├── codemIpsum.R
    ├── columnsUsed.R
    ├── conversion.R
    ├── dataParallel.R
    ├── dataSource.R
    ├── expandData.R.bak
    ├── expandData_scratch.txt
    ├── forLoop.R
    ├── generateDataParallel.R
    ├── generateTask.R
    ├── inferGraph.R
    ├── makeParallel.R
    ├── old_scheduleFork.R
    ├── order.R
    ├── plot.R
    ├── print.R
    ├── propagate.R
    ├── reduce.R
    ├── runMeasure.R
    ├── scheduleDataParallel.R
    ├── scheduleFork.R
    ├── scheduleTaskList.R
    ├── standardize.R
    ├── utils.R
    ├── writeCode.R
    └── zzz.R
├── README.md
├── TODO.md
├── cran_comments.md
├── inst
    ├── dev
    │   └── vectorBlock.R
    ├── examples
    │   ├── MapReduce.R
    │   ├── MapReduce_analysis.R
    │   ├── MapReduce_ideal.R
    │   ├── as_date2.R
    │   ├── benchmarks.txt
    │   ├── cov_chunked_efficiency.png
    │   ├── cov_prechunked_efficiency.png
    │   ├── cov_prechunked_parallel_eff.png
    │   ├── cov_times.csv
    │   ├── cov_timings.R
    │   ├── covariance.R
    │   ├── covariance.Rmd
    │   ├── d1.csv
    │   ├── d2.csv
    │   ├── duncan.R
    │   ├── efficiency_by_chunks.png
    │   ├── extend.R
    │   ├── isdataframe_strange_profile.Rprof
    │   ├── issue19.R
    │   ├── lda.R
    │   ├── mp_example.R
    │   ├── plot_cov.R
    │   ├── plot_cov2.R
    │   ├── qq_baseline.png
    │   ├── recurse_globals.R
    │   └── scale.R
    ├── oldcode
    │   ├── README
    │   ├── RHive.Rmd
    │   ├── apply.R
    │   ├── benchmark_transform.R
    │   ├── brokecode.R
    │   ├── canon.R
    │   ├── codegraph
    │   │   └── codegraph.R
    │   ├── columns.R
    │   ├── design.Rmd
    │   ├── distribute.R
    │   ├── evolve.R
    │   ├── evolve.Rmd
    │   ├── experiments.Rmd
    │   ├── hive.R
    │   ├── interactive.R
    │   ├── interactive.Rmd
    │   ├── internal.Rmd
    │   ├── intro.R
    │   ├── intro.Rmd
    │   ├── read_faster.R
    │   ├── read_faster.Rmd
    │   ├── related.Rmd
    │   ├── scratch.R
    │   ├── script.Rmd
    │   ├── simple.R
    │   ├── simple2.R
    │   ├── snow.R
    │   ├── task_parallel.Rmd
    │   ├── test_apply.R
    │   ├── test_canon.R
    │   ├── test_distribute.R
    │   ├── test_evolve.R
    │   ├── test_read_faster.R
    │   ├── test_tune.R
    │   ├── test_utils.R
    │   ├── tune.R
    │   ├── tuning.Rmd
    │   └── utils.R
    ├── pems
    │   ├── .gitignore
    │   ├── gen2.R
    │   ├── intermediate_transformed_code.R
    │   ├── notes.md
    │   ├── pems.R
    │   ├── pems_with_data_load.R
    │   ├── rbind_problem.R
    │   ├── slurm.sh
    │   ├── target_code.R
    │   ├── transform_code.R
    │   ├── transform_code2.R
    │   └── transform_subset.R
    ├── templates
    │   ├── snow_manager.R
    │   ├── snow_notransfer.R
    │   ├── snow_worker.R
    │   ├── udaf.R
    │   ├── udaf.sql
    │   └── vector.R
    └── use_cases
    │   └── lemmatize
    │       ├── Lemmatizer.R
    │       ├── Lemmatizer_parrallel_BETA.R
    │       ├── README.txt
    │       └── goodnightmoon.csv
├── man
    ├── Assignment-class.Rd
    ├── AssignmentOneVectorFunction-class.Rd
    ├── ChunkDataFiles-class.Rd
    ├── ChunkDataFiles.Rd
    ├── CodeBlock-class.Rd
    ├── DataLoadBlock-class.Rd
    ├── DataSource-class.Rd
    ├── FinalBlock-class.Rd
    ├── FixedWidthFiles-class.Rd
    ├── ForkSchedule-class.Rd
    ├── GeneratedCode-class.Rd
    ├── InitBlock-class.Rd
    ├── KnownAssignment-class.Rd
    ├── MapSchedule-class.Rd
    ├── MeasuredTaskGraph-class.Rd
    ├── NoDataSource-class.Rd
    ├── ParallelBlock-class.Rd
    ├── ParallelLocalCluster-class.Rd
    ├── Platform.Rd
    ├── ReduceBlock-class.Rd
    ├── ReduceFun-class.Rd
    ├── Schedule-class.Rd
    ├── SerialBlock-class.Rd
    ├── SerialSchedule-class.Rd
    ├── SimpleReduce-class.Rd
    ├── SplitBlock-class.Rd
    ├── Statement-class.Rd
    ├── TaskGraph-class.Rd
    ├── TaskSchedule-class.Rd
    ├── TextTableFiles-class.Rd
    ├── TimedTaskGraph-class.Rd
    ├── XXX.Rd
    ├── dataSource.Rd
    ├── expandData.Rd
    ├── file.Rd
    ├── fileSetter.Rd
    ├── findFirstDataSource.Rd
    ├── forLoopToLapply.Rd
    ├── generate.Rd
    ├── inferGraph.Rd
    ├── inferReadFuncFromFile.Rd
    ├── makeParallel.Rd
    ├── mapSchedule.Rd
    ├── orderBottomLevel.Rd
    ├── plot-TaskSchedule-missing-method.Rd
    ├── plotDOT.Rd
    ├── reduceFun.Rd
    ├── runMeasure.Rd
    ├── schedule.Rd
    ├── scheduleDataParallel.Rd
    ├── scheduleFork.Rd
    ├── scheduleFork_old.Rd
    ├── scheduleTaskList.Rd
    ├── standardizeData.Rd
    ├── time.Rd
    ├── use_def.Rd
    └── writeCode.Rd
├── notes.md
├── tests
    ├── generated
    │   ├── fail.R
    │   ├── script1.R
    │   ├── script2.R
    │   ├── script3.R
    │   ├── script4.R
    │   ├── script5.R
    │   ├── script6.R
    │   ├── script7.R
    │   └── test_generated_scripts.R
    ├── testthat.R
    └── testthat
    │   ├── .gitignore
    │   ├── by_example
    │       ├── setup_data.R
    │       ├── vector_actual_generated.R
    │       ├── vector_transform.R
    │       ├── x1.rds
    │       ├── x2.rds
    │       ├── x3.rds
    │       └── x4.rds
    │   ├── codewall.R
    │   ├── dates.txt
    │   ├── expected
    │       ├── med_petal.rds
    │       ├── result_custom_reduce.rds
    │       ├── result_median_reduce.rds
    │       └── result_two_blocks.rds
    │   ├── iris_csv
    │       ├── 1.csv
    │       ├── 2.csv
    │       ├── 3.csv
    │       ├── 4.csv
    │       ├── 5.csv
    │       └── generate_data.R
    │   ├── local_test.R
    │   ├── range_of_dates.R
    │   ├── single_numeric_few_distinct
    │       ├── .gitignore
    │       ├── big.rds
    │       ├── setup_data.R
    │       ├── small1.rds
    │       └── small2.rds
    │   ├── single_numeric_vector
    │       ├── .gitignore
    │       ├── big.rds
    │       ├── setup_data.R
    │       ├── small1.rds
    │       └── small2.rds
    │   ├── test_Data.R
    │   ├── test_MapSchedule.R
    │   ├── test_columnsUsed.R
    │   ├── test_custom_reduce.R
    │   ├── test_dependGraph.R
    │   ├── test_extend_platform.R
    │   ├── test_forLoop.R
    │   ├── test_forkSchedule.R
    │   ├── test_group_by.R
    │   ├── test_makeParallel.R
    │   ├── test_map_reduce.R
    │   ├── test_median_reduce.R
    │   ├── test_one_text_file.R
    │   ├── test_order.R
    │   ├── test_scheduleTaskList.R
    │   ├── test_two_blocks.R
    │   └── test_utils.R
└── vignettes
    ├── .gitignore
    ├── basic_model.dot
    ├── basic_model.png
    ├── concepts.Rmd
    ├── extensible.dot
    ├── extensible.png
    ├── old_concepts.Rmd
    ├── quickstart.Rmd
    └── standardized_code.md


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^\.travis\.yml$
2 | ^codecov\.yml$
3 | Makefile
4 | TODO.txt
5 | README.md
6 | cran_comments.md
7 | NEWS.md
8 | ^appveyor\.yml$
9 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pdf
 2 | gen_*.R
 3 | *.log
 4 | tests/testthat/*.pdf
 5 | tests/testthat/ex.R
 6 | scratch/*
 7 | vignettes/overview_files/*
 8 | vignettes/figure/*
 9 | vignettes/*.md
10 | 
11 | # History files
12 | .Rhistory
13 | .Rapp.history
14 | 
15 | # Session Data files
16 | .RData
17 | 
18 | # Example code in package build process
19 | *-Ex.R
20 | 
21 | # Output files from R CMD build
22 | /*.tar.gz
23 | 
24 | # Output files from R CMD check
25 | /*.Rcheck/
26 | 
27 | # RStudio files
28 | .Rproj.user/
29 | 
30 | # produced vignettes
31 | vignettes/*.html
32 | vignettes/*.pdf
33 | 
34 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
35 | .httr-oauth
36 | 
37 | # knitr and R markdown default cache directories
38 | /*_cache/
39 | /cache/
40 | 
41 | # Temporary files created by R markdown
42 | *.utf8.md
43 | *.knit.md
44 | 
45 | # Profiling files
46 | *.out
47 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # R for travis: see documentation at https://docs.travis-ci.com/user/languages/r
 2 | 
 3 | language: R
 4 | #sudo: false
 5 | cache: packages
 6 | bioc_packages: graph
 7 | warnings_are_errors: false
 8 | r:
 9 |   - release
10 |   - devel
11 | 
12 | # I think this is necessary because CodeDepends depends on the graph
13 | # package in BioConductor, so this comes from a larger issue with devtools?
14 | # https://community.rstudio.com/t/r-devel-error-on-travis-ci-and-appveyor/2708
15 | # https://github.com/r-lib/devtools/issues/1530
16 | #install:
17 | #  - R -e 'install.packages(c("CodeDepends"
18 | #    , "whisker"
19 | #    , "igraph"
20 | #    , "roxygen2"
21 | #    , "knitr"
22 | #    , "rmarkdown"
23 | #    , "testthat"
24 | #    ))'
25 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: makeParallel
 2 | Version: 0.2.1
 3 | Date: 2018-07-18
 4 | Title: Transform Serial R Code into Parallel R Code
 5 | Authors@R: person("Clark", "Fitzgerald", role = c("aut", "cre"),
 6 |                      email = "clarkfitzg@gmail.com",
 7 |                      comment = c(ORCID = "0000-0003-3446-6389"))
 8 | Maintainer: Clark Fitzgerald <clarkfitzg@gmail.com>
 9 | Depends:
10 |     R (>= 3.1.0)
11 | Imports:
12 |     methods,
13 |     utils,
14 |     graphics,
15 |     parallel,
16 |     codetools,
17 |     CodeDepends,
18 |     rstatic
19 | Suggests: 
20 |     igraph,
21 |     roxygen2,
22 |     knitr,
23 |     rmarkdown,
24 |     testthat
25 | Description:
26 |     Writing parallel R code can be difficult, particularly for code
27 |     that is not "embarrassingly parallel".
28 |     This experimental package automates the transformation of serial R code
29 |     into more efficient parallel versions. It identifies task parallelism by
30 |     statically analyzing entire scripts to detect dependencies between
31 |     statements. It implements an extensible system for scheduling
32 |     and generating new code. It includes a reference implementation of the
33 |     'List Scheduling' approach to the general task scheduling problem of scheduling 
34 |     statements on multiple processors.
35 | License: MIT + file LICENSE
36 | URL: https://github.com/clarkfitzg/makeParallel
37 | BugReports: https://github.com/clarkfitzg/makeParallel
38 | RoxygenNote: 6.1.1
39 | VignetteBuilder: knitr
40 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2018
2 | COPYRIGHT HOLDER: Clark Fitzgerald
3 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Generic Makefile that can live in the same directory as an R package.
 2 | 
 3 | PKGNAME = $(shell awk '{if(/Package:/) print $$2}' DESCRIPTION)
 4 | VERSION = $(shell awk '{if(/Version:/) print $$2}' DESCRIPTION)
 5 | PKG = $(PKGNAME)_$(VERSION).tar.gz
 6 | 
 7 | # Helpful for debugging:
 8 | $(info R package is: $(PKG))
 9 | 
10 | RFILES = $(wildcard R/*.R)
11 | TESTFILES = $(wildcard tests/testthat/test*.R)
12 | VIGNETTES = $(wildcard vignettes/*.Rmd)
13 | GRAPHVIZ_PNGS = $(addsuffix .png, $(basename $(wildcard vignettes/*.dot)))
14 | TEMPLATES = $(wildcard inst/templates/*.R)
15 | 
16 | #GEN_SCRIPT_OUTPUT = $(addsuffix .log, $(wildcard tests/testthat/scripts/script*.R))
17 | ## Log files that go with each test
18 | #%.R.log: %.R
19 | #	Rscript $<
20 | 
21 | # User local install
22 | install: $(PKG)
23 | 	R CMD INSTALL $<
24 | 
25 | #NAMESPACE: $(RFILES)
26 | 
27 | test: $(TESTFILES) $(GEN_SCRIPT_OUTPUT)
28 | 	make install
29 | 	cd tests && Rscript testthat.R && cd ..
30 | 
31 | $(PKG): $(RFILES) $(TESTFILES) $(TEMPLATES) $(VIGNETTES) DESCRIPTION
32 | 	R -e "devtools::document()"
33 | 	rm -f $(PKG)  # Otherwise it's included in build
34 | 	R CMD build . --no-build-vignettes
35 | 
36 | check: $(PKG)
37 | 	R CMD check $(PKG) --as-cran --run-dontrun
38 | 
39 | docs: $(VIGNETTES) $(GRAPHVIZ_PNGS)
40 | 	make install
41 | 	R -e "tools::buildVignettes(dir = '.')"
42 | 
43 | clean:
44 | 	rm -rf vignettes/*.html $(PKG) *.Rcheck
45 | 
46 | # Graphviz images
47 | %.png: %.dot
48 | 	dot -Tpng $< -o $@
49 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(dataSource,"<-")
 4 | S3method(dataSource,GeneratedCode)
 5 | S3method(dataSource,Schedule)
 6 | S3method(dataSource,expression)
 7 | S3method(inferDataSourceFromCall,default)
 8 | export("file<-")
 9 | export(ChunkDataFiles)
10 | export(FixedWidthFiles)
11 | export(GeneratedCode)
12 | export(Platform)
13 | export(TextTableFiles)
14 | export(combine_tables)
15 | export(dataSource)
16 | export(expandData)
17 | export(generate)
18 | export(inferDataSourceFromCall)
19 | export(inferGraph)
20 | export(makeParallel)
21 | export(mapSchedule)
22 | export(orderBottomLevel)
23 | export(platform)
24 | export(plotDOT)
25 | export(reduceFun)
26 | export(schedule)
27 | export(scheduleDataParallel)
28 | export(scheduleFork)
29 | export(scheduleFork_old)
30 | export(scheduleTaskList)
31 | export(standardizeData)
32 | export(writeCode)
33 | exportClasses(ChunkDataFiles)
34 | exportClasses(DataParallelSchedule)
35 | exportClasses(DataSource)
36 | exportClasses(FixedWidthFiles)
37 | exportClasses(ForkSchedule)
38 | exportClasses(GeneratedCode)
39 | exportClasses(MapSchedule)
40 | exportClasses(MeasuredTaskGraph)
41 | exportClasses(ParallelLocalCluster)
42 | exportClasses(Schedule)
43 | exportClasses(SerialSchedule)
44 | exportClasses(TaskGraph)
45 | exportClasses(TaskSchedule)
46 | exportClasses(TextTableFiles)
47 | exportClasses(TimedTaskGraph)
48 | exportClasses(UnixPlatform)
49 | exportMethods("file<-")
50 | exportMethods(file)
51 | exportMethods(generate)
52 | exportMethods(inferGraph)
53 | exportMethods(plot)
54 | exportMethods(schedule)
55 | exportMethods(time)
56 | exportMethods(writeCode)
57 | import(methods)
58 | importFrom(graphics,plot)
59 | importFrom(stats,time)
60 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | # News
 2 | 
 3 | ## 0.2
 4 | 
 5 | - Enhancement: Sort `DependGraph` by node priority before scheduling with `scheduleTaskList`.
 6 | - New feature: Plot method for `DependGraph`
 7 | - Enhancement: Prevent sending the same data to the same worker multiple times in `scheduleTaskList` function.
 8 | - Enhancement: Redesigned `DependGraph@graph` for extensibility.
 9 |   Columns are `from`, `to`, `type`, and `value`, where `value` is a list of lists that can contain anything for any row.
10 | 
11 | 
12 | ## 0.1
13 | 
14 | 31 July 2018
15 | 
16 | - Initial CRAN submission
17 | 


--------------------------------------------------------------------------------
/R/Platform.R:
--------------------------------------------------------------------------------
 1 | #' Describe Platform
 2 | #'
 3 | #' Constructor for \linkS4class{Platform} classes, by default uses the current local platform.
 4 | #'
 5 | #' @export
 6 | #' @param OS.type character, \code{"unix"} or \code{"windows"} 
 7 | #' @param nWorkers integer, number of parallel workers
 8 | #' @return \linkS4class{Platform}
 9 | Platform = function(OS.type = .Platform[["OS.type"]] , nWorkers = parallel::detectCores(logical = FALSE)
10 |     , name = "cls", scratchDir = ".")
11 | {
12 |     nWorkers = as.integer(nWorkers)
13 |     p = ParallelLocalCluster(name = name, nWorkers = nWorkers, scratchDir = scratchDir)
14 |     if(OS.type == "unix"){
15 |         p = as(p, "UnixPlatform")
16 |     } 
17 |     p
18 | }
19 | 
20 | setMethod("platform", "GeneratedCode", function(x, ...) callGeneric(schedule(x), ...))
21 | 
22 | # #' @export
23 | # parallelLocalCluster = function(name = "cls", nWorkers = 2L, scratchDir = ".")
24 | #     new("ParallelLocalCluster", name = name, nWorkers = nWorkers, scratchDir = scratchDir)
25 | 


--------------------------------------------------------------------------------
/R/codemIpsum.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # Generate Random Code
 3 | #
 4 | # Random code would be useful in this package to test the correctness of
 5 | # scheduling algorithms.
 6 | #
 7 | # Ethan talked about doing this. I wonder if we can put a probability
 8 | # distribution on how scripts write code?
 9 | #
10 | codemIpsum = function()
11 | {
12 | }
13 | 


--------------------------------------------------------------------------------
/R/columnsUsed.R:
--------------------------------------------------------------------------------
 1 | # TODO: Have CodeAnalysis::readFaster use this code.
 2 | 
 3 | # find the names of all columns of the data frame `dfname` that code uses.
 4 | # See tests for what it currently does and does not handle
 5 | columnsUsed = function(code, dfname)
 6 | {
 7 | 
 8 |     locations = find_var(code, dfname)
 9 |     found = character()
10 |     for(loc in locations){
11 |         r = oneUsage(loc, code, dfname)
12 |         status = r[["status"]]
13 |         if(status == "all_columns")
14 |             return(NULL)
15 |         found = c(found, r[["found"]])
16 |         if(status == "complete")
17 |             return(found)
18 |     }
19 |     found
20 | }
21 | 
22 | 
23 | # Return a list with the columns found and a status
24 | oneUsage = function(loc, code, dfname, subset_funcs = c("[", "[["), assign_funcs = c("=", "<-"))
25 | {
26 |     out = list(status = "continue", found = NULL)
27 |     ll = length(loc)
28 | 
29 |     # Handle all possible cases
30 | 
31 |     parent = code[[loc[-ll]]]
32 |     parent_func = as.character(parent[[1]])
33 | 
34 |     if(parent_func %in% assign_funcs){
35 |         # Assignment to the variable.
36 |         # This can be more robust - For now we'll handle it in the rhs.
37 |         return(out)
38 |     }
39 | 
40 |     if(!(parent_func %in% subset_funcs)){
41 |         # We don't know about this function, so assume the worst, that it uses all columns
42 |         out[["status"]] = "all_columns"
43 |         return(out)
44 |     }
45 | 
46 |     # Assumes that the column is the last argument,
47 |     # which is true for common uses of `[` and `[[`.
48 |     # We can't use match.call here because `[` is Primitive
49 |     colcode = parent[[length(parent)]]
50 | 
51 |     if(is.character(colcode) || c_with_literals(colcode)){
52 |         # It's just a character vector, safe to evaluate
53 |         out[["found"]] = eval(colcode)
54 |     } else {
55 |         # It's not a simple character vector, so assume the worst, that it uses all columns
56 |         out[["status"]] = "all_columns"
57 |         return(out)
58 |     }
59 | 
60 |     grandparent = code[[loc[-c(ll-1, ll)]]]
61 |     grandparent_func = as.character(grandparent[[1]])
62 | 
63 |     # The special case when we reassign into the same variable
64 |     if(grandparent_func %in% assign_funcs){
65 |         varname = as.character(grandparent[[2L]])
66 |         if(varname == dfname){
67 |             out[["status"]] = "complete"
68 |         }
69 |     }
70 | 
71 |     out
72 | }
73 | 
74 | 
75 | # Check if expr creates a character vector from literals, meaning it looks like c("foo", "bar", ...)
76 | c_with_literals = function(expr)
77 | {
78 |     if(as.character(expr[[1]]) != "c")
79 |         return(FALSE)
80 |     for(i in seq(2, length(expr))){
81 |         if(class(expr[[i]]) != "character")
82 |             return(FALSE)
83 |     }
84 |     TRUE
85 | }
86 | 


--------------------------------------------------------------------------------
/R/conversion.R:
--------------------------------------------------------------------------------
 1 | # TODO: Ask Duncan. Is it reasonable to define this conversion in this way?
 2 | # The idea is to keep igraph a "soft" dependency
 3 | 
 4 | setAs("TaskGraph", "igraph", function(from)
 5 | {
 6 |     if(requireNamespace("igraph", quietly = TRUE)){
 7 |         g = igraph::graph_from_data_frame(from@graph)
 8 |         # From https://stackoverflow.com/questions/17433402/r-igraph-rename-vertices
 9 |         igraph::V(g)$label = as(from@code, "character")
10 |         g
11 |     } else stop("Install igraph to use this conversion.")
12 | })
13 | 
14 | 
15 | # It might make more sense to have a class for a filename
16 | setAs("character", "expression", function(from)
17 | {
18 |     # This means that to do a single string literal we'll need to coerce it to a string literal.
19 |     # For example, as.expression("foo")
20 |     if(length(from) == 1){
21 |         parse(from, keep.source = TRUE)
22 |     } else {
23 |         stop("Expected a single file name.")
24 |     }
25 | })
26 | 


--------------------------------------------------------------------------------
/R/dataParallel.R:
--------------------------------------------------------------------------------
 1 | #' Estimate Time To Execute Function
 2 | #'
 3 | #' @param maxWorker integer number of parallel workers to use
 4 | #' @param sizeInput numeric size of each input element in bytes
 5 | #' @param sizeOutput numeric size of each output element in bytes
 6 | #' @return list with the following elements:
 7 | #' \describe{
 8 | #'   \item{serialTime}{Time in seconds to execute the function in serial}
 9 | #'   \item{parallelTime}{Time in seconds to execute the function in
10 | #'      parallel}
11 | #'   \item{elementsParallelFaster}{Number of data elements required for a
12 | #'   parallel version with maxWorker workers to be faster than serial. Can
13 | #'   be Inf if parallel will never be faster than serial.}
14 | #'   \item{}{}
15 | #' }
16 | XXX = function(maxWorker, sizeInput, sizeOutput)
17 | {
18 | }
19 | 
20 | 
21 | #' Create Functions Estimating Data Run Time
22 | #'
23 | #' @param sizeInput numeric size of each input element in bytes
24 | #' @param sizeOutput numeric size of each output element in bytes
25 | #' @return list with functions for estimating time required for serial and
26 | #'  parallel execution. Serial is a function of $n$
27 | 


--------------------------------------------------------------------------------
/R/old_scheduleFork.R:
--------------------------------------------------------------------------------
 1 | # TODO: I haven't limited the number of processors yet- so this assumes we
 2 | # have unlimited processors. Add the constraint later.
 3 | 
 4 | 
 5 | #' Single sequential forks scheduler
 6 | #'
 7 | #' @export
 8 | #' @inheritParams scheduleTaskList
 9 | #' @param graph object of class \code{TaskGraph} as returned from \code{\link{inferGraph}}
10 | #'  expression.
11 | #' @return schedule object of class \code{ForkSchedule}
12 | scheduleFork_old = function(graph
13 |     , overhead = 1e3
14 |     , bandwidth = 1.5e9
15 | ){
16 | 
17 |     nnodes = length(graph@code)
18 | 
19 |     might_fork = seq(nnodes)
20 |     might_fork = might_fork[time(graph) > overhead]
21 | 
22 |     partialSchedule = data.frame(expression = seq(nnodes), fork = "run", time = time(graph))
23 | 
24 |     # I imagine this pattern generalizes to other greedy algorithms.
25 |     # But I'm not going to generalize it now.
26 |     for(i in seq_len(might_fork)){
27 |         reduction = sapply(might_fork, forkTimeReduction
28 |                        , partialSchedule = partialSchedule
29 |                        , graph = graph
30 |                        )
31 |         if(all(reduction < 0)){
32 |             break
33 |         }
34 |         node_to_fork = might_fork[which.max(reduction)]
35 |         might_fork = setdiff(might_fork, node_to_fork)
36 |         partialSchedule = scheduleOne(node_to_fork, partialSchedule, graph)
37 |     }
38 | 
39 |     new("ForkSchedule", graph = graph
40 |             , fork = partialSchedule
41 |             , overhead = overhead
42 |             , bandwidth = bandwidth
43 |             )
44 | }
45 | 
46 | 
47 | # How long does the partial schedule take to complete if we fork one node?
48 | forkTimeReduction = function(node_to_fork, partialSchedule, graph)
49 | {
50 |     newSchedule = scheduleOne(node_to_fork, partialSchedule, graph)
51 |     runTime(partialSchedule) - runTime(newSchedule)
52 | }
53 | 
54 | 
55 | # How long does the partial schedule take to complete?
56 | runTime = function(partialSchedule)
57 | {
58 |     sum(partialSchedule[, "time"])
59 | }
60 | 
61 | scheduleOne = function(node_to_fork, partialSchedule, graph)
62 | {
63 | }
64 | 
65 | ############################################################
66 | 
67 | # Base case when there is no parallelism left. Returns
68 | # the nodes ordered topologically according to the graph.
69 | scheduleForkBase = function(nodes)
70 | {
71 |     # Can just do this because the original input order is a topological
72 |     # sort.
73 |     sort(nodes)
74 | }
75 | 
76 | 
77 | 


--------------------------------------------------------------------------------
/R/order.R:
--------------------------------------------------------------------------------
 1 | #setMethod(sort, "TaskGraph", sortBottomLevel)
 2 | 
 3 | 
 4 | #' Order Nodes By Bottom Level Order
 5 | #'
 6 | #' Permute the nodes of the graph so that they are ordered in decreasing
 7 | #' bottom level precedence order. The bottom level of a node is the length
 8 | #' of the longest path starting at that node and going to the end of the
 9 | #' program.
10 | #'
11 | #' This permutation respects the partial order of the graph, so executing
12 | #' the permuted code will produce the same result as the original code.
13 | #' There are many possible node precedence orders. 
14 | #'
15 | #' @references \emph{Task Scheduling for Parallel Systems}, Sinnen, O.
16 | #' claim bottom level order provides good average performance. I'm not sure
17 | #' if this claim holds for general data analysis scripts.
18 | #'
19 | #' @export
20 | #' @param graph \linkS4class{TimedTaskGraph}
21 | #' @return integer vector to permute the expressions in \code{x@code}
22 | #' @examples
23 | #' graph <- inferGraph(code = parse(text = "x <- 1:100
24 | #' y <- rep(1, 100)
25 | #' z <- x + y"), time = c(1, 2, 1))
26 | #' bl <- orderBottomLevel(graph)
27 | orderBottomLevel = function(graph)
28 | {
29 |     bl = bottomLevel(graph)
30 |     order(bl, decreasing = TRUE)
31 | }
32 | 
33 | 
34 | bottomLevel = function(graph)
35 | {
36 |     n = length(graph@code)
37 |     alltimes = graph@time
38 |     bl = rep(0, n)
39 |     g = graph@graph
40 |     # Iterating in reverse guarantees bl elements are defined for all
41 |     # successors.
42 |     for(node in seq(n, 1)){
43 |         nodetime = alltimes[node]
44 |         bl[node] = oneBottomLevel(node, nodetime, g, bl)
45 |     }
46 |     bl
47 | }
48 | 
49 | 
50 | oneBottomLevel = function(node, nodetime, graph, bl)
51 | {
52 |     s = successors(node, graph)
53 |     if(length(s) == 0) nodetime else max(bl[s]) + nodetime
54 | }
55 | 


--------------------------------------------------------------------------------
/R/print.R:
--------------------------------------------------------------------------------
 1 | minimalPrint = function(object)
 2 | {
 3 |     msg = sprintf('An object of class "%s"
 4 | Slots: ', class(object))
 5 |     slots = paste(slotNames(object), collapse = ", ")
 6 |     cat(paste0(msg, slots, "\n\n"))
 7 | }
 8 | 
 9 | 
10 | setMethod("show", "Schedule", minimalPrint)
11 | 
12 | setMethod("show", "GeneratedCode", minimalPrint)
13 | 


--------------------------------------------------------------------------------
/R/reduce.R:
--------------------------------------------------------------------------------
 1 | #' Construct ReduceFun Objects
 2 | #'
 3 | #' @export
 4 | reduceFun = function(reduce, summary = reduce, combine = "c", query = summary, predicate = function(...) TRUE)
 5 | {
 6 |     if(!is.character(reduce))
 7 |         stop("Expected the name of a reducible function for reduce argument.")
 8 | 
 9 |     funClasses = sapply(list(summary, combine, query), class)
10 |     if(all(funClasses == "character")){
11 |         return(SimpleReduce(reduce = reduce, summary = summary
12 |                                , combine = combine, query = query
13 |                                , predicate = predicate))
14 |     }
15 | 
16 |     UserDefinedReduce(reduce = reduce, summary = summary
17 |                                , combine = combine, query = query
18 |                                , predicate = predicate)
19 | }
20 | 
21 | 
22 | combine_two_tables = function(x, y)
23 | {
24 |     # Assume not all values will appear in each table
25 |     levels = union(names(x), names(y))
26 |     out = rep(0L, length(levels))
27 |     out[levels %in% names(x)] = out[levels %in% names(x)] + x
28 |     out[levels %in% names(y)] = out[levels %in% names(y)] + y
29 |     names(out) = levels
30 |     as.table(out)
31 | }
32 | 
33 | 
34 | #' @export
35 | combine_tables = function(...){
36 |     dots = list(...)
37 |     Reduce(combine_two_tables, dots, init = table(logical()))
38 | }
39 | 
40 | 


--------------------------------------------------------------------------------
/R/runMeasure.R:
--------------------------------------------------------------------------------
 1 | #' Run and Measure Code
 2 | #'
 3 | #' Will export this once I the full pipeline works.
 4 | #'
 5 | #' Run the serial code in the task graph and measure how long each expression
 6 | #' takes to run as well as the object sizes of each variable that can
 7 | #' possibly be transferred.
 8 | #'
 9 | #' This does naive and biased timing since it doesn't account for the
10 | #' overhead in evaluating a single expression. However, this is fine for
11 | #' this application since the focus is on measuring statements that take at
12 | #' least on the order of 1 second to run. 
13 | #'
14 | #' @param code to be passed into \code{\link{inferGraph}}
15 | #' @param graph object of class \code{TaskGraph}
16 | #' @param envir environment to evaluate the code in
17 | #' @param timer function that returns a timestamp.
18 | #' @return graph object of class \code{MeasuredTaskGraph}
19 | runMeasure = function(code, graph = inferGraph(code), envir = globalenv(), timer = Sys.time)
20 | {
21 |     expr = graph@code
22 |     n = length(expr)
23 |     times = numeric(n)
24 |     tg = graph@graph
25 | 
26 |     # Eliminate as much overhead for the timing as possible.
27 |     force(envir)
28 | 
29 |     for(i in seq(n)){
30 |         gc()
31 |         start_time = timer()
32 |         eval(expr[[i]], envir)
33 |         end_time = timer()
34 |         times[i] = end_time - start_time
35 | 
36 |         from_rows = which(tg$type == "use-def" & tg$from == i)
37 | 
38 |         # This is redundant because we may check and store the same object
39 |         # size multiple times. Seems like more trouble than it's worth to
40 |         # do it in a non redundant way.
41 |         for(row_index in from_rows){
42 |             varname = tg[["value"]][[row_index]][["varname"]]
43 |             size = as.numeric(utils::object.size(get(varname, envir)))
44 |             tg[["value"]][[row_index]][["size"]] = size
45 |         }
46 |     }
47 | 
48 |     new("MeasuredTaskGraph", code = expr, graph = tg, time = times)
49 | }
50 | 


--------------------------------------------------------------------------------
/R/standardize.R:
--------------------------------------------------------------------------------
 1 | #' schedulers expect to see the data in a standard form
 2 | #'
 3 | #' @export
 4 | standardizeData = function(data)
 5 | {
 6 |     # TODO: Implement more checks and different interfaces here.
 7 |     if(!is(data, "DataSource"))
 8 |         stop("Expected a DataSource here")
 9 |     data
10 | }
11 | 


--------------------------------------------------------------------------------
/R/writeCode.R:
--------------------------------------------------------------------------------
  1 | # Methods and functions associated with writing files.
  2 | 
  3 | #' @param overWrite logical write over existing file
  4 | #' @param prefix character prefix for generating file names
  5 | #' @export
  6 | #' @rdname writeCode
  7 | setMethod("writeCode", c("GeneratedCode", "logical"), 
  8 |     function(code, file, overWrite = FALSE, prefix = "gen_")
  9 | {
 10 |     oldname = file(schedule(code))
 11 |     fname = prefixFileName(oldname, prefix)
 12 |     if(file && !is.na(fname)){
 13 |         writeHelper(code, fname, overWrite = overWrite)
 14 |     }
 15 |     code@code
 16 | })
 17 | 
 18 | 
 19 | #' @export
 20 | #' @rdname writeCode
 21 | setMethod("writeCode", c("GeneratedCode", "missing"), function(code, file, ...)
 22 | {
 23 |     callGeneric(code, file = FALSE, ...)
 24 | })
 25 | 
 26 | 
 27 | #' @export
 28 | #' @rdname writeCode
 29 | setMethod("writeCode", c("GeneratedCode", "character"),
 30 |     function(code, file, overWrite = FALSE, ...)
 31 | {
 32 |     if(!is.na(file))
 33 |         writeHelper(code, file, overWrite = overWrite)
 34 |     code@code
 35 | })
 36 | 
 37 | 
 38 | #' @export
 39 | #' @rdname writeCode
 40 | setMethod("writeCode", c("expression", "character"),
 41 |     function(code, file, overWrite = FALSE, ...)
 42 | {
 43 |     writeHelper(fname = file, overWrite = overWrite, content = as.character(code))
 44 | })
 45 | 
 46 | 
 47 | writeHelper = function(code, fname, overWrite, content = as.character(code@code))
 48 | {
 49 |     if(file.exists(fname) && !overWrite){
 50 |         e = simpleError(sprintf("The file %s already exists. Pass overWrite = TRUE to replace %s with a new version.", fname, fname))
 51 |         class(e) = c("FileExistsError", class(e))
 52 |         stop(e)
 53 |     }
 54 |     writeLines(content, fname)
 55 |     message(sprintf("generated parallel code is in %s", fname))
 56 |     fname
 57 | }
 58 | 
 59 | 
 60 | # Extract the original file name from the schedule and prefix it.
 61 | prefixFileName = function(oldname, prefix)
 62 | {
 63 |     if(!is.na(oldname)){
 64 |         newname = paste0(prefix, basename(oldname))
 65 |         dir = dirname(oldname)
 66 |         if(dir == ".") newname else file.path(dir, newname)
 67 |         # normalizePath needed here?
 68 |     } else as.character(NA)
 69 |     #} else NA
 70 | }
 71 | 
 72 | 
 73 | #' @export
 74 | #' @rdname file
 75 | setMethod("file", "TaskGraph", function(description)
 76 | {
 77 |     srcfile = attr(description@code, "srcfile")
 78 | 
 79 |     # Interactively using parse(text = "...") names the file "<text>". We
 80 |     # don't want this name. So this function will fail is someone actually
 81 |     # has an R script named "<text>".
 82 | 
 83 |     if(is.environment(srcfile)){
 84 |         out = srcfile$filename
 85 |         if(out == "<text>")
 86 |             out = NA
 87 |     } else {
 88 |         out = NA
 89 |     }
 90 | 
 91 |     as.character(out)
 92 | })
 93 | 
 94 | 
 95 | #' Get File containing code
 96 | #'
 97 | #' @export
 98 | #' @rdname file
 99 | #' @param description object that may have a file associated with it
100 | setMethod("file", "Schedule", function(description)
101 | {
102 |     callGeneric(description@graph)
103 | })
104 | 
105 | 
106 | #' @export
107 | #' @rdname file
108 | setMethod("file", "GeneratedCode", function(description)
109 | {
110 |     description@file
111 | })
112 | 
113 | 
114 | #setMethod("file<-", c("GeneratedCode", "LogicalOrCharacter"), function(description, value)
115 | 
116 | #' @export
117 | #' @rdname fileSetter
118 | setMethod("file<-", c("GeneratedCode", "character"), function(description, value)
119 | {
120 |     description@file = value
121 |     description
122 | })
123 | 


--------------------------------------------------------------------------------
/R/zzz.R:
--------------------------------------------------------------------------------
1 | # Defaults
2 | 
3 | #' @export
4 | #' @rdname schedule
5 | setMethod("schedule", signature(graph = "TaskGraph", data = "ANY", platform = "ANY"), scheduleDataParallel)
6 | 


--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
 1 | 
 2 | TODO list:
 3 | 
 4 | - handle library calls
 5 | - Move some simple cases from tests to the function examples
 6 | 
 7 | Can defer:
 8 | 
 9 | - See about using CodeDepends::getTaskGraph
10 | - Revisit Apache hive code generation function
11 | - Global option for overwriting files
12 | 
13 | Done:
14 | 
15 | - Make sure documentation can be accessed
16 | - Properly cite references
17 | - Drop all the functions and objects that I don't use
18 | - All items in code marked `TODO:*`
19 | - Don't open up sockets for tests on CRAN machines
20 | - Only write to tmp directories
21 | - Make sure object naming convention and argument names are consistent for
22 |   all exported objects.
23 | - Pass R CMD check
24 | - igraph conversion example
25 | - Add error classes to messages
26 | - Unit tests passing
27 | - Run tests on generated code
28 | - Switch to S4
29 | - Read templates when necessary
30 | - Return expression objects rather than text
31 | 
32 | 
33 | 
34 | - What's the best way to keep the arguments to schedule() consistent? I could
35 |   put them in the generic, but it's not clear that I want to dispatch on
36 |   them. Right now they're in scheduleTaskList.
37 | 
38 | Duncan's answer: Put the arguments in the generic if and only if every
39 | method should use and respect this same set of arguments.
40 | 
41 | 
42 | Fri Jun  8 12:00:36 PDT 2018
43 | 
44 | Talking with Duncan has gotten me to think more deeply about what kind of
45 | object oriented system I want to use. S3 is simpler while S4 is more
46 | complex. But I don't understand either of them, because this passing
47 | default arguments has got me confused. Do any R S3 methods use default
48 | arguments?
49 | 
50 | Conceptually there are 3 important objects: TaskGraph, Schedule,
51 | GeneratedCode
52 | 
53 | I would like to have these features right now:
54 | 
55 | - methods to create a TaskGraph from different inputs
56 | - plot methods for TaskGraph
57 | - Allow user to define their own code_generator function to dispatch on
58 |   Schedule and return object of GeneratedCode
59 | - The flexibility to add arbitrary elements to classes
60 | 
61 | In the future I might like to have these features:
62 | 
63 | - summary, print, and more plot methods
64 | - ways to describe data and systems so that these feed into the scheduling
65 | - object validation for Schedule objects, because one can
66 | create schedules that aren't valid (which implies problems with the
67 | schedule generator)
68 | 
69 | 
70 | # TODO
71 | 
72 | 
73 | Less urgent:
74 | 
75 | - Conversion to igraph objects
76 | - preprocessing step
77 | - Alternative scheduling algorithm and code generator based on fork / join.
78 | - Measure CPU utilization during timings to see what's parallel / threaded.
79 | 
80 | 
81 | ## Done
82 | 
83 | - Show a realistic script as an example that actually benefits from task parallelism.
84 | - Vignettes.
85 | - Robust test for expression equality
86 | - Write the `data_parallel()` function, including modification of for loops
87 |   into parallel code.
88 | - Handle assignment inside of a for loop of the form `x[[i]] = ...`
89 | 


--------------------------------------------------------------------------------
/cran_comments.md:
--------------------------------------------------------------------------------
 1 | This is a re-submission for the new package 'makeParallel', now on version
 2 | 0.1.1. I have incorporated the feedback kindly provided by Swetlana
 3 | Herbrandt on July 30th, 2018. See responses inline below.
 4 | 
 5 | Thank you,
 6 | Clark Fitzgerald
 7 | 
 8 | > Thanks,
 9 | > 
10 | > makeParallel("script.R")
11 | > 
12 | > cannot run:
13 | > cannot open file 'script.R': No such file or directory
14 | > 
15 | > Please add such a file in your package.
16 | 
17 | Fixed.
18 | 
19 | > Please ensure that your functions do not write by default or in your
20 | > examples/vignettes/tests in the user's home filespace. That is not allow by
21 | > CRAN policies. Please only write/save files if the user has specified a
22 | > directory. In your examples/vignettes/tests you can write to tempdir().
23 | > 
24 | > Please fix and resubmit.
25 | > 
26 | > Best,
27 | > Swetlana Herbrandt
28 | 
29 | I changed the default arguments so that none of the functions write to
30 | files by default. The two offending functions were 'writeCode' and
31 | 'makeParallel'. Now the user must explicitly supply the 'file' argument to
32 | write to a file. I verified that all of the examples/vignettes/tests only
33 | write to temporary files or into temporary directories, and then remove
34 | these once they are finished. I clarified this behavior in the
35 | documentation and vignette titled 'quickstart'.
36 | 
37 | 
38 | ## Test environments
39 | * ubuntu 16.04 local (R 3.4.4)
40 | * ubuntu 14.04 travis-ci (R devel and release)
41 | * win-builder (R devel and release)
42 | 
43 | 
44 | ## R CMD check results
45 | There were no ERRORs or WARNINGs.
46 | 
47 | This is the first submission of this package to CRAN, so there is a NOTE.
48 | 


--------------------------------------------------------------------------------
/inst/dev/vectorBlock.R:
--------------------------------------------------------------------------------
 1 | library(makeParallel)
 2 | 
 3 | # This is a list that says which parameters a function is vectorized in.
 4 | vecfuncs = list(qnorm = c("p", "mean", "sd")
 5 |                 , exp = "x"
 6 |                 )
 7 | 
 8 | # This will be easiest if we have the code in a particular form:
 9 | # single lines of function calls with only named arguments.
10 | # No nesting.
11 | # This will be an issue with ... args, but we can deal with that later.
12 | 
13 | code = parse(text = "
14 | x = seq(from = 0, to = 1, length.out = 100)
15 | y = qnorm(p = x)
16 | z = exp(x = y)
17 | result = sum(z)
18 | ")
19 | 
20 | g = inferGraph(code)
21 | e = g@code[[2]]
22 | 
23 | # We basically want to follow the flow of the large vector x, and any derivative vectors, through the dependency graph.
24 | 
25 | is_vectorized = function(e, .vecfuncs = vecfuncs)
26 | {
27 |     # Assuming it's the RHS of an expression.
28 |     call = e[[3]]
29 |     if(is.call(call)){
30 |         func_name = as.character(call[[1]])
31 |         func_name %in% names(.vecfuncs)
32 |     }
33 | }
34 | 
35 | v = which(sapply(g@code, is_vectorized))
36 | gdf = g@graph
37 | vblock_condition = (gdf[, "from"] %in% v) & (gdf[, "to"] %in% v)
38 | 
39 | vector_block_edges = gdf[vblock_condition, ]
40 | 
41 | vector_block_edges
42 | 
43 | # "being a large vector / object" is a property of a variable.
44 | # Edges come from variable usage.
45 | # Nodes are function calls.
46 | # A function call is vectorized in some of its parameters.
47 | # We can consider a node to be a vectorized function call if that function is vectorized in all of the parameters where a large vector is passed.
48 | 
49 | 


--------------------------------------------------------------------------------
/inst/examples/MapReduce.R:
--------------------------------------------------------------------------------
 1 | # Tue Aug 15 09:25:48 PDT 2017
 2 | # 
 3 | # Goal: automatically parallelize this
 4 | #
 5 | 
 6 | x = as.list(1:10)
 7 | y = Map(function(xi) 2 * xi, x)
 8 | sy = Reduce(`+`, y)                 # Push partially to worker
 9 | z = Map(function(yi) yi - 3 + sy, y)     # Never bring to manager
10 | sz = Reduce(`+`, z)                 # Push to worker
11 | print(sz)
12 | 
13 | 


--------------------------------------------------------------------------------
/inst/examples/MapReduce_analysis.R:
--------------------------------------------------------------------------------
 1 | library(CodeDepends)
 2 | 
 3 | s = readScript("MapReduce.R")
 4 | 
 5 | g = autoparallel::depend_graph(s)
 6 | 
 7 | # Find if a variable assigned to a Map result is only input to a Reduce.
 8 | # If so, then we can combine the Map and Reduce. But is this the best way?
 9 | 
10 | mapuse = sapply(s, autoparallel::apply_location, apply_func = "Map")
11 | 
12 | # Making assumptions on how code is written here, ie. x = Map(...)
13 | map_assign = which(mapuse == 3)
14 | 
15 | names(map_assign) = sapply(s[map_assign], `[[`, 2)
16 | 
17 | # Next: find out where they are inputs
18 | 


--------------------------------------------------------------------------------
/inst/examples/MapReduce_ideal.R:
--------------------------------------------------------------------------------
 1 | # This is the target for the transformed program, ie.
 2 | # whatever autoparallel spits out should look something like this.
 3 | #
 4 | # This requires an associative Reduce function, different from R.
 5 | 
 6 | library(parallel)
 7 | 
 8 | cl = makeCluster(2L)
 9 | 
10 | x = as.list(1:10)
11 | 
12 | autoparallel::assign_workers(cl, "x")
13 | 
14 | 
15 | # Relying on `{` returning the last statement, and the `=` assignment
16 | # operator returning the object of assignment
17 | sy_partial_reduce = clusterEvalQ(cl, {
18 |     y = Map(function(xi) 2 * xi, x)
19 |     sy = Reduce(`+`, y)
20 | })
21 | sy = Reduce(`+`, sy_partial_reduce)
22 | 
23 | clusterExport(cl, "sy")
24 | 
25 | sz_partial_reduce = clusterEvalQ(cl, {
26 |     z = Map(function(yi) yi - 3 + sy, y)
27 |     sz = Reduce(`+`, z)
28 | })
29 | sz = Reduce(`+`, sz_partial_reduce)
30 | 
31 | print(sz)
32 | 
33 | 


--------------------------------------------------------------------------------
/inst/examples/as_date2.R:
--------------------------------------------------------------------------------
1 | b = as.Date(x[, "b"], origin = "2010-01-01")
2 | d = as.Date(x[, "d"], origin = "2010-01-01")
3 | rb = range(b)
4 | rd = range(d)
5 | print(rb)
6 | print(rd)
7 | 


--------------------------------------------------------------------------------
/inst/examples/cov_chunked_efficiency.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clarkfitzg/makeParallel/6e43f34f51a23692907ec1563d3d47a8e189d7bf/inst/examples/cov_chunked_efficiency.png


--------------------------------------------------------------------------------
/inst/examples/cov_prechunked_efficiency.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clarkfitzg/makeParallel/6e43f34f51a23692907ec1563d3d47a8e189d7bf/inst/examples/cov_prechunked_efficiency.png


--------------------------------------------------------------------------------
/inst/examples/cov_prechunked_parallel_eff.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clarkfitzg/makeParallel/6e43f34f51a23692907ec1563d3d47a8e189d7bf/inst/examples/cov_prechunked_parallel_eff.png


--------------------------------------------------------------------------------
/inst/examples/covariance.Rmd:
--------------------------------------------------------------------------------
  1 | Computing the covariance between two data vectors of length $n$ requires
  2 | $O(n)$ operations. A sample $n \times p$ covariance matrix requires
  3 | $(p + 1)p / 2$ vector covariances. So the whole covariance matrix needs
  4 | 
  5 | $$
  6 |     O(n) frac{(p + 1) p}{2}
  7 | $$
  8 | 
  9 | Suppose we model wall time required for an $n \times p$ sample covariance
 10 | calculation as:
 11 | 
 12 | $$
 13 |     t = c + b n (p + 1) p / 2 + \epsilon
 14 | $$
 15 | 
 16 | where $c$ is the constant overhead and $\epsilon$ is the random error.
 17 | 
 18 | 
 19 | ```{R}
 20 | 
 21 | cov_times = read.csv("cov_times.csv")
 22 | cov_times$complexity = with(cov_times, n * (p + 1) * p / 2)
 23 | 
 24 | fit_baseline = lm(baseline ~ complexity, cov_times)
 25 | fit_chunked = lm(chunked ~ complexity, cov_times)
 26 | fit_parallel_chunked = lm(parallel_chunked ~ complexity, cov_times)
 27 | 
 28 | ```
 29 | 
 30 | The `baseline` model uses the builtin `cov()` function, while the chunked
 31 | version builds on `cov()` by partitioning the sample matrix into groups of
 32 | columns.
 33 | 
 34 | Side note: great QQ plot here.
 35 | ```{R}
 36 | 
 37 | #png("qq_baseline.png")
 38 | plot(fit_baseline, which = 2)
 39 | #dev.off()
 40 | 
 41 | ```
 42 | 
 43 | If the coefficients for complexity are similar this is good for the model.
 44 | 
 45 | ```{R}
 46 | 
 47 | confint(fit_baseline)
 48 | confint(fit_chunked)
 49 | 
 50 | ```
 51 | 
 52 | They're reasonably close at 1.2 and 1.3. The units are in nanoseconds. What
 53 | does this mean in terms of processor speed and actual number of operations?
 54 | The clock speed is around 3 GHz, so a single clock cycle takes around 1 / 3
 55 | nanoseconds. Really understanding this will require significant low level
 56 | knowledge, and depends on how the code was compiled. I also need to know
 57 | exactly how the covariance is computed. But without considering all this I
 58 | can see that the coefficient for computational complexity is on the same
 59 | order as a single clock cycle.
 60 | 
 61 | Parallelism should cut the computational complexity coefficient
 62 | approximately in half, because this experiment was done with 2 cores.
 63 | 
 64 | ```{R}
 65 | 
 66 | confint(fit_parallel_chunked)
 67 | 
 68 | # This is not nearly as symmetrical as the others.
 69 | plot(fit_parallel_chunked, which = 2)
 70 | 
 71 | ```
 72 | 
 73 | Hmmm, doesn't cut it in half. Overhead in the intercept is around $1.4
 74 | \times 10^7$ nanoseconds = 14 milliseconds. This is consistent with each
 75 | use of parallel taking around 1 ms. But it would be better to model this in
 76 | terms of $n$ and $p$.
 77 | 
 78 | We can also ask, for what values of computational complexity will the
 79 | parallel version be faster?
 80 | 
 81 | ```{R}
 82 | 
 83 | cov_times$parallel_faster = cov_times$parallel_chunked < cov_times$baseline
 84 | 
 85 | plot(cov_times$complexity, cov_times$parallel_faster, log = "x")
 86 | 
 87 | fit2 = glm(parallel_faster ~ complexity, family = "binomial"
 88 |         , data = cov_times)
 89 | 
 90 | curve(predict(fit2, data.frame(complexity=x), type="resp"), add=TRUE)
 91 | 
 92 | summary(fit2)
 93 | 
 94 | ```
 95 | 
 96 | TODO: Look at outlying cases. What makes them exceptional?
 97 | 
 98 | ```{R}
 99 | 
100 | outliers = abs(residuals(fit2)) > 3
101 | 
102 | cov_times[outliers, ]
103 | 
104 | ```
105 | 
106 | This one has $n = 2848, p = 433$, and all the times around 0.3 seconds.
107 | Probably just random that the parallel version took longer.
108 | 


--------------------------------------------------------------------------------
/inst/examples/d1.csv:
--------------------------------------------------------------------------------
1 | 1,4,7,10
2 | 2,5,8,11
3 | 3,6,9,12
4 | 


--------------------------------------------------------------------------------
/inst/examples/d2.csv:
--------------------------------------------------------------------------------
1 | 1,14,7,10
2 | 2,15,8,11
3 | 3,16,9,12
4 | 


--------------------------------------------------------------------------------
/inst/examples/duncan.R:
--------------------------------------------------------------------------------
 1 | 
 2 | x + 1
 3 | 
 4 | function() 1
 5 | 
 6 | 
 7 | 
 8 | e = quote(apply(x, mean))
 9 | 
10 | sub_one_eval(e, list(apply = as.name("mclapply")))
11 | 
12 | f1 = quote(function() 1)
13 | 
14 | f2 = sub_one_eval(quote(function() 1), list(apply = as.name("mclapply")))
15 | 
16 | caller = function() sub_one_eval(function() 1, list(apply = as.name("mclapply")))
17 | 
18 | caller = function(f) sub_one_eval(f, list(apply = as.name("mclapply")))
19 | 
20 | sub_one_docall(e, list(apply = as.name("mclapply")))
21 | 
22 | 
23 | # TODO: tests to clarify what and how this should work
24 | sub_one = sub_one_eval
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/inst/examples/efficiency_by_chunks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clarkfitzg/makeParallel/6e43f34f51a23692907ec1563d3d47a8e189d7bf/inst/examples/efficiency_by_chunks.png


--------------------------------------------------------------------------------
/inst/examples/extend.R:
--------------------------------------------------------------------------------
 1 | {
 2 |     message("This code was generated from R by makeParallel version 0.2.1 at 2019-10-02 17:36:03")
 3 |     {
 4 |     }
 5 |     library(parallel)
 6 |     assignments = c(1, 2, 1)
 7 |     nWorkers = 2
 8 |     cls = makeCluster(nWorkers)
 9 |     c.data.frame = rbind
10 |     clusterExport(cls, character(0))
11 |     clusterExport(cls, c("assignments", "c.data.frame"))
12 |     parLapply(cls, seq(nWorkers), function(i) assign("workerID", i, globalenv()))
13 |     clusterEvalQ(cls, {
14 |         assignments = which(assignments == workerID)
15 |         NULL
16 |     })
17 | }
18 | {
19 |     clusterEvalQ(cls, {
20 |         read_args = c("d1.csv", "d2.csv", "d3.csv")
21 |         read_args = read_args[assignments]
22 |         chunks = lapply(read_args, function(fname) {
23 |             command = paste("cut -d , -f 2,4", fname)
24 |             read.table(pipe(command), header = FALSE, sep = ",", col.names = c("b", "d"), colClasses = c("numeric", "integer"))
25 |         })
26 |         x = do.call(rbind, chunks)
27 |         NULL
28 |     })
29 | }
30 | {
31 |     collected = clusterEvalQ(cls, {
32 |         list(x = x)
33 |     })
34 |     vars_to_collect = names(collected[[1]])
35 |     for (i in seq_along(vars_to_collect)) {
36 |         varname = vars_to_collect[i]
37 |         chunks = lapply(collected, `[[`, i)
38 |         value = do.call(c, chunks)
39 |         assign(varname, value)
40 |     }
41 | }
42 | b = as.Date(x[, "b"], origin = "2010-01-01")
43 | d = as.Date(x[, "d"], origin = "2010-01-01")
44 | rb = range(b)
45 | rd = range(d)
46 | print(rb)
47 | print(rd)
48 | stopCluster(cls)
49 | 


--------------------------------------------------------------------------------
/inst/examples/issue19.R:
--------------------------------------------------------------------------------
1 | f <- function(x) g(x)
2 | g <- function(x) {
3 |     h(x)
4 | }
5 | h <- function(x) {
6 |       k1(x) + k2(x) + my_var
7 | }
8 | my_var <- 1
9 | 


--------------------------------------------------------------------------------
/inst/examples/lda.R:
--------------------------------------------------------------------------------
  1 | # Linear Discriminant Analysis
  2 | 
  3 | library(Matrix)
  4 | 
  5 | source("covariance.R")
  6 | 
  7 | # LDA computations
  8 | 
  9 | # One component in LDA calc
 10 | di = function(i, means, Sigma)
 11 | {
 12 |     xi = means[i, ]
 13 |     # This isn't storing the matrix factorization. Maybe solving for a
 14 |     # vector doesn't require this?
 15 |     a = solve(Sigma, xi)
 16 |     as.numeric(xi %*% a)
 17 | }
 18 | 
 19 | 
 20 | lda2 = function(X0, groups)
 21 | {
 22 | 
 23 |     # Each row contains a group mean
 24 |     means = by(X0, groups, colMeans)
 25 |     means = do.call(rbind, means)
 26 |     means = Matrix(means)
 27 | 
 28 |     Sigma = cov_Matrix_pkg(X0)
 29 | 
 30 |     # Cholesky decompositions are cached. Doing it here so it propagates into
 31 |     # the functions.
 32 |     chol(Sigma)
 33 | 
 34 |     d = sapply(1:k, di, means = means, Sigma = Sigma)
 35 |     d = d / 2
 36 | 
 37 |     out = list(Sigma = Sigma, d = d, means = means)
 38 |     class(out) = "lda2"
 39 |     out
 40 | }
 41 | 
 42 | 
 43 | 
 44 | predict.lda2 = function(fit, X)
 45 | {
 46 |     Sigma = fit$Sigma
 47 |     d = fit$d
 48 |     means = fit$means
 49 | 
 50 |     Sigma_inv_Xt = solve(Sigma, t(X))
 51 |     obj = means %*% Sigma_inv_Xt - d
 52 |     maxs = apply(obj, 2, which.max)
 53 |     maxs
 54 | }
 55 | 
 56 | 
 57 | # Testing data:
 58 | ############################################################
 59 | 
 60 | library(MASS)
 61 | 
 62 | n = 10000
 63 | p = 50
 64 | k = 4
 65 | 
 66 | set.seed(891234)
 67 | X0 = matrix(rnorm(n * p), ncol = p)
 68 | colnames(X0) = paste0("X", 1:p)
 69 | 
 70 | groups = rep(1:k, length.out = n)
 71 | 
 72 | X = Matrix(rnorm(10000 * p), ncol = p)
 73 | 
 74 | Xd = as.data.frame(as.matrix(X))
 75 | colnames(Xd) = colnames(X0)
 76 | X0groups = data.frame(X0, groups)
 77 | 
 78 | 
 79 | fit = lda(groups ~ ., X0groups)
 80 | 
 81 | p0 = as.integer(predict(fit, Xd)$class)
 82 | 
 83 | fit2 = lda2(X0, groups)
 84 | 
 85 | p1 = predict(fit2, X)
 86 | 
 87 | 
 88 | mean(p0 == p1)
 89 | # 1 in 10000 is off, but not sure why.
 90 | # This is in the docs:
 91 | #
 92 | #     This version centres the linear discriminants so that the weighted
 93 | #     mean (weighted by ‘prior’) of the group centroids is at the
 94 | #     origin.
 95 | #
 96 | 
 97 | 
 98 | # Timings
 99 | ############################################################
100 | 
101 | if(FALSE)
102 | {
103 | 
104 | library(microbenchmark)
105 | 
106 | microbenchmark(lda(groups ~ ., X0groups), times = 10L)
107 | 
108 | microbenchmark(lda2(X0, groups), times = 10L)
109 | 
110 | # So we get a speedup of 2-3 x
111 | 
112 | # How much time is spent in covariance calc?
113 | # Over 40%
114 | #
115 | # Also 48% in `by`. Which means it's quite inefficient, considering that
116 | # column means can be computed in place with exactly one loop through the
117 | # data. I'll bet data.table is really good at this.
118 | #
119 | # scale() is also a big offender at 19%, half the time of the covariance
120 | # calc. The inefficient part of scale() is in the sweep() function. All we
121 | # really need to do is subtract the column means
122 | 
123 | Rprof("lda.out")
124 | replicate(100, lda2(X0, groups))
125 | Rprof(NULL)
126 | 
127 | summaryRprof("lda.out")
128 | 
129 | }
130 | 
131 | 
132 | 


--------------------------------------------------------------------------------
/inst/examples/mp_example.R:
--------------------------------------------------------------------------------
1 | x = 1:100
2 | y = rep(1, 100)
3 | z = x + y
4 | lapply(z, sin)
5 | 


--------------------------------------------------------------------------------
/inst/examples/plot_cov2.R:
--------------------------------------------------------------------------------
 1 | # Tue Aug 22 16:44:07 PDT 2017
 2 | # 
 3 | # Does the number of chunks affect the efficiency?
 4 | 
 5 | library(microbenchmark)
 6 | library(lattice)
 7 | 
 8 | source("covariance.R")
 9 | 
10 | 
11 | # previous work showed a prechunked version was less than 50 percent
12 | # efficient.
13 | 
14 | # n = 2000, p = 200 means it's not really worth it to go parallel
15 | n = 10000
16 | p = 200
17 | x = matrix(rnorm(n * p), nrow = n)
18 | 
19 | percent_efficiency = function(nchunks, times = 5L){
20 | 
21 |     baseline = microbenchmark(cov(x), times = times)$time
22 |     t_chunked = microbenchmark(cov_chunked(x, nchunks), times = times)$time
23 |     t_prechunked = microbenchmark(cov_with_prechunk(x, nchunks), times = times)$time
24 |     t_par_chunked = microbenchmark(cov_with_prechunk_parallel(x, nchunks), times = times)$time
25 | 
26 |     # Hopefully doing it a few times and taking the best will eliminate
27 |     # things like gc()
28 |     baseline = 100 * min(baseline)
29 |     data.frame(chunked = baseline / min(t_chunked)
30 |                , prechunked = baseline / min(t_prechunked)
31 |                , par_chunked = baseline / min(t_par_chunked)
32 |                )
33 | }
34 | 
35 | set.seed(2318)
36 | 
37 | #nchunks = round(seq(from = 2, to = p, length.out = 10))
38 | nchunks = 2:10
39 | 
40 | times = lapply(nchunks, percent_efficiency)
41 | 
42 | times = do.call(rbind, times)
43 | 
44 | png("efficiency_by_chunks.png")
45 | 
46 | plot(nchunks, times$prechunked
47 |      , ylim = range(times)
48 |      , main = sprintf("Efficiency for chunked cov() on %i x %i matrix", n, p)
49 |      , ylab = "percent efficiency (100% ideal)"
50 |      , xlab = "number of chunks"
51 |      )
52 | points(nchunks, times$chunked, pch = 2)
53 | points(nchunks, times$par_chunked, pch = 3)
54 | legend("topright", c("prechunked", "not prechunked", "parallel"), pch = 1:3)
55 | 
56 | dev.off()
57 | 


--------------------------------------------------------------------------------
/inst/examples/qq_baseline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clarkfitzg/makeParallel/6e43f34f51a23692907ec1563d3d47a8e189d7bf/inst/examples/qq_baseline.png


--------------------------------------------------------------------------------
/inst/examples/recurse_globals.R:
--------------------------------------------------------------------------------
 1 | # From wlandau
 2 | # https://github.com/duncantl/CodeDepends/issues/19
 3 | 
 4 | library(CodeDepends)
 5 | 
 6 | 
 7 | #' Recursively Find Global Variables
 8 | #'
 9 | #' TODO: Modify this to work without requiring that the code be evaluated
10 | #' Probably means we can't use codetools::findGlobals
11 | #'
12 | #' fun closure, see codetools::findGlobals
13 | #' possible_funs character vector of variable names to recurse into
14 | findGlobals_recursive <- function(fun, possible_funs)
15 | {
16 |     globals <- codetools::findGlobals(fun)
17 | 
18 |     for(varname in intersect(globals, possible_funs)){
19 |         var = get(varname, envir = .GlobalEnv)
20 |         if(is.function(var)){
21 |             globals <- c(globals, Recall(var, possible_funs))
22 |         }
23 |     }
24 |     unique(globals)
25 | }
26 | 
27 | 
28 | # Usage
29 | ############################################################
30 | 
31 | code = parse(text = "
32 | f <- function(x) g(x)
33 | g <- function(x) {
34 |     h(x)
35 | }
36 | h <- function(x) {
37 |       sin(x) + cos(x) + my_var
38 | }
39 | my_var <- 1
40 | ")
41 | 
42 | eval(code)
43 | 
44 | 
45 | info = getInputs(code)
46 | 
47 | findGlobals_recursive(f, possible_funs = info@outputs)
48 | 
49 | g <- function(x) {
50 |     if(x > 10)
51 |         h1(x)
52 |     else
53 |         h2(x)
54 | } 
55 | 
56 | getInputs(body(g))@functions
57 | 


--------------------------------------------------------------------------------
/inst/examples/scale.R:
--------------------------------------------------------------------------------
 1 | # Mon Aug 28 16:33:46 PDT 2017
 2 | #
 3 | # sweep() used to implement scale() is inefficient. Profiling shows that
 4 | # only 2% of the time is spent in colMeans. The only other thing to do is
 5 | # subtract the mean, which should be fast, but isn't because memory
 6 | # layout requires a transpose to use recycling (broadcasting).
 7 | #
 8 | # But I don't know how to do any better short of writing in C
 9 | 
10 | library(microbenchmark)
11 | 
12 | n = 10000
13 | p = 100
14 | 
15 | x = matrix(rnorm(n * p), nrow = n)
16 | 
17 | 
18 | # This isn't any better!!
19 | scale2 = function (x)
20 | {
21 |     n = nrow(x)
22 |     mu = colMeans(x)
23 |     #mu_broadcasted = matrix(rep(mu, each = n), nrow = n)
24 |     mu_broadcasted = rep(mu, each = n)
25 |     x - mu_broadcasted
26 | }
27 | 
28 | 
29 | # Takes about the same time as base::scale.default
30 | scale3 = function (x)
31 | {
32 |     n = nrow(x)
33 |     mu = colMeans(x)
34 |     # Tricky broadcasting
35 |     t(t(x) - mu)
36 | }
37 | 
38 | 
39 | # Bad again
40 | scale4 = function (x)
41 | {
42 |     n = nrow(x)
43 |     mu = colMeans(x)
44 |     t(apply(x, 1L, `-`, mu))
45 | }
46 | 
47 | 
48 | 
49 | s1 = scale(x, center = TRUE, scale = FALSE)
50 | s2 = scale2(x)
51 | s3 = scale3(x)
52 | s4 = scale4(x)
53 | 
54 | max(abs(s1 - s2))
55 | max(abs(s1 - s3))
56 | max(abs(s1 - s4))
57 | 
58 | microbenchmark(scale(x, center = TRUE, scale = FALSE), times = 10L)
59 | 
60 | microbenchmark(scale2(x), times = 10L)
61 | 
62 | microbenchmark(scale3(x), times = 10L)
63 | 
64 | microbenchmark(scale4(x), times = 10L)
65 | 
66 | Rprof()
67 | replicate(100, scale(x))
68 | Rprof(NULL)
69 | 
70 | summaryRprof()
71 | 


--------------------------------------------------------------------------------
/inst/oldcode/README:
--------------------------------------------------------------------------------
1 | These files are works in progress that may go into the package eventually.
2 | 


--------------------------------------------------------------------------------
/inst/oldcode/RHive.Rmd:
--------------------------------------------------------------------------------
 1 | # RHive
 2 | 
 3 | Run R code in Hive
 4 | 
 5 | Extending and generalizing the ideas from [this blog
 6 | post](http://clarkfitzg.github.io/2017/10/31/3-billion-rows-with-R/)
 7 | 
 8 | ## Prior Work
 9 | 
10 | The 
11 | 
12 | ## Example
13 | 
14 | ```{R}
15 | 
16 | library(RHive)
17 | 
18 | testfunc = function(x)
19 | {
20 |     data.frame(station = 1L, n_total = 2L, slope = 3.14)
21 | }
22 | 
23 | write_udaf_scripts(f = testfunc
24 |     , cluster_by = "station"
25 |     , input_table = "pems"
26 |     , input_cols = c("station", "flow2", "occ2")
27 |     , input_classes = c("integer", "integer", "numeric", "character")
28 |     , output_table = "fundamental_diagram"
29 |     , output_cols = c("station", "n_total", "slope")
30 |     , output_classes = c("integer", "integer", "numeric")
31 |     , overwrite_script = TRUE
32 |     , overwrite_table = TRUE
33 |     , try = TRUE
34 | )
35 | 
36 | ```
37 | 
38 | 
39 | Some ideas talking with the code review group:
40 | 
41 | ```
42 | input_table 
43 | data
44 | data_in
45 | SELECT_FROM
46 | select_from
47 | 
48 | sql_in = sql_builder(SELECT = c("col_a", "col_b")...)
49 | ```
50 | 


--------------------------------------------------------------------------------
/inst/oldcode/brokecode.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # Keeping this around just in case:
 3 | #' The current version sends all the global functions to the parallel
 4 | #' workers each time the evaluator is called. This is useful when
 5 | #' iteratively building functions within the global environment.
 6 | #' The smarter thing to do is keep track of which functions change, and
 7 | #' then send those over. But it's not clear that is worth it.
 8 | #' Return the names of all global functions
 9 | #global_functions = function()
10 | #{
11 | #    varnames = ls(.GlobalEnv, all.names = TRUE)
12 | #    funcs = sapply(varnames, function(x) is.function(get(x, envir = .GlobalEnv)))
13 | #    varnames[funcs]
14 | #}
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/inst/oldcode/codegraph/codegraph.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Wed Mar  8 16:09:51 PST 2017
 3 | #
 4 | # pass in an R script to build and visualize a code dependency graph
 5 | 
 6 | library(igraph)
 7 | library(CodeDepends)
 8 | 
 9 | source("depend_graph.R")
10 | 
11 | script_name = commandArgs(trailingOnly=TRUE)
12 | 
13 | s = readScript(script_name)
14 | f = tempfile()
15 | outfile = gsub("\\.R", "\\.pdf", script_name)
16 | 
17 | g = depend_graph(s, add_source = TRUE)
18 | write_graph(g, f, format = "dot")
19 | 
20 | system2("dot", c("-Tpdf",  f, "-o", outfile))
21 | 
22 | unlink(f)
23 | 
24 | message("Processed ", script_name)
25 | 


--------------------------------------------------------------------------------
/inst/oldcode/columns.R:
--------------------------------------------------------------------------------
 1 | # Column selection
 2 | 
 3 | x = mtcars$mpg
 4 | 
 5 | y = mtcars[, 2]
 6 | 
 7 | z = mtcars[, "disp"]
 8 | 
 9 | a = mtcars[[4]]
10 | 


--------------------------------------------------------------------------------
/inst/oldcode/interactive.R:
--------------------------------------------------------------------------------
  1 | ## ------------------------------------------------------------------------
  2 | 
  3 | library(autoparallel)
  4 | 
  5 | x = list(1:10, rnorm(10), rep(pi, 10))
  6 | 
  7 | do = makeParallel(x)
  8 | do
  9 | 
 10 | 
 11 | ## ------------------------------------------------------------------------
 12 | 
 13 | lapply(x, head)
 14 | 
 15 | do(lapply(x, head))
 16 | 
 17 | 
 18 | ## ------------------------------------------------------------------------
 19 | 
 20 | y <<- 20
 21 | z <<- 30
 22 | do(y + z, verbose = TRUE)
 23 | 
 24 | 
 25 | ## ------------------------------------------------------------------------
 26 | 
 27 | # An analysis function
 28 | myfun <<- function(x) x[1:2]
 29 | 
 30 | do(lapply(x, myfun))
 31 | 
 32 | # Oops I actually need the first 4
 33 | myfun <<- function(x) x[1:4]
 34 | 
 35 | # Now we see the new results of myfun
 36 | do(lapply(x, myfun))
 37 | 
 38 | 
 39 | ## ---- eval = FALSE-------------------------------------------------------
 40 | #  
 41 | #  # Any large R object
 42 | #  big = 1:1e8
 43 | #  
 44 | #  object.size(big)
 45 | #  
 46 | #  # BAD IDEA: this sends `big` over every time
 47 | #  do(sum(big + x[[1]][1]))
 48 | #  
 49 | 
 50 | ## ------------------------------------------------------------------------
 51 | 
 52 | print.function(do)
 53 | 
 54 | 
 55 | ## ------------------------------------------------------------------------
 56 | 
 57 | do(lapply(x, head), simplify = FALSE)
 58 | 
 59 | 
 60 | ## ------------------------------------------------------------------------
 61 | 
 62 | stop_cluster(do)
 63 | 
 64 | 
 65 | ## ---- echo = FALSE-------------------------------------------------------
 66 | 
 67 | # Used on my local machine only
 68 | datadir = "~/data/vets/appeals_sample"
 69 | 
 70 | 
 71 | ## ----download, eval = FALSE----------------------------------------------
 72 | #  
 73 | #  datadir = "vets_appeals"
 74 | #  dir.create(datadir)
 75 | #  
 76 | #  fnames = paste0("1719", 100:266, ".txt")
 77 | #  urls = paste0("https://www.va.gov/vetapp17/files3/", fnames)
 78 | #  
 79 | #  Map(download.file, urls, fnames)
 80 | #  
 81 | 
 82 | ## ------------------------------------------------------------------------
 83 | 
 84 | filenames = list.files(datadir, full.names = TRUE)
 85 | length(filenames)
 86 | 
 87 | do = makeParallel(filenames)
 88 | 
 89 | 
 90 | ## ------------------------------------------------------------------------
 91 | 
 92 | do({
 93 |     appeals <- lapply(filenames, readLines)
 94 |     appeals <- sapply(appeals, paste, collapse = "\n")
 95 |     appeals <- enc2utf8(appeals)
 96 |     NULL
 97 | })
 98 | 
 99 | 
100 | ## ------------------------------------------------------------------------
101 | 
102 | "appeals" %in% ls()
103 | 
104 | 
105 | ## ------------------------------------------------------------------------
106 | 
107 | ten <<- 10
108 | do(ten + 1, verbose = TRUE)
109 | 
110 | 
111 | ## ------------------------------------------------------------------------
112 | 
113 | do(length(appeals))
114 | do(class(appeals))
115 | 
116 | 
117 | ## ------------------------------------------------------------------------
118 | 
119 | # Check how many we're about to bring back
120 | do(sum(grepl("REMAND", appeals)))
121 | 
122 | # Bring them back from the workers
123 | remand <- do(appeals[grepl("REMAND", appeals)])
124 | 
125 | length(remand)
126 | 
127 | 
128 | ## ------------------------------------------------------------------------
129 | 
130 | stop_cluster(do)
131 | 
132 | 
133 | 


--------------------------------------------------------------------------------
/inst/oldcode/intro.R:
--------------------------------------------------------------------------------
 1 | ## ----setup, include = FALSE----------------------------------------------
 2 | knitr::opts_chunk$set(
 3 |     eval = FALSE
 4 | )
 5 | 
 6 | ## ------------------------------------------------------------------------
 7 | #  
 8 | #  library(autoparallel)
 9 | #  
10 | #  autoparallel("code.R")
11 | 
12 | ## ------------------------------------------------------------------------
13 | #  pcode = makeParallel("code.R"
14 | #      , clean_first = FALSE
15 | #      , run_now = FALSE
16 | #      , cluster_type = "FORK"
17 | #      , nnodes = 4
18 | #  )
19 | #  
20 | #  # visual representation of the graph structure
21 | #  plot(pcode)
22 | #  
23 | #  # Save the parallel version of the script
24 | #  save_code(pcode, "pcode.R")
25 | #  
26 | #  # Run the whole thing interactively
27 | #  run_code(pcode)
28 | 
29 | 


--------------------------------------------------------------------------------
/inst/oldcode/related.Rmd:
--------------------------------------------------------------------------------
 1 | ## Related Work
 2 | 
 3 | Several existing packages provide a more consistent interface to parallel
 4 | computation.
 5 | 
 6 | Landau's [drake](https://ropensci.github.io/drake/) provides task
 7 | parallelism similar to GNU make.
 8 | 
 9 | Bengstton's
10 | [futures](https://cran.r-project.org/web/packages/future/index.html)
11 | provides a mechanism for parallel asynchronous evaluation of R code
12 | across different systems.
13 | 
14 | Bischl and Lang's
15 | [parallelMap](https://cran.r-project.org/package=parallelMap) provides a
16 | parallel version of `Map()` supporting different execution backends
17 | including local, multicore, mpi and BatchJobs. The
18 | [batchtools](https://cran.r-project.org/package=batchtools) package
19 | supports HPC systems.
20 | 
21 | Böhringer's
22 | [parallelize.dynamic](https://cran.r-project.org/package=parallelize.dynamic)
23 | provides the `parallelize_call()` function to dynamically parallelize a
24 | single function call.
25 | 
26 | Wang's [valor](https://github.com/wanghc78/valor) vectorizes `lapply` calls
27 | into single function calls. In some sense this is the most related project,
28 | because the main purpose of valor is to actually transform code.
29 | 


--------------------------------------------------------------------------------
/inst/oldcode/scratch.R:
--------------------------------------------------------------------------------
 1 | # How to combine multiple use def chains?
 2 | library(igraph)
 3 | 
 4 | 
 5 | df1 = data.frame(from = c(1, 1)
 6 |                  , to = c(2, 3)
 7 |                  , edgetype = "use-def"
 8 |                  , var = "x"
 9 |                  )
10 | 
11 | g1 = graph_from_data_frame(df1)
12 | 
13 | 
14 | df2 = data.frame(from = 2
15 |                  , to = 3
16 |                  , edgetype = "use-def"
17 |                  , var = "y"
18 |                  )
19 | 
20 | g2 = graph_from_data_frame(df2)
21 | 
22 | 
23 | g = union(g1, g2, byname = TRUE)
24 | 
25 | 
26 | 
27 | g1 = make_empty_graph(n = 3)
28 | g1 = add_edges(g1, c(1, 2, 1, 3), type = "use-def", var = "x")
29 | 
30 | edge_attr(g1)
31 | 
32 | g2 = make_empty_graph(n = 3)
33 | g2 = add_edges(g2, c(2, 3), type = "use-def", var = "y")
34 | 
35 | g = union(g1, g2, byname = TRUE)
36 | 
37 | edge_attr(g)
38 | 
39 | 
40 | # Thinking of a more elegant way to make the use-def chain
41 | 
42 | def = c(1, 10, Inf)
43 | use = c(2, 3, 7, 10, 13)
44 | 
45 | # Ninja level R programming here.
46 | def[cut(use, breaks = def)]
47 | 


--------------------------------------------------------------------------------
/inst/oldcode/script.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "autoparallel-script"
 3 | output: rmarkdown::html_vignette
 4 | vignette: >
 5 |   %\VignetteIndexEntry{autoparallel-script}
 6 |   %\VignetteEngine{knitr::rmarkdown}
 7 |   %\VignetteEncoding{UTF-8}
 8 | ---
 9 | 
10 | # Script
11 | 
12 | Parallelism is useful if it improves the speed of a slow program. If speed
13 | doesn't improve then parallelism is an unnecessary complication.
14 | `autoparallel` transforms programs from serial into multicore parallel, and
15 | then benchmarks the modified program to determine if the transformation
16 | increases the speed.
17 | 
18 | The word 'program' means a collection of valid R statements. Typically this
19 | means a script or a function.
20 | 
21 | ## Basic Transformations
22 | 
23 | We begin with the simplest and most obvious way to transform a program.
24 | Top level occurences of `lapply, mapply, Map` are changed to `mclapply,
25 | mcmapply, mcMap` from the `parallel` package, and the run times are
26 | compared.  Below `lapply, mapply, Map` are referred to as the 'target
27 | statements'.
28 | 
29 | This technique may be useful if the following conditions are met:
30 | 
31 | - System supports fork based parallelism (not Windows)
32 | - Program spends a large amount of time in the target statements
33 | - Program will be ran many times (benchmark requires program to run)
34 | - Repeated evaluation of the target statements doesn't change output or
35 |   have otherwise adverse effects, ie. repeatedly writing data to places it
36 |   should not.
37 | 
38 | Consider the following simple program:
39 | 
40 | ```{R}
41 | 
42 | # simple.R
43 | 
44 | ffast = function(x) rnorm(1)
45 | 
46 | fslow = function(x){
47 |     Sys.sleep(0.1)
48 |     rnorm(1)
49 | }
50 | 
51 | z = 1:10
52 | r1 = lapply(z, ffast)
53 | r2 = lapply(z, fslow)
54 | 
55 | ```
56 | 
57 | To make this run faster the last line should be changed to:
58 | 
59 | ```{R}
60 | 
61 | r2 = parallel::mclapply(x, fslow)
62 | 
63 | ```
64 | 
65 | To transform it:
66 | 
67 | ```{R}
68 | 
69 | library(autoparallel)
70 | 
71 | benchmark_transform("simple.R", output = "simple2.R")
72 | 
73 | ```
74 | 


--------------------------------------------------------------------------------
/inst/oldcode/simple.R:
--------------------------------------------------------------------------------
 1 | # simple.R
 2 | 
 3 | ffast = function(x) rnorm(1)
 4 | 
 5 | fslow = function(x){
 6 |     Sys.sleep(0.1)
 7 |     rnorm(1)
 8 | }
 9 | 
10 | z = 1:10
11 | r1 = lapply(z, ffast)
12 | r2 = lapply(z, fslow)
13 | 


--------------------------------------------------------------------------------
/inst/oldcode/simple2.R:
--------------------------------------------------------------------------------
1 | ffast = function(x) rnorm(1)
2 | fslow = function(x) {
3 |     Sys.sleep(0.1)
4 |     rnorm(1)
5 | }
6 | z = 1:10
7 | r1 = lapply(z, ffast)
8 | r2 = lapply(z, fslow)
9 | 


--------------------------------------------------------------------------------
/inst/oldcode/snow.R:
--------------------------------------------------------------------------------
 1 | #' Make A SNOW Cluster Act Like A Unix Fork
 2 | #'
 3 | #' Evaluate code that appears before a call to lapply
 4 | #'
 5 | #' @export
 6 | snow_fork = function(code)
 7 | {
 8 | 
 9 |     #TODO: Implement me
10 |     find_call(code, "lapply")
11 | 
12 | }
13 | 


--------------------------------------------------------------------------------
/inst/oldcode/task_parallel.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "autoparallel-task-parallel"
 3 | output: rmarkdown::html_vignette
 4 | vignette: >
 5 |   %\VignetteIndexEntry{autoparallel-task-parallel}
 6 |   %\VignetteEngine{knitr::rmarkdown}
 7 |   %\VignetteEncoding{UTF-8}
 8 | ---
 9 | 
10 | # Task Parallel
11 | 
12 | Parallel infrastructure for computing on data generally centers around data
13 | parallelism. This means to call the same function on many different data-
14 | Same Instruction Multiple Data (SIMD).
15 | 
16 | Independent tasks can also run in parallel. This is task parallelism. It
17 | means to call different functions on different data simultaneously.
18 | 
19 | This can be done today through R's included `parallel` package:
20 | 
21 | 
22 | ```{R}
23 | 
24 | library(parallel)
25 | 
26 | # Begins asynchronous evaluation of rnorm(10)
27 | job1 = mcparallel(expr = rnorm(10))
28 | 
29 | # This can happen before the above expression is finished
30 | x = mean(1:10)
31 | 
32 | y = mccollect(job1)[[1]]
33 | 
34 | ```
35 | 
36 | This introduces overhead compared to standard serial evaluation, but it may
37 | speed up the program if the following conditions hold:
38 | 
39 | - The system has available computing resources, ie. processor cores which
40 |   are idle. If an R package uses threads through internal compiled code
41 |   then introducing parallelism on top of this will generally hurt rather than
42 |   help, because the processors must now compete for resources. Linear algebra
43 |   computations with a multithreaded BLAS / LAPACK are a common operation
44 |   with this effect.
45 | - There are two or more relatively long running tasks that can occur simultaneously.
46 |   For a multicore fork based approach the tasks should take at least 10 ms,
47 |   and [preferably much
48 |   longer](https://www.kernel.org/pub/linux/kernel/people/paulmck/perfbook/perfbook.html).
49 | - At least one task returns a relatively small object. This allows one to
50 |   avoid the cost of serializing R objects between processes. For example,
51 |   the code `1:1e8` generates a sequence of 100 million integers. This takes
52 |   10 times longer in parallel because the serialization time
53 |   far exceeds the time for actual computation.
54 | 
55 | 
56 | ## Ideas
57 | 
58 | Suppose the user would like to run a script multiple times.
59 | The software essentially needs to do the following:
60 | 
61 | 1. run the script once, measuring time required to evaluate each
62 |    expression, as well as the sizes of the resulting objects to be
63 |    serialized
64 | 2. infer the dependency structure of the code, which determines where
65 |    and how statements can run in parallel
66 | 3. solve an optimization problem specifying which statements ideally happen
67 |    in parallel
68 | 4. rewrite the code to use the optimal strategy
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/inst/oldcode/test_apply.R:
--------------------------------------------------------------------------------
 1 | library(autoparallel)
 2 | 
 3 | context("apply")
 4 | 
 5 | test_that("convert single apply to parallel", {
 6 | 
 7 |     x = matrix(1:10, ncol = 2)
 8 | 
 9 |     code = quote(apply(x, 2, max))
10 |     parcode = apply_parallel(code)
11 | 
12 |     expect_identical(eval(code), eval(parcode))
13 | 
14 | })
15 | 
16 | 
17 | if(FALSE)
18 | {
19 | 
20 | # Testing code:
21 | n = 1000000L
22 | p = 20L
23 | 
24 | x = matrix(1:(n*p), ncol = p)
25 | 
26 | incode = quote(apply(x, 2, max))
27 | parcode = apply_parallel(incode)
28 | 
29 | system.time(eval(incode))
30 | system.time(eval(parcode))
31 | 
32 | fast = benchmark_parallel(incode, times = 10L)
33 | 
34 | 
35 | 
36 | 
37 | library(microbenchmark)
38 | 
39 | bm = microbenchmark(eval(incode), eval(parcode))
40 | 
41 | # Distribution of these timings?
42 | # Most are small, with just a few high outliers (likely GC)
43 | # Right skew
44 | 
45 | par(mfrow = c(2, 2))
46 | 
47 | tapply(bm$time, bm$expr, function(x){
48 |            qqnorm(x)
49 |            qqline(x)
50 |         })
51 | tapply(bm$time, bm$expr, hist)
52 | }
53 | 


--------------------------------------------------------------------------------
/inst/oldcode/test_canon.R:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | test_that("names_to_ssb helper functions", {
 4 | 
 5 |     code = quote(dframe$b)
 6 |     transformed = dollar_to_ssb(code, letters)$statement
 7 |     expect_equal(quote(dframe[, 2L]), transformed)
 8 | 
 9 |     code = quote(dframe[["b"]])
10 |     transformed = double_to_ssb(code, letters)$statement
11 |     expect_equal(quote(dframe[, 2L]), transformed)
12 | 
13 |     code = quote(dframe[[2L]])
14 |     transformed = double_to_ssb(code, letters)$statement
15 |     expect_equal(quote(dframe[, 2L]), transformed)
16 | 
17 |     code = quote(dframe[, "b"])
18 |     transformed = single_to_ssb(code, letters)$statement
19 |     expect_equal(quote(dframe[, 2L]), transformed)
20 | 
21 |     code = quote(dframe[, c("b", "c")])
22 |     transformed = single_to_ssb(code, letters)$statement
23 |     expect_equal(quote(dframe[, 2:3]), transformed)
24 | 
25 |     code = quote(dframe[, 4:7])
26 |     transformed = single_to_ssb(code, letters)$statement
27 |     expect_equal(quote(dframe[, 4:7]), transformed)
28 | 
29 |     code = quote(dframe[, c(2L, 4L)])
30 |     transformed = single_to_ssb(code, letters)$statement
31 |     expect_equal(quote(dframe[, c(2L, 4L)]), transformed)
32 | 
33 |     code = quote(dframe[condition, "b"])
34 |     transformed = single_to_ssb(code, letters)$statement
35 |     expect_equal(quote(dframe[condition, 2L]), transformed)
36 | 
37 | })
38 | 
39 | 
40 | test_that("canon_form", {
41 | 
42 | # Wait until I have a reason to use this
43 | #    code = quote(dframe[condition, "b"])
44 | #    actual = canon_form(code, "dframe", letters)
45 | #    expect_true(actual$found)
46 | #
47 | #    code = quote(xxx[condition, "b"])
48 | #    actual = canon_form(code, "dframe", letters)
49 | #    expect_false(actual$found)
50 | 
51 | # Currently fails, not sure how I want this corner case to behave.
52 | #    code = quote(dframe)
53 | #    actual = canon_form(code, "dframe", letters)
54 | #    expect_equal(actual$statement, code)
55 | 
56 |     code = quote(dframe[condition, c("b", "c")])
57 |     actual = canon_form(code, "dframe", letters)
58 | 
59 |     expect_equal(actual$transformed, quote(dframe[condition, 2:3]))
60 |     expect_equal(actual$column_indices, 2:3)
61 |     expect_equal(actual$index_locs, list(4L))
62 | 
63 |     code = quote(xxx[condition, "b"])
64 |     actual = canon_form(code, "dframe", letters)
65 | 
66 |     expect_equal(actual$transformed, code)
67 |     expect_equal(actual$column_indices, integer())
68 |     expect_equal(actual$index_locs, list())
69 | 
70 |     code = quote(plot(dframe[, "d"]))
71 |     actual = canon_form(code, "dframe", letters)
72 | 
73 |     expect_equal(actual$transformed, quote(plot(dframe[, 4L])))
74 |     expect_equal(actual$column_indices, 4)
75 |     expect_equal(actual$index_locs, list(c(2, 4)))
76 | 
77 |     code = quote(dframe[dframe[, "d"] > 10, "b"])
78 |     actual = canon_form(code, "dframe", letters)
79 | 
80 |     expect_equal(actual$transformed, quote(dframe[dframe[, 4L] > 10, 2L]))
81 |     expect_equal(actual$column_indices, c(2, 4))
82 |     expect_equal(actual$index_locs, list(4, c(3, 2, 4)))
83 | 
84 | })
85 | 


--------------------------------------------------------------------------------
/inst/oldcode/test_distribute.R:
--------------------------------------------------------------------------------
 1 | context("distribute")
 2 | 
 3 | # distribute works off variables in the global environment
 4 | # testthat does some other things here.
 5 | assign("y", list(letters, LETTERS, 1:10), envir = .GlobalEnv)
 6 | 
 7 | 
 8 | test_that("basics with default", {
 9 | 
10 |     do = distribute(y)
11 | 
12 |     actual = do(lapply(y, head))
13 | 
14 |     expect_identical(actual, lapply(y, head))
15 | 
16 |     head2 = function(y) y[1:2]
17 |     assign("head2", head2, envir = .GlobalEnv)
18 | 
19 |     actual = do(lapply(y, head2))
20 | 
21 |     expect_identical(actual, lapply(y, head2))
22 | 
23 |     stop_cluster(do)
24 | 
25 | })
26 | 
27 | 
28 | test_that("finds global variables", {
29 | 
30 |     do = distribute(y, spec = 2L)
31 |     # assigning n must happen after cluster creation, otherwise forking
32 |     # will send n
33 |     assign("n", 10, envir = .GlobalEnv)
34 |     actual = do(n)
35 | 
36 |     expect_identical(actual, c(n, n))
37 | 
38 |     stop_cluster(do)
39 | })
40 | 
41 | 
42 | test_that("splits data frames into groups of rows", {
43 | 
44 |     do = distribute(iris)
45 |     dims = do(dim(iris))
46 | 
47 |     expect_equal(dims, c(75, 5, 75, 5))
48 |     stop_cluster(do)
49 | })
50 | 


--------------------------------------------------------------------------------
/inst/oldcode/test_evolve.R:
--------------------------------------------------------------------------------
  1 | 
  2 | # Include y to match the signature for crossprod()
  3 | crossprod_flops = function(x, y)
  4 | {
  5 |     n = nrow(x)
  6 |     p = ncol(x)
  7 |     (2*n - 1) * p * (p + 1) / 2
  8 | }
  9 | 
 10 | n = 20
 11 | p = 2
 12 | x = matrix(rnorm(n * p), nrow = n)
 13 | x2 = matrix(rnorm(2 * n * p), nrow = n)
 14 |  
 15 | 
 16 | 
 17 | ############################################################
 18 | 
 19 | test_that("get_timings", {
 20 | 
 21 | f = function(x) 20
 22 | 
 23 | f2 = smartfunc(f)
 24 | 
 25 | f2(50)
 26 | 
 27 | timings = get("timings", environment(f2))
 28 | 
 29 | expect_true(is.data.frame(timings))
 30 | 
 31 | })
 32 | 
 33 | 
 34 | 
 35 | test_that("smartfunc with metadata function", {
 36 | 
 37 | cp = smartfunc(crossprod, crossprod_flops)
 38 | 
 39 | replicate(10, cp(x))
 40 | replicate(10, cp(x2))
 41 | 
 42 | predict(cp, x)
 43 | 
 44 | environment(cp)
 45 | 
 46 | })
 47 | 
 48 | 
 49 | 
 50 | test_that("prediction of smartfunc", {
 51 | 
 52 | sleeptime = 0.1
 53 | epsilon = sleeptime / 2
 54 | 
 55 | f = function(x)
 56 | {
 57 |     Sys.sleep(sleeptime)
 58 | }
 59 | 
 60 | f2 = smartfunc(f)
 61 | 
 62 | # This is an implementation detail, I hesitate to test it.
 63 | expect_equal(predict(f2, 100), -Inf)
 64 | 
 65 | # Order important here! Function call forces a timing
 66 | f2(50)
 67 | 
 68 | # Prediction time is in nanoseconds
 69 | time_expected = predict(f2, 100) / 1e9
 70 | 
 71 | expect_gt(time_expected, sleeptime - epsilon)
 72 | expect_lt(time_expected, sleeptime + epsilon)
 73 | 
 74 | })
 75 | 
 76 | 
 77 | test_that("evolve with multiple implementations", {
 78 | 
 79 | ffast = function(x) "fast"
 80 | fslow = function(x){
 81 |     Sys.sleep(0.001)
 82 |     "slow"
 83 | }
 84 | 
 85 | f = evolve(fslow, ffast)
 86 | 
 87 | f(1)
 88 | f(2)
 89 | f(3)
 90 | f(4)
 91 | 
 92 | expect_equal(f(5), "fast")
 93 | 
 94 | 
 95 | })
 96 | 
 97 | 
 98 | 
 99 | # All details subject to change
100 | test_that("global timings exist", {
101 | 
102 | #debug(autoparallel:::startstop)
103 | 
104 | trace_timings(crossprod, metadata_func = crossprod_flops)
105 | 
106 | crossprod(x)
107 | 
108 | crossprod(x)
109 | 
110 | untrace(crossprod)
111 | 
112 | timings = env$crossprod
113 | 
114 | expect_equal(nrow(timings), 2)
115 | 
116 | expect_gte(ncol(timings), 3)
117 | 
118 | })
119 | 
120 | 
121 | test_that("defaults for trace_timings", {
122 | 
123 | skip("The way I'm using parent.frame() and eval() internally is not
124 |      compatible with testthat I believe")
125 | 
126 | n = 20
127 | x = rnorm(n)
128 | y = rnorm(n)
129 | 
130 | trace_timings(cov)
131 | 
132 | cov(x, y)
133 | 
134 | untrace(cov)
135 | 
136 | timings = env$cov
137 | 
138 | expect_equal(timings$metadata, length(x))
139 | 
140 | })
141 | 


--------------------------------------------------------------------------------
/inst/oldcode/test_tune.R:
--------------------------------------------------------------------------------
 1 | test_that("basic function tuning", {
 2 | 
 3 | f = function(x, t=0.1){
 4 |     if(x > 0) Sys.sleep(abs(t))
 5 |     x
 6 | }
 7 | 
 8 | f2 = tune(f, x = 100, t = tune_param(list(-0.05, 0.01, 0.1)))
 9 | 
10 | expect_equal(0.01, formals(f2)$t)
11 | 
12 | })
13 | 


--------------------------------------------------------------------------------
/inst/oldcode/test_utils.R:
--------------------------------------------------------------------------------
 1 | test_that("Longest path", {
 2 | 
 3 |     skip()
 4 | 
 5 |     g = make_graph(c(1, 2, 1, 3, 2, 3))
 6 | 
 7 |     expect_equal(longest_path(g), 3)
 8 | 
 9 | })
10 | 
11 | 
12 | test_that("replacing functions", {
13 | 
14 |     expr = parse(text = "
15 |         # Testing code:
16 |         n = 1000000L
17 |         p = 20L
18 |         x = matrix(1:(n*p), ncol = p)
19 |         x
20 |         colmaxs = apply(x, 2, max)
21 |         colmaxs2 <- apply(x, 2, max)
22 |         assign('colmaxs3', apply(x, 2, max))
23 |         apply(x, 2, min)
24 |     ")
25 | 
26 |     sub_one_docall(expr, list(apply = quote(FANCY_APPLY)))
27 | 
28 | })
29 | 
30 | 
31 | test_that("all_symbols", {
32 | 
33 |     e = quote(plot(x, y))
34 |     actual = sort(all_symbols(e))
35 |     expected = sort(c("plot", "x", "y"))
36 | 
37 |     expect_equal(actual, expected)
38 | 
39 |     # Using x as a function also. Yuck!
40 |     e = parse(text = "x(plot(x, y))
41 |               plot(x)")
42 |     actual = sort(all_symbols(e))
43 | 
44 |     expect_equal(actual, expected)
45 | 
46 | })
47 | 
48 | 
49 | test_that("only_literals", {
50 | 
51 |     expect_true(only_literals(quote(1:5)))
52 | 
53 |     expect_true(only_literals(quote(c(1, 4))))
54 | 
55 |     expect_false(only_literals(quote(f(3))))
56 | 
57 |     expect_false(only_literals(quote(1:n)))
58 | 
59 | })
60 | 
61 | 
62 | test_that("even_split", {
63 | 
64 |     actual = even_split(6, 2)    
65 |     expect_equal(actual, c(1, 1, 1, 2, 2, 2))
66 | 
67 |     actual = even_split(7, 2)    
68 |     expect_equal(actual, c(1, 1, 1, 1, 2, 2, 2))
69 | 
70 | })
71 | 
72 | 
73 | if(FALSE){
74 | 
75 | expr = parse(text = "
76 |     # Testing code:
77 |     n = 100000L
78 |     p = 10L
79 |     x = matrix(1:(n*p), ncol = p)
80 |     x
81 |     nitenite = function(x) Sys.sleep(0.01)
82 |     colmaxs = apply(x, 2, max)
83 |     apply(x, 2, nitenite)
84 | ")
85 | 
86 | # Seems to work fine
87 | expr_out =  parallelize_script(expr)
88 | 
89 | e = lapply(expr, CodeDepends::getInputs)
90 | 
91 | lapply(e, function(x) x@inputs)
92 | 
93 | }
94 | 


--------------------------------------------------------------------------------
/inst/oldcode/tune.R:
--------------------------------------------------------------------------------
 1 | #' Tune Function For Specific Arguments
 2 | #'
 3 | #' Return a modified version of the input function 
 4 | #'
 5 | #' @export
 6 | #' @param FUN function to be tuned
 7 | #' @param ... arguments to FUN that are being tuned
 8 | #' @param times number of times to run microbenchmark for each call
 9 | #' @examples
10 | #' # t is the tuning parameter
11 | #' f = function(x, t){
12 | #'  if(x > 0) Sys.sleep(t^2)
13 | #'  x
14 | #' }
15 | #' # Suppose we plan to call this many times with positive x values
16 | #' f2 = tune(f, x = 100, t = tune_param(list(-0.05, 0.01, 0.1)))
17 | #' # f2 should now have 0.01 as the default argument for t
18 | tune = function(FUN, ..., times = 5L)
19 | {
20 |     NEWFUN = FUN
21 | 
22 |     args = list(...)
23 |     params_to_tune = which(sapply(args, function(x) !is.null(attr(x, "tune"))))
24 |     
25 |     nparams = length(params_to_tune)
26 |     if(nparams > 1) stop("Multiple tuning parameters not yet implemented")
27 | 
28 |     # Benchmark all the tuning parameters
29 |     if(nparams == 1){
30 |         params_to_try = args[[params_to_tune]]
31 |         median_times = sapply(params_to_try, function(x){
32 |             timing_args = args
33 |             timing_args[[params_to_tune]] = x
34 |             median_time(FUN, timing_args, times = times)
35 |         })
36 |         fastest_param = params_to_try[[which.min(median_times)]]
37 |         formals(NEWFUN)[[params_to_tune]] = fastest_param
38 |     }
39 | 
40 |     NEWFUN
41 | }
42 | 
43 | 
44 | #' Median Time To Evaluate Function
45 | #'
46 | #' @param FUN function
47 | #' @param args list of arguments to call
48 | #' @param ... additional arguments to microbenchmark
49 | median_time = function(FUN, args, ...)
50 | {
51 |     bm = microbenchmark(do.call(FUN, args), ...)
52 |     median(bm$time)
53 | }
54 | 
55 | 
56 | #' Mark Parameter For Tuning
57 | #'
58 | #' @param x list of arguments to time
59 | #' @export
60 | tune_param = function(x)
61 | {
62 |     attr(x, "tune") = TRUE
63 |     x
64 | }
65 | 


--------------------------------------------------------------------------------
/inst/oldcode/utils.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # Adapted from Hadley Wickham's pryr / Advanced R
 3 | sub_one = function(statement, env)
 4 | {
 5 |     #stopifnot(is.language(statement))
 6 |     call <- substitute(substitute(statement, env), list(statement = statement))
 7 |     eval(call)
 8 | }
 9 | 
10 | 
11 | # Doesn't work
12 | sub_one_docall = function(expr, env)
13 | {
14 |     e = substitute(expr)
15 |     do.call(substitute, list(e, env))
16 | }
17 | 
18 | 
19 | # Substitute Expressions
20 | # 
21 | # Replace code with new code objects in env.
22 | # Handles expression objects as well as single objects.
23 | sub_expr = function(expr, env) {
24 |     if(is.expression(expr)){
25 |         as.expression(lapply(expr, sub_one, env))
26 |     } else {
27 |         sub_one(expr, env)
28 |     }
29 | }
30 | 
31 | 
32 | # Check if code only contains literal expressions
33 | #
34 | # If only literals and simple functions such as \code{:, c} then code can be
35 | # evaluated regardless of context.  Assuming those functions haven't been
36 | # redefined.
37 | #
38 | # @param code single R statement
39 | only_literals = function(code, simple_funcs = c("c", ":"))
40 | {
41 | 
42 |     info = CodeDepends::getInputs(code)
43 | 
44 |     if(length(info@inputs) > 0) {
45 |         return(FALSE)
46 |     }
47 | 
48 |     funcs = names(info@functions)
49 |     ok = funcs %in% simple_funcs
50 |     if(any(!ok)) {
51 |         return(FALSE)
52 |     }
53 | 
54 |     # TODO: Other code can be safely literally evaluated, for example
55 |     # sapply(1:5, function(x) (x %% 2) == 0)
56 |     #
57 |     # So we could relax the above to check for funcs available through R's
58 |     # search path.
59 | 
60 |     TRUE
61 | }
62 | 
63 | 
64 | #' Find All Symbols In Expression
65 | #'
66 | #' @param expr R language object
67 | all_symbols = function(expr)
68 | {
69 |     expr = as.expression(expr)
70 |     symbols = character()
71 |     walker = codetools::makeCodeWalker(leaf = function(e, w){
72 |         if(is.symbol(e)){
73 |             symbols <<- union(symbols, as.character(e))
74 |         }
75 |     })
76 |     lapply(expr, codetools::walkCode, walker)
77 |     symbols
78 | }
79 | 
80 | 
81 | #' Approximately Even Split
82 | #'
83 | #' @param n_elements integer number of elements to split
84 | #' @param n_groups integer number of resulting groups
85 | #' @return integer vector for use as splitting factor in \code{\link[base]{split}}
86 | even_split = function(n_elements, n_groups)
87 | {
88 |     splits = rep(seq(n_groups), (n_elements %/% n_groups) + 1)
89 |     sort(splits[1:n_elements])
90 | }
91 | 


--------------------------------------------------------------------------------
/inst/pems/.gitignore:
--------------------------------------------------------------------------------
1 | *.csv
2 | data/*
3 | pems/*
4 | 


--------------------------------------------------------------------------------
/inst/pems/intermediate_transformed_code.R:
--------------------------------------------------------------------------------
1 | pipe("cut -d , -f  stationID/313368.csv")
2 | pipe("cut -d , -f  stationID/313369.csv")
3 | pems = pems[, c("station", "flow2", "occupancy2")]
4 | pems2 = split(pems, pems$station)
5 | results = lapply(pems2, npbin)
6 | results = do.call(rbind, results)
7 | write.csv(results, "results.csv")
8 | 


--------------------------------------------------------------------------------
/inst/pems/pems.R:
--------------------------------------------------------------------------------
 1 | # This is the high level code that I would *like* to run. It won't work
 2 | # because it will run out of memory
 3 | 
 4 | message("starting")
 5 | old_time = Sys.time()
 6 | 
 7 | 
 8 | 
 9 | dyncut = function(x, pts_per_bin = 200, lower = 0, upper = 1, min_bin_width = 0.01)
10 | {
11 |     x = x[x < upper]
12 |     N = length(x)
13 |     max_num_cuts = ceiling(upper / min_bin_width)
14 |     eachq = pts_per_bin / N
15 | 
16 |     possible_cuts = quantile(x, probs = seq(from = 0, to = 1, by = eachq), na.rm = TRUE)
17 |     cuts = rep(NA, max_num_cuts)
18 |     current_cut = lower
19 |     for(i in seq_along(cuts)){
20 |         # Find the first possible cuts that is at least min_bin_width away from
21 |         # the current cut
22 |         possible_cuts = possible_cuts[possible_cuts >= current_cut + min_bin_width]
23 |         if(length(possible_cuts) == 0) 
24 |             break
25 |         current_cut = possible_cuts[1]
26 |         cuts[i] = current_cut
27 |     }
28 |     cuts = cuts[!is.na(cuts)]
29 |     c(lower, cuts, upper)
30 | }
31 | 
32 | 
33 | # Non parametric binned means
34 | npbin = function(x)
35 | {
36 |     breaks = dyncut(x$occupancy2, pts_per_bin = 200)
37 |     binned = cut(x$occupancy2, breaks, right = FALSE)
38 |     groups = split(x$flow2, binned)
39 | 
40 |     out = data.frame(station = rep(x[1, "station"], length(groups))
41 |         , right_end_occ = breaks[-1]
42 |         , mean_flow = sapply(groups, mean)
43 |         , sd_flow = sapply(groups, sd)
44 |         , number_observed = sapply(groups, length)
45 |     )
46 |     out
47 | }
48 | 
49 | 
50 | # Actual program
51 | ############################################################
52 | #
53 | # - Load data
54 | # - Split based on column value
55 | # - Apply a function to each group
56 | # - Write the result
57 | 
58 | # We'll generate the reading code
59 | 
60 | # From this line we can infer that we only need these 3 columns.
61 | # How do we know for sure?
62 | # Because it writes over the pems variable.
63 | # I wrote code to do this in the CodeAnalysis package.
64 | cols = c("station", "flow2", "occupancy2")
65 | pems = pems[, cols]
66 |  
67 | new_time = Sys.time()
68 | message("read in files and rbind: ", capture.output(new_time - old_time))
69 | old_time = new_time
70 | 
71 | # The data description will tell us if the data starts grouped by the "station" column
72 | station = pems[, "station"]
73 | pems2 = split(pems, station)
74 | 
75 | new_time = Sys.time()
76 | message("split: ", capture.output(new_time - old_time))
77 | old_time = new_time
78 | 
79 | results = lapply(pems2, npbin)
80 | 
81 | new_time = Sys.time()
82 | message("actual computations: ", capture.output(new_time - old_time))
83 | old_time = new_time
84 | 
85 | results = do.call(rbind, results)
86 | 
87 | write.csv(results, "results.csv", row.names = FALSE)
88 | 
89 | new_time = Sys.time()
90 | message("save output: ", capture.output(new_time - old_time))
91 | old_time = new_time
92 | 


--------------------------------------------------------------------------------
/inst/pems/rbind_problem.R:
--------------------------------------------------------------------------------
 1 | # Wed Sep 18 09:15:03 PDT 2019
 2 | #
 3 | # Why did the pems example fail?
 4 | # Can I make a minimal reproducible example?
 5 | 
 6 | # Here I'm using R 3.6.
 7 | 
 8 | 
 9 | # This is fast because of ALTREP in R 3.6
10 | d1 = data.frame(a = seq(.Machine$integer.max))
11 | d2 = data.frame(a = seq(10))
12 | 
13 | system.time(
14 |     d12 <- rbind(d1, d2)
15 | )
16 | 
17 | # Same error, good:
18 | 
19 | # Error in seq.int(from = nrow + 1L, length.out = ni) :
20 | #   'from' must be a finite number
21 | # In addition: Warning message:
22 | # In nrow + 1L : NAs produced by integer overflow
23 | 
24 | 
25 | # Lets see if this fixes it.
26 | system.time(
27 |     d12 <- rbind(d1, d2, deparse.level = 0, make.row.names = FALSE)
28 | )
29 | # Nope, same error.
30 | 
31 | # Wow, integers overflow to NA?
32 | # That's inconvenient.
33 | 
34 | a = seq(.Machine$integer.max + 10)
35 | b = as.character(a)
36 | 
37 | d3 = data.frame(a = a, row.names = b)
38 | # Error in if (nrows[i] > 0L && (nr%%nrows[i] == 0L)) { :
39 | #   missing value where TRUE/FALSE needed
40 | # In addition: Warning message:
41 | # In attributes(.Data) <- c(attributes(.Data), attrib) :
42 | #   NAs introduced by coercion to integer range
43 | 
44 | # What seems to be going on is that even though R allows long vectors, data frame cannot have more than 2^31 (about 2 billion) rows because it attempts to use integers for the names, and the integers overflow.
45 | # Thus the failure.
46 | # Which means the serial code won't run as it stands, and so must be rewritten.
47 | # It could use a higher performance third party library, or we could be clever and chunk the data frames.
48 | # If we're going to do the latter, then we may as well make it parallel also.
49 | 


--------------------------------------------------------------------------------
/inst/pems/slurm.sh:
--------------------------------------------------------------------------------
1 | # For starting up a shell and trying out the parallelism
2 | 
3 | srun -n 10 --pty bash -i
4 | 


--------------------------------------------------------------------------------
/inst/pems/target_code.R:
--------------------------------------------------------------------------------
 1 | # This is what pems.R should be expanded to before feeding it to the list scheduler.
 2 | # The list scheduler should be able to handle the following code directly.
 3 | # A 
 4 | #
 5 | 
 6 | # omit the function bodies for brevity here- they will actually be included.
 7 | dyncut = function(...) "see pems.R"
 8 | 
 9 | npbin = function(...) "see pems.R"
10 | 
11 | 
12 | # We can expand in this way because we saw the combination of the following:
13 | #
14 | # 1. A split based on a data partition
15 | # 2. An lapply on the results of that split
16 | 
17 | tmp0 = c("integer", "integer", "numeric")
18 | tmp1 = c("station", "flow2", "occupancy2")
19 | 
20 | pems_1 = read.csv(
21 |     pipe("cut -d , -f 2,6,7 stationID/313368.csv")
22 |     , col.names = tmp1
23 |     , colClasses = tmp0
24 |     )
25 |     
26 | pems_2 = read.csv(
27 |     pipe("cut -d , -f 2,6,7 stationID/313369.csv")
28 |     , col.names = tmp1
29 |     , colClasses = tmp0
30 |     )
31 | 
32 | pems_1 = pems_1[, tmp1]
33 | pems_2 = pems_2[, tmp1]
34 | 
35 | tmp2_1 = pems_1[, "station"]
36 | tmp2_2 = pems_2[, "station"]
37 | 
38 | pems2_1 = split(pems_1, tmp2_1)
39 | pems2_2 = split(pems_2, tmp2_2)
40 | 
41 | # treat lapply as vectorized
42 | results_1 = lapply(pems2_1, npbin)
43 | results_2 = lapply(pems2_2, npbin)
44 | 
45 | results = c(results_1, results_2)
46 | 
47 | # Unmodified after this point
48 | results = do.call(rbind, results)
49 | 
50 | write.csv(results, "results.csv")
51 | 


--------------------------------------------------------------------------------
/inst/pems/transform_code.R:
--------------------------------------------------------------------------------
 1 | library(makeParallel)
 2 | d = dataFiles(dir = "stationID"
 3 |     , varname = "pems"
 4 |     , delimiter = ","
 5 |     , splitColumn = "station"
 6 |     , header = FALSE
 7 | 	, columns = c(timeperiod = "character", station = "integer"
 8 | 		, flow1 = "integer", occupancy1 = "numeric", speed1 = "numeric"
 9 | 		, flow2 = "integer", occupancy2 = "numeric", speed2 = "numeric"
10 | 		, flow3 = "integer", occupancy3 = "numeric", speed3 = "numeric"
11 | 		, flow4 = "integer", occupancy4 = "numeric", speed4 = "numeric"
12 | 		, flow5 = "integer", occupancy5 = "numeric", speed5 = "numeric"
13 | 		, flow6 = "integer", occupancy6 = "numeric", speed6 = "numeric"
14 | 		, flow7 = "integer", occupancy7 = "numeric", speed7 = "numeric"
15 | 		, flow8 = "integer", occupancy8 = "numeric", speed8 = "numeric"
16 | 	)
17 | )
18 | p = platform(OS.type = "unix", workers = 10L)
19 | 
20 | # The named list for the data argument means that the symbol 'pems' in the code corresponds to the data in 'pems_ds'.
21 | # makeParallel("pems.R", data = list(pems = pems_ds), scheduler = scheduleTaskList, workers = 10L)
22 | 
23 | # It's more convenient at the moment for me to use the varname in the object.
24 | out = makeParallel("pems.R", data = d, platform = p, scheduler = scheduleTaskList)
25 | 
26 | # Could use a more convenient way to extract this code
27 | tcode = schedule(out)@graph@code
28 | writeCode(tcode[-c(3,4)], "intermediate_transformed_code.R", overWrite = TRUE)
29 | 


--------------------------------------------------------------------------------
/inst/pems/transform_code2.R:
--------------------------------------------------------------------------------
 1 | # Thu Sep 12 10:31:46 PDT 2019
 2 | #
 3 | # Experimenting to see what makeParallel can currently do with the PEMS example
 4 | #
 5 | # To actually run we need:
 6 | #
 7 | # 1. To only read the necessary columns 
 8 | 
 9 | 
10 | library(makeParallel)
11 | 
12 | nWorkers = 10L
13 | files = list.files("/scratch/clarkf/pems/district4"
14 | #                   , pattern = "d04_text_station_raw_2016_08_2."
15 |                    , full.names = TRUE
16 |                    )
17 | 
18 | #files = list.files("data"
19 | #        , full.names = TRUE)
20 | 
21 | columns = c(timeperiod = "character", station = "integer"
22 |     , flow1 = "integer", occupancy1 = "numeric", speed1 = "numeric"
23 |     , flow2 = "integer", occupancy2 = "numeric", speed2 = "numeric"
24 |     , flow3 = "integer", occupancy3 = "numeric", speed3 = "numeric"
25 |     , flow4 = "integer", occupancy4 = "numeric", speed4 = "numeric"
26 |     , flow5 = "integer", occupancy5 = "numeric", speed5 = "numeric"
27 |     , flow6 = "integer", occupancy6 = "numeric", speed6 = "numeric"
28 |     , flow7 = "integer", occupancy7 = "numeric", speed7 = "numeric"
29 |     , flow8 = "integer", occupancy8 = "numeric", speed8 = "numeric"
30 |     )
31 | 
32 | pems_data = DataFrameFiles(varName = "pems"
33 |     , files = files
34 |     , sizes = file.info(files)$size
35 |     , readFuncName = "read.csv"
36 |     , col.names = names(columns)
37 |     , colClasses = unname(columns)
38 |     , header = FALSE
39 |     )
40 | 
41 | 
42 | outFile = "gen2.R"
43 | 
44 | out = makeParallel("pems.R"
45 | , data = pems_data
46 | , scheduler = scheduleDataParallel
47 | , platform = parallelLocalCluster(nWorkers = nWorkers)
48 | , chunkFuncs = c("[", "lapply")
49 | , outFile = outFile
50 | , overWrite = TRUE
51 | )
52 | 
53 | s = schedule(out)
54 | 


--------------------------------------------------------------------------------
/inst/pems/transform_subset.R:
--------------------------------------------------------------------------------
 1 | # Thu Sep 12 10:31:46 PDT 2019
 2 | #
 3 | # Experimenting to see what makeParallel can currently do with the PEMS example
 4 | #
 5 | # To actually run we need:
 6 | #
 7 | # 1. To only read the necessary columns 
 8 | 
 9 | 
10 | library(makeParallel)
11 | 
12 | nWorkers = 10L
13 | outFile = "gen_subset.R"
14 | nfiles = 50L
15 | 
16 | 
17 | files = list.files("/scratch/clarkf/pems/district4"
18 |                    , full.names = TRUE
19 |                    )
20 | 
21 | # Adding this in an attempt to avoid memory problems.
22 | files = files[seq(nfiles)]
23 | 
24 | 
25 | #files = list.files("data"
26 | #        , full.names = TRUE)
27 | 
28 | columns = c(timeperiod = "character", station = "integer"
29 |     , flow1 = "integer", occupancy1 = "numeric", speed1 = "numeric"
30 |     , flow2 = "integer", occupancy2 = "numeric", speed2 = "numeric"
31 |     , flow3 = "integer", occupancy3 = "numeric", speed3 = "numeric"
32 |     , flow4 = "integer", occupancy4 = "numeric", speed4 = "numeric"
33 |     , flow5 = "integer", occupancy5 = "numeric", speed5 = "numeric"
34 |     , flow6 = "integer", occupancy6 = "numeric", speed6 = "numeric"
35 |     , flow7 = "integer", occupancy7 = "numeric", speed7 = "numeric"
36 |     , flow8 = "integer", occupancy8 = "numeric", speed8 = "numeric"
37 |     )
38 | 
39 | pems_data = DataFrameFiles(varName = "pems"
40 |     , files = files
41 |     , sizes = file.info(files)$size
42 |     , readFuncName = "read.csv"
43 |     , col.names = names(columns)
44 |     , colClasses = unname(columns)
45 |     , header = FALSE
46 |     )
47 | 
48 | 
49 | 
50 | out = makeParallel("pems.R"
51 | , data = pems_data
52 | , scheduler = scheduleDataParallel
53 | , platform = parallelLocalCluster(nWorkers = nWorkers)
54 | , chunkFuncs = c("[", "lapply")
55 | , outFile = outFile
56 | , overWrite = TRUE
57 | )
58 | 
59 | s = schedule(out)
60 | 


--------------------------------------------------------------------------------
/inst/templates/snow_manager.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | # {{{gen_time}}}
 4 | # Automatically generated from R by autoparallel version {{{version}}}
 5 | 
 6 | library(parallel)
 7 | 
 8 | nworkers = {{{nworkers}}}
 9 | timeout = {{{timeout}}}
10 | 
11 | cls = makeCluster(nworkers, "PSOCK")
12 | 
13 | # Each worker updates a copy of this object. On worker j workers[[i]] will
14 | # contain an open socket connection between workers j and i.
15 | workers = vector(nworkers, mode = "list")
16 | 
17 | close.NULL = function(...) NULL
18 | 
19 | 
20 | #' Connect workers as peers
21 | connect = function(server, client, port, timeout, sleep = 0.1, ...)
22 | {
23 |     if(ID == server){
24 |         con = socketConnection(port = port, server = TRUE
25 |                 , blocking = TRUE, open = "a+b", timeout = timeout, ...)
26 |         workers[[client]] <<- con
27 |     }
28 |     if(ID == client){
29 |         Sys.sleep(sleep)
30 |         con = socketConnection(port = port, server = FALSE
31 |                 , blocking = TRUE, open = "a+b", timeout = timeout, ...)
32 |         workers[[server]] <<- con
33 |     }
34 |     NULL
35 | }
36 | 
37 | # Setting environment so that <<- in `connect` works correctly and to avoid
38 | # transferring potentially large amounts of data in case the user evaluates
39 | # this code from within an environment, ie. a function.
40 | environment(connect) = environment(close.NULL) = .GlobalEnv
41 | 
42 | clusterExport(cls, c("workers", "connect", "close.NULL"), envir = environment())
43 | 
44 | # Each worker has an ID
45 | clusterMap(cls, assign, "ID", seq(nworkers)
46 |         , MoreArgs = list(envir = .GlobalEnv))
47 | 
48 | # Define the peer to peer connections
49 | socket_map = read.csv(text = '
50 | {{{socket_map_csv}}}
51 | ')
52 | 
53 | # Open the connections
54 | by(socket_map, seq(nrow(socket_map)), function(x){
55 |     clusterCall(cls, connect, x$server, x$client, x$port, timeout = timeout)
56 | })
57 | 
58 | worker_code = {{{worker_code}}}
59 | 
60 | evalg = function(codestring)
61 | {
62 |     code = parse(text = codestring)
63 |     eval(code, .GlobalEnv)
64 |     NULL
65 | }
66 | 
67 | # Action!
68 | parLapply(cls, worker_code, evalg)
69 | 
70 | # Close peer to peer connections
71 | clusterEvalQ(cls, lapply(workers, close))
72 | 
73 | stopCluster(cls)
74 | 


--------------------------------------------------------------------------------
/inst/templates/snow_notransfer.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | # {{{gen_time}}}
 4 | # Automatically generated from R by autoparallel version {{{version}}}
 5 | 
 6 | library(parallel)
 7 | 
 8 | nworkers = {{{nworkers}}}
 9 | 
10 | cls = makeCluster(nworkers, "PSOCK")
11 | 
12 | # Each worker has an ID
13 | clusterMap(cls, assign, "ID", seq(nworkers)
14 |         , MoreArgs = list(envir = .GlobalEnv))
15 | 
16 | worker_code = {{{worker_code}}}
17 | 
18 | evalg = function(codestring)
19 | {
20 |     code = parse(text = codestring)
21 |     eval(code, .GlobalEnv)
22 |     NULL
23 | }
24 | 
25 | # Action!
26 | parLapply(cls, worker_code, evalg)
27 | 
28 | stopCluster(cls)
29 | 


--------------------------------------------------------------------------------
/inst/templates/snow_worker.R:
--------------------------------------------------------------------------------
1 | if(ID != {{{processor}}})
2 |     stop(sprintf("Worker is attempting to execute wrong code.
3 | This code is for {{{processor}}}, but manager assigned ID %s", ID))
4 | 
5 | {{{code_body}}}
6 | 


--------------------------------------------------------------------------------
/inst/templates/udaf.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | # {{{gen_time}}}
 4 | # Automatically generated from R by autoparallel version {{{version}}}
 5 | 
 6 | # These values are specific to the analysis
 7 | verbose = {{{verbose}}}
 8 | rows_per_chunk = {{{rows_per_chunk}}}
 9 | cluster_by = {{{cluster_by}}}
10 | sep = {{{sep}}}
11 | input_cols = {{{input_cols}}}
12 | input_classes = {{{input_classes}}}
13 | try = {{{try}}}
14 | f = {{{f}}}
15 | 
16 | 
17 | # Other code that the user wanted to include, such as supporting functions
18 | # or variables:
19 | ############################################################
20 | 
21 | {{{include_script}}}
22 | 
23 | # The remainder of the script is a generic template
24 | ############################################################
25 | 
26 | 
27 | # Logging to stderr() writes to the Hadoop logs where we can find them.
28 | msg = function(..., log = verbose)
29 | {
30 |     if(log) writeLines(paste(...), stderr())
31 | }
32 | 
33 | 
34 | multiple_groups = function(queue, g = cluster_by) length(unique(queue[, g])) > 1
35 | 
36 | 
37 | process_group = function(grp, outfile, .try = try)
38 | {
39 |     msg("Processing group", grp[1, cluster_by])
40 | 
41 |     if(.try) {try({
42 |         # TODO: log these failures
43 |         out = f(grp)
44 |         write.table(out, outfile, col.names = FALSE, row.names = FALSE, sep = sep)
45 |     })} else {
46 |         out = f(grp)
47 |         write.table(out, outfile, col.names = FALSE, row.names = FALSE, sep = sep)
48 |     }
49 | }
50 | 
51 | 
52 | msg("BEGIN R SCRIPT")
53 | ############################################################
54 | 
55 | stream_in = file("stdin")
56 | open(stream_in)
57 | stream_out = stdout()
58 | 
59 | # Initialize the queue
60 | # TODO: parameterize Hive's na.strings
61 | queue = read.table(stream_in, nrows = rows_per_chunk, colClasses = input_classes
62 |     , col.names = input_cols, na.strings = "\\N")
63 | 
64 | while(TRUE) {
65 |     while(multiple_groups(queue)) {
66 |         # Pop the first group out of the queue
67 |         nextgrp = queue[, cluster_by] == queue[1, cluster_by]
68 |         working = queue[nextgrp, ]
69 |         queue = queue[!nextgrp, ]
70 | 
71 |         process_group(working, stream_out)
72 |     }
73 | 
74 |     # Fill up the queue
75 |     nextqueue = read.table(stream_in, nrows = rows_per_chunk
76 |         , colClasses = input_classes, col.names = input_cols, na.strings = "\\N")
77 |     if(nrow(nextqueue) == 0) {
78 |         msg("Last group")
79 |         try(process_group(queue, stream_out))
80 |         break
81 |     }
82 |     queue = rbind(queue, nextqueue)
83 | }
84 | 
85 | msg("END R SCRIPT")
86 | 


--------------------------------------------------------------------------------
/inst/templates/udaf.sql:
--------------------------------------------------------------------------------
 1 | -- {{{gen_time}}}
 2 | -- Automatically generated from R by autoparallel version {{{autoparallel_version}}}
 3 | 
 4 | add FILE {{{udaf_dot_R}}}
 5 | ;
 6 | 
 7 | {{{#overwrite_table}}}
 8 | DROP TABLE {{{output_table}}} 
 9 | ;
10 | 
11 | CREATE TABLE {{{output_table}}} (
12 |   {{{#output_table_definition}}}{{{^first}}}  , {{{/first}}}{{{ddl}}}
13 | {{{/output_table_definition}}})
14 | ROW FORMAT DELIMITED
15 | FIELDS TERMINATED BY {{{sep}}}
16 | ;
17 | 
18 | INSERT OVERWRITE TABLE {{{output_table}}} {{{/overwrite_table}}}
19 | SELECT
20 | TRANSFORM ({{{input_cols}}})
21 | USING "Rscript {{{udaf_dot_R}}}"
22 | AS (
23 |     {{{output_cols}}}
24 | )
25 | FROM (
26 |     SELECT {{{input_cols}}}
27 |     FROM {{{input_table}}}
28 |     CLUSTER BY {{{cluster_by}}}
29 | ) AS {{{tmptable}}}
30 | ;
31 | 


--------------------------------------------------------------------------------
/inst/templates/vector.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Using nonsyntactic names with backticks should avoid most name collisions.
 3 | # To be really safe we could test for name collisions, and then modify them, but I'll wait until it becomes an issue.
 4 | 
 5 | message(`_MESSAGE`)
 6 | 
 7 | library(parallel)
 8 | 
 9 | nworkers = `_NWORKERS`
10 | assignments = `_ASSIGNMENT_INDICES`
11 | read_args = `_READ_ARGS`
12 | 
13 | cls = makeCluster(nworkers)
14 | 
15 | clusterExport(cls, c("assignments", "read_args"))
16 | parLapply(cls, seq(nworkers), function(i) assign("workerID", i, globalenv()))
17 | 
18 | collected = clusterEvalQ(cls, {
19 |     assignments = which(assignments == workerID)
20 |     read_args = read_args[assignments]
21 |     chunks = lapply(read_args, `_READ_FUNC`)
22 |     # TODO: Generalize this to other combining functions besides c, rbind for data.frame
23 |     # For this we need to know if value is a data.frame
24 |     `_DATA_VARNAME` = do.call(`_COMBINE_FUNC`, chunks)
25 | 
26 |     `_VECTOR_BODY`
27 | 
28 |     `_OBJECTS_RECEIVE_FROM_WORKERS`
29 | })
30 | 
31 | # Unpack and assemble the objects
32 | vars_to_collect = names(collected[[1]])
33 | for(i in seq_along(vars_to_collect)){
34 |     varname = vars_to_collect[i]
35 |     chunks = lapply(collected, `[[`, i)
36 |     # TODO: This assumes the same _COMBINE_FUNC will work, which is not necessarily true.
37 |     value = do.call(`_COMBINE_FUNC`, chunks)
38 |     assign(varname, value)
39 | }
40 | 
41 | stopCluster(cls)
42 | 
43 | `_REMAINDER`
44 | 


--------------------------------------------------------------------------------
/inst/use_cases/lemmatize/Lemmatizer_parrallel_BETA.R:
--------------------------------------------------------------------------------
 1 | library(parallel)
 2 | library(doParallel)
 3 | # options(warn = -1)
 4 | 
 5 | # Start up a parallel cluster
 6 | parallelCluster <- makeCluster(25)
 7 | print(parallelCluster)
 8 | registerDoParallel(parallelCluster)
 9 | 
10 | clusterEvalQ(cl = parallelCluster, {
11 | library(koRpus)
12 | 
13 | loc = "..."
14 | doc = read.csv(loc, header = TRUE, stringsAsFactors = FALSE)
15 | 
16 | # The only things you need to modify
17 | LemmatizerSourceDir = 'C:/TreeTagger/'   # Where the lemmatizer source files live
18 | 
19 | # Set the koRpus environment
20 | set.kRp.env(TT.cmd = "manual",
21 |             lang = 'en',
22 |             preset = 'en',
23 |             treetagger = 'manual',
24 |             format = 'obj',
25 |             TT.tknz = TRUE,
26 |             encoding = 'UTF-8',
27 |             TT.options = list(path = LemmatizerSourceDir,
28 |                               preset = 'en'))
29 | 
30 | # This function will take in a character vector and output a data frame
31 | lemmatize = function(txt){
32 |   tagged.words = treetag(txt,
33 |                          format = "obj",
34 |                          treetagger ='manual',
35 |                          lang = 'en',
36 |                          TT.options = list(path = paste0(LemmatizerSourceDir),
37 |                                            preset = 'en'))
38 |   results = tagged.words@TT.res
39 |   return(results)
40 | }
41 | })
42 | #### Function ####
43 | 
44 | GSRLem = function(text.col){
45 |   lemdflist = lapply(X = text.col, function(x) lemmatize(x))
46 |   rcv = vector()
47 |   for(i in 1:length(lemdflist)){
48 |     activedf = lemdflist[[i]]
49 |     activedf$lemma = as.character(activedf$lemma)
50 |     activedf[which(activedf$lemma == "<unknown>"), "lemma"] = activedf[which(activedf$lemma == "<unknown>"), "token"]
51 |     coltext = paste(activedf$lemma, collapse = " ")
52 |     rcv = c(rcv, coltext)
53 |     print(paste("#", i, " of ", length(lemdflist), " done!"))
54 |   }
55 |   return(rcv)
56 | }
57 | 
58 | 
59 | 
60 | lemed = GSRLem(doc[ , "paragraph"])
61 | 
62 | stopImplicitCluster()
63 | stopCluster(parallelCluster)
64 | rm(parallelCluster)
65 | 
66 | # If this gives you the following warning:
67 |   # Warning message:
68 |   #   Can't find the lexicon file, hence omitted! Please ensure this path is valid:
69 |   # ~/Desktop/test/lib/english-lexicon.txt 
70 | # Do not worry about it. From the documentation for the treetag() function:
71 |   # you can omit all the following elements, because they will be filled with defaults. Of course this only makes sense if you have a working default installation.
72 | 
73 | 
74 | 
75 | 
76 | 
77 | 
78 | 
79 | 


--------------------------------------------------------------------------------
/inst/use_cases/lemmatize/README.txt:
--------------------------------------------------------------------------------
1 | Mon Oct  9 13:56:53 PDT 2017
2 | 
3 | Code thanks to Jared in the DSI
4 | 


--------------------------------------------------------------------------------
/inst/use_cases/lemmatize/goodnightmoon.csv:
--------------------------------------------------------------------------------
1 | paragraph
2 | In the great green room there was a telephone
3 | and a red balloon
4 | and a picture of the cow jumping over the moon
5 | 


--------------------------------------------------------------------------------
/man/Assignment-class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllClass.R
 3 | \docType{class}
 4 | \name{Assignment-class}
 5 | \alias{Assignment-class}
 6 | \alias{Assignment}
 7 | \title{Assignment}
 8 | \description{
 9 | Assignment
10 | }
11 | \section{Slots}{
12 | 
13 | \describe{
14 | \item{\code{lhs}}{name of the variable to be assigned}
15 | }}
16 | 
17 | 


--------------------------------------------------------------------------------
/man/AssignmentOneVectorFunction-class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllClass.R
 3 | \docType{class}
 4 | \name{AssignmentOneVectorFunction-class}
 5 | \alias{AssignmentOneVectorFunction-class}
 6 | \alias{AssignmentOneVectorFunction}
 7 | \title{Assignment From Single Vectorized Function}
 8 | \description{
 9 | Assignment From Single Vectorized Function
10 | }
11 | \section{Slots}{
12 | 
13 | \describe{
14 | \item{\code{functionName}}{name of the function that's called}
15 | 
16 | \item{\code{args}}{arguments to the function}
17 | }}
18 | 
19 | 


--------------------------------------------------------------------------------
/man/ChunkDataFiles-class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllClass.R
 3 | \docType{class}
 4 | \name{ChunkDataFiles-class}
 5 | \alias{ChunkDataFiles-class}
 6 | \title{One or More Files Representing One Object}
 7 | \description{
 8 | One or More Files Representing One Object
 9 | }
10 | 


--------------------------------------------------------------------------------
/man/ChunkDataFiles.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/dataSource.R
 3 | \name{ChunkDataFiles}
 4 | \alias{ChunkDataFiles}
 5 | \title{Constructor for ChunkDataFiles}
 6 | \usage{
 7 | ChunkDataFiles(files, sizes = file.info(files)$size,
 8 |   readFuncName = inferReadFuncFromFile(files[1]), ...)
 9 | }
10 | \description{
11 | Constructor for ChunkDataFiles
12 | }
13 | 


--------------------------------------------------------------------------------
/man/CodeBlock-class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllClass.R
 3 | \docType{class}
 4 | \name{CodeBlock-class}
 5 | \alias{CodeBlock-class}
 6 | \alias{CodeBlock}
 7 | \title{Abstract base class for blocks comprising a DataParallelSchedule}
 8 | \description{
 9 | These are NOT basic blocks in the sense of compilers, because they may contain control flow.
10 | }
11 | \section{Slots}{
12 | 
13 | \describe{
14 | \item{\code{code}}{to evaluate in serial on the manager.}
15 | }}
16 | 
17 | 


--------------------------------------------------------------------------------
/man/DataLoadBlock-class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllClass.R
 3 | \docType{class}
 4 | \name{DataLoadBlock-class}
 5 | \alias{DataLoadBlock-class}
 6 | \alias{DataLoadBlock}
 7 | \title{Load Data}
 8 | \description{
 9 | Load Data
10 | }
11 | 


--------------------------------------------------------------------------------
/man/DataSource-class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllClass.R
 3 | \docType{class}
 4 | \name{DataSource-class}
 5 | \alias{DataSource-class}
 6 | \alias{DataSource}
 7 | \title{Abstract Base Class For Data Descriptions}
 8 | \description{
 9 | Abstract Base Class For Data Descriptions
10 | }
11 | \section{Slots}{
12 | 
13 | \describe{
14 | \item{\code{varName}}{name of the variable in the code}
15 | 
16 | \item{\code{uniqueValueBound}}{upper bound for number of distinct values.
17 | TODO: Define what this means for tables and vectors.}
18 | }}
19 | 
20 | 


--------------------------------------------------------------------------------
/man/FinalBlock-class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllClass.R
 3 | \docType{class}
 4 | \name{FinalBlock-class}
 5 | \alias{FinalBlock-class}
 6 | \alias{FinalBlock}
 7 | \title{Finalize, shut down the platform, free the resources, because everything is done.}
 8 | \description{
 9 | Finalize, shut down the platform, free the resources, because everything is done.
10 | }
11 | 


--------------------------------------------------------------------------------
/man/FixedWidthFiles-class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllClass.R
 3 | \docType{class}
 4 | \name{FixedWidthFiles-class}
 5 | \alias{FixedWidthFiles-class}
 6 | \alias{FixedWidthFiles}
 7 | \title{A Collection Of One Or More Fixed Width Files}
 8 | \description{
 9 | A Collection Of One Or More Fixed Width Files
10 | }
11 | 


--------------------------------------------------------------------------------
/man/ForkSchedule-class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllClass.R
 3 | \docType{class}
 4 | \name{ForkSchedule-class}
 5 | \alias{ForkSchedule-class}
 6 | \alias{ForkSchedule}
 7 | \title{Fork based parallel schedule}
 8 | \description{
 9 | Class for schedules that should be parallelized by forks from one single
10 | process
11 | }
12 | \section{Slots}{
13 | 
14 | \describe{
15 | \item{\code{sequence}}{vector of statement indices}
16 | }}
17 | 
18 | 


--------------------------------------------------------------------------------
/man/GeneratedCode-class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllClass.R
 3 | \docType{class}
 4 | \name{GeneratedCode-class}
 5 | \alias{GeneratedCode-class}
 6 | \title{Generated code ready to write}
 7 | \description{
 8 | This class contains code that is ready to run and execute, as well as
 9 | the steps taken to generate this code.
10 | }
11 | \section{Slots}{
12 | 
13 | \describe{
14 | \item{\code{schedule}}{contains all information to generate code}
15 | 
16 | \item{\code{code}}{executable R code}
17 | 
18 | \item{\code{file}}{name of a file where code will be written}
19 | }}
20 | 
21 | 


--------------------------------------------------------------------------------
/man/InitBlock-class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllClass.R
 3 | \docType{class}
 4 | \name{InitBlock-class}
 5 | \alias{InitBlock-class}
 6 | \alias{InitBlock}
 7 | \title{Initialize the platform}
 8 | \description{
 9 | Initialize the platform
10 | }
11 | \section{Slots}{
12 | 
13 | \describe{
14 | \item{\code{funcNames}}{list of functions defined in slot code to make available for the remainder of the program.}
15 | 
16 | \item{\code{assignmentIndices}}{assigns each data chunk to a worker. For example, c(2, 1, 1) assigns the 1st chunk to worker 2, and chunks 2 and 3 to worker 1.}
17 | }}
18 | 
19 | 


--------------------------------------------------------------------------------
/man/KnownAssignment-class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllClass.R
 3 | \docType{class}
 4 | \name{KnownAssignment-class}
 5 | \alias{KnownAssignment-class}
 6 | \alias{KnownAssignment}
 7 | \title{Simple Statement With Known Value}
 8 | \description{
 9 | Simple Statement With Known Value
10 | }
11 | \section{Slots}{
12 | 
13 | \describe{
14 | \item{\code{value}}{the value that the lhs will be bound to}
15 | }}
16 | 
17 | 


--------------------------------------------------------------------------------
/man/MapSchedule-class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllClass.R
 3 | \docType{class}
 4 | \name{MapSchedule-class}
 5 | \alias{MapSchedule-class}
 6 | \alias{MapSchedule}
 7 | \title{Data parallel schedule}
 8 | \description{
 9 | Class for schedules that should be parallelized with apply style parallelism
10 | }
11 | 


--------------------------------------------------------------------------------
/man/MeasuredTaskGraph-class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllClass.R
 3 | \docType{class}
 4 | \name{MeasuredTaskGraph-class}
 5 | \alias{MeasuredTaskGraph-class}
 6 | \alias{MeasuredTaskGraph}
 7 | \title{Graph where the size of each variable that can be transferred is known}
 8 | \description{
 9 | Graph where the size of each variable that can be transferred is known
10 | }
11 | 


--------------------------------------------------------------------------------
/man/NoDataSource-class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllClass.R
 3 | \docType{class}
 4 | \name{NoDataSource-class}
 5 | \alias{NoDataSource-class}
 6 | \alias{NoDataSource}
 7 | \title{Data Unspecified}
 8 | \description{
 9 | Data Unspecified
10 | }
11 | 


--------------------------------------------------------------------------------
/man/ParallelBlock-class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllClass.R
 3 | \docType{class}
 4 | \name{ParallelBlock-class}
 5 | \alias{ParallelBlock-class}
 6 | \alias{ParallelBlock}
 7 | \title{Code to run in parallel}
 8 | \description{
 9 | Code to run in parallel
10 | }
11 | \section{Slots}{
12 | 
13 | \describe{
14 | \item{\code{export}}{names of objects to export from manager to workers.}
15 | }}
16 | 
17 | 


--------------------------------------------------------------------------------
/man/ParallelLocalCluster-class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllClass.R
 3 | \docType{class}
 4 | \name{ParallelLocalCluster-class}
 5 | \alias{ParallelLocalCluster-class}
 6 | \alias{ParallelLocalCluster}
 7 | \title{Placeholder for local \code{cluster} objects from the parallel package, for example, those produced by \code{parallel::makeCluster}.}
 8 | \description{
 9 | Placeholder for local \code{cluster} objects from the parallel package, for example, those produced by \code{parallel::makeCluster}.
10 | }
11 | \section{Slots}{
12 | 
13 | \describe{
14 | \item{\code{name}}{symbol to use for the cluster name when generating code}
15 | 
16 | \item{\code{scratchDir}}{place to write intermediate data files}
17 | }}
18 | 
19 | 


--------------------------------------------------------------------------------
/man/Platform.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllGeneric.R
 3 | \name{platform}
 4 | \alias{platform}
 5 | \title{extract platform}
 6 | \usage{
 7 | platform(x, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{object with a platform}
11 | 
12 | \item{...}{additional arguments to methods}
13 | }
14 | \value{
15 | \linkS4class{Platform} object
16 | }
17 | \description{
18 | extract platform
19 | }
20 | 


--------------------------------------------------------------------------------
/man/ReduceBlock-class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllClass.R
 3 | \docType{class}
 4 | \name{ReduceBlock-class}
 5 | \alias{ReduceBlock-class}
 6 | \alias{ReduceBlock}
 7 | \title{Reduce in parallel on the workers}
 8 | \description{
 9 | Reduce in parallel on the workers
10 | }
11 | \section{Slots}{
12 | 
13 | \describe{
14 | \item{\code{objectToReduce}}{name of the object to apply the reduce to}
15 | 
16 | \item{\code{resultName}}{name of the object to save the result as}
17 | 
18 | \item{\code{reduceFun}}{implementation of a reduce to use}
19 | }}
20 | 
21 | 


--------------------------------------------------------------------------------
/man/ReduceFun-class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllClass.R
 3 | \docType{class}
 4 | \name{ReduceFun-class}
 5 | \alias{ReduceFun-class}
 6 | \alias{ReduceFun}
 7 | \title{Abstract base class for reducible function implementations}
 8 | \description{
 9 | Abstract base class for reducible function implementations
10 | }
11 | \section{Slots}{
12 | 
13 | \describe{
14 | \item{\code{reduce}}{name of a reducible function}
15 | 
16 | \item{\code{predicate}}{function that takes in a resource and returns TRUE if this particular resource can be reduced using this ReduceFun, and FALSE otherwise.
17 | TODO: Define resource and make it more user accessible if users are expected to compute on it.}
18 | }}
19 | 
20 | 


--------------------------------------------------------------------------------
/man/Schedule-class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllClass.R
 3 | \docType{class}
 4 | \name{Schedule-class}
 5 | \alias{Schedule-class}
 6 | \alias{Schedule}
 7 | \title{Schedule base class}
 8 | \description{
 9 | Subclasses of schedule contain an abstract plan to run the code in
10 | parallel using various models.
11 | }
12 | \section{Slots}{
13 | 
14 | \describe{
15 | \item{\code{graph}}{\linkS4class{TaskGraph} used to create the schedule}
16 | 
17 | \item{\code{data}}{\linkS4class{DataSource} used to create the schedule}
18 | }}
19 | 
20 | 


--------------------------------------------------------------------------------
/man/SerialBlock-class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllClass.R
 3 | \docType{class}
 4 | \name{SerialBlock-class}
 5 | \alias{SerialBlock-class}
 6 | \alias{SerialBlock}
 7 | \title{Code to run in serial}
 8 | \description{
 9 | Code to run in serial
10 | }
11 | \section{Slots}{
12 | 
13 | \describe{
14 | \item{\code{collect}}{names of objects to collect from the workers to the manager.}
15 | }}
16 | 
17 | 


--------------------------------------------------------------------------------
/man/SerialSchedule-class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllClass.R
 3 | \docType{class}
 4 | \name{SerialSchedule-class}
 5 | \alias{SerialSchedule-class}
 6 | \alias{SerialSchedule}
 7 | \title{Schedule that contains no parallelism at all}
 8 | \description{
 9 | Schedule that contains no parallelism at all
10 | }
11 | 


--------------------------------------------------------------------------------
/man/SimpleReduce-class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllClass.R
 3 | \docType{class}
 4 | \name{SimpleReduce-class}
 5 | \alias{SimpleReduce-class}
 6 | \alias{SimpleReduce}
 7 | \title{Implementation for a reducible function using function names only}
 8 | \description{
 9 | This assumes that all of the summary, combine and query functions are defined and available in the R environment where it will run.
10 | See \linkS4class{UserDefinedReduce} to define and use your own functions.
11 | }
12 | \section{Slots}{
13 | 
14 | \describe{
15 | \item{\code{summary}}{name of a function that each worker will call on their chunk of the data.
16 | This produces an intermediate result.}
17 | 
18 | \item{\code{combine}}{name of a function to combine many intermediate results into a single intermediate results}
19 | 
20 | \item{\code{query}}{name of a function to produce the actual final result from an intermediate result}
21 | }}
22 | 
23 | 


--------------------------------------------------------------------------------
/man/SplitBlock-class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllClass.R
 3 | \docType{class}
 4 | \name{SplitBlock-class}
 5 | \alias{SplitBlock-class}
 6 | \alias{SplitBlock}
 7 | \title{Split one chunked object using another as a factor}
 8 | \description{
 9 | GROUP BY style code becomes a split followed by an lapply, and both are parallel blocks.
10 | The semantic meaning of this in a schedule is that the data will be grouped, ready for an lapply on the groups.
11 | }
12 | \section{Slots}{
13 | 
14 | \describe{
15 | \item{\code{groupData}}{names of chunked variables to split according to groupIndex}
16 | 
17 | \item{\code{groupIndex}}{names of chunked variables that define the split}
18 | 
19 | \item{\code{lhs}}{name of the chunked variable that holds the result of the split.
20 | This doesn't necessarily need to be here, but we use it to generate code.}
21 | }}
22 | 
23 | 


--------------------------------------------------------------------------------
/man/Statement-class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllClass.R
 3 | \docType{class}
 4 | \name{Statement-class}
 5 | \alias{Statement-class}
 6 | \alias{Statement}
 7 | \title{Single Top Level Statement}
 8 | \description{
 9 | Scripts consist of many such statements.
10 | This class is necessary to help out with method dispatch in \code{expandData}.
11 | We would use expression, but there's already a method for that.
12 | }
13 | \section{Slots}{
14 | 
15 | \describe{
16 | \item{\code{statement}}{language object that is the statement}
17 | }}
18 | 
19 | 


--------------------------------------------------------------------------------
/man/TaskGraph-class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllClass.R
 3 | \docType{class}
 4 | \name{TaskGraph-class}
 5 | \alias{TaskGraph-class}
 6 | \alias{TaskGraph}
 7 | \title{Dependency graph between expressions}
 8 | \description{
 9 | Subclasses of this class contain all the information that we know about
10 | the code and the problem, such as the time to run each expression and
11 | the variable sizes.
12 | }
13 | \section{Slots}{
14 | 
15 | \describe{
16 | \item{\code{code}}{input code}
17 | 
18 | \item{\code{graph}}{data frame representing the graph with indices corresponding
19 | to code}
20 | }}
21 | 
22 | 


--------------------------------------------------------------------------------
/man/TaskSchedule-class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllClass.R
 3 | \docType{class}
 4 | \name{TaskSchedule-class}
 5 | \alias{TaskSchedule-class}
 6 | \alias{TaskSchedule}
 7 | \title{Task Parallel Schedule}
 8 | \description{
 9 | Task Parallel Schedule
10 | }
11 | \section{Slots}{
12 | 
13 | \describe{
14 | \item{\code{transfer}}{transfer variables between processes}
15 | 
16 | \item{\code{evaluation}}{data.frame assigning expressions to processors}
17 | 
18 | \item{\code{maxWorker}}{maximum number of processors, similar to \code{mc.cores}
19 | in the parallel package}
20 | 
21 | \item{\code{overhead}}{minimum time in seconds to evaluate a single expression}
22 | 
23 | \item{\code{bandwidth}}{network bandwidth in bytes per second}
24 | }}
25 | 
26 | 


--------------------------------------------------------------------------------
/man/TextTableFiles-class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllClass.R
 3 | \docType{class}
 4 | \name{TextTableFiles-class}
 5 | \alias{TextTableFiles-class}
 6 | \alias{TextTableFiles}
 7 | \title{One or More Files Representing One Data Frame}
 8 | \description{
 9 | Slots correspond to arguments in read.table
10 | }
11 | 


--------------------------------------------------------------------------------
/man/TimedTaskGraph-class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllClass.R
 3 | \docType{class}
 4 | \name{TimedTaskGraph-class}
 5 | \alias{TimedTaskGraph-class}
 6 | \alias{TimedTaskGraph}
 7 | \title{Graph where the run time for each expression is known}
 8 | \description{
 9 | Graph where the run time for each expression is known
10 | }
11 | \section{Slots}{
12 | 
13 | \describe{
14 | \item{\code{time}}{time in seconds to run each expression}
15 | }}
16 | 
17 | 


--------------------------------------------------------------------------------
/man/XXX.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/dataParallel.R
 3 | \name{XXX}
 4 | \alias{XXX}
 5 | \title{Estimate Time To Execute Function}
 6 | \usage{
 7 | XXX(maxWorker, sizeInput, sizeOutput)
 8 | }
 9 | \arguments{
10 | \item{maxWorker}{integer number of parallel workers to use}
11 | 
12 | \item{sizeInput}{numeric size of each input element in bytes}
13 | 
14 | \item{sizeOutput}{numeric size of each output element in bytes}
15 | }
16 | \value{
17 | list with the following elements:
18 | \describe{
19 |   \item{serialTime}{Time in seconds to execute the function in serial}
20 |   \item{parallelTime}{Time in seconds to execute the function in
21 |      parallel}
22 |   \item{elementsParallelFaster}{Number of data elements required for a
23 |   parallel version with maxWorker workers to be faster than serial. Can
24 |   be Inf if parallel will never be faster than serial.}
25 |   \item{}{}
26 | }
27 | }
28 | \description{
29 | Estimate Time To Execute Function
30 | }
31 | 


--------------------------------------------------------------------------------
/man/dataSource.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllGeneric.R
 3 | \name{dataSource}
 4 | \alias{dataSource}
 5 | \title{Infer Or Extract Data Source Object}
 6 | \usage{
 7 | dataSource(expr, ...)
 8 | }
 9 | \arguments{
10 | \item{expr}{object to infer the data source from}
11 | 
12 | \item{...}{additional arguments to methods}
13 | }
14 | \value{
15 | \linkS4class{DataSource} object
16 | }
17 | \description{
18 | Infer Or Extract Data Source Object
19 | }
20 | 


--------------------------------------------------------------------------------
/man/expandData.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllGeneric.R
 3 | \name{expandData}
 4 | \alias{expandData}
 5 | \title{Expand Data}
 6 | \usage{
 7 | expandData(code, data, platform, ...)
 8 | }
 9 | \arguments{
10 | \item{code}{file name or a string containing code to be parsed}
11 | 
12 | \item{data}{list of data descriptions. 
13 | Each element is a \linkS4class{DataSource}.
14 | The names of the list elements correspond to the variables in the code that these objects are bound to.}
15 | 
16 | \item{platform}{\linkS4class{Platform} describing resource to compute on}
17 | 
18 | \item{...}{additional arguments to schedule methods}
19 | }
20 | \description{
21 | Updates code to include code to load data
22 | }
23 | 


--------------------------------------------------------------------------------
/man/file.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/writeCode.R
 3 | \docType{methods}
 4 | \name{file,TaskGraph-method}
 5 | \alias{file,TaskGraph-method}
 6 | \alias{file,Schedule-method}
 7 | \alias{file,GeneratedCode-method}
 8 | \title{Get File containing code}
 9 | \usage{
10 | \S4method{file}{TaskGraph}(description)
11 | 
12 | \S4method{file}{Schedule}(description)
13 | 
14 | \S4method{file}{GeneratedCode}(description)
15 | }
16 | \arguments{
17 | \item{description}{object that may have a file associated with it}
18 | }
19 | \description{
20 | Get File containing code
21 | }
22 | 


--------------------------------------------------------------------------------
/man/fileSetter.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllGeneric.R, R/writeCode.R
 3 | \docType{methods}
 4 | \name{file<-}
 5 | \alias{file<-}
 6 | \alias{file<-,GeneratedCode,character-method}
 7 | \title{Set File for generated code object}
 8 | \usage{
 9 | file(description) <- value
10 | 
11 | \S4method{file}{GeneratedCode,character}(description) <- value
12 | }
13 | \arguments{
14 | \item{description}{\linkS4class{GeneratedCode}}
15 | 
16 | \item{value}{file name to associate with object}
17 | }
18 | \description{
19 | Set File for generated code object
20 | }
21 | 


--------------------------------------------------------------------------------
/man/findFirstDataSource.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/dataSource.R
 3 | \name{findFirstDataSource}
 4 | \alias{findFirstDataSource}
 5 | \title{Find the first expression that we can infer a DataSource from}
 6 | \usage{
 7 | findFirstDataSource(expr, ...)
 8 | }
 9 | \description{
10 | Find the first expression that we can infer a DataSource from
11 | }
12 | 


--------------------------------------------------------------------------------
/man/forLoopToLapply.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/forLoop.R
 3 | \name{forLoopToLapply}
 4 | \alias{forLoopToLapply}
 5 | \title{Transfrom For Loop To Lapply}
 6 | \usage{
 7 | forLoopToLapply(forloop)
 8 | }
 9 | \arguments{
10 | \item{forloop}{R language object with class \code{for}.}
11 | }
12 | \value{
13 | call R call to \code{parallel::mclapply} if successful,
14 |  otherwise the original forloop.
15 | }
16 | \description{
17 | Determine if a for loop can be parallelized, and if so transform it into
18 | a call to \code{lapply}. This first version will modify loops if and
19 | only if the body of the loop does not do any assignments at all.
20 | }
21 | \details{
22 | Recommended use case:
23 | 
24 | The functions in the body of the loop write to different files on each
25 | loop iteration.
26 | 
27 | The generated code WILL FAIL if:
28 | 
29 | Code in the body of the loop is truly iterative. Functions update global
30 | state in any way other than direct assignment.
31 | }
32 | 


--------------------------------------------------------------------------------
/man/generate.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllGeneric.R, R/MapSchedule.R, R/generateTask.R
 3 | \docType{methods}
 4 | \name{generate}
 5 | \alias{generate}
 6 | \alias{generate,SerialSchedule,ANY,ANY-method}
 7 | \alias{generate,MapSchedule,ANY,ANY-method}
 8 | \alias{generate,TaskSchedule,ANY,ANY-method}
 9 | \title{Generate Code}
10 | \usage{
11 | generate(schedule, platform, data, ...)
12 | 
13 | \S4method{generate}{SerialSchedule,ANY,ANY}(schedule, platform, data, ...)
14 | 
15 | \S4method{generate}{MapSchedule,ANY,ANY}(schedule, platform, data, ...)
16 | 
17 | \S4method{generate}{TaskSchedule,ANY,ANY}(schedule, platform,
18 |   portStart = 33000L, minTimeout = 600)
19 | }
20 | \arguments{
21 | \item{schedule}{object to generate code from, typically an object of class \linkS4class{Schedule}}
22 | 
23 | \item{platform}{object of class \linkS4class{Platform}}
24 | 
25 | \item{data}{object of class \linkS4class{DataSource}}
26 | 
27 | \item{...}{additional arguments to methods}
28 | 
29 | \item{portStart}{first local port to use, can possibly use up to n * (n -
30 | 1) / 2 subsequent ports if every pair of n workers must communicate.}
31 | 
32 | \item{minTimeout}{timeout for socket connection will be at least this
33 | many seconds.}
34 | }
35 | \value{
36 | x object of class \linkS4class{GeneratedCode}
37 | }
38 | \description{
39 | Produces executable code that relies on a SNOW cluster on a single
40 | machine and sockets.
41 | }
42 | \seealso{
43 | \code{\link{schedule}} generic function to create
44 | \linkS4class{Schedule}, \code{\link{writeCode}} to write and extract the
45 | actual code, and
46 | \code{\link{makeParallel}} to do everything all at once.
47 | }
48 | 


--------------------------------------------------------------------------------
/man/inferGraph.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllGeneric.R, R/inferGraph.R
 3 | \docType{methods}
 4 | \name{inferGraph}
 5 | \alias{inferGraph}
 6 | \alias{inferGraph,character,missing-method}
 7 | \alias{inferGraph,language,missing-method}
 8 | \alias{inferGraph,expression,missing-method}
 9 | \alias{inferGraph,ANY,numeric-method}
10 | \title{Infer Task Dependency Graph}
11 | \usage{
12 | inferGraph(code, time, ...)
13 | 
14 | \S4method{inferGraph}{character,missing}(code, time, ...)
15 | 
16 | \S4method{inferGraph}{language,missing}(code, time, ...)
17 | 
18 | \S4method{inferGraph}{expression,missing}(code, time, ...)
19 | 
20 | \S4method{inferGraph}{ANY,numeric}(code, time, ...)
21 | }
22 | \arguments{
23 | \item{code}{the file path to a script or an object that can be coerced
24 | to an expression.}
25 | 
26 | \item{time}{time to run each expression}
27 | 
28 | \item{...}{additional arguments to methods}
29 | }
30 | \value{
31 | object of class \linkS4class{TaskGraph}
32 | }
33 | \description{
34 | Statically analyze code to determine implicit dependencies
35 | }
36 | \examples{
37 | g <- inferGraph(parse(text = "
38 |   a <- 1
39 |   b <- 2
40 |   c <- a + b
41 |   d <- b * c
42 | "))
43 | 
44 | ig <- as(g, "igraph")
45 | plot(ig)
46 | 
47 | # To specify the time each expression takes:
48 | g2 <- inferGraph(g@code, time = c(1.1, 2, 0.5, 6))
49 | }
50 | 


--------------------------------------------------------------------------------
/man/inferReadFuncFromFile.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/dataSource.R
 3 | \name{inferReadFuncFromFile}
 4 | \alias{inferReadFuncFromFile}
 5 | \title{Attempt to infer the type of a file.}
 6 | \usage{
 7 | inferReadFuncFromFile(fname)
 8 | }
 9 | \description{
10 | Attempt to infer the type of a file.
11 | }
12 | 


--------------------------------------------------------------------------------
/man/makeParallel.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/makeParallel.R
 3 | \name{makeParallel}
 4 | \alias{makeParallel}
 5 | \title{Make Parallel Code From Serial}
 6 | \usage{
 7 | makeParallel(code, isFile = file.exists(code), expr = if (isFile)
 8 |   parse(code, keep.source = TRUE) else parse(text = code, keep.source =
 9 |   FALSE), data = NULL, nWorkers = parallel::detectCores(),
10 |   platform = Platform(nWorkers = nWorkers), run = FALSE,
11 |   scheduler = schedule, ..., generator = generate,
12 |   generatorArgs = list(), outFile = FALSE, prefix = "gen_",
13 |   overWrite = FALSE)
14 | }
15 | \arguments{
16 | \item{code}{file name or a string containing code to be parsed}
17 | 
18 | \item{isFile}{logical, is the code a file name?}
19 | 
20 | \item{expr}{expression, for example from \code{\link[base]{parse}}}
21 | 
22 | \item{data}{list of data descriptions. 
23 | Each element is a \linkS4class{DataSource}.
24 | The names of the list elements correspond to the variables in the code that these objects are bound to.}
25 | 
26 | \item{nWorkers}{integer, number of parallel workers}
27 | 
28 | \item{platform}{\linkS4class{Platform} describing resource to compute on}
29 | 
30 | \item{run}{logical, evaluate the code once to gather timings?}
31 | 
32 | \item{scheduler, }{function to produce a \linkS4class{Schedule}
33 | from a \linkS4class{TaskGraph}.}
34 | 
35 | \item{...}{additional arguments to schedule methods}
36 | 
37 | \item{generator}{function to produce \linkS4class{GeneratedCode} from a \linkS4class{Schedule}}
38 | 
39 | \item{generatorArgs}{list of named arguments to use with
40 | \code{generator}}
41 | 
42 | \item{outFile}{character name of the file to write the generated script. 
43 | If FALSE then don't write anything to disk.
44 | If TRUE and code comes from a file then use \code{prefix} to make a new
45 | name and write a script.}
46 | 
47 | \item{prefix}{character added to front of file name}
48 | 
49 | \item{overWrite}{logical write over existing generated file}
50 | }
51 | \value{
52 | code object of class \linkS4class{GeneratedCode}
53 | }
54 | \description{
55 | \code{makeParallel} is a high level function that performs all the steps
56 | to generate parallel code.
57 | }
58 | \details{
59 | The following are the high level steps:
60 | 
61 | \enumerate{
62 |  \item Infer the task graph
63 |  \item Schedule the statements
64 |  \item Generate parallel code
65 | }
66 | 
67 | The arguments allow the user to control every aspect of this process.
68 | For more details see \code{vignette("makeParallel-concepts")}.
69 | }
70 | \examples{
71 | # Make an existing R script parallel
72 | script <- system.file("examples/mp_example.R", package = "makeParallel")
73 | makeParallel(script)
74 | 
75 | # Write generated code to a new file
76 | newfile <- tempfile()
77 | makeParallel(script, file = newfile)
78 | 
79 | # Clean up
80 | unlink(newfile)
81 | 
82 | # Pass in code directly
83 | d <- makeParallel(expr = parse(text = "lapply(mtcars, mean)"))
84 | 
85 | # Examine generated code
86 | writeCode(d)
87 | 
88 | # Specify a different scheduler
89 | pcode <- makeParallel("x <- 1:100
90 | y <- rep(1, 100)
91 | z <- x + y", scheduler = scheduleTaskList)
92 | 
93 | # Some schedules have plotting methods
94 | plot(schedule(pcode))
95 | }
96 | 


--------------------------------------------------------------------------------
/man/mapSchedule.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/MapSchedule.R
 3 | \name{mapSchedule}
 4 | \alias{mapSchedule}
 5 | \title{Data Parallel Scheduler}
 6 | \usage{
 7 | mapSchedule(graph)
 8 | }
 9 | \arguments{
10 | \item{graph}{\linkS4class{TaskGraph}}
11 | }
12 | \description{
13 | DEPRECATED. Use scheduleDataParallel instead.
14 | }
15 | \details{
16 | This function
17 | detects parallelism through the use of top level calls to R's
18 | apply family of functions and through analysis of \code{for} loops.
19 | Currently supported apply style functions include
20 | \code{\link[base]{lapply}} and \code{\link[base]{mapply}}. It doesn't
21 | parallelize all for loops that can be parallelized, but it does do the
22 | common ones listed in the example.
23 | 
24 | Consider using this if:
25 | 
26 | \itemize{
27 |  \item \code{code} is slow
28 |  \item \code{code} uses for loops or one of the apply functions mentioned above
29 |  \item You have access to machine with multiple cores that supports
30 |      \code{\link[parallel]{makeForkCluster}} (Any UNIX variant should work,
31 |      ie. Mac)
32 |  \item You're unfamiliar with parallel programming in R
33 | }
34 | 
35 | Don't use this if:
36 | 
37 | \itemize{
38 |  \item \code{code} is fast enough for your application
39 |  \item \code{code} is already parallel, either explicitly with a package
40 |      such as parallel, or implicitly, say through a multi threaded BLAS
41 |  \item You need maximum performance at all costs. In this case you need
42 |      to carefully profile and interface appropriately with a high
43 |      performance library.
44 | }
45 | 
46 | Currently this function support \code{for} loops that update 0 or 1
47 | global variables. For those that update a single variable the update
48 | must be on the last line of the loop body, so the for loop should have
49 | the following form:
50 | 
51 | \code{
52 | for(i in ...){
53 |   ... 
54 |   x[i] <- ...
55 | }
56 | }
57 | 
58 | If the last line doesn't update the variable then it's not clear that
59 | the loop can be parallelized.
60 | 
61 | Road map of features to implement:
62 | 
63 | \itemize{
64 |  \item Prevent from parallelizing calls that are themselves in the body
65 |  of a loop.
66 | }
67 | }
68 | \examples{
69 | 
70 | # Each iteration of the for loop writes to a different file- good!
71 | # If they write to the same file this will break.
72 | pfile <- makeParallel(parse(text = "
73 |      fnames <- paste0(1:10, '.txt')
74 |      for(f in fname){
75 |          writeLines('testing...', f)
76 |      }"))
77 | 
78 | # A couple examples in one script
79 | serial_code <- parse(text = "
80 |      x1 <- lapply(1:10, exp)
81 |      n <- 10
82 |      x2 <- rep(NA, n)
83 |      for(i in seq(n)) x2[[i]] <- exp(i + 1)
84 | ")
85 | 
86 | p <- makeParallel(serial_code)
87 | 
88 | eval(serial_code)
89 | x1
90 | x2
91 | rm(x1, x2)
92 | 
93 | # x1 and x2 should now be back and the same as they were for serial
94 | eval(writeCode(p))
95 | x1
96 | x2
97 | }
98 | 


--------------------------------------------------------------------------------
/man/orderBottomLevel.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/order.R
 3 | \name{orderBottomLevel}
 4 | \alias{orderBottomLevel}
 5 | \title{Order Nodes By Bottom Level Order}
 6 | \usage{
 7 | orderBottomLevel(graph)
 8 | }
 9 | \arguments{
10 | \item{graph}{\linkS4class{TimedTaskGraph}}
11 | }
12 | \value{
13 | integer vector to permute the expressions in \code{x@code}
14 | }
15 | \description{
16 | Permute the nodes of the graph so that they are ordered in decreasing
17 | bottom level precedence order. The bottom level of a node is the length
18 | of the longest path starting at that node and going to the end of the
19 | program.
20 | }
21 | \details{
22 | This permutation respects the partial order of the graph, so executing
23 | the permuted code will produce the same result as the original code.
24 | There are many possible node precedence orders.
25 | }
26 | \examples{
27 | graph <- inferGraph(code = parse(text = "x <- 1:100
28 | y <- rep(1, 100)
29 | z <- x + y"), time = c(1, 2, 1))
30 | bl <- orderBottomLevel(graph)
31 | }
32 | \references{
33 | \emph{Task Scheduling for Parallel Systems}, Sinnen, O.
34 | claim bottom level order provides good average performance. I'm not sure
35 | if this claim holds for general data analysis scripts.
36 | }
37 | 


--------------------------------------------------------------------------------
/man/plot-TaskSchedule-missing-method.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plot.R
 3 | \docType{methods}
 4 | \name{plot,TaskSchedule,missing-method}
 5 | \alias{plot,TaskSchedule,missing-method}
 6 | \title{Gantt chart of a schedule}
 7 | \usage{
 8 | \S4method{plot}{TaskSchedule,missing}(x, blockHeight = 0.25,
 9 |   main = "schedule plot", xlab = "Time (seconds)",
10 |   ylab = "Processor", evalColor = "gray", sendColor = "orchid",
11 |   receiveColor = "slateblue", labelTransfer = TRUE, labelExpr = NULL,
12 |   rectAes = list(density = NA, border = "black", lwd = 2), ...)
13 | }
14 | \arguments{
15 | \item{x}{\linkS4class{TaskSchedule}}
16 | 
17 | \item{blockHeight}{height of rectangle, between 0 and 0.5}
18 | 
19 | \item{main}{title}
20 | 
21 | \item{xlab}{x axis label}
22 | 
23 | \item{ylab}{y ayis label}
24 | 
25 | \item{evalColor}{color for evaluation blocks}
26 | 
27 | \item{sendColor}{color for send blocks}
28 | 
29 | \item{receiveColor}{color for receive blocks}
30 | 
31 | \item{labelTransfer}{add labels for transfer arrows}
32 | 
33 | \item{labelExpr}{NULL to use default numbering labels, FALSE to suppress
34 | labels, or a character vector of custom labels.}
35 | 
36 | \item{rectAes}{list of additional arguments for
37 | \code{\link[graphics]{rect}}}
38 | 
39 | \item{...}{additional arguments to \code{plot}}
40 | }
41 | \description{
42 | Gantt chart of a schedule
43 | }
44 | 


--------------------------------------------------------------------------------
/man/plotDOT.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plot.R
 3 | \name{plotDOT}
 4 | \alias{plotDOT}
 5 | \title{Plot Dependency Graph}
 6 | \usage{
 7 | plotDOT(graph, file = NULL, dotfile = NULL, args = "")
 8 | }
 9 | \arguments{
10 | \item{graph}{dependGraph}
11 | 
12 | \item{file}{character where to save pdf image}
13 | 
14 | \item{dotfile}{character where to save dot commands to produce plot}
15 | 
16 | \item{args}{character additional arguments to \code{dot} command line program}
17 | }
18 | \description{
19 | Produces a PDF image using graphviz
20 | }
21 | 


--------------------------------------------------------------------------------
/man/reduceFun.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/reduce.R
 3 | \name{reduceFun}
 4 | \alias{reduceFun}
 5 | \title{Construct ReduceFun Objects}
 6 | \usage{
 7 | reduceFun(reduce, summary = reduce, combine = "c", query = summary,
 8 |   predicate = function(...) TRUE)
 9 | }
10 | \description{
11 | Construct ReduceFun Objects
12 | }
13 | 


--------------------------------------------------------------------------------
/man/runMeasure.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/runMeasure.R
 3 | \name{runMeasure}
 4 | \alias{runMeasure}
 5 | \title{Run and Measure Code}
 6 | \usage{
 7 | runMeasure(code, graph = inferGraph(code), envir = globalenv(),
 8 |   timer = Sys.time)
 9 | }
10 | \arguments{
11 | \item{code}{to be passed into \code{\link{inferGraph}}}
12 | 
13 | \item{graph}{object of class \code{TaskGraph}}
14 | 
15 | \item{envir}{environment to evaluate the code in}
16 | 
17 | \item{timer}{function that returns a timestamp.}
18 | }
19 | \value{
20 | graph object of class \code{MeasuredTaskGraph}
21 | }
22 | \description{
23 | Will export this once I the full pipeline works.
24 | }
25 | \details{
26 | Run the serial code in the task graph and measure how long each expression
27 | takes to run as well as the object sizes of each variable that can
28 | possibly be transferred.
29 | 
30 | This does naive and biased timing since it doesn't account for the
31 | overhead in evaluating a single expression. However, this is fine for
32 | this application since the focus is on measuring statements that take at
33 | least on the order of 1 second to run.
34 | }
35 | 


--------------------------------------------------------------------------------
/man/schedule.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllGeneric.R, R/zzz.R
 3 | \docType{methods}
 4 | \name{schedule}
 5 | \alias{schedule}
 6 | \alias{schedule,GeneratedCode-method}
 7 | \alias{schedule,TaskGraph-method}
 8 | \title{Schedule Dependency Graph}
 9 | \usage{
10 | schedule(graph, data, platform, ...)
11 | 
12 | \S4method{schedule}{GeneratedCode}(graph, data, platform, ...)
13 | 
14 | \S4method{schedule}{TaskGraph}(graph, data, platform = Platform(),
15 |   nWorkers = platform@nWorkers, chunkFuncs = character(),
16 |   reduceFuncs = list(), knownReduceFuncs = getKnownReduceFuncs(),
17 |   knownChunkFuncs = getKnownChunkFuncs(),
18 |   allChunkFuncs = c(knownChunkFuncs, chunkFuncs))
19 | }
20 | \arguments{
21 | \item{graph}{\linkS4class{TaskGraph}, code dependency graph}
22 | 
23 | \item{data}{list of data descriptions. 
24 | Each element is a \linkS4class{DataSource}.
25 | The names of the list elements correspond to the variables in the code that these objects are bound to.}
26 | 
27 | \item{platform}{\linkS4class{Platform} describing resource to compute on}
28 | 
29 | \item{...}{additional arguments to schedule methods}
30 | }
31 | \description{
32 | Creates the schedule for a dependency graph. The schedule is the
33 | assignment of the expressions to different processors at different
34 | times. There are many possible scheduling algorithms. The default is
35 | \code{\link{mapSchedule}}, which does
36 | simple map parallelism using R's apply family of functions.
37 | }
38 | \references{
39 | See \emph{Task Scheduling for Parallel Systems}, Sinnen, O.
40 | for a thorough treatment of what it means to have a valid schedule.
41 | }
42 | 


--------------------------------------------------------------------------------
/man/scheduleDataParallel.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/scheduleDataParallel.R
 3 | \name{scheduleDataParallel}
 4 | \alias{scheduleDataParallel}
 5 | \title{Schedule Based On Data Parallelism}
 6 | \usage{
 7 | scheduleDataParallel(graph, data, platform = Platform(),
 8 |   nWorkers = platform@nWorkers, chunkFuncs = character(),
 9 |   reduceFuncs = list(), knownReduceFuncs = getKnownReduceFuncs(),
10 |   knownChunkFuncs = getKnownChunkFuncs(),
11 |   allChunkFuncs = c(knownChunkFuncs, chunkFuncs))
12 | }
13 | \arguments{
14 | \item{graph}{\linkS4class{TaskGraph}, code dependency graph}
15 | 
16 | \item{data}{list of data descriptions. 
17 | Each element is a \linkS4class{DataSource}.
18 | The names of the list elements correspond to the variables in the code that these objects are bound to.}
19 | 
20 | \item{platform}{\linkS4class{Platform} describing resource to compute on}
21 | 
22 | \item{chunkFuncs}{character, names of additional chunkable functions known to the user.}
23 | 
24 | \item{reduceFuncs}{list of ReduceFun objects, these can override the knownReduceFuncs.}
25 | 
26 | \item{knownReduceFuncs}{list of known ReduceFun objects}
27 | 
28 | \item{knownChunkFuncs}{character, the names of chunkable functions from recommended and base packages.}
29 | 
30 | \item{allchunkFuncs}{character, names of all chunkable functions to use in the analysis.}
31 | }
32 | \description{
33 | If you're doing a series of computations over a large data set, then start with this scheduler.
34 | This scheduler combines as many chunkable expressions as it can into large blocks of chunkable expressions to run in parallel.
35 | The initial data chunks and intermediate objects stay on the workers and do not return to the manager, so you can think of it as "chunk fusion".
36 | }
37 | \details{
38 | It statically balances the load of the data chunks among workers, assuming that loading and processing times are linear in the size of the data.
39 | 
40 | TODO:
41 | \enumerate{
42 | \item Populate \code{chunkableFuncs} based on code analysis.
43 | \item Identify which parameters a function is chunkable in, and respect these by matching arguments.
44 | See \code{update_resource.Call}.
45 | \item Clarify behavior of subexpressions, handling cases such as \code{min(sin(large_object))}
46 | }
47 | }
48 | \seealso{
49 | \link{makeParallel}, \link{schedule}
50 | }
51 | 


--------------------------------------------------------------------------------
/man/scheduleFork.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/scheduleFork.R
 3 | \name{scheduleFork}
 4 | \alias{scheduleFork}
 5 | \title{Single sequential forks scheduler}
 6 | \usage{
 7 | scheduleFork(graph, platform = Platform(), data = list(),
 8 |   overhead = 1e-03, bandwidth = 1.5e+09)
 9 | }
10 | \arguments{
11 | \item{graph}{\linkS4class{TaskGraph} as returned from \code{\link{inferGraph}}}
12 | 
13 | \item{overhead}{seconds required to initialize a fork}
14 | 
15 | \item{bandwidth}{numeric speed that the network can transfer an object
16 | between processors in bytes per second. We don't take network
17 | contention into account. This will have to be extended to account for
18 | multiple machines.}
19 | }
20 | \value{
21 | schedule \linkS4class{ForkSchedule}
22 | }
23 | \description{
24 | Single sequential forks scheduler
25 | }
26 | 


--------------------------------------------------------------------------------
/man/scheduleFork_old.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/old_scheduleFork.R
 3 | \name{scheduleFork_old}
 4 | \alias{scheduleFork_old}
 5 | \title{Single sequential forks scheduler}
 6 | \usage{
 7 | scheduleFork_old(graph, overhead = 1000, bandwidth = 1.5e+09)
 8 | }
 9 | \arguments{
10 | \item{graph}{object of class \code{TaskGraph} as returned from \code{\link{inferGraph}}
11 | expression.}
12 | 
13 | \item{overhead}{numeric seconds to send any object}
14 | 
15 | \item{bandwidth}{numeric speed that the network can transfer an object
16 | between processors in bytes per second. We don't take network
17 | contention into account. This will have to be extended to account for
18 | multiple machines.}
19 | }
20 | \value{
21 | schedule object of class \code{ForkSchedule}
22 | }
23 | \description{
24 | Single sequential forks scheduler
25 | }
26 | 


--------------------------------------------------------------------------------
/man/scheduleTaskList.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/scheduleTaskList.R
 3 | \name{scheduleTaskList}
 4 | \alias{scheduleTaskList}
 5 | \title{Minimize Expression Start Time}
 6 | \usage{
 7 | scheduleTaskList(graph, platform = Platform(), data = list(),
 8 |   orderFun = orderBottomLevel, timeDefault = 1e-05,
 9 |   sizeDefault = as.numeric(utils::object.size(1L)), overhead = 8e-06,
10 |   bandwidth = 1.5e+09)
11 | }
12 | \arguments{
13 | \item{graph}{\linkS4class{TaskGraph} as returned from \code{\link{inferGraph}}}
14 | 
15 | \item{orderFun}{function that takes in a \code{graph} and
16 | returns a permutation of \code{1:length(graph@code)} that respects the
17 | topological ordering of the graph.}
18 | 
19 | \item{timeDefault}{numeric time in seconds to execute a single
20 | expression. Expression times default to this value, with a warning, if
21 | we can't find \code{time} from \code{graph}.}
22 | 
23 | \item{sizeDefault}{numeric default size of objects to transfer in bytes}
24 | 
25 | \item{overhead}{numeric seconds to send any object}
26 | 
27 | \item{bandwidth}{numeric speed that the network can transfer an object
28 | between processors in bytes per second. We don't take network
29 | contention into account. This will have to be extended to account for
30 | multiple machines.}
31 | 
32 | \item{nWorkers}{integer maximum number of processors}
33 | }
34 | \value{
35 | schedule object of class \code{TaskSchedule}
36 | }
37 | \description{
38 | Implementation of "list scheduling".
39 | This is a greedy algorithm that assigns each expression to the earliest
40 | possible processor.
41 | }
42 | \details{
43 | This function is experimental and unstable. If you're trying to actually
44 | speed up your code through parallelism then consider using the default
45 | method in \code{\link{schedule}} for data parallelism.
46 | This function rewrites code to use task parallelism.
47 | Task parallelism means two or more processors run different R
48 | expressions simultaneously.
49 | }
50 | \examples{
51 | code <- parse(text = "a <- 100
52 |      b <- 200
53 |      c <- a + b")
54 | 
55 | g <- inferGraph(code)
56 | s <- scheduleTaskList(g)
57 | plot(s)
58 | }
59 | \references{
60 | Algorithm 10 in \emph{Task Scheduling for Parallel
61 | Systems}, Sinnen (2007)
62 | }
63 | 


--------------------------------------------------------------------------------
/man/standardizeData.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/standardize.R
 3 | \name{standardizeData}
 4 | \alias{standardizeData}
 5 | \title{schedulers expect to see the data in a standard form}
 6 | \usage{
 7 | standardizeData(data)
 8 | }
 9 | \description{
10 | schedulers expect to see the data in a standard form
11 | }
12 | 


--------------------------------------------------------------------------------
/man/time.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllGeneric.R
 3 | \docType{methods}
 4 | \name{time,TimedTaskGraph-method}
 5 | \alias{time,TimedTaskGraph-method}
 6 | \alias{time,Schedule-method}
 7 | \title{Expression Run Time}
 8 | \usage{
 9 | \S4method{time}{TimedTaskGraph}(x)
10 | 
11 | \S4method{time}{Schedule}(x)
12 | }
13 | \arguments{
14 | \item{x}{object containing expression run times}
15 | }
16 | \description{
17 | Extract a numeric vector of expression run times
18 | }
19 | 


--------------------------------------------------------------------------------
/man/use_def.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/inferGraph.R
 3 | \name{use_def}
 4 | \alias{use_def}
 5 | \title{Use Definition Chain}
 6 | \usage{
 7 | use_def(x, all_uses, all_definitions)
 8 | }
 9 | \arguments{
10 | \item{x}{variable name}
11 | 
12 | \item{all_uses}{list containing variable names used in each expression}
13 | 
14 | \item{all_definitions}{list containing variable names defined in each expression}
15 | }
16 | \value{
17 | data frame of edges suitable for use with
18 |  \code{\link[igraph]{graph_from_data_frame}}.
19 | }
20 | \description{
21 | Compute a data frame of edges with one edge connecting each use of the
22 | variable x to the most recent definition or update of x.
23 | }
24 | 


--------------------------------------------------------------------------------
/man/writeCode.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllGeneric.R, R/writeCode.R
 3 | \docType{methods}
 4 | \name{writeCode}
 5 | \alias{writeCode}
 6 | \alias{writeCode,GeneratedCode,logical-method}
 7 | \alias{writeCode,GeneratedCode,missing-method}
 8 | \alias{writeCode,GeneratedCode,character-method}
 9 | \alias{writeCode,expression,character-method}
10 | \title{Write Generated Code}
11 | \usage{
12 | writeCode(code, file, ...)
13 | 
14 | \S4method{writeCode}{GeneratedCode,logical}(code, file,
15 |   overWrite = FALSE, prefix = "gen_")
16 | 
17 | \S4method{writeCode}{GeneratedCode,missing}(code, file, ...)
18 | 
19 | \S4method{writeCode}{GeneratedCode,character}(code, file,
20 |   overWrite = FALSE, ...)
21 | 
22 | \S4method{writeCode}{expression,character}(code, file, overWrite = FALSE,
23 |   ...)
24 | }
25 | \arguments{
26 | \item{code}{object of class \linkS4class{GeneratedCode}}
27 | 
28 | \item{...}{additional arguments to methods}
29 | 
30 | \item{overWrite}{logical write over existing file}
31 | 
32 | \item{prefix}{character prefix for generating file names}
33 | }
34 | \value{
35 | expression R language object, suitable for further manipulation
36 | }
37 | \description{
38 | Write the generated code to a file and return the code.
39 | }
40 | \seealso{
41 | \code{\link{generate}} to generate the code from a schedule,
42 | \code{\link{makeParallel}} to do everything all at once.
43 | }
44 | 


--------------------------------------------------------------------------------
/tests/generated/fail.R:
--------------------------------------------------------------------------------
1 | ab = "a" + "b"
2 | print("If this line prints then execution proceeded after the error. Fix it.")
3 | 


--------------------------------------------------------------------------------
/tests/generated/script1.R:
--------------------------------------------------------------------------------
1 | v1 = "foo1"
2 | v2 = "foo2"
3 | x <- paste0(v1, v1)
4 | y <- paste0(v2, v2)
5 | xy <- paste0(x, y)
6 | writeLines(xy, "script1.R.log")
7 | 


--------------------------------------------------------------------------------
/tests/generated/script2.R:
--------------------------------------------------------------------------------
 1 | # worker 1
 2 | a = 7
 3 | b = a + 4
 4 | # worker 2
 5 | x = 1
 6 | y = x + 2
 7 | # worker 1
 8 | c = b + y
 9 | # If the work is assigned as above then worker 1 will have variables c and
10 | # y, while worker 2 will have variables x and y. As I've implemented it,
11 | # ties should go to the lower worker, so worker 1 should do this. But I
12 | # don't think I've added the logic so that we know variable y is available
13 | # on worker 1. Thus 2 should do the following:
14 | output = c("got:", c*x*y, "expected:", (7 + 4 + 1 + 2) * 1 * (1 + 2))
15 | writeLines(output, "script2.R.log")
16 | 


--------------------------------------------------------------------------------
/tests/generated/script3.R:
--------------------------------------------------------------------------------
 1 | # This is as good as it gets- three processors almost all busy with no communication
 2 | a1 = 1
 3 | a2 = a1 + 1
 4 | b1 = 1
 5 | c1 = 1
 6 | c2 = c1 + 1
 7 | b2 = b1 + 1
 8 | b3 = b2 + 1
 9 | b4 = b3 + 1
10 | a3 = a2 + 1
11 | a4 = a3 + 1
12 | c3 = c2 + 1
13 | writeLines(as.character(c3), "script3.R.log")
14 | 


--------------------------------------------------------------------------------
/tests/generated/script4.R:
--------------------------------------------------------------------------------
 1 | # The ping pong script. Assignments to a* should happen on worker 1,
 2 | # assignments to b* should happen on worker 2.
 3 | 
 4 | a1 = 1
 5 | b1 = 1
 6 | a2 = 2
 7 | b2 = 2
 8 | 
 9 | a3 = a1 + a2 + b2
10 | b3 = b1 + b2 + a2
11 | 
12 | a4 = a2 + a3 + b3
13 | b4 = b2 + b3 + a3
14 | 
15 | a5 = a3 + a4 + b4
16 | b5 = b3 + b4 + a4
17 | 
18 | writeLines(as.character(b5), "script4.R.log")
19 | 


--------------------------------------------------------------------------------
/tests/generated/script5.R:
--------------------------------------------------------------------------------
 1 | # Three processes
 2 | x = 1
 3 | y = 2
 4 | z = 3
 5 | 
 6 | a = x + y
 7 | b = y + z
 8 | out = a + b + x
 9 | write.table(out, "script5.R.log")
10 | 


--------------------------------------------------------------------------------
/tests/generated/script6.R:
--------------------------------------------------------------------------------
1 | # Sends a big object over
2 | x = 1
3 | tenmb = as.numeric(seq(10 * 2^20/8))
4 | y = 2
5 | out = sum(x, y, tenmb)
6 | write.table(out, "script6.R.log")
7 | 


--------------------------------------------------------------------------------
/tests/generated/script7.R:
--------------------------------------------------------------------------------
 1 | # Duncan's "revisit" script
 2 | 
 3 | # First worker
 4 | a1 = 1
 5 | b1 = a1 + 1
 6 | b2 = a1 + 1
 7 | b3 = a1 + 1
 8 | b4 = a1 + 1
 9 | 
10 | # Second worker
11 | a2 = 2
12 | b5 = a2 + 2
13 | b6 = a2 + b4 + b5
14 | writeLines(as.character(b6), "script7.R.log")
15 | 
16 | # First worker
17 | b7 = a1 + 1
18 | 


--------------------------------------------------------------------------------
/tests/generated/test_generated_scripts.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | # This scripts tests generated code. It requires certain ports to be open,
 4 | # writes local files, and sometimes uses 3 worker processes, so it can't
 5 | # run on CRAN.
 6 | #
 7 | # Each script should write results to a file following the current naming
 8 | # convention. For example, script1.R writes output to script1.R.log.
 9 | 
10 | 
11 | library(makeParallel)
12 | 
13 | 
14 | expect_generated = function(script, scheduler = scheduleTaskList, plot = FALSE, ...)
15 | {
16 |     cat(sprintf("Testing %s\n", script))
17 | 
18 |     # Check that the output of the file is the same for the serial script
19 |     # and the generated script.
20 | 
21 |     outfile = paste0(basename(script), ".log")
22 |     serfile = paste0("expected_", outfile)
23 |     p = makeParallel(script, scheduler = scheduler, overWrite = TRUE, ...)
24 | 
25 |     if(plot){
26 |         pdf(paste0(script, ".pdf"))
27 |         plot(schedule(p))
28 |         dev.off()
29 |     }
30 | 
31 |     # Serial
32 |     source(script, local = new.env())
33 |     file.rename(outfile, serfile)
34 |     expected = readLines(serfile)
35 | 
36 |     # Parallel
37 |     # Generated scripts have a cluster `cls`. It needs to be cleaned up if
38 |     # something fails midway through.
39 |     #on.exit(try(parallel::stopCluster(cls), silent = TRUE))
40 | 
41 |     code = writeCode(p, file = FALSE)
42 |     eval(code)
43 |     actual = readLines(outfile)
44 |     
45 |     stopifnot(identical(actual, expected))
46 | 
47 |     cat(sprintf("Pass %s\n\n", script))
48 | }
49 | 
50 | # A test of the test :)
51 | e = tryCatch(expect_generated("fail.R"), error = identity)
52 | 
53 | 
54 | # Special cases:
55 | ############################################################
56 | 
57 | expect_generated("script3.R", maxWorker = 3)
58 | 
59 | 
60 | # Run all with the defaults:
61 | ############################################################
62 | 
63 | scripts = Sys.glob("script*.R")
64 | lapply(scripts, expect_generated)
65 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
 1 | library(testthat)
 2 | # https://github.com/r-lib/testthat/issues/86
 3 | #Sys.setenv("R_TESTS" = "")
 4 | library(makeParallel)
 5 | 
 6 | # Allows parse() to check equality between expressions
 7 | #options(keep.source = FALSE)
 8 | 
 9 | test_check("makeParallel")
10 | 


--------------------------------------------------------------------------------
/tests/testthat/.gitignore:
--------------------------------------------------------------------------------
1 | chunk_dates.txt/*
2 | big_dates.txt
3 | 


--------------------------------------------------------------------------------
/tests/testthat/by_example/setup_data.R:
--------------------------------------------------------------------------------
 1 | # Set up some toy data
 2 | gen_one = function(i, fname)
 3 | {
 4 |     d = data.frame(y = i, z = i * 1:10)
 5 |     saveRDS(d, file = fname)
 6 | }
 7 | nchunks = 4L
 8 | fnames = paste0("x", seq(nchunks), ".rds")
 9 | Map(gen_one, seq(nchunks), fnames)
10 | 


--------------------------------------------------------------------------------
/tests/testthat/by_example/vector_actual_generated.R:
--------------------------------------------------------------------------------
 1 | message("This code was generated from R by makeParallel version 0.2.0 at 2019-08-02 11:46:08")
 2 | library(parallel)
 3 | nworkers = 3
 4 | assignments = list(1, 2:3, 4)
 5 | read_args = c("x1.rds", "x2.rds", "x3.rds", "x4.rds")
 6 | cls = makeCluster(nworkers)
 7 | clusterExport(cls, c("assignments", "read_args"))
 8 | parLapply(cls, seq(nworkers), function(i) assign("workerID", i, globalenv()))
 9 | clusterEvalQ(cls, {
10 |     read_args = read_args[assignments[[workerID]]]
11 |     chunks = lapply(read_args, readRDS)
12 |     x = do.call(rbind, chunks)
13 |     {
14 |         y = x[, "y"]
15 |         y2 = 4 * y/2
16 |     }
17 |     fname = paste0("y2", "_", workerID, ".rds")
18 |     saveRDS(y2, file = fname)
19 | })
20 | 2 * 3
21 | 


--------------------------------------------------------------------------------
/tests/testthat/by_example/vector_transform.R:
--------------------------------------------------------------------------------
 1 | # TODO: Write the narrative for what's going on here, what does it represent / exemplify?
 2 | # Tell what's manual and will be automated later.
 3 | 
 4 | # The code to do the actual transformation
 5 | # The user of makeParallel must write something like the following:
 6 | 
 7 | library(makeParallel)
 8 | 
 9 | fnames = list.files(pattern = "x[1-4]\\.rds")
10 | 
11 | # Description of the data
12 | d = ChunkLoadFunc(read_func_name = "readRDS"
13 |                   , read_args = fnames
14 |                   , varname = "x"
15 |                   , combine_func_name = "rbind"
16 |                   , split_column_name = "y"
17 |                   , column_names = c(y = 1L, z = 2L)
18 |                   , sizes = c(10, 5, 5, 10)
19 |                   )
20 | 
21 | 
22 | # TODO: Grow the example by scheduling based on size of files
23 | # x.csv (is chunked as) x1.csv = 200 rows, x2.csv = 300 rows, etc.
24 | 
25 | out = makeParallel('
26 |     f = function(grp){
27 |         median_z = median(grp[, "z"])
28 |         data.frame(y = grp[1L, "y"], median_z = median_z)
29 |     }
30 |     result = by(x, x[, "y"], f)
31 |     saveRDS(result, "result.rds")
32 | ', scheduler = scheduleDataParallel, data = d, nWorkers = 3L)
33 | 
34 | writeCode(out, "vector_actual_generated.R", overWrite = TRUE)
35 | 


--------------------------------------------------------------------------------
/tests/testthat/by_example/x1.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clarkfitzg/makeParallel/6e43f34f51a23692907ec1563d3d47a8e189d7bf/tests/testthat/by_example/x1.rds


--------------------------------------------------------------------------------
/tests/testthat/by_example/x2.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clarkfitzg/makeParallel/6e43f34f51a23692907ec1563d3d47a8e189d7bf/tests/testthat/by_example/x2.rds


--------------------------------------------------------------------------------
/tests/testthat/by_example/x3.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clarkfitzg/makeParallel/6e43f34f51a23692907ec1563d3d47a8e189d7bf/tests/testthat/by_example/x3.rds


--------------------------------------------------------------------------------
/tests/testthat/by_example/x4.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clarkfitzg/makeParallel/6e43f34f51a23692907ec1563d3d47a8e189d7bf/tests/testthat/by_example/x4.rds


--------------------------------------------------------------------------------
/tests/testthat/codewall.R:
--------------------------------------------------------------------------------
 1 | # Some random data analysis script
 2 | library(helperfuncs)
 3 | cmp_plots <- function(cmp1, cmp2, ...){
 4 |     pdf(plotname(cmp1))
 5 |     plot(cmp1, cmp2, ...)
 6 |     dev.off()
 7 | }
 8 | results <- get_results("previous")
 9 | results$time_processed <- Sys.time()
10 | params <- get_param()
11 | params["flag"] <- TRUE
12 | for(p in params){
13 |     check_conformance(p)
14 | }
15 | simulated <- simulate(params)
16 | cmp1 <- compare1(results, simulated)
17 | normalized_results <- normalize(results)
18 | simulated[, "y"] <- addy(simulated)
19 | cmp2 <- compare2(normalized_results, simulated)
20 | verify_compare(cmp1, cmp2)
21 | cmp_plots(cmp1, cmp2, col = "blue")
22 | save_sim(simulated)
23 | save_cmp(cmp2)
24 | 


--------------------------------------------------------------------------------
/tests/testthat/dates.txt:
--------------------------------------------------------------------------------
  1 | 1940-07-22
  2 | 1961-09-11
  3 | 1910-10-19
  4 | 1968-11-10
  5 | 1975-04-24
  6 | 1906-03-29
  7 | 1916-05-25
  8 | 1942-06-12
  9 | 1982-12-08
 10 | 1980-03-22
 11 | 1969-08-06
 12 | 1939-08-20
 13 | 1974-02-11
 14 | 1980-12-23
 15 | 1951-09-22
 16 | 1980-10-30
 17 | 1970-05-30
 18 | 1944-11-30
 19 | 1947-11-20
 20 | 1960-06-06
 21 | 1985-05-11
 22 | 1950-05-16
 23 | 1999-12-12
 24 | 1935-10-13
 25 | 1942-11-22
 26 | 1988-02-21
 27 | 1930-06-24
 28 | 1911-04-03
 29 | 1928-04-11
 30 | 1907-12-15
 31 | 1975-01-02
 32 | 1974-07-06
 33 | 1983-12-24
 34 | 1981-09-21
 35 | 1916-04-26
 36 | 1909-05-19
 37 | 1929-07-28
 38 | 1943-04-30
 39 | 1909-02-26
 40 | 1958-10-13
 41 | 1956-01-13
 42 | 1900-02-15
 43 | 1988-02-15
 44 | 1945-03-18
 45 | 1995-07-03
 46 | 1916-12-10
 47 | 1964-10-16
 48 | 1933-10-28
 49 | 1994-06-20
 50 | 1988-07-25
 51 | 1953-10-11
 52 | 1957-03-21
 53 | 1916-06-30
 54 | 1957-09-10
 55 | 1987-12-22
 56 | 1927-08-30
 57 | 1982-10-01
 58 | 1980-07-19
 59 | 1942-09-09
 60 | 1977-06-26
 61 | 1998-01-21
 62 | 1936-09-27
 63 | 1976-09-21
 64 | 1991-07-22
 65 | 1966-08-27
 66 | 1921-04-04
 67 | 1972-03-28
 68 | 1911-01-16
 69 | 1971-07-01
 70 | 1925-08-09
 71 | 1933-03-24
 72 | 1977-10-17
 73 | 1945-06-27
 74 | 1951-08-07
 75 | 1962-10-10
 76 | 1934-07-25
 77 | 1975-12-06
 78 | 1953-03-23
 79 | 1910-07-16
 80 | 1901-07-05
 81 | 1925-03-28
 82 | 1963-07-17
 83 | 1990-10-18
 84 | 1964-10-26
 85 | 1960-05-04
 86 | 1997-10-14
 87 | 1902-08-30
 88 | 1900-07-13
 89 | 1964-10-21
 90 | 1962-07-21
 91 | 1959-05-07
 92 | 1907-01-31
 93 | 1998-11-14
 94 | 1977-06-21
 95 | 1913-08-02
 96 | 1937-03-16
 97 | 1920-11-23
 98 | 1922-02-10
 99 | 1977-07-03
100 | 1903-08-24
101 | 


--------------------------------------------------------------------------------
/tests/testthat/expected/med_petal.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clarkfitzg/makeParallel/6e43f34f51a23692907ec1563d3d47a8e189d7bf/tests/testthat/expected/med_petal.rds


--------------------------------------------------------------------------------
/tests/testthat/expected/result_custom_reduce.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clarkfitzg/makeParallel/6e43f34f51a23692907ec1563d3d47a8e189d7bf/tests/testthat/expected/result_custom_reduce.rds


--------------------------------------------------------------------------------
/tests/testthat/expected/result_median_reduce.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clarkfitzg/makeParallel/6e43f34f51a23692907ec1563d3d47a8e189d7bf/tests/testthat/expected/result_median_reduce.rds


--------------------------------------------------------------------------------
/tests/testthat/expected/result_two_blocks.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clarkfitzg/makeParallel/6e43f34f51a23692907ec1563d3d47a8e189d7bf/tests/testthat/expected/result_two_blocks.rds


--------------------------------------------------------------------------------
/tests/testthat/iris_csv/1.csv:
--------------------------------------------------------------------------------
 1 | "","Sepal.Length","Sepal.Width","Petal.Length","Petal.Width","Species"
 2 | "9",4.4,2.9,1.4,0.2,"setosa"
 3 | "10",4.9,3.1,1.5,0.1,"setosa"
 4 | "13",4.8,3,1.4,0.1,"setosa"
 5 | "14",4.3,3,1.1,0.1,"setosa"
 6 | "15",5.8,4,1.2,0.2,"setosa"
 7 | "26",5,3,1.6,0.2,"setosa"
 8 | "29",5.2,3.4,1.4,0.2,"setosa"
 9 | "50",5,3.3,1.4,0.2,"setosa"
10 | "51",7,3.2,4.7,1.4,"versicolor"
11 | "55",6.5,2.8,4.6,1.5,"versicolor"
12 | "57",6.3,3.3,4.7,1.6,"versicolor"
13 | "60",5.2,2.7,3.9,1.4,"versicolor"
14 | "65",5.6,2.9,3.6,1.3,"versicolor"
15 | "76",6.6,3,4.4,1.4,"versicolor"
16 | "79",6,2.9,4.5,1.5,"versicolor"
17 | "80",5.7,2.6,3.5,1,"versicolor"
18 | "81",5.5,2.4,3.8,1.1,"versicolor"
19 | "84",6,2.7,5.1,1.6,"versicolor"
20 | "97",5.7,2.9,4.2,1.3,"versicolor"
21 | "101",6.3,3.3,6,2.5,"virginica"
22 | "107",4.9,2.5,4.5,1.7,"virginica"
23 | "109",6.7,2.5,5.8,1.8,"virginica"
24 | "111",6.5,3.2,5.1,2,"virginica"
25 | "114",5.7,2.5,5,2,"virginica"
26 | "118",7.7,3.8,6.7,2.2,"virginica"
27 | "123",7.7,2.8,6.7,2,"virginica"
28 | "125",6.7,3.3,5.7,2.1,"virginica"
29 | "130",7.2,3,5.8,1.6,"virginica"
30 | "134",6.3,2.8,5.1,1.5,"virginica"
31 | "141",6.7,3.1,5.6,2.4,"virginica"
32 | 


--------------------------------------------------------------------------------
/tests/testthat/iris_csv/2.csv:
--------------------------------------------------------------------------------
 1 | "","Sepal.Length","Sepal.Width","Petal.Length","Petal.Width","Species"
 2 | "2",4.9,3,1.4,0.2,"setosa"
 3 | "5",5,3.6,1.4,0.2,"setosa"
 4 | "11",5.4,3.7,1.5,0.2,"setosa"
 5 | "23",4.6,3.6,1,0.2,"setosa"
 6 | "33",5.2,4.1,1.5,0.1,"setosa"
 7 | "37",5.5,3.5,1.3,0.2,"setosa"
 8 | "38",4.9,3.6,1.4,0.1,"setosa"
 9 | "40",5.1,3.4,1.5,0.2,"setosa"
10 | "41",5,3.5,1.3,0.3,"setosa"
11 | "49",5.3,3.7,1.5,0.2,"setosa"
12 | "54",5.5,2.3,4,1.3,"versicolor"
13 | "70",5.6,2.5,3.9,1.1,"versicolor"
14 | "73",6.3,2.5,4.9,1.5,"versicolor"
15 | "82",5.5,2.4,3.7,1,"versicolor"
16 | "83",5.8,2.7,3.9,1.2,"versicolor"
17 | "99",5.1,2.5,3,1.1,"versicolor"
18 | "113",6.8,3,5.5,2.1,"virginica"
19 | "136",7.7,3,6.1,2.3,"virginica"
20 | "144",6.8,3.2,5.9,2.3,"virginica"
21 | "149",6.2,3.4,5.4,2.3,"virginica"
22 | 


--------------------------------------------------------------------------------
/tests/testthat/iris_csv/3.csv:
--------------------------------------------------------------------------------
 1 | "","Sepal.Length","Sepal.Width","Petal.Length","Petal.Width","Species"
 2 | "8",5,3.4,1.5,0.2,"setosa"
 3 | "16",5.7,4.4,1.5,0.4,"setosa"
 4 | "22",5.1,3.7,1.5,0.4,"setosa"
 5 | "27",5,3.4,1.6,0.4,"setosa"
 6 | "32",5.4,3.4,1.5,0.4,"setosa"
 7 | "35",4.9,3.1,1.5,0.2,"setosa"
 8 | "42",4.5,2.3,1.3,0.3,"setosa"
 9 | "44",5,3.5,1.6,0.6,"setosa"
10 | "45",5.1,3.8,1.9,0.4,"setosa"
11 | "47",5.1,3.8,1.6,0.2,"setosa"
12 | "52",6.4,3.2,4.5,1.5,"versicolor"
13 | "53",6.9,3.1,4.9,1.5,"versicolor"
14 | "56",5.7,2.8,4.5,1.3,"versicolor"
15 | "63",6,2.2,4,1,"versicolor"
16 | "71",5.9,3.2,4.8,1.8,"versicolor"
17 | "72",6.1,2.8,4,1.3,"versicolor"
18 | "74",6.1,2.8,4.7,1.2,"versicolor"
19 | "75",6.4,2.9,4.3,1.3,"versicolor"
20 | "88",6.3,2.3,4.4,1.3,"versicolor"
21 | "89",5.6,3,4.1,1.3,"versicolor"
22 | "92",6.1,3,4.6,1.4,"versicolor"
23 | "94",5,2.3,3.3,1,"versicolor"
24 | "95",5.6,2.7,4.2,1.3,"versicolor"
25 | "96",5.7,3,4.2,1.2,"versicolor"
26 | "98",6.2,2.9,4.3,1.3,"versicolor"
27 | "102",5.8,2.7,5.1,1.9,"virginica"
28 | "104",6.3,2.9,5.6,1.8,"virginica"
29 | "105",6.5,3,5.8,2.2,"virginica"
30 | "106",7.6,3,6.6,2.1,"virginica"
31 | "115",5.8,2.8,5.1,2.4,"virginica"
32 | "116",6.4,3.2,5.3,2.3,"virginica"
33 | "121",6.9,3.2,5.7,2.3,"virginica"
34 | "128",6.1,3,4.9,1.8,"virginica"
35 | "132",7.9,3.8,6.4,2,"virginica"
36 | "142",6.9,3.1,5.1,2.3,"virginica"
37 | "147",6.3,2.5,5,1.9,"virginica"
38 | "150",5.9,3,5.1,1.8,"virginica"
39 | 


--------------------------------------------------------------------------------
/tests/testthat/iris_csv/4.csv:
--------------------------------------------------------------------------------
 1 | "","Sepal.Length","Sepal.Width","Petal.Length","Petal.Width","Species"
 2 | "4",4.6,3.1,1.5,0.2,"setosa"
 3 | "7",4.6,3.4,1.4,0.3,"setosa"
 4 | "17",5.4,3.9,1.3,0.4,"setosa"
 5 | "19",5.7,3.8,1.7,0.3,"setosa"
 6 | "24",5.1,3.3,1.7,0.5,"setosa"
 7 | "31",4.8,3.1,1.6,0.2,"setosa"
 8 | "39",4.4,3,1.3,0.2,"setosa"
 9 | "43",4.4,3.2,1.3,0.2,"setosa"
10 | "46",4.8,3,1.4,0.3,"setosa"
11 | "58",4.9,2.4,3.3,1,"versicolor"
12 | "61",5,2,3.5,1,"versicolor"
13 | "66",6.7,3.1,4.4,1.4,"versicolor"
14 | "67",5.6,3,4.5,1.5,"versicolor"
15 | "85",5.4,3,4.5,1.5,"versicolor"
16 | "86",6,3.4,4.5,1.6,"versicolor"
17 | "90",5.5,2.5,4,1.3,"versicolor"
18 | "93",5.8,2.6,4,1.2,"versicolor"
19 | "108",7.3,2.9,6.3,1.8,"virginica"
20 | "120",6,2.2,5,1.5,"virginica"
21 | "124",6.3,2.7,4.9,1.8,"virginica"
22 | "131",7.4,2.8,6.1,1.9,"virginica"
23 | "135",6.1,2.6,5.6,1.4,"virginica"
24 | "138",6.4,3.1,5.5,1.8,"virginica"
25 | "139",6,3,4.8,1.8,"virginica"
26 | "145",6.7,3.3,5.7,2.5,"virginica"
27 | "146",6.7,3,5.2,2.3,"virginica"
28 | "148",6.5,3,5.2,2,"virginica"
29 | 


--------------------------------------------------------------------------------
/tests/testthat/iris_csv/5.csv:
--------------------------------------------------------------------------------
 1 | "","Sepal.Length","Sepal.Width","Petal.Length","Petal.Width","Species"
 2 | "1",5.1,3.5,1.4,0.2,"setosa"
 3 | "3",4.7,3.2,1.3,0.2,"setosa"
 4 | "6",5.4,3.9,1.7,0.4,"setosa"
 5 | "12",4.8,3.4,1.6,0.2,"setosa"
 6 | "18",5.1,3.5,1.4,0.3,"setosa"
 7 | "20",5.1,3.8,1.5,0.3,"setosa"
 8 | "21",5.4,3.4,1.7,0.2,"setosa"
 9 | "25",4.8,3.4,1.9,0.2,"setosa"
10 | "28",5.2,3.5,1.5,0.2,"setosa"
11 | "30",4.7,3.2,1.6,0.2,"setosa"
12 | "34",5.5,4.2,1.4,0.2,"setosa"
13 | "36",5,3.2,1.2,0.2,"setosa"
14 | "48",4.6,3.2,1.4,0.2,"setosa"
15 | "59",6.6,2.9,4.6,1.3,"versicolor"
16 | "62",5.9,3,4.2,1.5,"versicolor"
17 | "64",6.1,2.9,4.7,1.4,"versicolor"
18 | "68",5.8,2.7,4.1,1,"versicolor"
19 | "69",6.2,2.2,4.5,1.5,"versicolor"
20 | "77",6.8,2.8,4.8,1.4,"versicolor"
21 | "78",6.7,3,5,1.7,"versicolor"
22 | "87",6.7,3.1,4.7,1.5,"versicolor"
23 | "91",5.5,2.6,4.4,1.2,"versicolor"
24 | "100",5.7,2.8,4.1,1.3,"versicolor"
25 | "103",7.1,3,5.9,2.1,"virginica"
26 | "110",7.2,3.6,6.1,2.5,"virginica"
27 | "112",6.4,2.7,5.3,1.9,"virginica"
28 | "117",6.5,3,5.5,1.8,"virginica"
29 | "119",7.7,2.6,6.9,2.3,"virginica"
30 | "122",5.6,2.8,4.9,2,"virginica"
31 | "126",7.2,3.2,6,1.8,"virginica"
32 | "127",6.2,2.8,4.8,1.8,"virginica"
33 | "129",6.4,2.8,5.6,2.1,"virginica"
34 | "133",6.4,2.8,5.6,2.2,"virginica"
35 | "137",6.3,3.4,5.6,2.4,"virginica"
36 | "140",6.9,3.1,5.4,2.1,"virginica"
37 | "143",5.8,2.7,5.1,1.9,"virginica"
38 | 


--------------------------------------------------------------------------------
/tests/testthat/iris_csv/generate_data.R:
--------------------------------------------------------------------------------
1 | set.seed(803)
2 | random_ints = sample(5L, size = nrow(iris), replace = TRUE)
3 | s = split(iris, random_ints)
4 | Map(write.csv, s, paste0(names(s), ".csv"))
5 | 


--------------------------------------------------------------------------------
/tests/testthat/local_test.R:
--------------------------------------------------------------------------------
 1 | # Tests that should not run on CRAN.
 2 | 
 3 | test_that("depend graph plotting through command line graphviz (dot)", {
 4 | 
 5 | g = inferGraph("ex.R")
 6 | #f = tempfile(pattern = "plot", fileext = ".pdf")
 7 | f = "ex_plot.pdf"
 8 | 
 9 | plotDOT(g, file = f)
10 | 
11 | expect_true(file.exists(f))
12 | 
13 | unlink(f)
14 | 
15 | })
16 | 


--------------------------------------------------------------------------------
/tests/testthat/range_of_dates.R:
--------------------------------------------------------------------------------
1 | dt = read.table('dates.txt')
2 | d = as.Date(dt[, 1])
3 | rd = range(d)
4 | print(rd)
5 | 


--------------------------------------------------------------------------------
/tests/testthat/single_numeric_few_distinct/.gitignore:
--------------------------------------------------------------------------------
1 | result.rds
2 | 


--------------------------------------------------------------------------------
/tests/testthat/single_numeric_few_distinct/big.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clarkfitzg/makeParallel/6e43f34f51a23692907ec1563d3d47a8e189d7bf/tests/testthat/single_numeric_few_distinct/big.rds


--------------------------------------------------------------------------------
/tests/testthat/single_numeric_few_distinct/setup_data.R:
--------------------------------------------------------------------------------
 1 | # Set up some toy data
 2 | n = 100
 3 | nDistinct = 10L
 4 | set.seed(3890)
 5 | vals = rnorm(nDistinct)
 6 | 
 7 | saveRDS(sample(vals, size = n, replace = TRUE), "small1.rds", compress = FALSE)
 8 | saveRDS(sample(vals, size = n, replace = TRUE), "small2.rds", compress = FALSE)
 9 | saveRDS(sample(vals, size = 2 *n, replace = TRUE), "big.rds", compress = FALSE)
10 | 


--------------------------------------------------------------------------------
/tests/testthat/single_numeric_few_distinct/small1.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clarkfitzg/makeParallel/6e43f34f51a23692907ec1563d3d47a8e189d7bf/tests/testthat/single_numeric_few_distinct/small1.rds


--------------------------------------------------------------------------------
/tests/testthat/single_numeric_few_distinct/small2.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clarkfitzg/makeParallel/6e43f34f51a23692907ec1563d3d47a8e189d7bf/tests/testthat/single_numeric_few_distinct/small2.rds


--------------------------------------------------------------------------------
/tests/testthat/single_numeric_vector/.gitignore:
--------------------------------------------------------------------------------
1 | result.rds
2 | 


--------------------------------------------------------------------------------
/tests/testthat/single_numeric_vector/big.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clarkfitzg/makeParallel/6e43f34f51a23692907ec1563d3d47a8e189d7bf/tests/testthat/single_numeric_vector/big.rds


--------------------------------------------------------------------------------
/tests/testthat/single_numeric_vector/setup_data.R:
--------------------------------------------------------------------------------
 1 | # Set up some toy data
 2 | n = 100
 3 | small = seq(from = 0, to = 1, length.out = n)
 4 | 
 5 | saveRDS(small, "small1.rds", compress = FALSE)
 6 | saveRDS(small, "small2.rds", compress = FALSE)
 7 | 
 8 | big = c(small, small)
 9 | saveRDS(big, "big.rds", compress = FALSE)
10 | 


--------------------------------------------------------------------------------
/tests/testthat/single_numeric_vector/small1.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clarkfitzg/makeParallel/6e43f34f51a23692907ec1563d3d47a8e189d7bf/tests/testthat/single_numeric_vector/small1.rds


--------------------------------------------------------------------------------
/tests/testthat/single_numeric_vector/small2.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clarkfitzg/makeParallel/6e43f34f51a23692907ec1563d3d47a8e189d7bf/tests/testthat/single_numeric_vector/small2.rds


--------------------------------------------------------------------------------
/tests/testthat/test_Data.R:
--------------------------------------------------------------------------------
 1 | library(makeParallel)
 2 | 
 3 | test_that("calls into DataSource classes", {
 4 | 
 5 |     fwf_call = quote(read.fwf("foo.txt", widths = c(5, 5, 10)))
 6 | 
 7 |     #actual = dataSource(fwf_call)
 8 | 
 9 |     #expected = ChunkDataFiles(
10 | 
11 | })
12 | 
13 | 
14 | test_that("simple case of chunked input data descriptions", {
15 | 
16 |     skip("Dropping data expansion idea for the moment")
17 | 
18 |     incode = parse(text = "
19 |         y = 2L * x
20 |         m_y = median(y)
21 |     ", keep.source = FALSE)
22 | 
23 |     xfile1 = tempfile()
24 |     xfile2 = tempfile()
25 | 
26 |     saveRDS(1:5, xfile1)
27 |     saveRDS(6:10, xfile2)
28 | 
29 |     # Build the expression by grabbing the literal filenames
30 |     e = list(xfile1 = xfile1, xfile2 = xfile2)
31 |     chunk_load_code = as.expression(list(
32 |         substitute(readRDS(xfile1), e),
33 |         substitute(readRDS(xfile2), e)
34 |     ))
35 | 
36 |     xdescription = tableChunkData(expr = chunk_load_code, varname = "x", collector = "c")
37 | 
38 |     # Mon May 27 16:10:48 PDT 2019
39 |     # The issue I'm having now is that I want to use the list signature for expandData.
40 |     # This API below also uses the list signature.
41 |     # In this API each element of the list is a data source.
42 |     # I could dispatch on the class of the elements of the list.
43 |     # But that seems excessive, because I only need it when I read in a bunch of data in the first place.
44 |     out = makeParallel(incode
45 |                        , scheduler = scheduleTaskList
46 |                        , data = list(x = xdescription)
47 |                        , maxWorker = 1L
48 |                        )
49 | 
50 |     outcode = writeCode(out)
51 | 
52 |     eval(outcode)
53 | 
54 |     # These variable names subject to change.
55 |     x = c(x_1, x_2)
56 |     y_out = c(y_1, y_2)
57 | 
58 |     # Makes y available, writing over previous version
59 |     eval(incode)
60 | 
61 |     expect_identical(y, y_out)
62 | 
63 | })
64 | 


--------------------------------------------------------------------------------
/tests/testthat/test_MapSchedule.R:
--------------------------------------------------------------------------------
 1 | # Use NSE to make the tests more readable.
 2 | generated_code_matches = function(input, expected)
 3 | {
 4 |     expr = substitute(input)
 5 |     desired_expr = as.expression(substitute(expected))
 6 |     actual = makeParallel(expr = expr)@code
 7 |     expect_equal(actual, desired_expr)
 8 | }
 9 | 
10 | 
11 | test_that("Basic transformation to parallel", {
12 | 
13 |     skip("plan to deprecate")
14 |     generated_code_matches(lapply(f, x)
15 |         , parallel::mclapply(f, x))
16 | 
17 |     generated_code_matches(f(a, b)
18 |         , f(a, b))
19 | 
20 | })
21 | 
22 | 
23 | test_that("Nested parallelism", {
24 | 
25 |     skip("plan to deprecate")
26 |     generated_code_matches(lapply(lapply(x, f), g)
27 |         , parallel::mclapply(lapply(x, f), g))
28 | 
29 |     generated_code_matches(foo(lapply(x, f), lapply(y, f))
30 |         , foo(parallel::mclapply(x, f), parallel::mclapply(y, f)))
31 | 
32 | })
33 | 


--------------------------------------------------------------------------------
/tests/testthat/test_columnsUsed.R:
--------------------------------------------------------------------------------
 1 | # Doesn't (yet) handle:
 2 | #
 3 | # - created or updated columns
 4 | #       x[, "col"] = bar()
 5 | #       foo(x[, "col"])
 6 | # - different object writes over x
 7 | #       x = something_completely_different()
 8 | #       foo(x[, "col"])
 9 | # - when x is renamed
10 | #       df2 = x
11 | #       foo(df2[, "col1"])
12 | # - NSE from `$`
13 | #       foo(x$col)
14 | # - column names contained in a variable
15 | #       v = "col1"
16 | #       foo(df2[, v])
17 | # - 
18 | 
19 | test_that("single literal for `[[`", {
20 | 
21 |     e = parse(text = '
22 |         foo(x[["col"]])
23 |     ')
24 |     expect_equal(columnsUsed(e, "x"), "col")
25 | 
26 | })
27 | 
28 | 
29 | test_that("single literal for `[`", {
30 | 
31 |     e1 = parse(text = '
32 |         foo(x[, "col"])
33 |     ')
34 |     expect_equal(columnsUsed(e1, "x"), "col")
35 | 
36 |     e2 = parse(text = '
37 |         foo(x["col"])
38 |     ')
39 |     expect_equal(columnsUsed(e2, "x"), "col")
40 | 
41 | })
42 | 
43 | 
44 | test_that("multiple literals for `[` combined with `c()`", {
45 | 
46 |     e = parse(text = '
47 |        foo(x[, c("col1", "col2")])
48 |     ')
49 |     expect_equal(columnsUsed(e, "x"), c("col1", "col2"))
50 | 
51 | })
52 | 
53 | 
54 | test_that("redefinitions based on columns", {
55 | # This is a special case when we can quit the analysis early.
56 | 
57 |     e = parse(text = '
58 |        x = x[, c("col1", "col2")]
59 |        foo(x)
60 |     ')
61 |     expect_equal(columnsUsed(e, "x"), c("col1", "col2"))
62 | 
63 | })
64 | 
65 | 
66 | test_that("Not sure which columns are actually used", {
67 | 
68 |     e = parse(text = '
69 |        foo(x)
70 |     ')
71 |     expect_null(columnsUsed(e, "x"))
72 | 
73 |     e2 = parse(text = '
74 |        foo(x[, "col"])
75 |        foo(x)
76 |     ')
77 |     expect_null(columnsUsed(e2, "x"))
78 | 
79 | })
80 | 


--------------------------------------------------------------------------------
/tests/testthat/test_custom_reduce.R:
--------------------------------------------------------------------------------
 1 | library(makeParallel)
 2 | 
 3 | # An example of a user provided custom reduce function.
 4 | naiveMeanReduce = reduceFun("mean"
 5 |     , summary = function(data) list(length = length(data), sum = sum(data))
 6 |     , combine = function(...)
 7 |     {
 8 |         args = list(...)
 9 |         lengths = sapply(args, `[[`, "length")
10 |         sums = sapply(args, `[[`, "sum")
11 |         list(length = sum(lengths), sum = sum(sums))
12 |     }
13 |     , query = function(s) s$sum / s$length
14 |     )
15 | 
16 | 
17 | files = list.files("single_numeric_vector", pattern = "*.rds", full.names = TRUE)
18 | 
19 | # Can surely do this for the user
20 | sizes = file.info(files)[, "size"]
21 | 
22 | x_desc = ChunkDataFiles(varName = "x0"
23 |     , files = files
24 | 	, sizes = sizes
25 | 	, readFuncName = "readRDS"
26 |     )
27 | 
28 | outFile = "gen/custom_reduce.R"
29 | 
30 | out = makeParallel("
31 | x = sin(x0)
32 | result = mean(x)
33 | saveRDS(result, 'gen/result_custom_reduce.rds')
34 | "
35 | , data = x_desc
36 | , scheduler = scheduleDataParallel
37 | , platform = parallelLocalCluster()
38 | , chunkFuncs = c("sin")
39 | , reduceFuncs = list(naiveMeanReduce)
40 | , outFile = outFile
41 | , overWrite = TRUE
42 | )
43 | 
44 | 
45 | # Test code
46 | ############################################################
47 | if(identical(Sys.getenv("TESTTHAT"), "true")){
48 | 
49 |     rr = 'gen/result_custom_reduce.rds'
50 |     unlink(rr)
51 |     source(outFile)
52 | 
53 |     result = readRDS(rr)
54 |     # A cleaner way to test this would be to test that both the serial schedules and the parallel ones get the same result.
55 |     expected = readRDS("expected/result_custom_reduce.rds")
56 | 
57 |     expect_equal(result, expected)
58 | 
59 |     s = schedule(out)
60 |     block_class = sapply(s@blocks, class)
61 | 
62 |     expect_true("ReduceBlock" %in% block_class)
63 | 
64 | }
65 | 


--------------------------------------------------------------------------------
/tests/testthat/test_dependGraph.R:
--------------------------------------------------------------------------------
  1 | library(CodeDepends)
  2 | library(igraph)
  3 | 
  4 | 
  5 | # TODO: define behavior for this script:
  6 | # x = list(a = 1)
  7 | # x$b = 2
  8 | # f(x)
  9 | 
 10 | # Could define Ops to get ==, but this is sufficient
 11 | expect_samegraph = function(g, tg)
 12 | {
 13 |     tg2 = graph_from_data_frame(tg@graph)
 14 |     expect_true(isomorphic(g, tg2))
 15 | }
 16 | 
 17 | 
 18 | test_that("Degenerate cases, 0 or 1 nodes", {
 19 | 
 20 |     s1 = parse(text = "
 21 |     x = 1
 22 |     ")
 23 |     g1 = make_graph(numeric(), n = 1)
 24 |     gd1 = inferGraph(s1)
 25 | 
 26 |     s0 = parse(text = "
 27 |     ")
 28 |     g0 = make_empty_graph()
 29 |     gd0 = inferGraph(s0)
 30 |     expect_samegraph(g0, gd0)
 31 | 
 32 |     skip("Not that important.")
 33 |     expect_samegraph(g1, gd1)
 34 | 
 35 | })
 36 | 
 37 | 
 38 | test_that("User defined functions are dependencies", {
 39 | 
 40 |     s = parse(text = "
 41 |     f2 = function() 2
 42 |     x = f2()
 43 |     ")
 44 | 
 45 |     desired = make_graph(c(1, 2))
 46 |     actual = inferGraph(s)
 47 | 
 48 |     expect_samegraph(desired, actual)
 49 | 
 50 | })
 51 | 
 52 | 
 53 | test_that("Self referring node does not appear", {
 54 | 
 55 |     s = parse(text = "
 56 |     x = 1
 57 |     x = x + 2
 58 |     ")
 59 | 
 60 |     desired = make_graph(c(1, 2))
 61 |     actual = inferGraph(s)
 62 | 
 63 |     expect_samegraph(desired, actual)
 64 | 
 65 | })
 66 | 
 67 | 
 68 | test_that("Assignment order respected", {
 69 | 
 70 |     s = parse(text = "
 71 |     x = 1
 72 |     x = 2
 73 |     y = 2 * x
 74 |     ")
 75 | 
 76 |     desired = make_graph(c(2, 3))
 77 |     actual = inferGraph(s)
 78 | 
 79 |     skip("Doesn't currently work because the graph doesn't know it has 3
 80 |          nodes rather than 2.")
 81 | 
 82 |     expect_samegraph(desired, actual)
 83 | 
 84 | })
 85 | 
 86 | 
 87 | test_that("Chains not too long", {
 88 | 
 89 |     s = parse(text = "
 90 |     x = 1:10
 91 |     plot(x)
 92 |     y = 2 * x
 93 |     ")
 94 | 
 95 |     desired = make_graph(c(1, 2, 1, 3))
 96 |     actual = inferGraph(s)
 97 | 
 98 |     expect_samegraph(desired, actual)
 99 | 
100 | })
101 | 
102 | 
103 | test_that("Updates count as dependencies", {
104 | 
105 |     s = parse(text = "
106 |     x = list()
107 |     x$a = 1
108 |     x$b = 2
109 |     ")
110 | 
111 |     desired = make_graph(c(1, 2, 2, 3))
112 |     actual = inferGraph(s)
113 | 
114 |     expect_samegraph(desired, actual)
115 | 
116 | })
117 | 
118 | 
119 | test_that("$ evaluates LHS", {
120 | 
121 |     s = parse(text = "
122 |     f = function(x) 100
123 |     optimize(f, c(0, 1))$minimum
124 |     ")
125 | 
126 |     desired = make_graph(c(1, 2))
127 |     actual = inferGraph(s)
128 | 
129 |     expect_samegraph(desired, actual)
130 | 
131 | })
132 | 
133 | 
134 | test_that("Precedence for user defined variables over base", {
135 | 
136 |     s = parse(text = "
137 |     c = 100
138 |     print(c)
139 |     ")
140 | 
141 |     desired = make_graph(c(1, 2))
142 |     actual = inferGraph(s)
143 | 
144 |     expect_samegraph(desired, actual)
145 | 
146 | })
147 | 


--------------------------------------------------------------------------------
/tests/testthat/test_extend_platform.R:
--------------------------------------------------------------------------------
 1 | library(makeParallel)
 2 | 
 3 | POSIXLocalCluster = setClass("POSIXLocalCluster", contains = "ParallelLocalCluster")
 4 | 
 5 | msg = "Yay right method"
 6 | 
 7 | setMethod("generate", signature(schedule = "DataLoadBlock", platform = "POSIXLocalCluster", data = "ChunkDataFiles"),
 8 | function(schedule, platform, data)
 9 | {
10 |     stop(msg)
11 | })
12 | 
13 | d = ChunkDataFiles(files = c("a.csv", "b.csv"), sizes = c(1, 2), readFuncName = "read.csv", varName = "x")
14 | p = POSIXLocalCluster(nWorkers = 1L, name = "cls")
15 | 
16 | expect_error(
17 | makeParallel("lapply(x, f)", data = d
18 |              , platform = p
19 |              , scheduler = scheduleDataParallel
20 |              , chunkFuncs = "lapply"
21 |              )
22 | , regexp = msg)
23 | 


--------------------------------------------------------------------------------
/tests/testthat/test_forLoop.R:
--------------------------------------------------------------------------------
 1 | # DO NOT add more tests here. Put them in CodeAnalysis
 2 | 
 3 | test_that("for loop to lapply", {
 4 | 
 5 |     loop1 = quote(for(i in x){f(i)})
 6 |     actual = forLoopToLapply(loop1)
 7 |     expected = quote(lapply(x, function(i){f(i)}))
 8 | 
 9 |     expect_equal(actual, expected)
10 | 
11 |     # Can't be parallelized
12 |     loop2 = quote(for(i in x){
13 |         y = f(y)
14 |     })
15 | 
16 |     expect_equal(forLoopToLapply(loop2), loop2)
17 | 
18 |     loop3 = quote(for(i in x){
19 |         tmp = foo()
20 |         f(tmp, i)
21 |     })
22 |     actual = forLoopToLapply(loop3)
23 |     expected = quote(lapply(x, function(i){
24 |         tmp = foo()
25 |         f(tmp, i)
26 |     }))
27 | 
28 |     expect_equal(actual, expected)
29 | 
30 | })
31 | 
32 | 
33 | test_that("assignment inside for loop", {
34 | 
35 |     loop1 = quote(
36 |     for (i in 1:500){
37 |         tmp = g() 
38 |         output[[i]] = tmp
39 |     })
40 | 
41 |     expected = quote(
42 |     output[1:500] <- lapply(1:500, function(i) {
43 |         tmp = g() 
44 |         tmp
45 |     }))
46 | 
47 |     actual = forLoopToLapply(loop1)
48 | 
49 |     expect_equal(actual, expected)
50 | 
51 |     # True dependence, can't parallelize
52 |     loop2 = quote(
53 |     for (i in 1:500){
54 |         tmp = g(tmp) 
55 |         x[[i]] = tmp
56 |     })
57 | 
58 |     actual = forLoopToLapply(loop2)
59 | 
60 |     expect_equal(actual, loop2)
61 | 
62 |     # True dependence, can't parallelize
63 |     loop3 = quote(
64 |     for (i in 2:500){
65 |         x[[i]] = g(x[[i - 1]])
66 |     })
67 | 
68 |     actual = forLoopToLapply(loop3)
69 | 
70 |     expect_equal(actual, loop3)
71 | 
72 | })
73 | 


--------------------------------------------------------------------------------
/tests/testthat/test_forkSchedule.R:
--------------------------------------------------------------------------------
 1 | test_that("fork schedule", {
 2 | 
 3 |     code = parse(text = "
 4 |     x = foo()
 5 |     y = bar()
 6 |     foobar(x, y)
 7 |     ")
 8 | 
 9 |     g = inferGraph(code, time = c(2, 3, 1))
10 | 
11 |     s = scheduleForkSeq(g, overhead = 1e-3)
12 | 
13 |     # Exactly one statement should be forked and thus appear twice.
14 |     expect_true(xor(sum(s == 1) == 2, sum(s == 2) == 2))
15 | 
16 |     plot(s)
17 | 
18 | })
19 | 
20 | 
21 | test_that("fork schedule on a larger script", {
22 | 
23 |     codewall = parse("codewall.R")
24 | 
25 |     # Some times are large, some not.
26 |     set.seed(8439)
27 |     n = length(codewall)
28 |     times = runif(n)
29 |     epsilon = 1e-4
30 |     times[sample.int(n, size = floor(n/2))] = epsilon
31 | 
32 |     g = inferGraph(codewall, time = times)
33 | 
34 |     s = scheduleFork(g)
35 | 
36 |     # TODO: Write plot method
37 |     plot(s)
38 | 
39 | })
40 | 
41 | 
42 | test_that("Helper functions", {
43 | 
44 |     schedule = c(1, 2, 3, 4, 5, 3, 6)
45 | 
46 |     expect_equal(forkSplit(4, schedule),
47 |         list(before = c(1, 2, 3)
48 |              , hasnode = c(4, 5)
49 |              , after = c(3, 6)
50 |     ))
51 | 
52 |     expect_equal(forkSplit(1, schedule),
53 |         list(before = integer()
54 |              , hasnode = c(1, 2)
55 |              , after = c(3, 4, 5, 3, 6)
56 |     ))
57 | 
58 | 
59 | })
60 | 


--------------------------------------------------------------------------------
/tests/testthat/test_group_by.R:
--------------------------------------------------------------------------------
 1 | library(makeParallel)
 2 | 
 3 | files = list.files("iris_csv", pattern = "*.csv", full.names = TRUE)
 4 | 
 5 | # Can surely do this for the user
 6 | sizes = file.info(files)[, "size"]
 7 | 
 8 | x_desc = ChunkDataFiles(varName = "iris2"
 9 |     , files = files
10 | 	, sizes = sizes
11 | 	, readFuncName = "read.csv"
12 |     )
13 | 
14 | outFile = "gen/group_by.R"
15 | 
16 | out = makeParallel("
17 | species = iris2$Species
18 | iris2split = split(x = iris2, f = species)
19 | med_petal = sapply(iris2split, function(grp) median(grp$Petal.Length))
20 | saveRDS(med_petal, 'gen/med_petal.rds')
21 | "
22 | , data = x_desc
23 | , scheduler = scheduleDataParallel
24 | , platform = parallelLocalCluster(scratchDir = "gen")
25 | , chunkFuncs = c("sapply", "$")
26 | , outFile = outFile
27 | , overWrite = TRUE
28 | )
29 | 
30 | 
31 | # Test code
32 | ############################################################
33 | if(identical(Sys.getenv("TESTTHAT"), "true")){
34 | 
35 |     rr = 'gen/med_petal.rds'
36 |     unlink(rr)
37 |     source(outFile)
38 | 
39 |     result = readRDS(rr)
40 |     expected = readRDS("expected/med_petal.rds")
41 | 
42 |     # Equal up to ordering
43 |     expect_equal(result[names(expected)], expected)
44 | 
45 |     s = schedule(out)
46 |     block_class = sapply(s@blocks, class)
47 | 
48 |     expect_true("SplitBlock" %in% block_class)
49 | 
50 | }
51 | 


--------------------------------------------------------------------------------
/tests/testthat/test_makeParallel.R:
--------------------------------------------------------------------------------
  1 | oldcode = parse(text = "
  2 |     v1 = 'foo1'
  3 |     v2 = 'foo2'
  4 |     x <- paste0(v1, v1)
  5 |     y <- paste0(v2, v2)
  6 |     xy <- paste0(x, y)
  7 | ")
  8 | 
  9 | 
 10 | test_that("Defaults for generics used in parallelize.", {
 11 | 
 12 |     g = inferGraph(oldcode)
 13 | 
 14 |     s = schedule(g)
 15 |     newcode = generate(s)
 16 | 
 17 |     expect_s4_class(g, "TaskGraph")
 18 |     expect_s4_class(s, "Schedule")
 19 |     expect_s4_class(newcode, "GeneratedCode")
 20 | 
 21 |     fn = tempfile()
 22 |     writeCode(newcode, fn)
 23 |     expect_true(file.exists(fn))
 24 |     unlink(fn)
 25 | 
 26 | })
 27 | 
 28 | 
 29 | test_that("runMeasure", {
 30 | 
 31 |     g = runMeasure(oldcode)
 32 | 
 33 |     expect_s4_class(g, "MeasuredTaskGraph")
 34 | 
 35 | })
 36 | 
 37 | 
 38 | test_that("Multiple assignment in single expression", {
 39 | 
 40 |     code = parse(text = "
 41 |         x = y = z = 1
 42 |         a = b = c = 2
 43 |         f(x, y, z, a, b, c)
 44 |     ", keep.source = FALSE)
 45 | 
 46 |     out = makeParallel(expr = code, scheduler = scheduleTaskList)
 47 | 
 48 |     # This test is specific to the implementation, and may need to change.
 49 |     # The first two lines will be assigned to different processors, so
 50 |     # three transfers should happen regardless of which processor evaluates
 51 |     # the last line.
 52 |     trans = schedule(out)@transfer
 53 | 
 54 |     expect_equal(3, nrow(trans))
 55 | 
 56 | })
 57 | 
 58 | 
 59 | test_that("whole workflow on files", {
 60 | 
 61 |     exfile = tempfile()
 62 |     oldscript = system.file("examples/mp_example.R", package = "makeParallel")
 63 |     file.copy(from = oldscript, to = exfile)
 64 |     genfile = file.path(dirname(exfile), paste0("gen_", basename(exfile)))
 65 | 
 66 |     out = makeParallel(exfile, outFile = TRUE, scheduler = scheduleTaskList, nWorkers = 3)
 67 | 
 68 |     expect_s4_class(out, "GeneratedCode")
 69 | 
 70 |     plot(schedule(out))
 71 | 
 72 |     expect_equal(file(out), genfile)
 73 | 
 74 |     expect_true(file.exists(genfile))
 75 | 
 76 |     # 'Catching different types of errors' - This would make a nice blog post.
 77 |     e = tryCatch(makeParallel(exfile, outFile = genfile), error = identity)
 78 |     expect_true(is(e, "FileExistsError"))
 79 | 
 80 |     makeParallel(exfile, outFile = TRUE, overWrite = TRUE)
 81 | 
 82 |     unlink(genfile)
 83 |     makeParallel(exfile, outFile = FALSE)
 84 |     expect_false(file.exists(genfile))
 85 | 
 86 |     fname = tempfile()
 87 |     out = makeParallel(exfile, scheduler = scheduleTaskList, outFile = fname)
 88 |     expect_true(file.exists(fname))
 89 |     expect_equal(fname, file(out))
 90 | 
 91 |     out = makeParallel(exfile, outFile = TRUE, scheduler = scheduleTaskList, prefix = "GEN")
 92 |     fn = file.path(dirname(exfile), paste0("GEN", basename(exfile)))
 93 |     expect_true(file.exists(fn))
 94 |     expect_equal(fn, file(out))
 95 | 
 96 | })
 97 | 
 98 | 
 99 | test_that("string as input", {
100 | 
101 |     out = makeParallel("
102 |         x = 1:10
103 |         y = 2 * x
104 |     ")
105 | 
106 | })
107 | 


--------------------------------------------------------------------------------
/tests/testthat/test_map_reduce.R:
--------------------------------------------------------------------------------
 1 | # See clarkfitzthesis/tex/vectorize document to see more details for what's going on, what this is working towards.
 2 | #
 3 | 
 4 | # The code to do the actual transformation
 5 | # The user of makeParallel must write something like the following:
 6 | 
 7 | library(makeParallel)
 8 | 
 9 | # We need the files in this order to check the load balancing works.
10 | files = paste0("single_numeric_vector/", c("small1", "big", "small2"), ".rds")
11 | 
12 | # Can surely do this for the user
13 | sizes = file.info(files)[, "size"]
14 | 
15 | x_desc = ChunkDataFiles(varName = "x"
16 |     , files = files
17 | 	, sizes = sizes
18 | 	, readFuncName = "readRDS"
19 |     )
20 | 
21 | outFile = "gen/map_reduce.R"
22 | 
23 | out = makeParallel("
24 |     y = sin(x)
25 |     result = min(y)
26 |     saveRDS(result, 'gen/result_map_reduce.rds')
27 | "
28 | , data = x_desc
29 | , nWorkers = 2L
30 | , scheduler = scheduleDataParallel
31 | , platform = parallelLocalCluster()
32 | , chunkFuncs = "sin"
33 | , outFile = outFile
34 | , overWrite = TRUE
35 | )
36 | 
37 | 
38 | # Test code
39 | ############################################################
40 | if(identical(Sys.getenv("TESTTHAT"), "true")){
41 | 
42 |     # Check that the load balancing happens.
43 |     expect_equal(schedule(out)@blocks[[1]]@assignmentIndices, c(1, 2, 1))
44 | 
45 |     rr = "gen/result_map_reduce.rds"
46 |     unlink(rr)
47 |     source(outFile)
48 | 
49 |     result = readRDS(rr)
50 | 
51 |     expect_equal(result, 0)
52 | 
53 | }
54 | 
55 | 
56 | 
57 | if(FALSE){
58 |     # Duncan's example of fusing lapply's works just fine.
59 | 
60 |     s = makeParallel("
61 |         y = lapply(x, sin)
62 |         y2 = sapply(y, function(x) x^2)
63 |         #ten = 10
64 |         y3 = log(y2 + 2, base = 10)
65 |         result = min(y3)
66 |     "
67 |     , data = list(x = x_desc)
68 |     , nWorkers = 2L
69 |     , scheduler = scheduleDataParallel
70 |     , chunkFuncs = c("lapply", "sapply", "log", "+")
71 |     , outFile = "lapply_example.R"
72 |     , overWrite = TRUE
73 |     )
74 | 
75 |     schedule(s)@vectorIndices
76 | 
77 |     eval(s@code)
78 | 
79 | }
80 | 


--------------------------------------------------------------------------------
/tests/testthat/test_median_reduce.R:
--------------------------------------------------------------------------------
 1 | library(makeParallel)
 2 | 
 3 | # Implementation note- regarding the functions in strings, the code generator can check for :: in the string and generate a call to `::` instead of a symbol.
 4 | # We don't want to inline package functions in generated code because they may use package internal objects, as in this case.
 5 | # It's also messy.
 6 | 
 7 | 
 8 | # Compute the median given a table where the names are the values
 9 | tableMedian = function(tbl){
10 | 
11 |     tbl = tbl[order(as.numeric(names(tbl)))]
12 |     totals = cumsum(tbl)
13 | 
14 |     n = totals[length(totals)]
15 |     med_count = n / 2
16 | 
17 |     med = names(tbl)[med_count <= totals][1L]
18 |     as.numeric(med)
19 | }
20 | 
21 | # Could generalize this with quantile through preprocessing instead of specifying all the reduces:
22 | # Change median -> quantile, which is more general
23 | # Could also change two or more calls into one:
24 | # iqr = c(quantile(x, 0.25), mean(x), quantile(x, 0.75))
25 | 
26 | # There's a whole class of statistics that can be computed from a table.
27 | # More generally, could return and estimate statistics from a KDE
28 | 
29 | # For someone to write a predicate function they need to know about how we've implemented resources and propagation.
30 | # This is bad for extensibility.
31 | # Other idea: cost
32 | 
33 | medianReduce = reduceFun("median"
34 |     , summary = "table"
35 |     , combine = "makeParallel::combine_tables"
36 |     , query = tableMedian
37 |     , predicate = function(r) !is.null(r[["uniqueValueBound"]]) && r[["uniqueValueBound"]] < 1000
38 |     )
39 | 
40 | # quantile(x, 0.5) 
41 | # quantile(a, 0.5) # few unique values
42 | # quantile(b, 0.5) # only care about approximation
43 | # Could imagine user declaring that they only need a specified amount of precision for some calculation, and using a different implementation.
44 | # makeParallel(..., precision = c(x = 0.99))
45 | 
46 | files = list.files("single_numeric_few_distinct", pattern = "*.rds", full.names = TRUE)
47 | 
48 | # Can surely do this for the user
49 | sizes = file.info(files)[, "size"]
50 | 
51 | x_desc = ChunkDataFiles(varName = "x0"
52 |     , files = files
53 | 	, sizes = sizes
54 | 	, readFuncName = "readRDS"
55 |     , uniqueValueBound = 500
56 |     )
57 | 
58 | outFile = "gen/median_reduce.R"
59 | 
60 | out = makeParallel("
61 | x = sin(x0)
62 | result = median(x)
63 | saveRDS(result, 'gen/result_median_reduce.rds')
64 | "
65 | , data = x_desc
66 | , scheduler = scheduleDataParallel
67 | , platform = parallelLocalCluster()
68 | , chunkFuncs = c("sin")
69 | , reduceFuncs = list(medianReduce)
70 | , outFile = outFile
71 | , overWrite = TRUE
72 | )
73 | 
74 | 
75 | # Test code
76 | ############################################################
77 | if(identical(Sys.getenv("TESTTHAT"), "true")){
78 | 
79 |     rr = 'gen/result_median_reduce.rds'
80 |     unlink(rr)
81 |     source(outFile)
82 | 
83 |     result = readRDS(rr)
84 |     # A cleaner way to test this would be to test that both the serial schedules and the parallel ones get the same result.
85 |     expected = readRDS("expected/result_median_reduce.rds")
86 | 
87 |     expect_equal(result, expected)
88 | 
89 |     s = schedule(out)
90 |     block_class = sapply(s@blocks, class)
91 | 
92 |     expect_true("ReduceBlock" %in% block_class)
93 | 
94 | }
95 | 


--------------------------------------------------------------------------------
/tests/testthat/test_one_text_file.R:
--------------------------------------------------------------------------------
 1 | # Testing the data source inference.
 2 | 
 3 | library(makeParallel)
 4 | 
 5 | out = makeParallel("range_of_dates.R", scheduler = scheduleDataParallel
 6 |                    , chunkFuncs = c("[", "as.Date")
 7 |                    , reduceFuncs = list(reduceFun("range")))
 8 | 
 9 | d = dataSource(out)
10 | 
11 | writeCode(out, "gen_range_of_dates.R", overWrite = TRUE)
12 | 
13 | source("gen_range_of_dates.R")
14 | 
15 | schedule(out)@blocks
16 | 
17 | 
18 | # Test code
19 | ############################################################
20 | if(identical(Sys.getenv("TESTTHAT"), "true")){
21 | 
22 | script = "
23 | dt = read.fwf('dates.txt', widths = 10L)
24 | d = as.Date(vals[, 1])
25 | rd = range(d)
26 | print(rd)
27 | "
28 | 
29 | out = makeParallel(script, scheduler = scheduleDataParallel)
30 | 
31 | # Manual specification
32 | d0 = FixedWidthFiles(varName = "dt", files = "dates.txt", widths = 10L)
33 | 
34 | # Inference on a single call
35 | d1 = dataSource(quote(
36 |     dt <- read.fwf('dates.txt', widths = 10L)
37 | ))
38 | 
39 | # Discovered in the original code by makeParallel
40 | d2 = dataSource(out)
41 | 
42 | expect_equal(d0, d1)
43 | 
44 | expect_equal(d0, d2)
45 | 
46 | } 
47 | 
48 | 
49 | if(FALSE){
50 |     # This is an extension mechanism, so could test it.
51 |     # Ugly though.
52 |     
53 |     inferDataSourceFromCall.read.csv_Call = function(expr, ...) "Boom!"
54 | 
55 |     tst = dataSource(quote(
56 |         dt <- read.csv('dates.txt', widths = 10L)
57 |     ))
58 | 
59 |     # This is what Duncan was talking about where you actually have to make the method available on the search path.
60 |     # The package code cannot find this.
61 |     # Hence the need to make the function handler list available.
62 | 
63 |     inferDataSourceFromCall.read.fwf_Call = function(expr, ...) "Boom Boom!"
64 | 
65 | }
66 | 


--------------------------------------------------------------------------------
/tests/testthat/test_order.R:
--------------------------------------------------------------------------------
 1 | test_that("basic ordering", 
 2 | {
 3 | 
 4 | graph <- inferGraph(code = parse(text = "x <- 1:100
 5 | y <- rep(1, 100)
 6 | z <- x + y"), time = c(1, 2, 5))
 7 | bl <- orderBottomLevel(graph)
 8 | 
 9 | # 2nd statement takes longer, so it should come before 1st.
10 | expect_equal(bl, c(2, 1, 3))
11 | 
12 | 
13 | graph <- inferGraph(code = parse(text = "x <- 1:100
14 | y <- f(x)
15 | g(x)"), time = c(1, 2, 5))
16 | bl <- bottomLevel(graph)
17 | 
18 | expect_equal(bl, c(6, 2, 5))
19 | 
20 | })
21 | 


--------------------------------------------------------------------------------
/tests/testthat/test_scheduleTaskList.R:
--------------------------------------------------------------------------------
 1 | oldcode = parse(text = "
 2 |     v1 = 'foo1'
 3 |     v2 = 'foo2'
 4 |     x <- paste0(v1, v1)
 5 |     y <- paste0(v2, v2)
 6 |     xy <- paste0(x, y)
 7 | ")
 8 | 
 9 | 
10 | test_that("Defaults", {
11 | 
12 |     g = inferGraph(oldcode)
13 | 
14 |     expect_warning(scheduleTaskList(g), "TimedTaskGraph")
15 | 
16 | })
17 | 
18 | 
19 | test_that("Plotting", {
20 | 
21 |     code = parse(text = "
22 |         x = 1:10
23 |         y = sin(x)
24 |         pdf('sin.pdf')
25 |         plot(x, y)
26 |         dev.off()
27 |         ")
28 | 
29 |     g = inferGraph(code)
30 | 
31 |     s = scheduleTaskList(g)
32 | 
33 |     plot(s)
34 | 
35 | })
36 | 


--------------------------------------------------------------------------------
/tests/testthat/test_two_blocks.R:
--------------------------------------------------------------------------------
 1 | library(makeParallel)
 2 | 
 3 | maxReduce = reduceFun("max")
 4 | 
 5 | files = list.files("single_numeric_vector", pattern = "*.rds", full.names = TRUE)
 6 | 
 7 | # Can surely do this for the user
 8 | sizes = file.info(files)[, "size"]
 9 | 
10 | x_desc = ChunkDataFiles(varName = "x0"
11 |     , files = files
12 | 	, sizes = sizes
13 | 	, readFuncName = "readRDS"
14 |     )
15 | 
16 | outFile = "gen/two_blocks.R"
17 | 
18 | out = makeParallel("
19 | x = sin(x0)             # ParallelBlock 1
20 | y = cos(x0)
21 | z = ceiling(x)
22 | mx = median(x)          # SerialBlock 1
23 | x2 = x + y + z - mx     # ParallelBlock 2
24 | result = max(x2)        # Reduce 1
25 | saveRDS(result, 'gen/result_two_blocks.rds') # SerialBlock2
26 | "
27 | , data = x_desc
28 | , scheduler = scheduleDataParallel
29 | , platform = parallelLocalCluster()
30 | , chunkFuncs = c("sin", "cos", "+", "-", "ceiling")
31 | , reduceFuncs = list(maxReduce)
32 | , outFile = outFile
33 | , overWrite = TRUE
34 | )
35 | 
36 | 
37 | # Test code
38 | ############################################################
39 | if(identical(Sys.getenv("TESTTHAT"), "true")){
40 | 
41 |     rr = "gen/result_two_blocks.rds"
42 |     unlink(rr)
43 |     source(outFile)
44 | 
45 |     result = readRDS(rr)
46 |     # A cleaner way to test this would be to test that both the serial schedules and the parallel ones get the same result.
47 |     expected = readRDS("expected/result_two_blocks.rds")
48 | 
49 |     expect_equal(result, expected)
50 | 
51 |     s = schedule(out)
52 |     block_class = sapply(s@blocks, class)
53 | 
54 |     expect_equal(sum(block_class == "ParallelBlock"), 2L)
55 | 
56 |     expect_true("ReduceBlock" %in% block_class)
57 | 
58 | }
59 | 


--------------------------------------------------------------------------------
/tests/testthat/test_utils.R:
--------------------------------------------------------------------------------
  1 | library(makeParallel)
  2 | 
  3 | 
  4 | test_that("descendant removal", {
  5 | 
  6 |     nodes = list(c(1, 2), c(1, 2, 4))
  7 | 
  8 |     actual = hasAncestors(nodes)
  9 | 
 10 |     expect_equal(c(FALSE, TRUE), actual)
 11 | 
 12 | })
 13 | 
 14 | 
 15 | test_that("find_var", {
 16 | 
 17 |     expr = quote(x[1:5])
 18 | 
 19 |     expect_equal(find_var(expr, "x"), list(2))
 20 | 
 21 |     expr = quote(y[1:5])
 22 |     expect_equal(find_var(expr, "x"), list())
 23 | 
 24 |     expr = quote(mean(x[1:5]))
 25 |     expect_equal(find_var(expr, "x"), list(c(2, 2)))
 26 | 
 27 |     expr = quote(plot(dframe[, "d"]))
 28 |     actual = find_var(expr, "dframe")
 29 | 
 30 |     expect_equal(actual, list(c(2, 2)))
 31 | 
 32 |     expr = quote(mean(x[1:5]) + x)
 33 |     actual = find_var(expr, "x")
 34 |     # I don't care about the order of the elements of this list.
 35 |     expect_equal(actual, list(c(2, 2, 2), 3))
 36 | 
 37 |     # Don't match character vectors
 38 |     expr = quote(paste("x", "y"))
 39 |     expect_equal(find_var(expr, "y"), list())
 40 | 
 41 |     expr = parse(text = '
 42 |         d = read.csv("data.csv")
 43 |         hist(d[, 2])
 44 |     ')
 45 |     actual = find_var(expr, "read.csv")
 46 | 
 47 |     expect_equal(actual, list(c(1, 3, 1)))
 48 | 
 49 | 
 50 |     expr = parse(text = '
 51 |         f = function(end, start = x) area(sin, start, end)
 52 |         f(x)
 53 |     ')
 54 | 
 55 |     # expr[[c(1, 3, 2, 2)]] = as.symbol("z")
 56 |     # TODO: Running the above results in `expr` still printing `x`, but
 57 |     # code evaluates as if it were changed to `z`.
 58 |     # I don't know what's happening.
 59 | 
 60 |     actual = find_var(expr, "x")
 61 | 
 62 |     expect_equal(actual, list(c(1, 3, 2, 2), c(2, 2)))
 63 | 
 64 | })
 65 | 
 66 | 
 67 | test_that("find_call", {
 68 | 
 69 |     e0 = quote(sapply(x, f))
 70 |     expect_equal(find_call(e0, "lapply"), list())
 71 | 
 72 |     e1 = quote(lapply(x, f))
 73 |     expect_equal(find_call(e1, "lapply"), list(1L))
 74 | 
 75 |     e2 = quote(y <- lapply(x, f))
 76 |     expect_equal(find_call(e2, "lapply"), list(c(3L, 1L)))
 77 | 
 78 |     e3 = quote(y <- c(lapply(x, f1), lapply(x, f2)))
 79 |     expect_equal(find_call(e3, "lapply"), list(c(3L, 2L, 1L), c(3L, 3L, 1L)))
 80 | 
 81 |     e4 = quote(y <- lapply(lapply(x, f), g))
 82 |     expect_equal(find_call(e4, "lapply"), list(c(3L, 1L), c(3L, 2L, 1L)))
 83 | 
 84 | })
 85 | 
 86 | 
 87 | test_that("tree methods", {
 88 | 
 89 |     tree = list(list(list(1, 2, 3), 4), 5)
 90 |     actual = tree[[c(1, 1, 2)]]
 91 |     expect_equal(actual, 2)
 92 | 
 93 | })
 94 | 
 95 | 
 96 | test_that("symbol replacement", {
 97 | 
 98 |     e = parse(text = "
 99 |         bar = FOO
100 |         BAZ
101 |         ", keep.source = FALSE)
102 | 
103 |     actual = substitute_language(e, list(FOO = quote(foo_new), BAZ = quote(f(g(foo_new)))))
104 | 
105 |     expected = parse(text = "
106 |         bar = foo_new
107 |         f(g(foo_new))
108 |         ", keep.source = FALSE)
109 | 
110 |     expect_equal(actual, expected)
111 | 
112 |     e2 = parse(text = "
113 |         `_BODY`
114 |         foo(bar)
115 |         ", keep.source = FALSE)
116 | 
117 |     actual = substitute_language(e2, list(`_BODY` = e))
118 | 
119 |     expected = parse(text = "
120 |         {
121 |             bar = FOO
122 |             BAZ
123 |         }
124 |         foo(bar)
125 |         ", keep.source = FALSE)
126 | 
127 |     expect_equal(actual, expected)
128 | 
129 | })
130 | 


--------------------------------------------------------------------------------
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.R
2 | .build.timestamp
3 | 


--------------------------------------------------------------------------------
/vignettes/basic_model.dot:
--------------------------------------------------------------------------------
 1 | digraph G {
 2 | //rankdir = LR;
 3 | rankdir = TB;
 4 | 
 5 | node [shape = "none"];
 6 | InCode [label = "input\ncode"];
 7 | OutCode [label = "output\ncode"];
 8 | 
 9 | node [shape = "rectangle", fontname = "courier"];
10 | edge [fontname = "courier"];
11 | 
12 | //label = "Rectangles represent data structures
13 | //Ovals represent functions
14 | //";
15 | //labeljust=right;
16 | 
17 | InCode -> DependGraph [label = " inferGraph"]
18 | DependGraph -> Schedule [label = " schedule"]
19 | Schedule -> GeneratedCode [label = " generate"]
20 | GeneratedCode -> OutCode [label = " writeCode"]
21 | 
22 | }
23 | 


--------------------------------------------------------------------------------
/vignettes/basic_model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clarkfitzg/makeParallel/6e43f34f51a23692907ec1563d3d47a8e189d7bf/vignettes/basic_model.png


--------------------------------------------------------------------------------
/vignettes/extensible.dot:
--------------------------------------------------------------------------------
 1 | digraph G {
 2 | rankdir = LR;
 3 | 
 4 | labeljust=right;
 5 | 
 6 | node [shape = "rectangle", fontname = "arial"];
 7 | //------------------------------------------------------------
 8 | file_name [label = "file\nname"];
 9 | parsed_expression [label = "parsed\nexpression"];
10 | 
11 | node [shape = "rectangle", fontname = "courier"];
12 | //------------------------------------------------------------
13 | DependGraph;
14 | TaskSchedule;
15 | MapSchedule;
16 | SerialSchedule;
17 | GeneratedCode;
18 | 
19 | node [shape = "oval", fontname = "arial"];
20 | //------------------------------------------------------------
21 | socket_code_generator [label = "socket code\ngenerator"];
22 | parallel_package_code_generator [label = "parallel package\ncode generator"];
23 | 
24 | node [shape = "oval", fontname = "courier"];
25 | //------------------------------------------------------------
26 | inferGraph;
27 | scheduleTaskList;
28 | map_scheduler [label = "schedule\n(default)"];
29 | 
30 | node [style = "filled", shape = "oval", fontname = "courier"];
31 | //------------------------------------------------------------
32 | scheduleForkJoin;
33 | scheduleCluster;
34 | ForkJoinSchedule [shape = "rectangle"];
35 | 
36 | node [fontname = "arial"];
37 | hive_code_generator [label = "Apache Hive\ncode generator"];
38 | snow_code_generator [label = "SNOW code\ngenerator"];
39 | future_code_generator [label = "R future package\ncode generator"];
40 | mcparallel_code_generator [label = "parallel::mcparallel\ncode generator"];
41 | 
42 | 
43 | file_name -> inferGraph;
44 | parsed_expression -> inferGraph;
45 | inferGraph -> DependGraph;
46 | DependGraph -> scheduleTaskList;
47 | DependGraph -> map_scheduler;
48 | DependGraph -> scheduleForkJoin;
49 | DependGraph -> scheduleCluster
50 | scheduleTaskList -> TaskSchedule;
51 | scheduleCluster -> TaskSchedule;
52 | map_scheduler -> MapSchedule;
53 | SerialSchedule -> GeneratedCode [label = "non op"];
54 | TaskSchedule -> socket_code_generator;
55 | MapSchedule -> parallel_package_code_generator;
56 | MapSchedule -> hive_code_generator;
57 | MapSchedule -> snow_code_generator;
58 | hive_code_generator -> GeneratedCode;
59 | snow_code_generator -> GeneratedCode;
60 | socket_code_generator -> GeneratedCode;
61 | parallel_package_code_generator -> GeneratedCode;
62 | scheduleForkJoin -> ForkJoinSchedule;
63 | ForkJoinSchedule -> future_code_generator;
64 | ForkJoinSchedule -> mcparallel_code_generator;
65 | future_code_generator -> GeneratedCode;
66 | mcparallel_code_generator -> GeneratedCode;
67 | 
68 | // Bypass
69 | edge [style = "dotted"];
70 | //------------------------------------------------------------
71 | scheduleTaskList -> SerialSchedule;
72 | map_scheduler -> SerialSchedule;
73 | scheduleForkJoin -> SerialSchedule;
74 | scheduleCluster -> SerialSchedule;
75 | }
76 | 


--------------------------------------------------------------------------------
/vignettes/extensible.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clarkfitzg/makeParallel/6e43f34f51a23692907ec1563d3d47a8e189d7bf/vignettes/extensible.png


--------------------------------------------------------------------------------
/vignettes/standardized_code.md:
--------------------------------------------------------------------------------
 1 | ## Standardized Code
 2 | 
 3 | A couple things are becoming clear to me:
 4 | 
 5 | 1. We need to preserve the code as it comes into the function, since we infer everything from this original code.
 6 | 2. It's often easier to do the scheduling on modified code.
 7 | 
 8 | Therefore it would be useful to have two versions of the code- one original, and one modified.
 9 | Nick and I have spoke about this before: putting the code in 'canonical form'.
10 | I'll call it 'standardized code', so I can use a more common word.
11 | 
12 | The main thing I want right now from the standardized code is to group plotting edges.
13 | We could also change `for` loops -> `lapply` at this step.
14 | In the future I could see wanting more, for example:
15 | 
16 | - break down to subexpressions, and schedule those individually
17 | - transform the magrittr pipe, `%>%`, into regular looking calls
18 | - single static asssignment
19 | 
20 | It should be the responsibility of makeParallel to standardize the code.
21 | Otherwise, we would require the user to write their code according to our model, which is exactly what we don't want.
22 | Also, if the package dictates how we standardize then we can continue to modify how we standardize it- it doesn't have to be fixed or concretely specified.
23 | 
24 | All of these things will require us to change the timing information.
25 | 
26 | For scheduling we only need the standardized code, and the graph that comes from the standardized code.
27 | 
28 | The new model then would transform in these steps:
29 | 
30 | - User code
31 | - Standardized code
32 | - Dependency graph (with only use-def edges)
33 | - Schedule
34 | - Generated code
35 | 
36 | We may use some form of dependency graph on the user code to help standardize it, but this is really just an implementation detail.
37 | The scheduler needs to consume a dependency graph with only use-def edges, because this helps the steps to be modular.
38 | Each scheduler doesn't need to handle arbitrary types of edges.
39 | What we can do is 'collapse' all edges which are not use-def edges down into blocks.
40 | This collapsing step can do all the changes to the timings.
41 | 
42 | I may be overthinking it with the standardized code.
43 | How much transparency do we need for each step?
44 | 


--------------------------------------------------------------------------------