├── .gitignore
├── Makefile
├── README.md
├── paper
    ├── Makefile
    ├── jss.bst
    ├── jss.cls
    ├── jss5097.Rnw
    ├── jss5097.bib
    ├── jsslogo.jpg
    ├── notes
    │   ├── Makefile
    │   └── notes.tex
    ├── orcidlink.sty
    ├── reviews
    │   └── D- 5097 post comments.txt
    └── setup.R
└── pkg
    ├── DESCRIPTION
    ├── NAMESPACE
    ├── NEWS
    ├── R
        ├── accumulate.R
        ├── helpers.R
        ├── producers.R
        └── utils.R
    ├── data
        └── producers.rda
    ├── inst
        ├── CITATION
        └── tinytest
        │   ├── test_accumulate.R
        │   ├── test_helpers.R
        │   ├── test_internals.R
        │   ├── test_object_list.R
        │   └── test_utils.R
    ├── tests
        └── tinytest.R
    └── vignettes
        ├── introduction.md
        └── style.css


/.gitignore:
--------------------------------------------------------------------------------
 1 | # History files
 2 | .Rhistory
 3 | .Rapp.history
 4 | 
 5 | # Session Data files
 6 | .RData
 7 | *.swp
 8 | 
 9 | # Example code in package build process
10 | *-Ex.R
11 | 
12 | # Output files from R CMD build
13 | /*.tar.gz
14 | 
15 | # Output files from R CMD check
16 | /*.Rcheck/
17 | 
18 | # RStudio files
19 | .Rproj.user/
20 | *.Rproj
21 | 
22 | # produced vignettes
23 | pkg/vignettes/*.html
24 | pkg/vignettes/*.pdf
25 | 
26 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
27 | .httr-oauth
28 | 
29 | # knitr and R markdown default cache directories
30 | /*_cache/
31 | /cache/
32 | 
33 | # Temporary files created by R markdown
34 | *.utf8.md
35 | *.knit.md
36 | .Rproj.user
37 | 
38 | # Files created by roxygen
39 | pkg/man/*
40 | 
41 | # paper.pdf
42 | paper/*.pdf
43 | 
44 | # other files I frequently generate
45 | manual.pdf
46 | README.html
47 | *.csv
48 | *.toc
49 | *.aux
50 | *.log
51 | *.out
52 | *.tex
53 | *.bbl
54 | *.blg
55 | 
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | doc: 
 3 | 	R -s -e "pkgload::load_all('pkg');roxygen2::roxygenize('pkg')"
 4 | 
 5 | pkg: doc
 6 | 	rm -f *.tar.gz
 7 | 	R CMD build pkg
 8 | 
 9 | check: doc
10 | 	rm -rf *.tar.gz
11 | 	R CMD build pkg
12 | 	R CMD check *.tar.gz
13 | 
14 | cran: doc
15 | 	rm -rf *.tar.gz
16 | 	R CMD build --compact-vignettes="gs+qpdf" ./pkg
17 | 	R CMD check --as-cran *.tar.gz
18 | 
19 | install: doc
20 | 	rm -rf *.tar.gz
21 | 	R CMD build pkg
22 | 	R CMD INSTALL *.tar.gz
23 | 
24 | test: doc
25 | 	R -s -e "tinytest::build_install_test('pkg')"
26 | 
27 | manual: doc
28 | 	R CMD Rd2pdf --force -o manual.pdf ./pkg
29 | 
30 | revdep: pkg
31 | 	rm -rf revdep
32 | 	mkdir revdep
33 | 	mv *.tar.gz revdep
34 | 	R -s -e "out <- tools::check_packages_in_dir('revdep',reverse=list(which='most'),Ncpus=3); print(summary(out)); saveRDS(out, file='revdep/output.RDS')"
35 | 
36 | vignette: 
37 | 	mkdir -p out
38 | 	rm -rf out/*
39 | 	cp pkg/vignettes/* ./out
40 | 	cd out; R -e "simplermarkdown::mdweave_to_html('introduction.md')"
41 | 	
42 | 
43 | clean:
44 | 	rm -f pkg/vignettes/*.aux
45 | 	rm -f pkg/vignettes/*.log
46 | 	rm -f pkg/vignettes/*.out
47 | 	rm -f pkg/vignettes/using_lumberjack.pdf
48 | 	rm -f pkg/vignettes/*.toc
49 | 	rm -f pkg/vignettes/*.csv
50 | 	rm -f pkg/vignettes/*.html
51 | 	rm -rf *.Rcheck
52 | 	rm -rf revdep
53 | 	rm -f *.tar.gz
54 | 
55 | 
56 | 
57 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | [![CRAN](http://www.r-pkg.org/badges/version/accumulate)](http://cran.r-project.org/package=accumulate/)
 3 | [![Downloads](https://cranlogs.r-pkg.org/badges/accumulate)](http://cran.r-project.org/package=accumulate/)
 4 | [![status](https://tinyverse.netlify.app/badge/accumulate)](https://CRAN.R-project.org/package=accumulate)
 5 | 
 6 | # accumulate
 7 | 
 8 | Split-apply-combine aggregation with dynamic grouping. 
 9 | 
10 | The packages implements grouped aggregation, but rather than having static
11 | groups like in `stats::aggregate()` or `dplyr::group_by()` it is possible to
12 | change the grouping according to a user-defined scheme. For example, one may
13 | demand that groups contain at least _n_ records, and collapse certain groups
14 | together if this is not the case.
15 | 
16 | 
17 | ## Installing
18 | 
19 | The latest CRAN release can be installed as usual
20 | ```r
21 | install.packages("accumulate")
22 | ```
23 | 
24 | The git version can be installed by cloning the repo and using `make`.
25 | 
26 | ```bash
27 | git clone https://github.com/markvanderloo/accumulate
28 | cd accumulate
29 | make install
30 | ```
31 | 
32 | No guarantees that it will actually build, install, or give correct results.
33 | (This is after all the place where development takes place).
34 | 
35 | 
36 | ## Example
37 | 
38 | See [the introductory vignette](pkg/vignettes/introduction.md).
39 | 
40 | 
41 | ## Licence
42 | 
43 | This software is released under [EUPL](https://commission.europa.eu/content/european-union-public-licence_en).
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/paper/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | jss5097.pdf: jss5097.Rnw jss5097.bib
 3 | 	R -s -e "Sweave('jss5097.Rnw')"
 4 | 	pdflatex jss5097.tex
 5 | 	bibtex jss5097
 6 | 	pdflatex jss5097.tex
 7 | 	pdflatex jss5097.tex
 8 | 	evince jss5097.pdf&
 9 | 
10 | jss5097.R: jss5097.Rnw
11 | 	R -s -e "Stangle('jss5097.Rnw')"
12 | 
13 | clean:
14 | 	rm -f *.aux *.log *.blg *.bbl *.out
15 | 
16 | 


--------------------------------------------------------------------------------
/paper/jss.bst:
--------------------------------------------------------------------------------
   1 | %%
   2 | %% This is file `jss.bst',
   3 | %% generated with the docstrip utility.
   4 | %%
   5 | %% The original source files were:
   6 | %%
   7 | %% merlin.mbs  (with options: `ay,nat,nm-rvx,keyxyr,dt-beg,yr-par,note-yr,tit-qq,atit-u,trnum-it,vol-bf,volp-com,num-xser,pre-edn,isbn,issn,edpar,pp,ed,xedn,xand,etal-it,revdata,eprint,url,url-blk,doi,nfss')
   8 | %%
   9 | %% ** BibTeX style file for JSS publications (http://www.jstatsoft.org/)
  10 | %%
  11 | %% License: GPL-2 | GPL-3
  12 |  % ===============================================================
  13 |  % IMPORTANT NOTICE:
  14 |  % This bibliographic style (bst) file has been generated from one or
  15 |  % more master bibliographic style (mbs) files, listed above, provided
  16 |  % with kind permission of Patrick W Daly.
  17 |  %
  18 |  % This generated file can be redistributed and/or modified under the terms
  19 |  % of the General Public License (Version 2 or 3).
  20 |  % ===============================================================
  21 |  % Name and version information of the main mbs file:
  22 |  % \ProvidesFile{merlin.mbs}[2011/11/18 4.33 (PWD, AO, DPC)]
  23 |  %   For use with BibTeX version 0.99a or later
  24 |  %-------------------------------------------------------------------
  25 |  % This bibliography style file is intended for texts in ENGLISH
  26 |  % This is an author-year citation style bibliography. As such, it is
  27 |  % non-standard LaTeX, and requires a special package file to function properly.
  28 |  % Such a package is    natbib.sty   by Patrick W. Daly
  29 |  % The form of the \bibitem entries is
  30 |  %   \bibitem[Jones et al.(1990)]{key}...
  31 |  %   \bibitem[Jones et al.(1990)Jones, Baker, and Smith]{key}...
  32 |  % The essential feature is that the label (the part in brackets) consists
  33 |  % of the author names, as they should appear in the citation, with the year
  34 |  % in parentheses following. There must be no space before the opening
  35 |  % parenthesis!
  36 |  % With natbib v5.3, a full list of authors may also follow the year.
  37 |  % In natbib.sty, it is possible to define the type of enclosures that is
  38 |  % really wanted (brackets or parentheses), but in either case, there must
  39 |  % be parentheses in the label.
  40 |  % The \cite command functions as follows:
  41 |  %   \citet{key} ==>>                Jones et al. (1990)
  42 |  %   \citet*{key} ==>>               Jones, Baker, and Smith (1990)
  43 |  %   \citep{key} ==>>                (Jones et al., 1990)
  44 |  %   \citep*{key} ==>>               (Jones, Baker, and Smith, 1990)
  45 |  %   \citep[chap. 2]{key} ==>>       (Jones et al., 1990, chap. 2)
  46 |  %   \citep[e.g.][]{key} ==>>        (e.g. Jones et al., 1990)
  47 |  %   \citep[e.g.][p. 32]{key} ==>>   (e.g. Jones et al., 1990, p. 32)
  48 |  %   \citeauthor{key} ==>>           Jones et al.
  49 |  %   \citeauthor*{key} ==>>          Jones, Baker, and Smith
  50 |  %   \citeyear{key} ==>>             1990
  51 |  %---------------------------------------------------------------------
  52 | 
  53 | ENTRY
  54 |   { address
  55 |     archive
  56 |     author
  57 |     booktitle
  58 |     chapter
  59 |     collaboration
  60 |     doi
  61 |     edition
  62 |     editor
  63 |     eid
  64 |     eprint
  65 |     howpublished
  66 |     institution
  67 |     isbn
  68 |     issn
  69 |     journal
  70 |     key
  71 |     month
  72 |     note
  73 |     number
  74 |     numpages
  75 |     organization
  76 |     pages
  77 |     publisher
  78 |     school
  79 |     series
  80 |     title
  81 |     type
  82 |     url
  83 |     volume
  84 |     year
  85 |   }
  86 |   {}
  87 |   { label extra.label sort.label short.list }
  88 | INTEGERS { output.state before.all mid.sentence after.sentence after.block }
  89 | FUNCTION {init.state.consts}
  90 | { #0 'before.all :=
  91 |   #1 'mid.sentence :=
  92 |   #2 'after.sentence :=
  93 |   #3 'after.block :=
  94 | }
  95 | STRINGS { s t}
  96 | FUNCTION {output.nonnull}
  97 | { 's :=
  98 |   output.state mid.sentence =
  99 |     { ", " * write$ }
 100 |     { output.state after.block =
 101 |         { add.period$ write$
 102 |           newline$
 103 |           "\newblock " write$
 104 |         }
 105 |         { output.state before.all =
 106 |             'write$
 107 |             { add.period$ " " * write$ }
 108 |           if$
 109 |         }
 110 |       if$
 111 |       mid.sentence 'output.state :=
 112 |     }
 113 |   if$
 114 |   s
 115 | }
 116 | FUNCTION {output}
 117 | { duplicate$ empty$
 118 |     'pop$
 119 |     'output.nonnull
 120 |   if$
 121 | }
 122 | FUNCTION {output.check}
 123 | { 't :=
 124 |   duplicate$ empty$
 125 |     { pop$ "empty " t * " in " * cite$ * warning$ }
 126 |     'output.nonnull
 127 |   if$
 128 | }
 129 | FUNCTION {fin.entry}
 130 | { add.period$
 131 |   write$
 132 |   newline$
 133 | }
 134 | 
 135 | FUNCTION {new.block}
 136 | { output.state before.all =
 137 |     'skip$
 138 |     { after.block 'output.state := }
 139 |   if$
 140 | }
 141 | FUNCTION {new.sentence}
 142 | { output.state after.block =
 143 |     'skip$
 144 |     { output.state before.all =
 145 |         'skip$
 146 |         { after.sentence 'output.state := }
 147 |       if$
 148 |     }
 149 |   if$
 150 | }
 151 | FUNCTION {add.blank}
 152 | {  " " * before.all 'output.state :=
 153 | }
 154 | 
 155 | FUNCTION {date.block}
 156 | {
 157 |   new.block
 158 | }
 159 | 
 160 | FUNCTION {not}
 161 | {   { #0 }
 162 |     { #1 }
 163 |   if$
 164 | }
 165 | FUNCTION {and}
 166 | {   'skip$
 167 |     { pop$ #0 }
 168 |   if$
 169 | }
 170 | FUNCTION {or}
 171 | {   { pop$ #1 }
 172 |     'skip$
 173 |   if$
 174 | }
 175 | FUNCTION {non.stop}
 176 | { duplicate$
 177 |    "}" * add.period$
 178 |    #-1 #1 substring$ "." =
 179 | }
 180 | 
 181 | STRINGS {z}
 182 | 
 183 | FUNCTION {remove.dots}
 184 | { 'z :=
 185 |    ""
 186 |    { z empty$ not }
 187 |    { z #1 #2 substring$
 188 |      duplicate$ "\." =
 189 |        { z #3 global.max$ substring$ 'z :=  * }
 190 |        { pop$
 191 |          z #1 #1 substring$
 192 |          z #2 global.max$ substring$ 'z :=
 193 |          duplicate$ "." = 'pop$
 194 |            { * }
 195 |          if$
 196 |        }
 197 |      if$
 198 |    }
 199 |    while$
 200 | }
 201 | FUNCTION {new.block.checkb}
 202 | { empty$
 203 |   swap$ empty$
 204 |   and
 205 |     'skip$
 206 |     'new.block
 207 |   if$
 208 | }
 209 | FUNCTION {field.or.null}
 210 | { duplicate$ empty$
 211 |     { pop$ "" }
 212 |     'skip$
 213 |   if$
 214 | }
 215 | FUNCTION {emphasize}
 216 | { duplicate$ empty$
 217 |     { pop$ "" }
 218 |     { "\emph{" swap$ * "}" * }
 219 |   if$
 220 | }
 221 | FUNCTION {bolden}
 222 | { duplicate$ empty$
 223 |     { pop$ "" }
 224 |     { "\textbf{" swap$ * "}" * }
 225 |   if$
 226 | }
 227 | FUNCTION {tie.or.space.prefix}
 228 | { duplicate$ text.length$ #3 <
 229 |     { "~" }
 230 |     { " " }
 231 |   if$
 232 |   swap$
 233 | }
 234 | 
 235 | FUNCTION {capitalize}
 236 | { "u" change.case$ "t" change.case$ }
 237 | 
 238 | FUNCTION {space.word}
 239 | { " " swap$ * " " * }
 240 |  % Here are the language-specific definitions for explicit words.
 241 |  % Each function has a name bbl.xxx where xxx is the English word.
 242 |  % The language selected here is ENGLISH
 243 | FUNCTION {bbl.and}
 244 | { "and"}
 245 | 
 246 | FUNCTION {bbl.etal}
 247 | { "et~al." }
 248 | 
 249 | FUNCTION {bbl.editors}
 250 | { "eds." }
 251 | 
 252 | FUNCTION {bbl.editor}
 253 | { "ed." }
 254 | 
 255 | FUNCTION {bbl.edby}
 256 | { "edited by" }
 257 | 
 258 | FUNCTION {bbl.edition}
 259 | { "edition" }
 260 | 
 261 | FUNCTION {bbl.volume}
 262 | { "volume" }
 263 | 
 264 | FUNCTION {bbl.of}
 265 | { "of" }
 266 | 
 267 | FUNCTION {bbl.number}
 268 | { "number" }
 269 | 
 270 | FUNCTION {bbl.nr}
 271 | { "no." }
 272 | 
 273 | FUNCTION {bbl.in}
 274 | { "in" }
 275 | 
 276 | FUNCTION {bbl.pages}
 277 | { "pp." }
 278 | 
 279 | FUNCTION {bbl.page}
 280 | { "p." }
 281 | 
 282 | FUNCTION {bbl.eidpp}
 283 | { "pages" }
 284 | 
 285 | FUNCTION {bbl.chapter}
 286 | { "chapter" }
 287 | 
 288 | FUNCTION {bbl.techrep}
 289 | { "Technical Report" }
 290 | 
 291 | FUNCTION {bbl.mthesis}
 292 | { "Master's thesis" }
 293 | 
 294 | FUNCTION {bbl.phdthesis}
 295 | { "Ph.D. thesis" }
 296 | 
 297 | MACRO {jan} {"January"}
 298 | 
 299 | MACRO {feb} {"February"}
 300 | 
 301 | MACRO {mar} {"March"}
 302 | 
 303 | MACRO {apr} {"April"}
 304 | 
 305 | MACRO {may} {"May"}
 306 | 
 307 | MACRO {jun} {"June"}
 308 | 
 309 | MACRO {jul} {"July"}
 310 | 
 311 | MACRO {aug} {"August"}
 312 | 
 313 | MACRO {sep} {"September"}
 314 | 
 315 | MACRO {oct} {"October"}
 316 | 
 317 | MACRO {nov} {"November"}
 318 | 
 319 | MACRO {dec} {"December"}
 320 | 
 321 | MACRO {acmcs} {"ACM Computing Surveys"}
 322 | 
 323 | MACRO {acta} {"Acta Informatica"}
 324 | 
 325 | MACRO {cacm} {"Communications of the ACM"}
 326 | 
 327 | MACRO {ibmjrd} {"IBM Journal of Research and Development"}
 328 | 
 329 | MACRO {ibmsj} {"IBM Systems Journal"}
 330 | 
 331 | MACRO {ieeese} {"IEEE Transactions on Software Engineering"}
 332 | 
 333 | MACRO {ieeetc} {"IEEE Transactions on Computers"}
 334 | 
 335 | MACRO {ieeetcad}
 336 |  {"IEEE Transactions on Computer-Aided Design of Integrated Circuits"}
 337 | 
 338 | MACRO {ipl} {"Information Processing Letters"}
 339 | 
 340 | MACRO {jacm} {"Journal of the ACM"}
 341 | 
 342 | MACRO {jcss} {"Journal of Computer and System Sciences"}
 343 | 
 344 | MACRO {scp} {"Science of Computer Programming"}
 345 | 
 346 | MACRO {sicomp} {"SIAM Journal on Computing"}
 347 | 
 348 | MACRO {tocs} {"ACM Transactions on Computer Systems"}
 349 | 
 350 | MACRO {tods} {"ACM Transactions on Database Systems"}
 351 | 
 352 | MACRO {tog} {"ACM Transactions on Graphics"}
 353 | 
 354 | MACRO {toms} {"ACM Transactions on Mathematical Software"}
 355 | 
 356 | MACRO {toois} {"ACM Transactions on Office Information Systems"}
 357 | 
 358 | MACRO {toplas} {"ACM Transactions on Programming Languages and Systems"}
 359 | 
 360 | MACRO {tcs} {"Theoretical Computer Science"}
 361 | FUNCTION {bibinfo.check}
 362 | { swap$
 363 |   duplicate$ missing$
 364 |     {
 365 |       pop$ pop$
 366 |       ""
 367 |     }
 368 |     { duplicate$ empty$
 369 |         {
 370 |           swap$ pop$
 371 |         }
 372 |         { swap$
 373 |           pop$
 374 |         }
 375 |       if$
 376 |     }
 377 |   if$
 378 | }
 379 | FUNCTION {bibinfo.warn}
 380 | { swap$
 381 |   duplicate$ missing$
 382 |     {
 383 |       swap$ "missing " swap$ * " in " * cite$ * warning$ pop$
 384 |       ""
 385 |     }
 386 |     { duplicate$ empty$
 387 |         {
 388 |           swap$ "empty " swap$ * " in " * cite$ * warning$
 389 |         }
 390 |         { swap$
 391 |           pop$
 392 |         }
 393 |       if$
 394 |     }
 395 |   if$
 396 | }
 397 | FUNCTION {format.eprint}
 398 | { eprint duplicate$ empty$
 399 |     'skip$
 400 |     { "\eprint"
 401 |       archive empty$
 402 |         'skip$
 403 |         { "[" * archive * "]" * }
 404 |       if$
 405 |       "{" * swap$ * "}" *
 406 |     }
 407 |   if$
 408 | }
 409 | FUNCTION {format.url}
 410 | {
 411 |   url
 412 |   duplicate$ empty$
 413 |     { pop$ "" }
 414 |     { "\urlprefix\url{" swap$ * "}" * }
 415 |   if$
 416 | }
 417 | 
 418 | INTEGERS { nameptr namesleft numnames }
 419 | 
 420 | 
 421 | STRINGS  { bibinfo}
 422 | 
 423 | FUNCTION {format.names}
 424 | { 'bibinfo :=
 425 |   duplicate$ empty$ 'skip$ {
 426 |   's :=
 427 |   "" 't :=
 428 |   #1 'nameptr :=
 429 |   s num.names$ 'numnames :=
 430 |   numnames 'namesleft :=
 431 |     { namesleft #0 > }
 432 |     { s nameptr
 433 |       "{vv~}{ll}{ jj}{ f{}}"
 434 |       format.name$
 435 |       remove.dots
 436 |       bibinfo bibinfo.check
 437 |       't :=
 438 |       nameptr #1 >
 439 |         {
 440 |           namesleft #1 >
 441 |             { ", " * t * }
 442 |             {
 443 |               s nameptr "{ll}" format.name$ duplicate$ "others" =
 444 |                 { 't := }
 445 |                 { pop$ }
 446 |               if$
 447 |               "," *
 448 |               t "others" =
 449 |                 {
 450 |                   " " * bbl.etal emphasize *
 451 |                 }
 452 |                 { " " * t * }
 453 |               if$
 454 |             }
 455 |           if$
 456 |         }
 457 |         't
 458 |       if$
 459 |       nameptr #1 + 'nameptr :=
 460 |       namesleft #1 - 'namesleft :=
 461 |     }
 462 |   while$
 463 |   } if$
 464 | }
 465 | FUNCTION {format.names.ed}
 466 | {
 467 |   'bibinfo :=
 468 |   duplicate$ empty$ 'skip$ {
 469 |   's :=
 470 |   "" 't :=
 471 |   #1 'nameptr :=
 472 |   s num.names$ 'numnames :=
 473 |   numnames 'namesleft :=
 474 |     { namesleft #0 > }
 475 |     { s nameptr
 476 |       "{f{}~}{vv~}{ll}{ jj}"
 477 |       format.name$
 478 |       remove.dots
 479 |       bibinfo bibinfo.check
 480 |       't :=
 481 |       nameptr #1 >
 482 |         {
 483 |           namesleft #1 >
 484 |             { ", " * t * }
 485 |             {
 486 |               s nameptr "{ll}" format.name$ duplicate$ "others" =
 487 |                 { 't := }
 488 |                 { pop$ }
 489 |               if$
 490 |               "," *
 491 |               t "others" =
 492 |                 {
 493 | 
 494 |                   " " * bbl.etal emphasize *
 495 |                 }
 496 |                 { " " * t * }
 497 |               if$
 498 |             }
 499 |           if$
 500 |         }
 501 |         't
 502 |       if$
 503 |       nameptr #1 + 'nameptr :=
 504 |       namesleft #1 - 'namesleft :=
 505 |     }
 506 |   while$
 507 |   } if$
 508 | }
 509 | FUNCTION {format.key}
 510 | { empty$
 511 |     { key field.or.null }
 512 |     { "" }
 513 |   if$
 514 | }
 515 | 
 516 | FUNCTION {format.authors}
 517 | { author "author" format.names
 518 |     duplicate$ empty$ 'skip$
 519 |     { collaboration "collaboration" bibinfo.check
 520 |       duplicate$ empty$ 'skip$
 521 |         { " (" swap$ * ")" * }
 522 |       if$
 523 |       *
 524 |     }
 525 |   if$
 526 | }
 527 | FUNCTION {get.bbl.editor}
 528 | { editor num.names$ #1 > 'bbl.editors 'bbl.editor if$ }
 529 | 
 530 | FUNCTION {format.editors}
 531 | { editor "editor" format.names duplicate$ empty$ 'skip$
 532 |     {
 533 |       " " *
 534 |       get.bbl.editor
 535 |    "(" swap$ * ")" *
 536 |       *
 537 |     }
 538 |   if$
 539 | }
 540 | FUNCTION {format.isbn}
 541 | { isbn "isbn" bibinfo.check
 542 |   duplicate$ empty$ 'skip$
 543 |     {
 544 |       new.block
 545 |       "ISBN " swap$ *
 546 |     }
 547 |   if$
 548 | }
 549 | 
 550 | FUNCTION {format.issn}
 551 | { issn "issn" bibinfo.check
 552 |   duplicate$ empty$ 'skip$
 553 |     {
 554 |       new.block
 555 |       "ISSN " swap$ *
 556 |     }
 557 |   if$
 558 | }
 559 | 
 560 | FUNCTION {format.doi}
 561 | { doi empty$
 562 |     { "" }
 563 |     {
 564 |       new.block
 565 |       "\doi{" doi * "}" *
 566 |     }
 567 |   if$
 568 | }
 569 | FUNCTION {format.note}
 570 | {
 571 |  note empty$
 572 |     { "" }
 573 |     { note #1 #1 substring$
 574 |       duplicate$ "{" =
 575 |         'skip$
 576 |         { output.state mid.sentence =
 577 |           { "l" }
 578 |           { "u" }
 579 |         if$
 580 |         change.case$
 581 |         }
 582 |       if$
 583 |       note #2 global.max$ substring$ * "note" bibinfo.check
 584 |     }
 585 |   if$
 586 | }
 587 | 
 588 | FUNCTION {format.title}
 589 | { title
 590 |   "title" bibinfo.check
 591 |   duplicate$ empty$ 'skip$
 592 |     {
 593 |       "\enquote{" swap$ *
 594 |       add.period$ "}" *
 595 |     }
 596 |   if$
 597 | }
 598 | FUNCTION {format.full.names}
 599 | {'s :=
 600 |  "" 't :=
 601 |   #1 'nameptr :=
 602 |   s num.names$ 'numnames :=
 603 |   numnames 'namesleft :=
 604 |     { namesleft #0 > }
 605 |     { s nameptr
 606 |       "{vv~}{ll}" format.name$
 607 |       't :=
 608 |       nameptr #1 >
 609 |         {
 610 |           namesleft #1 >
 611 |             { ", " * t * }
 612 |             {
 613 |               s nameptr "{ll}" format.name$ duplicate$ "others" =
 614 |                 { 't := }
 615 |                 { pop$ }
 616 |               if$
 617 |               t "others" =
 618 |                 {
 619 |                   " " * bbl.etal emphasize *
 620 |                 }
 621 |                 {
 622 |                   numnames #2 >
 623 |                     { "," * }
 624 |                     'skip$
 625 |                   if$
 626 |                   bbl.and
 627 |                   space.word * t *
 628 |                 }
 629 |               if$
 630 |             }
 631 |           if$
 632 |         }
 633 |         't
 634 |       if$
 635 |       nameptr #1 + 'nameptr :=
 636 |       namesleft #1 - 'namesleft :=
 637 |     }
 638 |   while$
 639 | }
 640 | 
 641 | FUNCTION {author.editor.key.full}
 642 | { author empty$
 643 |     { editor empty$
 644 |         { key empty$
 645 |             { cite$ #1 #3 substring$ }
 646 |             'key
 647 |           if$
 648 |         }
 649 |         { editor format.full.names }
 650 |       if$
 651 |     }
 652 |     { author format.full.names }
 653 |   if$
 654 | }
 655 | 
 656 | FUNCTION {author.key.full}
 657 | { author empty$
 658 |     { key empty$
 659 |          { cite$ #1 #3 substring$ }
 660 |           'key
 661 |       if$
 662 |     }
 663 |     { author format.full.names }
 664 |   if$
 665 | }
 666 | 
 667 | FUNCTION {editor.key.full}
 668 | { editor empty$
 669 |     { key empty$
 670 |          { cite$ #1 #3 substring$ }
 671 |           'key
 672 |       if$
 673 |     }
 674 |     { editor format.full.names }
 675 |   if$
 676 | }
 677 | 
 678 | FUNCTION {make.full.names}
 679 | { type$ "book" =
 680 |   type$ "inbook" =
 681 |   or
 682 |     'author.editor.key.full
 683 |     { type$ "proceedings" =
 684 |         'editor.key.full
 685 |         'author.key.full
 686 |       if$
 687 |     }
 688 |   if$
 689 | }
 690 | 
 691 | FUNCTION {output.bibitem}
 692 | { newline$
 693 |   "\bibitem[{" write$
 694 |   label write$
 695 |   ")" make.full.names duplicate$ short.list =
 696 |      { pop$ }
 697 |      { * }
 698 |    if$
 699 |   "}]{" * write$
 700 |   cite$ write$
 701 |   "}" write$
 702 |   newline$
 703 |   ""
 704 |   before.all 'output.state :=
 705 | }
 706 | 
 707 | FUNCTION {n.dashify}
 708 | {
 709 |   't :=
 710 |   ""
 711 |     { t empty$ not }
 712 |     { t #1 #1 substring$ "-" =
 713 |         { t #1 #2 substring$ "--" = not
 714 |             { "--" *
 715 |               t #2 global.max$ substring$ 't :=
 716 |             }
 717 |             {   { t #1 #1 substring$ "-" = }
 718 |                 { "-" *
 719 |                   t #2 global.max$ substring$ 't :=
 720 |                 }
 721 |               while$
 722 |             }
 723 |           if$
 724 |         }
 725 |         { t #1 #1 substring$ *
 726 |           t #2 global.max$ substring$ 't :=
 727 |         }
 728 |       if$
 729 |     }
 730 |   while$
 731 | }
 732 | 
 733 | FUNCTION {word.in}
 734 | { bbl.in capitalize
 735 |   " " * }
 736 | 
 737 | FUNCTION {format.date}
 738 | { year "year" bibinfo.check duplicate$ empty$
 739 |     {
 740 |       "empty year in " cite$ * "; set to ????" * warning$
 741 |        pop$ "????"
 742 |     }
 743 |     'skip$
 744 |   if$
 745 |   extra.label *
 746 |   before.all 'output.state :=
 747 |   " (" swap$ * ")" *
 748 | }
 749 | FUNCTION {format.btitle}
 750 | { title "title" bibinfo.check
 751 |   duplicate$ empty$ 'skip$
 752 |     {
 753 |       emphasize
 754 |     }
 755 |   if$
 756 | }
 757 | FUNCTION {either.or.check}
 758 | { empty$
 759 |     'pop$
 760 |     { "can't use both " swap$ * " fields in " * cite$ * warning$ }
 761 |   if$
 762 | }
 763 | FUNCTION {format.bvolume}
 764 | { volume empty$
 765 |     { "" }
 766 |     { bbl.volume volume tie.or.space.prefix
 767 |       "volume" bibinfo.check * *
 768 |       series "series" bibinfo.check
 769 |       duplicate$ empty$ 'pop$
 770 |         { swap$ bbl.of space.word * swap$
 771 |           emphasize * }
 772 |       if$
 773 |       "volume and number" number either.or.check
 774 |     }
 775 |   if$
 776 | }
 777 | FUNCTION {format.number.series}
 778 | { volume empty$
 779 |     { number empty$
 780 |         { series field.or.null }
 781 |         { series empty$
 782 |             { number "number" bibinfo.check }
 783 |             { output.state mid.sentence =
 784 |                 { bbl.number }
 785 |                 { bbl.number capitalize }
 786 |               if$
 787 |               number tie.or.space.prefix "number" bibinfo.check * *
 788 |               bbl.in space.word *
 789 |               series "series" bibinfo.check *
 790 |             }
 791 |           if$
 792 |         }
 793 |       if$
 794 |     }
 795 |     { "" }
 796 |   if$
 797 | }
 798 | 
 799 | FUNCTION {format.edition}
 800 | { edition duplicate$ empty$ 'skip$
 801 |     {
 802 |       output.state mid.sentence =
 803 |         { "l" }
 804 |         { "t" }
 805 |       if$ change.case$
 806 |       "edition" bibinfo.check
 807 |       " " * bbl.edition *
 808 |     }
 809 |   if$
 810 | }
 811 | INTEGERS { multiresult }
 812 | FUNCTION {multi.page.check}
 813 | { 't :=
 814 |   #0 'multiresult :=
 815 |     { multiresult not
 816 |       t empty$ not
 817 |       and
 818 |     }
 819 |     { t #1 #1 substring$
 820 |       duplicate$ "-" =
 821 |       swap$ duplicate$ "," =
 822 |       swap$ "+" =
 823 |       or or
 824 |         { #1 'multiresult := }
 825 |         { t #2 global.max$ substring$ 't := }
 826 |       if$
 827 |     }
 828 |   while$
 829 |   multiresult
 830 | }
 831 | FUNCTION {format.pages}
 832 | { pages duplicate$ empty$ 'skip$
 833 |     { duplicate$ multi.page.check
 834 |         {
 835 |           bbl.pages swap$
 836 |           n.dashify
 837 |         }
 838 |         {
 839 |           bbl.page swap$
 840 |         }
 841 |       if$
 842 |       tie.or.space.prefix
 843 |       "pages" bibinfo.check
 844 |       * *
 845 |     }
 846 |   if$
 847 | }
 848 | FUNCTION {format.journal.pages}
 849 | { pages duplicate$ empty$ 'pop$
 850 |     { swap$ duplicate$ empty$
 851 |         { pop$ pop$ format.pages }
 852 |         {
 853 |           ", " *
 854 |           swap$
 855 |           n.dashify
 856 |           "pages" bibinfo.check
 857 |           *
 858 |         }
 859 |       if$
 860 |     }
 861 |   if$
 862 | }
 863 | FUNCTION {format.journal.eid}
 864 | { eid "eid" bibinfo.check
 865 |   duplicate$ empty$ 'pop$
 866 |     { swap$ duplicate$ empty$ 'skip$
 867 |       {
 868 |           ", " *
 869 |       }
 870 |       if$
 871 |       swap$ *
 872 |       numpages empty$ 'skip$
 873 |         { bbl.eidpp numpages tie.or.space.prefix
 874 |           "numpages" bibinfo.check * *
 875 |           " (" swap$ * ")" * *
 876 |         }
 877 |       if$
 878 |     }
 879 |   if$
 880 | }
 881 | FUNCTION {format.vol.num.pages}
 882 | { volume field.or.null
 883 |   duplicate$ empty$ 'skip$
 884 |     {
 885 |       "volume" bibinfo.check
 886 |     }
 887 |   if$
 888 |   bolden
 889 |   number "number" bibinfo.check duplicate$ empty$ 'skip$
 890 |     {
 891 |       swap$ duplicate$ empty$
 892 |         { "there's a number but no volume in " cite$ * warning$ }
 893 |         'skip$
 894 |       if$
 895 |       swap$
 896 |       "(" swap$ * ")" *
 897 |     }
 898 |   if$ *
 899 |   eid empty$
 900 |     { format.journal.pages }
 901 |     { format.journal.eid }
 902 |   if$
 903 | }
 904 | 
 905 | FUNCTION {format.chapter.pages}
 906 | { chapter empty$
 907 |     'format.pages
 908 |     { type empty$
 909 |         { bbl.chapter }
 910 |         { type "l" change.case$
 911 |           "type" bibinfo.check
 912 |         }
 913 |       if$
 914 |       chapter tie.or.space.prefix
 915 |       "chapter" bibinfo.check
 916 |       * *
 917 |       pages empty$
 918 |         'skip$
 919 |         { ", " * format.pages * }
 920 |       if$
 921 |     }
 922 |   if$
 923 | }
 924 | 
 925 | FUNCTION {format.booktitle}
 926 | {
 927 |   booktitle "booktitle" bibinfo.check
 928 |   emphasize
 929 | }
 930 | FUNCTION {format.in.ed.booktitle}
 931 | { format.booktitle duplicate$ empty$ 'skip$
 932 |     {
 933 |       editor "editor" format.names.ed duplicate$ empty$ 'pop$
 934 |         {
 935 |           " " *
 936 |           get.bbl.editor
 937 |           "(" swap$ * "), " *
 938 |           * swap$
 939 |           * }
 940 |       if$
 941 |       word.in swap$ *
 942 |     }
 943 |   if$
 944 | }
 945 | FUNCTION {format.thesis.type}
 946 | { type duplicate$ empty$
 947 |     'pop$
 948 |     { swap$ pop$
 949 |       "t" change.case$ "type" bibinfo.check
 950 |     }
 951 |   if$
 952 | }
 953 | FUNCTION {format.tr.number}
 954 | { number "number" bibinfo.check
 955 |   type duplicate$ empty$
 956 |     { pop$ bbl.techrep }
 957 |     'skip$
 958 |   if$
 959 |   "type" bibinfo.check
 960 |   swap$ duplicate$ empty$
 961 |     { pop$ "t" change.case$ }
 962 |     { tie.or.space.prefix * * }
 963 |   if$
 964 | }
 965 | FUNCTION {format.article.crossref}
 966 | {
 967 |   word.in
 968 |   " \cite{" * crossref * "}" *
 969 | }
 970 | FUNCTION {format.book.crossref}
 971 | { volume duplicate$ empty$
 972 |     { "empty volume in " cite$ * "'s crossref of " * crossref * warning$
 973 |       pop$ word.in
 974 |     }
 975 |     { bbl.volume
 976 |       capitalize
 977 |       swap$ tie.or.space.prefix "volume" bibinfo.check * * bbl.of space.word *
 978 |     }
 979 |   if$
 980 |   " \cite{" * crossref * "}" *
 981 | }
 982 | FUNCTION {format.incoll.inproc.crossref}
 983 | {
 984 |   word.in
 985 |   " \cite{" * crossref * "}" *
 986 | }
 987 | FUNCTION {format.org.or.pub}
 988 | { 't :=
 989 |   ""
 990 |   address empty$ t empty$ and
 991 |     'skip$
 992 |     {
 993 |       t empty$
 994 |         { address "address" bibinfo.check *
 995 |         }
 996 |         { t *
 997 |           address empty$
 998 |             'skip$
 999 |             { ", " * address "address" bibinfo.check * }
1000 |           if$
1001 |         }
1002 |       if$
1003 |     }
1004 |   if$
1005 | }
1006 | FUNCTION {format.publisher.address}
1007 | { publisher "publisher" bibinfo.warn format.org.or.pub
1008 | }
1009 | 
1010 | FUNCTION {format.organization.address}
1011 | { organization "organization" bibinfo.check format.org.or.pub
1012 | }
1013 | 
1014 | FUNCTION {article}
1015 | { output.bibitem
1016 |   format.authors "author" output.check
1017 |   author format.key output
1018 |   format.date "year" output.check
1019 |   date.block
1020 |   format.title "title" output.check
1021 |   new.block
1022 |   crossref missing$
1023 |     {
1024 |       journal
1025 |       "journal" bibinfo.check
1026 |       emphasize
1027 |       "journal" output.check
1028 |       format.vol.num.pages output
1029 |     }
1030 |     { format.article.crossref output.nonnull
1031 |       format.pages output
1032 |     }
1033 |   if$
1034 |   format.issn output
1035 |   format.doi output
1036 |   new.block
1037 |   format.note output
1038 |   format.eprint output
1039 |   format.url output
1040 |   fin.entry
1041 | }
1042 | FUNCTION {book}
1043 | { output.bibitem
1044 |   author empty$
1045 |     { format.editors "author and editor" output.check
1046 |       editor format.key output
1047 |     }
1048 |     { format.authors output.nonnull
1049 |       crossref missing$
1050 |         { "author and editor" editor either.or.check }
1051 |         'skip$
1052 |       if$
1053 |     }
1054 |   if$
1055 |   format.date "year" output.check
1056 |   date.block
1057 |   format.btitle "title" output.check
1058 |   crossref missing$
1059 |     { format.bvolume output
1060 |       new.block
1061 |       format.number.series output
1062 |       format.edition output
1063 |       new.sentence
1064 |       format.publisher.address output
1065 |     }
1066 |     {
1067 |       new.block
1068 |       format.book.crossref output.nonnull
1069 |     }
1070 |   if$
1071 |   format.isbn output
1072 |   format.doi output
1073 |   new.block
1074 |   format.note output
1075 |   format.eprint output
1076 |   format.url output
1077 |   fin.entry
1078 | }
1079 | FUNCTION {booklet}
1080 | { output.bibitem
1081 |   format.authors output
1082 |   author format.key output
1083 |   format.date "year" output.check
1084 |   date.block
1085 |   format.title "title" output.check
1086 |   new.block
1087 |   howpublished "howpublished" bibinfo.check output
1088 |   address "address" bibinfo.check output
1089 |   format.isbn output
1090 |   format.doi output
1091 |   new.block
1092 |   format.note output
1093 |   format.eprint output
1094 |   format.url output
1095 |   fin.entry
1096 | }
1097 | 
1098 | FUNCTION {inbook}
1099 | { output.bibitem
1100 |   author empty$
1101 |     { format.editors "author and editor" output.check
1102 |       editor format.key output
1103 |     }
1104 |     { format.authors output.nonnull
1105 |       crossref missing$
1106 |         { "author and editor" editor either.or.check }
1107 |         'skip$
1108 |       if$
1109 |     }
1110 |   if$
1111 |   format.date "year" output.check
1112 |   date.block
1113 |   format.btitle "title" output.check
1114 |   crossref missing$
1115 |     {
1116 |       format.bvolume output
1117 |       format.chapter.pages "chapter and pages" output.check
1118 |       new.block
1119 |       format.number.series output
1120 |       format.edition output
1121 |       new.sentence
1122 |       format.publisher.address output
1123 |     }
1124 |     {
1125 |       format.chapter.pages "chapter and pages" output.check
1126 |       new.block
1127 |       format.book.crossref output.nonnull
1128 |     }
1129 |   if$
1130 |   crossref missing$
1131 |     { format.isbn output }
1132 |     'skip$
1133 |   if$
1134 |   format.doi output
1135 |   new.block
1136 |   format.note output
1137 |   format.eprint output
1138 |   format.url output
1139 |   fin.entry
1140 | }
1141 | 
1142 | FUNCTION {incollection}
1143 | { output.bibitem
1144 |   format.authors "author" output.check
1145 |   author format.key output
1146 |   format.date "year" output.check
1147 |   date.block
1148 |   format.title "title" output.check
1149 |   new.block
1150 |   crossref missing$
1151 |     { format.in.ed.booktitle "booktitle" output.check
1152 |       format.bvolume output
1153 |       format.number.series output
1154 |       format.edition output
1155 |       format.chapter.pages output
1156 |       new.sentence
1157 |       format.publisher.address output
1158 |       format.isbn output
1159 |     }
1160 |     { format.incoll.inproc.crossref output.nonnull
1161 |       format.chapter.pages output
1162 |     }
1163 |   if$
1164 |   format.doi output
1165 |   new.block
1166 |   format.note output
1167 |   format.eprint output
1168 |   format.url output
1169 |   fin.entry
1170 | }
1171 | FUNCTION {inproceedings}
1172 | { output.bibitem
1173 |   format.authors "author" output.check
1174 |   author format.key output
1175 |   format.date "year" output.check
1176 |   date.block
1177 |   format.title "title" output.check
1178 |   new.block
1179 |   crossref missing$
1180 |     { format.in.ed.booktitle "booktitle" output.check
1181 |       format.bvolume output
1182 |       format.number.series output
1183 |       format.pages output
1184 |       new.sentence
1185 |       publisher empty$
1186 |         { format.organization.address output }
1187 |         { organization "organization" bibinfo.check output
1188 |           format.publisher.address output
1189 |         }
1190 |       if$
1191 |       format.isbn output
1192 |       format.issn output
1193 |     }
1194 |     { format.incoll.inproc.crossref output.nonnull
1195 |       format.pages output
1196 |     }
1197 |   if$
1198 |   format.doi output
1199 |   new.block
1200 |   format.note output
1201 |   format.eprint output
1202 |   format.url output
1203 |   fin.entry
1204 | }
1205 | FUNCTION {conference} { inproceedings }
1206 | FUNCTION {manual}
1207 | { output.bibitem
1208 |   format.authors output
1209 |   author format.key output
1210 |   format.date "year" output.check
1211 |   date.block
1212 |   format.btitle "title" output.check
1213 |   organization address new.block.checkb
1214 |   organization "organization" bibinfo.check output
1215 |   address "address" bibinfo.check output
1216 |   format.edition output
1217 |   format.doi output
1218 |   new.block
1219 |   format.note output
1220 |   format.eprint output
1221 |   format.url output
1222 |   fin.entry
1223 | }
1224 | 
1225 | FUNCTION {mastersthesis}
1226 | { output.bibitem
1227 |   format.authors "author" output.check
1228 |   author format.key output
1229 |   format.date "year" output.check
1230 |   date.block
1231 |   format.btitle
1232 |   "title" output.check
1233 |   new.block
1234 |   bbl.mthesis format.thesis.type output.nonnull
1235 |   school "school" bibinfo.warn output
1236 |   address "address" bibinfo.check output
1237 |   format.doi output
1238 |   new.block
1239 |   format.note output
1240 |   format.eprint output
1241 |   format.url output
1242 |   fin.entry
1243 | }
1244 | 
1245 | FUNCTION {misc}
1246 | { output.bibitem
1247 |   format.authors output
1248 |   author format.key output
1249 |   format.date "year" output.check
1250 |   date.block
1251 |   format.title output
1252 |   new.block
1253 |   howpublished "howpublished" bibinfo.check output
1254 |   format.doi output
1255 |   new.block
1256 |   format.note output
1257 |   format.eprint output
1258 |   format.url output
1259 |   fin.entry
1260 | }
1261 | FUNCTION {phdthesis}
1262 | { output.bibitem
1263 |   format.authors "author" output.check
1264 |   author format.key output
1265 |   format.date "year" output.check
1266 |   date.block
1267 |   format.btitle
1268 |   "title" output.check
1269 |   new.block
1270 |   bbl.phdthesis format.thesis.type output.nonnull
1271 |   school "school" bibinfo.warn output
1272 |   address "address" bibinfo.check output
1273 |   format.doi output
1274 |   new.block
1275 |   format.note output
1276 |   format.eprint output
1277 |   format.url output
1278 |   fin.entry
1279 | }
1280 | 
1281 | FUNCTION {proceedings}
1282 | { output.bibitem
1283 |   format.editors output
1284 |   editor format.key output
1285 |   format.date "year" output.check
1286 |   date.block
1287 |   format.btitle "title" output.check
1288 |   format.bvolume output
1289 |   format.number.series output
1290 |   new.sentence
1291 |   publisher empty$
1292 |     { format.organization.address output }
1293 |     { organization "organization" bibinfo.check output
1294 |       format.publisher.address output
1295 |     }
1296 |   if$
1297 |   format.isbn output
1298 |   format.issn output
1299 |   format.doi output
1300 |   new.block
1301 |   format.note output
1302 |   format.eprint output
1303 |   format.url output
1304 |   fin.entry
1305 | }
1306 | 
1307 | FUNCTION {techreport}
1308 | { output.bibitem
1309 |   format.authors "author" output.check
1310 |   author format.key output
1311 |   format.date "year" output.check
1312 |   date.block
1313 |   format.title
1314 |   "title" output.check
1315 |   new.block
1316 |   format.tr.number emphasize output.nonnull
1317 |   institution "institution" bibinfo.warn output
1318 |   address "address" bibinfo.check output
1319 |   format.doi output
1320 |   new.block
1321 |   format.note output
1322 |   format.eprint output
1323 |   format.url output
1324 |   fin.entry
1325 | }
1326 | 
1327 | FUNCTION {unpublished}
1328 | { output.bibitem
1329 |   format.authors "author" output.check
1330 |   author format.key output
1331 |   format.date "year" output.check
1332 |   date.block
1333 |   format.title "title" output.check
1334 |   format.doi output
1335 |   new.block
1336 |   format.note "note" output.check
1337 |   format.eprint output
1338 |   format.url output
1339 |   fin.entry
1340 | }
1341 | 
1342 | FUNCTION {default.type} { misc }
1343 | READ
1344 | FUNCTION {sortify}
1345 | { purify$
1346 |   "l" change.case$
1347 | }
1348 | INTEGERS { len }
1349 | FUNCTION {chop.word}
1350 | { 's :=
1351 |   'len :=
1352 |   s #1 len substring$ =
1353 |     { s len #1 + global.max$ substring$ }
1354 |     's
1355 |   if$
1356 | }
1357 | FUNCTION {format.lab.names}
1358 | { 's :=
1359 |   "" 't :=
1360 |   s #1 "{vv~}{ll}" format.name$
1361 |   s num.names$ duplicate$
1362 |   #2 >
1363 |     { pop$
1364 |       " " * bbl.etal emphasize *
1365 |     }
1366 |     { #2 <
1367 |         'skip$
1368 |         { s #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" =
1369 |             {
1370 |               " " * bbl.etal emphasize *
1371 |             }
1372 |             { bbl.and space.word * s #2 "{vv~}{ll}" format.name$
1373 |               * }
1374 |           if$
1375 |         }
1376 |       if$
1377 |     }
1378 |   if$
1379 | }
1380 | 
1381 | FUNCTION {author.key.label}
1382 | { author empty$
1383 |     { key empty$
1384 |         { cite$ #1 #3 substring$ }
1385 |         'key
1386 |       if$
1387 |     }
1388 |     { author format.lab.names }
1389 |   if$
1390 | }
1391 | 
1392 | FUNCTION {author.editor.key.label}
1393 | { author empty$
1394 |     { editor empty$
1395 |         { key empty$
1396 |             { cite$ #1 #3 substring$ }
1397 |             'key
1398 |           if$
1399 |         }
1400 |         { editor format.lab.names }
1401 |       if$
1402 |     }
1403 |     { author format.lab.names }
1404 |   if$
1405 | }
1406 | 
1407 | FUNCTION {editor.key.label}
1408 | { editor empty$
1409 |     { key empty$
1410 |         { cite$ #1 #3 substring$ }
1411 |         'key
1412 |       if$
1413 |     }
1414 |     { editor format.lab.names }
1415 |   if$
1416 | }
1417 | 
1418 | FUNCTION {calc.short.authors}
1419 | { type$ "book" =
1420 |   type$ "inbook" =
1421 |   or
1422 |     'author.editor.key.label
1423 |     { type$ "proceedings" =
1424 |         'editor.key.label
1425 |         'author.key.label
1426 |       if$
1427 |     }
1428 |   if$
1429 |   'short.list :=
1430 | }
1431 | 
1432 | FUNCTION {calc.label}
1433 | { calc.short.authors
1434 |   short.list
1435 |   "("
1436 |   *
1437 |   year duplicate$ empty$
1438 |   short.list key field.or.null = or
1439 |      { pop$ "" }
1440 |      'skip$
1441 |   if$
1442 |   *
1443 |   'label :=
1444 | }
1445 | 
1446 | FUNCTION {sort.format.names}
1447 | { 's :=
1448 |   #1 'nameptr :=
1449 |   ""
1450 |   s num.names$ 'numnames :=
1451 |   numnames 'namesleft :=
1452 |     { namesleft #0 > }
1453 |     { s nameptr
1454 |       "{vv{ } }{ll{ }}{  f{ }}{  jj{ }}"
1455 |       format.name$ 't :=
1456 |       nameptr #1 >
1457 |         {
1458 |           "   "  *
1459 |           namesleft #1 = t "others" = and
1460 |             { "zzzzz" 't := }
1461 |             'skip$
1462 |           if$
1463 |           t sortify *
1464 |         }
1465 |         { t sortify * }
1466 |       if$
1467 |       nameptr #1 + 'nameptr :=
1468 |       namesleft #1 - 'namesleft :=
1469 |     }
1470 |   while$
1471 | }
1472 | 
1473 | FUNCTION {sort.format.title}
1474 | { 't :=
1475 |   "A " #2
1476 |     "An " #3
1477 |       "The " #4 t chop.word
1478 |     chop.word
1479 |   chop.word
1480 |   sortify
1481 |   #1 global.max$ substring$
1482 | }
1483 | FUNCTION {author.sort}
1484 | { author empty$
1485 |     { key empty$
1486 |         { "to sort, need author or key in " cite$ * warning$
1487 |           ""
1488 |         }
1489 |         { key sortify }
1490 |       if$
1491 |     }
1492 |     { author sort.format.names }
1493 |   if$
1494 | }
1495 | FUNCTION {author.editor.sort}
1496 | { author empty$
1497 |     { editor empty$
1498 |         { key empty$
1499 |             { "to sort, need author, editor, or key in " cite$ * warning$
1500 |               ""
1501 |             }
1502 |             { key sortify }
1503 |           if$
1504 |         }
1505 |         { editor sort.format.names }
1506 |       if$
1507 |     }
1508 |     { author sort.format.names }
1509 |   if$
1510 | }
1511 | FUNCTION {editor.sort}
1512 | { editor empty$
1513 |     { key empty$
1514 |         { "to sort, need editor or key in " cite$ * warning$
1515 |           ""
1516 |         }
1517 |         { key sortify }
1518 |       if$
1519 |     }
1520 |     { editor sort.format.names }
1521 |   if$
1522 | }
1523 | FUNCTION {presort}
1524 | { calc.label
1525 |   label sortify
1526 |   "    "
1527 |   *
1528 |   type$ "book" =
1529 |   type$ "inbook" =
1530 |   or
1531 |     'author.editor.sort
1532 |     { type$ "proceedings" =
1533 |         'editor.sort
1534 |         'author.sort
1535 |       if$
1536 |     }
1537 |   if$
1538 |   #1 entry.max$ substring$
1539 |   'sort.label :=
1540 |   sort.label
1541 |   *
1542 |   "    "
1543 |   *
1544 |   title field.or.null
1545 |   sort.format.title
1546 |   *
1547 |   #1 entry.max$ substring$
1548 |   'sort.key$ :=
1549 | }
1550 | 
1551 | ITERATE {presort}
1552 | SORT
1553 | STRINGS { last.label next.extra }
1554 | INTEGERS { last.extra.num last.extra.num.extended last.extra.num.blank number.label }
1555 | FUNCTION {initialize.extra.label.stuff}
1556 | { #0 int.to.chr$ 'last.label :=
1557 |   "" 'next.extra :=
1558 |   #0 'last.extra.num :=
1559 |   "a" chr.to.int$ #1 - 'last.extra.num.blank :=
1560 |   last.extra.num.blank 'last.extra.num.extended :=
1561 |   #0 'number.label :=
1562 | }
1563 | FUNCTION {forward.pass}
1564 | { last.label label =
1565 |     { last.extra.num #1 + 'last.extra.num :=
1566 |       last.extra.num "z" chr.to.int$ >
1567 |        { "a" chr.to.int$ 'last.extra.num :=
1568 |          last.extra.num.extended #1 + 'last.extra.num.extended :=
1569 |        }
1570 |        'skip$
1571 |       if$
1572 |       last.extra.num.extended last.extra.num.blank >
1573 |         { last.extra.num.extended int.to.chr$
1574 |           last.extra.num int.to.chr$
1575 |           * 'extra.label := }
1576 |         { last.extra.num int.to.chr$ 'extra.label := }
1577 |       if$
1578 |     }
1579 |     { "a" chr.to.int$ 'last.extra.num :=
1580 |       "" 'extra.label :=
1581 |       label 'last.label :=
1582 |     }
1583 |   if$
1584 |   number.label #1 + 'number.label :=
1585 | }
1586 | FUNCTION {reverse.pass}
1587 | { next.extra "b" =
1588 |     { "a" 'extra.label := }
1589 |     'skip$
1590 |   if$
1591 |   extra.label 'next.extra :=
1592 |   extra.label
1593 |   duplicate$ empty$
1594 |     'skip$
1595 |     { "{\natexlab{" swap$ * "}}" * }
1596 |   if$
1597 |   'extra.label :=
1598 |   label extra.label * 'label :=
1599 | }
1600 | EXECUTE {initialize.extra.label.stuff}
1601 | ITERATE {forward.pass}
1602 | REVERSE {reverse.pass}
1603 | FUNCTION {bib.sort.order}
1604 | { sort.label
1605 |   "    "
1606 |   *
1607 |   year field.or.null sortify
1608 |   *
1609 |   "    "
1610 |   *
1611 |   title field.or.null
1612 |   sort.format.title
1613 |   *
1614 |   #1 entry.max$ substring$
1615 |   'sort.key$ :=
1616 | }
1617 | ITERATE {bib.sort.order}
1618 | SORT
1619 | FUNCTION {begin.bib}
1620 | { preamble$ empty$
1621 |     'skip$
1622 |     { preamble$ write$ newline$ }
1623 |   if$
1624 |   "\begin{thebibliography}{" number.label int.to.str$ * "}" *
1625 |   write$ newline$
1626 |   "\newcommand{\enquote}[1]{``#1''}"
1627 |   write$ newline$
1628 |   "\providecommand{\natexlab}[1]{#1}"
1629 |   write$ newline$
1630 |   "\providecommand{\url}[1]{\texttt{#1}}"
1631 |   write$ newline$
1632 |   "\providecommand{\urlprefix}{URL }"
1633 |   write$ newline$
1634 |   "\expandafter\ifx\csname urlstyle\endcsname\relax"
1635 |   write$ newline$
1636 |   "  \providecommand{\doi}[1]{doi:\discretionary{}{}{}#1}\else"
1637 |   write$ newline$
1638 |   "  \providecommand{\doi}{doi:\discretionary{}{}{}\begingroup \urlstyle{rm}\Url}\fi"
1639 |   write$ newline$
1640 |   "\providecommand{\eprint}[2][]{\url{#2}}"
1641 |   write$ newline$
1642 | }
1643 | EXECUTE {begin.bib}
1644 | EXECUTE {init.state.consts}
1645 | ITERATE {call.type$}
1646 | FUNCTION {end.bib}
1647 | { newline$
1648 |   "\end{thebibliography}" write$ newline$
1649 | }
1650 | EXECUTE {end.bib}
1651 | %% End of customized bst file
1652 | %%
1653 | %% End of file `jss.bst'.
1654 | 


--------------------------------------------------------------------------------
/paper/jss.cls:
--------------------------------------------------------------------------------
  1 | %%
  2 | %% This is file `jss.cls',
  3 | \def\fileversion{3.3}
  4 | \def\filename{jss}
  5 | \def\filedate{2021/05/23}
  6 | %%
  7 | %% Package `jss' to use with LaTeX2e for JSS publications (http://www.jstatsoft.org/)
  8 | %% License: GPL-2 | GPL-3
  9 | %% Copyright: (C) Achim Zeileis
 10 | %% Please report errors to Achim.Zeileis@R-project.org
 11 | %%
 12 | \NeedsTeXFormat{LaTeX2e}
 13 | \ProvidesClass{jss}[\filedate\space\fileversion\space jss class by Achim Zeileis]
 14 | %% options
 15 | \newif\if@article
 16 | \newif\if@codesnippet
 17 | \newif\if@bookreview
 18 | \newif\if@softwarereview
 19 | \newif\if@review
 20 | \newif\if@shortnames
 21 | \newif\if@nojss
 22 | \newif\if@notitle
 23 | \newif\if@noheadings
 24 | \newif\if@nofooter
 25 | 
 26 | \@articletrue
 27 | \@codesnippetfalse
 28 | \@bookreviewfalse
 29 | \@softwarereviewfalse
 30 | \@reviewfalse
 31 | \@shortnamesfalse
 32 | \@nojssfalse
 33 | \@notitlefalse
 34 | \@noheadingsfalse
 35 | \@nofooterfalse
 36 | 
 37 | \DeclareOption{article}{\@articletrue%
 38 |   \@codesnippetfalse \@bookreviewfalse \@softwarereviewfalse}
 39 | \DeclareOption{codesnippet}{\@articlefalse%
 40 |   \@codesnippettrue \@bookreviewfalse \@softwarereviewfalse}
 41 | \DeclareOption{bookreview}{\@articlefalse%
 42 |   \@codesnippetfalse \@bookreviewtrue \@softwarereviewfalse}
 43 | \DeclareOption{softwarereview}{\@articlefalse%
 44 |   \@codesnippetfalse \@bookreviewfalse \@softwarereviewtrue}
 45 | \DeclareOption{shortnames}{\@shortnamestrue}
 46 | \DeclareOption{nojss}{\@nojsstrue}
 47 | \DeclareOption{notitle}{\@notitletrue}
 48 | \DeclareOption{noheadings}{\@noheadingstrue}
 49 | \DeclareOption{nofooter}{\@nofootertrue}
 50 | 
 51 | \ProcessOptions
 52 | \LoadClass[11pt,a4paper,twoside]{article}
 53 | %% required packages
 54 | \RequirePackage{graphicx,xcolor,ae,fancyvrb}
 55 | \RequirePackage[T1]{fontenc}
 56 | \IfFileExists{upquote.sty}{\RequirePackage{upquote}}{}
 57 | \IfFileExists{lmodern.sty}{\RequirePackage{lmodern}}{}
 58 | %% bibliography
 59 | \if@shortnames
 60 |   \usepackage[authoryear,round]{natbib}
 61 | \else
 62 |   \usepackage[authoryear,round,longnamesfirst]{natbib}
 63 | \fi
 64 | \bibpunct{(}{)}{;}{a}{}{,}
 65 | \bibliographystyle{jss}
 66 | %% page layout
 67 | \topmargin 0pt
 68 | \textheight 46\baselineskip
 69 | \advance\textheight by \topskip
 70 | \oddsidemargin 0.1in
 71 | \evensidemargin 0.15in
 72 | \marginparwidth 1in
 73 | \oddsidemargin 0.125in
 74 | \evensidemargin 0.125in
 75 | \marginparwidth 0.75in
 76 | \textwidth 6.125in
 77 | %% paragraphs
 78 | \setlength{\parskip}{0.7ex plus0.1ex minus0.1ex}
 79 | \setlength{\parindent}{0em}
 80 | %% for all publications
 81 | \newcommand{\Address}[1]{\def\@Address{#1}}
 82 | \newcommand{\Plaintitle}[1]{\def\@Plaintitle{#1}}
 83 | \newcommand{\Shorttitle}[1]{\def\@Shorttitle{#1}}
 84 | \newcommand{\Plainauthor}[1]{\def\@Plainauthor{#1}}
 85 | \newcommand{\Volume}[1]{\def\@Volume{#1}}
 86 | \newcommand{\Year}[1]{\def\@Year{#1}}
 87 | \newcommand{\Month}[1]{\def\@Month{#1}}
 88 | \newcommand{\Issue}[1]{\def\@Issue{#1}}
 89 | \newcommand{\Submitdate}[1]{\def\@Submitdate{#1}}
 90 | %% for articles and code snippets
 91 | \newcommand{\Acceptdate}[1]{\def\@Acceptdate{#1}}
 92 | \newcommand{\Abstract}[1]{\def\@Abstract{#1}}
 93 | \newcommand{\Keywords}[1]{\def\@Keywords{#1}}
 94 | \newcommand{\Plainkeywords}[1]{\def\@Plainkeywords{#1}}
 95 | %% for book and software reviews
 96 | \newcommand{\Reviewer}[1]{\def\@Reviewer{#1}}
 97 | \newcommand{\Booktitle}[1]{\def\@Booktitle{#1}}
 98 | \newcommand{\Bookauthor}[1]{\def\@Bookauthor{#1}}
 99 | \newcommand{\Publisher}[1]{\def\@Publisher{#1}}
100 | \newcommand{\Pubaddress}[1]{\def\@Pubaddress{#1}}
101 | \newcommand{\Pubyear}[1]{\def\@Pubyear{#1}}
102 | \newcommand{\ISBN}[1]{\def\@ISBN{#1}}
103 | \newcommand{\Pages}[1]{\def\@Pages{#1}}
104 | \newcommand{\Price}[1]{\def\@Price{#1}}
105 | \newcommand{\Plainreviewer}[1]{\def\@Plainreviewer{#1}}
106 | \newcommand{\Softwaretitle}[1]{\def\@Softwaretitle{#1}}
107 | \newcommand{\URL}[1]{\def\@URL{#1}}
108 | \newcommand{\DOI}[1]{\def\@DOI{#1}}
109 | %% for internal use
110 | \newcommand{\Seriesname}[1]{\def\@Seriesname{#1}}
111 | \newcommand{\Hypersubject}[1]{\def\@Hypersubject{#1}}
112 | \newcommand{\Hyperauthor}[1]{\def\@Hyperauthor{#1}}
113 | \newcommand{\Footername}[1]{\def\@Footername{#1}}
114 | \newcommand{\Firstdate}[1]{\def\@Firstdate{#1}}
115 | \newcommand{\Seconddate}[1]{\def\@Seconddate{#1}}
116 | \newcommand{\Reviewauthor}[1]{\def\@Reviewauthor{#1}}
117 | %% defaults
118 | \author{Firstname Lastname\\Affiliation}
119 | \title{Title}
120 | \Abstract{---!!!---an abstract is required---!!!---}
121 | \Plainauthor{\@author}
122 | \Volume{VV}
123 | \Year{YYYY}
124 | \Month{MMMMMM}
125 | \Issue{II}
126 | \Submitdate{yyyy-mm-dd}
127 | \Acceptdate{yyyy-mm-dd}
128 | \Address{
129 |   Firstname Lastname\\
130 |   Affiliation\\
131 |   Address, Country\\
132 |   E-mail: \email{name@address}\\
133 |   URL: \url{http://link/to/webpage/}
134 | }
135 | 
136 | \Reviewer{Firstname Lastname\\Affiliation}
137 | \Plainreviewer{Firstname Lastname}
138 | \Booktitle{Book Title}
139 | \Bookauthor{Book Author}
140 | \Publisher{Publisher}
141 | \Pubaddress{Publisher's Address}
142 | \Pubyear{YYY}
143 | \ISBN{x-xxxxx-xxx-x}
144 | \Pages{xv + 123}
145 | \Price{USD 69.95 (P)}
146 | \URL{http://link/to/webpage/}
147 | \DOI{10.18637/jss.v000.i00}
148 | \if@article
149 |   \Seriesname{Issue}
150 |   \Hypersubject{Journal of Statistical Software}
151 |   \Plaintitle{\@title}
152 |   \Shorttitle{\@title}
153 |   \Plainkeywords{\@Keywords}
154 | \fi
155 | 
156 | \if@codesnippet
157 |   \Seriesname{Code Snippet}
158 |   \Hypersubject{Journal of Statistical Software -- Code Snippets}
159 |   \Plaintitle{\@title}
160 |   \Shorttitle{\@title}
161 |   \Plainkeywords{\@Keywords}
162 | \fi
163 | 
164 | \if@bookreview
165 |   \Seriesname{Book Review}
166 |   \Hypersubject{Journal of Statistical Software -- Book Reviews}
167 |   \Plaintitle{\@Booktitle}
168 |   \Shorttitle{\@Booktitle}
169 |   \Reviewauthor{\@Bookauthor\\
170 |                 \@Publisher, \@Pubaddress, \@Pubyear.\\
171 |                 ISBN~\@ISBN. \@Pages~pp. \@Price.\\
172 |                 \url{\@URL}}
173 |   \Plainkeywords{}
174 |   \@reviewtrue
175 | \fi
176 | 
177 | \if@softwarereview
178 |   \Seriesname{Software Review}
179 |   \Hypersubject{Journal of Statistical Software -- Software Reviews}
180 |   \Plaintitle{\@Softwaretitle}
181 |   \Shorttitle{\@Softwaretitle}
182 |   \Booktitle{\@Softwaretitle}
183 |   \Reviewauthor{\@Publisher, \@Pubaddress. \@Price.\\
184 |                 \url{\@URL}}
185 |   \Plainkeywords{}
186 |   \@reviewtrue
187 | \fi
188 | 
189 | \if@review
190 |   \Hyperauthor{\@Plainreviewer}
191 |   \Keywords{}
192 |   \Footername{Reviewer}
193 |   \Firstdate{\textit{Published:} \@Submitdate}
194 |   \Seconddate{}
195 | \else
196 |   \Hyperauthor{\@Plainauthor}
197 |   \Keywords{---!!!---at least one keyword is required---!!!---}
198 |   \Footername{Affiliation}
199 |   \Firstdate{\textit{Submitted:} \@Submitdate}
200 |   \Seconddate{\textit{Accepted:} \@Acceptdate}
201 | \fi
202 | %% Sweave(-like)
203 | \DefineVerbatimEnvironment{Sinput}{Verbatim}{fontshape=sl}
204 | \DefineVerbatimEnvironment{Soutput}{Verbatim}{}
205 | \DefineVerbatimEnvironment{Scode}{Verbatim}{fontshape=sl}
206 | \newenvironment{Schunk}{}{}
207 | \DefineVerbatimEnvironment{Code}{Verbatim}{}
208 | \DefineVerbatimEnvironment{CodeInput}{Verbatim}{fontshape=sl}
209 | \DefineVerbatimEnvironment{CodeOutput}{Verbatim}{}
210 | \newenvironment{CodeChunk}{}{}
211 | \setkeys{Gin}{width=0.8\textwidth}
212 | %% footer
213 | \newlength{\footerskip}
214 | \setlength{\footerskip}{2.5\baselineskip plus 2ex minus 0.5ex}
215 | 
216 | \newcommand{\makefooter}{%
217 |   \vspace{\footerskip}
218 | 
219 |   \if@nojss
220 |     \begin{samepage}
221 |     \textbf{\large \@Footername: \nopagebreak}\\[.3\baselineskip] \nopagebreak
222 |     \@Address \nopagebreak
223 |     \end{samepage}
224 |   \else
225 |     \begin{samepage}
226 |     \textbf{\large \@Footername: \nopagebreak}\\[.3\baselineskip] \nopagebreak
227 |     \@Address \nopagebreak
228 |     \vfill
229 |     \hrule \nopagebreak
230 |     \vspace{.1\baselineskip}
231 |     {\fontfamily{pzc} \fontsize{13}{15} \selectfont Journal of Statistical Software}
232 |     \hfill
233 |     \url{http://www.jstatsoft.org/}\\ \nopagebreak
234 |     published by the Foundation for Open Access Statistics
235 |     \hfill
236 |     \url{http://www.foastat.org/}\\[.3\baselineskip] \nopagebreak
237 |     {\@Month{} \@Year, Volume~\@Volume, \@Seriesname~\@Issue}
238 |     \hfill
239 |     \@Firstdate\\ \nopagebreak
240 |     {\href{https://doi.org/\@DOI}{\tt doi:\@DOI}}
241 |     \hfill
242 |     \@Seconddate  \nopagebreak
243 |     \vspace{.3\baselineskip}
244 |     \hrule
245 |     \end{samepage}
246 |   \fi
247 | }
248 | \if@nofooter
249 |   %% \AtEndDocument{\makefooter}
250 | \else
251 |   \AtEndDocument{\makefooter}
252 | \fi
253 | %% required packages
254 | \RequirePackage{hyperref}
255 | %% new \maketitle
256 | \def\@myoddhead{
257 |   {\color{white} JSS}\\[-1.42cm]
258 |   \hspace{-2em} \includegraphics[height=23mm,keepaspectratio]{jsslogo} \hfill
259 |   \parbox[b][23mm]{118mm}{\hrule height 3pt
260 |    \center{
261 |    {\fontfamily{pzc} \fontsize{28}{32} \selectfont Journal of Statistical Software}
262 |    \vfill
263 |    {\it \small \@Month{} \@Year, Volume~\@Volume, \@Seriesname~\@Issue.%
264 |             \hfill \href{https://doi.org/\@DOI}{doi:\,\@DOI}}}\\[0.1cm]
265 |      \hrule height 3pt}}
266 | \if@review
267 |   \renewcommand{\maketitle}{
268 |   \if@nojss
269 |     %% \@oddhead{\@myoddhead}\\[3\baselineskip]
270 |   \else
271 |     \@oddhead{\@myoddhead}\\[3\baselineskip]
272 |   \fi
273 |     {\large
274 |     \noindent
275 |     Reviewer: \@Reviewer
276 |     \vspace{\baselineskip}
277 |     \hrule
278 |     \vspace{\baselineskip}
279 |     \textbf{\@Booktitle}
280 |     \begin{quotation} \noindent
281 |     \@Reviewauthor
282 |     \end{quotation}
283 |     \vspace{0.7\baselineskip}
284 |     \hrule
285 |     \vspace{1.3\baselineskip}
286 |     }
287 | 
288 |     \thispagestyle{empty}
289 |     \if@nojss
290 |       \markboth{\centerline{\@Shorttitle}}{\centerline{\@Hyperauthor}}
291 |     \else
292 |       \markboth{\centerline{\@Shorttitle}}{\centerline{\@Hypersubject}}
293 |     \fi
294 |     \pagestyle{myheadings}
295 |   }
296 | \else
297 |   \def\maketitle{
298 |   \if@nojss
299 |     %% \@oddhead{\@myoddhead} \par
300 |   \else
301 |     \@oddhead{\@myoddhead} \par
302 |   \fi
303 |    \begingroup
304 |      \def\thefootnote{\fnsymbol{footnote}}
305 |      \def\@makefnmark{\hbox to 0pt{$^{\@thefnmark}$\hss}}
306 |      \long\def\@makefntext##1{\parindent 1em\noindent
307 |                               \hbox to1.8em{\hss $\m@th ^{\@thefnmark}$}##1}
308 |      \@maketitle \@thanks
309 |    \endgroup
310 |    \setcounter{footnote}{0}
311 | 
312 |    \if@noheadings
313 |     %% \markboth{\centerline{\@Shorttitle}}{\centerline{\@Hypersubject}}
314 |     \else
315 |      \thispagestyle{empty}
316 |       \if@nojss
317 |         \markboth{\centerline{\@Shorttitle}}{\centerline{\@Hyperauthor}}
318 |       \else
319 |         \markboth{\centerline{\@Shorttitle}}{\centerline{\@Hypersubject}}
320 |       \fi
321 |      \pagestyle{myheadings}
322 |    \fi
323 | 
324 |    \let\maketitle\relax \let\@maketitle\relax
325 |    \gdef\@thanks{}\gdef\@author{}\gdef\@title{}\let\thanks\relax
326 |   }
327 | 
328 |   \def\@maketitle{\vbox{\hsize\textwidth \linewidth\hsize
329 |   \if@nojss
330 |     %% \vskip 1in
331 |   \else
332 |     \vskip 1in
333 |   \fi
334 |    {\centering
335 |    {\LARGE\bf \@title\par}
336 |    \vskip 0.2in plus 1fil minus 0.1in
337 |    {
338 |        \def\and{\unskip\enspace{\rm and}\enspace}%
339 |        \def\And{\end{tabular}\hss \egroup \hskip 1in plus 2fil
340 |           \hbox to 0pt\bgroup\hss \begin{tabular}[t]{c}\large\bf\rule{\z@}{24pt}\ignorespaces}%
341 |        \def\AND{\end{tabular}\hss\egroup \hfil\hfil\egroup
342 |           \vskip 0.1in plus 1fil minus 0.05in
343 |           \hbox to \linewidth\bgroup\rule{\z@}{10pt} \hfil\hfil
344 |           \hbox to 0pt\bgroup\hss \begin{tabular}[t]{c}\large\bf\rule{\z@}{24pt}\ignorespaces}
345 |        \hbox to \linewidth\bgroup\rule{\z@}{10pt} \hfil\hfil
346 |        \hbox to 0pt\bgroup\hss \begin{tabular}[t]{c}\large\bf\rule{\z@}{24pt}\@author
347 |        \end{tabular}\hss\egroup
348 |    \hfil\hfil\egroup}
349 |    \vskip 0.3in minus 0.1in
350 |    \hrule
351 |    \begin{abstract}
352 |    \@Abstract
353 |    \end{abstract}}
354 |    \textit{Keywords}:~\@Keywords.
355 |    \vskip 0.1in minus 0.05in
356 |    \hrule
357 |    \vskip 0.2in minus 0.1in
358 |   }}
359 | \fi
360 | %% sections, subsections, and subsubsections
361 | \newlength{\preXLskip}
362 | \newlength{\preLskip}
363 | \newlength{\preMskip}
364 | \newlength{\preSskip}
365 | \newlength{\postMskip}
366 | \newlength{\postSskip}
367 | \setlength{\preXLskip}{1.8\baselineskip plus 0.5ex minus 0ex}
368 | \setlength{\preLskip}{1.5\baselineskip plus 0.3ex minus 0ex}
369 | \setlength{\preMskip}{1\baselineskip plus 0.2ex minus 0ex}
370 | \setlength{\preSskip}{.8\baselineskip plus 0.2ex minus 0ex}
371 | \setlength{\postMskip}{.5\baselineskip plus 0ex minus 0.1ex}
372 | \setlength{\postSskip}{.3\baselineskip plus 0ex minus 0.1ex}
373 | 
374 | \newcommand{\jsssec}[2][default]{\vskip \preXLskip%
375 |   \pdfbookmark[1]{#1}{Section.\thesection.#1}%
376 |   \refstepcounter{section}%
377 |   \centerline{\textbf{\Large \thesection. #2}} \nopagebreak
378 |   \vskip \postMskip \nopagebreak}
379 | \newcommand{\jsssecnn}[1]{\vskip \preXLskip%
380 |   \centerline{\textbf{\Large #1}} \nopagebreak
381 |   \vskip \postMskip \nopagebreak}
382 | 
383 | \newcommand{\jsssubsec}[2][default]{\vskip \preMskip%
384 |   \pdfbookmark[2]{#1}{Subsection.\thesubsection.#1}%
385 |   \refstepcounter{subsection}%
386 |   \textbf{\large \thesubsection. #2} \nopagebreak
387 |   \vskip \postSskip \nopagebreak}
388 | \newcommand{\jsssubsecnn}[1]{\vskip \preMskip%
389 |   \textbf{\large #1} \nopagebreak
390 |   \vskip \postSskip \nopagebreak}
391 | 
392 | \newcommand{\jsssubsubsec}[2][default]{\vskip \preSskip%
393 |   \pdfbookmark[3]{#1}{Subsubsection.\thesubsubsection.#1}%
394 |   \refstepcounter{subsubsection}%
395 |   {\large \textit{#2}} \nopagebreak
396 |   \vskip \postSskip \nopagebreak}
397 | \newcommand{\jsssubsubsecnn}[1]{\vskip \preSskip%
398 |   {\textit{\large #1}} \nopagebreak
399 |   \vskip \postSskip \nopagebreak}
400 | 
401 | \newcommand{\jsssimplesec}[2][default]{\vskip \preLskip%
402 | %%  \pdfbookmark[1]{#1}{Section.\thesection.#1}%
403 |   \refstepcounter{section}%
404 |   \textbf{\large #1} \nopagebreak
405 |   \vskip \postSskip \nopagebreak}
406 | \newcommand{\jsssimplesecnn}[1]{\vskip \preLskip%
407 |   \textbf{\large #1} \nopagebreak
408 |   \vskip \postSskip \nopagebreak}
409 | 
410 | \if@review
411 |   \renewcommand{\section}{\secdef \jsssimplesec \jsssimplesecnn}
412 |   \renewcommand{\subsection}{\secdef \jsssimplesec \jsssimplesecnn}
413 |   \renewcommand{\subsubsection}{\secdef \jsssimplesec \jsssimplesecnn}
414 | \else
415 |   \renewcommand{\section}{\secdef \jsssec \jsssecnn}
416 |   \renewcommand{\subsection}{\secdef \jsssubsec \jsssubsecnn}
417 |   \renewcommand{\subsubsection}{\secdef \jsssubsubsec \jsssubsubsecnn}
418 | \fi
419 | %% colors
420 | \definecolor{Red}{rgb}{0.5,0,0}
421 | \definecolor{Blue}{rgb}{0,0,0.5}
422 | \if@review
423 |   \hypersetup{%
424 |     hyperindex = {true},
425 |     colorlinks = {true},
426 |     linktocpage = {true},
427 |     plainpages = {false},
428 |     linkcolor = {Blue},
429 |     citecolor = {Blue},
430 |     urlcolor = {Red},
431 |     pdfstartview = {Fit},
432 |     pdfpagemode = {None},
433 |     pdfview = {XYZ null null null}
434 |   }
435 | \else
436 |   \hypersetup{%
437 |     hyperindex = {true},
438 |     colorlinks = {true},
439 |     linktocpage = {true},
440 |     plainpages = {false},
441 |     linkcolor = {Blue},
442 |     citecolor = {Blue},
443 |     urlcolor = {Red},
444 |     pdfstartview = {Fit},
445 |     pdfpagemode = {UseOutlines},
446 |     pdfview = {XYZ null null null}
447 |   }
448 | \fi
449 | \if@nojss
450 |   \AtBeginDocument{
451 |     \hypersetup{%
452 |       pdfauthor = {\@Hyperauthor},
453 |       pdftitle = {\@Plaintitle},
454 |       pdfkeywords = {\@Plainkeywords}
455 |     }
456 |   }
457 | \else
458 |   \AtBeginDocument{
459 |     \hypersetup{%
460 |       pdfauthor = {\@Hyperauthor},
461 |       pdftitle = {\@Plaintitle},
462 |       pdfsubject = {\@Hypersubject},
463 |       pdfkeywords = {\@Plainkeywords}
464 |     }
465 |   }
466 | \fi
467 | \if@notitle
468 |   %% \AtBeginDocument{\maketitle}
469 | \else
470 |   \@ifundefined{AddToHook}{\AtBeginDocument{\maketitle}}{\AddToHook{begindocument}[maketitle]{\maketitle}}
471 | \fi
472 | %% commands
473 | \newcommand\code{\bgroup\@makeother\_\@makeother\~\@makeother\$\@codex}
474 | \def\@codex#1{{\normalfont\ttfamily\hyphenchar\font=-1 #1}\egroup}
475 | %%\let\code=\texttt
476 | \let\proglang=\textsf
477 | \newcommand{\pkg}[1]{{\fontseries{m}\fontseries{b}\selectfont #1}}
478 | \newcommand{\email}[1]{\href{mailto:#1}{\normalfont\texttt{#1}}}
479 | \ifx\csname urlstyle\endcsname\relax
480 |   \newcommand\@doi[1]{doi:\discretionary{}{}{}#1}\else
481 |   \newcommand\@doi{doi:\discretionary{}{}{}\begingroup
482 | \urlstyle{tt}\Url}\fi
483 | \newcommand{\doi}[1]{\href{https://doi.org/#1}{\normalfont\texttt{\@doi{#1}}}}
484 | \newcommand{\E}{\mathsf{E}}
485 | \newcommand{\VAR}{\mathsf{VAR}}
486 | \newcommand{\COV}{\mathsf{COV}}
487 | \newcommand{\Prob}{\mathsf{P}}
488 | \endinput
489 | %%
490 | %% End of file `jss.cls'.
491 | 


--------------------------------------------------------------------------------
/paper/jss5097.Rnw:
--------------------------------------------------------------------------------
   1 | \documentclass[article,table]{jss}
   2 | 
   3 | 
   4 | %% -- LaTeX packages and custom commands ---------------------------------------
   5 | %% recommended packages
   6 | \usepackage{orcidlink,thumbpdf,lmodern}
   7 | 
   8 | \usepackage{amsmath, amssymb}
   9 | \usepackage[ruled, linesnumbered]{algorithm2e}
  10 | \usepackage{tikz}
  11 | \usepackage{float}
  12 | % shortcut for surjection arrow, and surjection version of xrightarrow
  13 | \newcommand{\onto}{\twoheadrightarrow}
  14 | \newcommand\xonto[2][]{%
  15 |   \mathrel{\ooalign{$\xrightarrow[#1\mkern4mu]{#2\mkern4mu}$\cr%
  16 |   \hidewidth$\rightarrow\mkern4mu$}}
  17 | }
  18 | 
  19 | 
  20 | 
  21 | %% new custom commands
  22 | \newcommand{\class}[1]{`\code{#1}'}
  23 | \newcommand{\fct}[1]{\code{#1()}}
  24 | 
  25 | % This allows referencing institutional authors. 
  26 | % See: https://tex.stackexchange.com/questions/162659/formatting-of-institution-as-author-with-natbib-and-numbered-references
  27 | \newcommand{\xfnm}[1][]{\ifx!#1!\else\unskip,\space#1\fi}
  28 | 
  29 | 
  30 | %% For Sweave-based articles about R packages:
  31 | %% need no \usepackage{Sweave}
  32 | \SweaveOpts{engine = R, eps = FALSE, keep.source = TRUE}
  33 | <<preliminaries, echo=FALSE, results = hide>>=
  34 | options(prompt = "R> ", continue = "+  ", width = 70, useFancyQuotes = FALSE)
  35 | @
  36 | 
  37 | 
  38 | %% -- Article metainformation (author, title, ...) -----------------------------
  39 | 
  40 | %% - \author{} with primary affiliation (and optionally ORCID link)
  41 | %% - \Plainauthor{} without affiliations
  42 | %% - Separate authors by \And or \AND (in \author) or by comma (in \Plainauthor).
  43 | %% - \AND starts a new line, \And does not.
  44 | \author{Mark P.J. van der Loo~\orcidlink{0000-0002-9807-4686}\\
  45 |         Statistics Netherlands and Leiden University}
  46 | 
  47 | \Plainauthor{Mark P.J. van der Loo}
  48 | 
  49 | %% - \title{} in title case
  50 | %% - \Plaintitle{} without LaTeX markup (if any)
  51 | %% - \Shorttitle{} with LaTeX markup (if any), used as running title
  52 | \title{Split-Apply-Combine with Dynamic Grouping}
  53 | \Plaintitle{Split-Apply-Combine with Dynamic Grouping}
  54 | \Shorttitle{Split-Apply-Combine with Dynamic Grouping}
  55 | 
  56 | 
  57 | \Abstract{
  58 | Partitioning a data set by one or more of its attributes and computing an
  59 | aggregate for each part is one of the most common operations in data analyses.
  60 | There are use cases where the partitioning is determined dynamically by
  61 | collapsing smaller subsets into larger ones, to ensure sufficient support for
  62 | the computed aggregate. These use cases are not supported by software
  63 | implementing split-apply-combine types of operations. This paper presents the
  64 | \proglang{R} package \code{accumulate} that offers convenient interfaces for
  65 | defining grouped aggregation where the grouping itself is dynamically
  66 | determined, based on user-defined conditions on subsets, and a user-defined
  67 | subset collapsing scheme. The formal underlaying algorithm is described and
  68 | analyzed as well.
  69 | }
  70 | 
  71 | %% - \Keywords{} with LaTeX markup, at least one required
  72 | %% - \Plainkeywords{} without LaTeX markup (if necessary)
  73 | %% - Should be comma-separated and in sentence case.
  74 | \Keywords{data analysis, estimation, aggregation, \proglang{R}}
  75 | \Plainkeywords{data analysis, R}
  76 | 
  77 | %% - \Address{} of at least one author
  78 | %% - May contain multiple affiliations for each author
  79 | %%   (in extra lines, separated by \emph{and}\\).
  80 | %% - May contain multiple authors for the same affiliation
  81 | %%   (in the same first line, separated by comma).
  82 | \Address{
  83 |   Mark P.J. van der Loo~\orcidlink{0000-0002-9807-4686}\\
  84 |   Research and Development\\
  85 |   Statistics Netherlands\\
  86 |   Henri Faasdreef 312\\
  87 |   2492JP Den Haag, the Netherlands\\
  88 |   E-mail: \email{mpj.vanderloo@cbs.nl}\\
  89 |   URL: \url{https://www.markvanderloo.eu}\\
  90 |   \emph{and}\\
  91 |   Leiden Institute of Advanced Computer Science (LIACS)\\
  92 |   Leiden University\\
  93 |   P.O. Box 9512\\
  94 |   2300 RA Leiden, The Netherlands\\
  95 | }
  96 | 
  97 | \begin{document}
  98 | 
  99 | 
 100 | %% -- Introduction -------------------------------------------------------------
 101 | 
 102 | %% - In principle "as usual".
 103 | %% - But should typically have some discussion of both _software_ and _methods_.
 104 | %% - Use \proglang{}, \pkg{}, and \code{} markup throughout the manuscript.
 105 | 
 106 | 
 107 | 
 108 | \section{Introduction}
 109 | The operation of splitting a data set into non-overlapping groups, computing an
 110 | aggregate for each group, and combining the results into a new dataset is one
 111 | of the most common operations in data analyses. Indeed, any software for data
 112 | analyses includes some functionality for this. For example, the combination of
 113 | \code{split}/\code{lapply}/\code{unsplit} as well as \code{aggregate} have been
 114 | a part of the \proglang{S} \citep{becker1988new} and \proglang{R} \citep{rcore}
 115 | languages for a long time. For \proglang{R} there are several packages that
 116 | implement functionality for this, including \pkg{plyr}
 117 | \citep{wickham2011split}, it's successor \pkg{dplyr} \citep{wickham2022dplyr},
 118 | it's drop-in replacement \pkg{poorman} \citep{eastwood2022poorman}, and
 119 | performance-focused \proglang{R} packages \pkg{collapse}
 120 | \citep{krantz2022collapse} and \pkg{data.table} \citep{dowle2022datatable}. In
 121 | \proglang{Python} the \code{pandas} package implements several methods for
 122 | grouping records and aggregating over one or more columns in data frame
 123 | objects. The more recent \citet{polars2023} library for \proglang{Python} and
 124 | \proglang{Rust} also implement such features. Similarly, the \pkg{DataFrames}
 125 | package for \proglang{julia} implements split-apply-combine functionality
 126 | \citep{kaminski2022dataframes}.
 127 | 
 128 | In all packages mentioned, the calculation for each group uses data available
 129 | within the group. However, there are valid use cases where a group aggregate is
 130 | determined using attributes from out-of-group entities. One example where this
 131 | occurs is in the area of Small Area Estimation (SAE, see \emph{e.g.},
 132 | \citet{rao2015small, molina2015sae}). Here, one wishes to estimate an aggregate
 133 | for a group, for example a geographical region, or a detailed population
 134 | subset, where the number of (sampled) observations is so small that the variance
 135 | of the estimate would be unacceptably large. In small area estimation (SAE) one
 136 | trades bias for variance by `borrowing statistical strength' from out-of-group
 137 | records. The out-of-group records can be obtained, for example by combining the
 138 | original small group with a group of records that are deemed similar in certain
 139 | respects. A second area where out-of-group records play a role is in certain
 140 | hot-deck imputation methods \citep{andridge2010review}.  In
 141 | $k$-nearest-neighbours imputation for example, one finds a set of $k$ donor
 142 | records that are preferably in the same group, but this condition may be
 143 | relaxed if there are not enough records in the group.  In the \pkg{VIM} package
 144 | for \proglang{R} \citep{kowarik2016imputation}, this is controlled by a
 145 | combination of the Gower distance and setting conditions on minimal number of
 146 | donors. In practice, imputation is often performed via a fall-though scenario,
 147 | where one first tries to estimate a model within a group, but if the group is
 148 | too small for a reliable estimate of model parameters, the group is enlarged by
 149 | combining similar groups, similar to the small-area estimation scenario. 
 150 | 
 151 | 
 152 | SAE as a special case is well supported by \proglang{R} and other free
 153 | software. Methodology has for example been implemented in the \pkg{sae} package
 154 | of \cite{molina2015sae} and the \pkg{hbsae} package of
 155 | \cite{boonstra2022hbsae}.  Regarding imputation methodology, the CRAN task view
 156 | on missing data\footnote{https://cran.r-project.org/web/views/MissingData.html}
 157 | currently lists 203 \proglang{R} packages that support some form of estimating
 158 | missing data.  The \pkg{simputation} package \citep{loo2022simputation} seems
 159 | to be the only one that allows for some kind of fall-through scenario for
 160 | selecting methods, but it does not allow for dynamic grouping.
 161 | 
 162 | Summarizing, we see that on one hand there are many implementations available
 163 | for generic aggregation based on fixed groups. On the other hand there are
 164 | domain-specific implementation for methods where dynamic grouping of a set of
 165 | records plays a role. This paper presents a generic solution to
 166 | split-apply-combine aggregation where groups can be collapsed dynamically in
 167 | the form of \proglang{R} package \pkg{accumulate} \citep{loo2022accumulate}.
 168 | 
 169 | The \code{accumulate} package serves the use case where a user wishes to
 170 | compute aggregates for a certain grouping of records. However, if a certain
 171 | instance of a group does not meet user-defined quality requirements the set of
 172 | records is expanded by (recursively) collapsing the grouping according to a
 173 | user-defined scheme. For example, given some financial data on companies, one
 174 | wishes to compute the average profit to turnover ratio for each combination of
 175 | economic activity and size class. If for a certain combination of economic
 176 | activity and size class there are too few records for a reliable estimate, one
 177 | could drop size class and compute the average ratio over all records within a
 178 | certain economic activity. Or, one could choose to coarse-grain economic
 179 | activities by collapsing groups with activities that are deemed similar enough.
 180 | 
 181 | The package has been developed with the following design choices in mind.
 182 | First, the interface should be easy to learn for \proglang{R} users, and thus
 183 | should resemble existing popular interfaces where possible. Second, users
 184 | should be free to define any, possibly multi-step, collapsing scheme. Here, we
 185 | keep in mind that collapsing schemes may be constructed manually based on
 186 | domain knowledge and that users may want to experiment with several schemes
 187 | before deciding on a final solution. This calls for a certain separation of
 188 | concerns between defining collapsing schemes and applying them to data.  The
 189 | package should also support collapsing schemes that follow naturally from
 190 | hierarchical classification systems. Third, users should have the flexibility
 191 | to define any quality requirement on the grouping while common quality
 192 | requirements are supported out of the box. Common quality requirements include
 193 | a minimum number of records, or a minimum fraction of records without missing
 194 | values, or a minimum number of records with non-zero values. Finally, the
 195 | package should support simple outputs such as counts or averages, but also
 196 | compound objects such as the output of model estimates.
 197 | 
 198 | The rest of this paper is organized as follows.  In the next section we
 199 | visually explain dynamic grouping via a collapsing scheme and introduce the
 200 | running example that will be used in Section~\ref{sect:accumulate} which
 201 | introduces the \pkg{accumulate} package, the main interfaces and helper
 202 | functions. In Section~\ref{sect:accumulate} we also discuss the common case of
 203 | collapsing via a predifined hiearchical classification scheme.
 204 | Section~\ref{sect:example} demonstrates the package on a realistic synthetic
 205 | dataset. The case of complex outputs is also demonstrated in this Section.  In
 206 | Section~\ref{sect:algorithm} we derive the pseudocode that solves the problem
 207 | of aggregating with dynamically collapsing groups. We show that it is a precise
 208 | and straightforward generalization of the standard split-apply-combine problem
 209 | and analyze its time complexity. A summary and conlcusion follows in
 210 | Section~\ref{sect:conclusion}.
 211 | 
 212 | 
 213 | 
 214 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 215 | \section{Dynamic Grouping} \label{sect:dynamicgrouping}
 216 | In this Section the illustrate the concept of dynamic grouping with a minimal
 217 | worked example. This example is not very realistic, but it is constructed
 218 | to be simple enough so the whole procedure can be followed in detail. 
 219 | 
 220 | We consider a data set with three categorical variables $A$, $B$ and $B_1$, and
 221 | one numerical variable $Y$.  Variable $A$ has levels $\{1, 2,3\}$ and variable
 222 | $B$ is a hierarchical classification with levels $\{11,12,13,21,22\}$. Variable
 223 | $B_1$ is a coarse-graining of $B$: for each record the value for $B_1$ is the
 224 | first digit of $B$. Hence, $B_1$ has levels $\{1,2\}$. 
 225 | 
 226 | Our goal is to compute the mean $\mu_Y$ over $Y$, grouped by $A\times B$.  We
 227 | impose the condition that there must be at least three records in each group.
 228 | If a certain group $(a\in A,b\in B)$ has less than three records, we attempt
 229 | to compute the value for that group over records in $(a,b_1)$ where $b_1$ is
 230 | obtained by taking the first digit of $b$. If we then still have less than
 231 | three records, we take records of group $a$ to determine the value for $(a,b)$.
 232 | 
 233 | The tables in Figure~\ref{fig:example} illustrate the idea. The left table
 234 | represents the data set to be aggregated by $A$ and $B$.  The table on the
 235 | right represents the output. Colors indicate which data was used. 
 236 | %
 237 | \begin{figure}[H]
 238 | \centering
 239 | \begin{tabular}{cccl}
 240 | \multicolumn{4}{l}{Input Data}\\
 241 | \hline
 242 | $A$ & $B$ & $B_1$ & $Y$\\
 243 | \hline
 244 | \cellcolor{red!25}   1 &\cellcolor{red!25}   11 &                1       & \cellcolor{red!25}   1   \\
 245 | \cellcolor{red!25}   1 &\cellcolor{red!25}   11 &                1       & \cellcolor{red!25}   2   \\
 246 | \cellcolor{red!25}   1 &\cellcolor{red!25}   11 &                1       & \cellcolor{red!25}   3   \\
 247 | \cellcolor{green!25} 2 &                     12 & \cellcolor{green!25} 1 & \cellcolor{green!25} 4   \\
 248 | \cellcolor{green!25} 2 &                     12 & \cellcolor{green!25} 1 & \cellcolor{green!25} 5   \\
 249 | \cellcolor{green!25} 2 &                     13 & \cellcolor{green!25} 1 & \cellcolor{green!25} 6   \\
 250 | \cellcolor{blue!25}  3 &                     21 &                2       & \cellcolor{blue!25}  7   \\
 251 | \cellcolor{blue!25}  3 &                     22 &                2       & \cellcolor{blue!25}  8   \\
 252 | \cellcolor{blue!25}  3 &                     12 &                1       & \cellcolor{blue!25}  9   \\
 253 | \hline
 254 | \end{tabular}\hspace{1cm}\begin{tabular}{ccl}
 255 | \multicolumn{3}{l}{Output Aggregates}\\
 256 | \hline
 257 | $A\times B$ & \code{Level} & $\mu_Y$\\
 258 | \hline
 259 | \rowcolor{red!25}   1 11        & 0             & 2  \\
 260 | \rowcolor{green!25} 2 12        & 1             & 5  \\
 261 | \rowcolor{green!25} 2 13        & 1             & 5  \\
 262 | \rowcolor{blue!25}  3 21        & 2             & 8  \\
 263 | \rowcolor{blue!25}  3 22        & 2             & 8  \\
 264 | \rowcolor{blue!25}  3 12        & 2             & 8  \\
 265 | \hline
 266 | \end{tabular}
 267 | \label{fig:example}
 268 | \caption{Input data (left) with grouping variables $A$, $B$ and $B_1$, and
 269 | means of $Y$ per $A\times B$ after dynamic grouping (right).}
 270 | \end{figure}
 271 | %
 272 | The First row in the output represents group $(A=1,B=11)$. The collapsing level
 273 | is zero, which means that no collapsing was necessary. Indeed, in the data
 274 | table we see that there are three rows with $A=1$ and $B=11$ with $Y$ values
 275 | $1, 2$, and $3$, resulting in $\mu_Y=(1+2+3)/3=2$ for this group.
 276 | 
 277 | Next, we try to compute the total for group $(A=2,B=12)$ (in green) but find
 278 | that there are only two such rows. We now define a new group $(A=2,B_1=1)$ and
 279 | find that there are three records in that group so we get 
 280 | $\mu_Y=(4+5+6)/3=5$.  Similarly, there is only one record with $(A=2,B=13)$. 
 281 | Collapsing groups to $(A=2,B_1=1)$, yields again $\mu_y=5$.
 282 | 
 283 | Finally, for $(A=2,B=21)$ there is only a single record. Collapsing to
 284 | $(A=2,B_1=2)$ yields only two records, so we need to collapse further to $(A=2)$
 285 | and finally obtain three records.  This yields $\mu_Y=(7+8+9)/3=8$. Similarly
 286 | the groups $(A=3, B=22)$ and $(A=3,B=12)$ are collapsed to $(A=2)$.
 287 | 
 288 | 
 289 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5%%%%%%%
 290 | \section[R Package accumulate]{\proglang{R Package} \pkg{accumulate} }\label{sect:accumulate}
 291 | Grouped aggregation with a fall-though scenario based on a collapsing scheme
 292 | requires a fair amount of specification by the user. Besides the data to be
 293 | aggregated, one needs to specify the method(s) of aggregation, the collapsing
 294 | scheme, and the condition to decide whether a subset is fit for aggregation or
 295 | a next collapse is necessary. There are two main functions in \pkg{accumulate}
 296 | that offer slightly different interfaces.
 297 | \begin{Code}
 298 |   accumulate(data, collapse, test, fun, ...)
 299 |   cumulate(data, collapse, test, ...)
 300 | \end{Code}
 301 | Here \texttt{data} is a data frame holding data to be aggregated;
 302 | \texttt{collapse} represents the collapse sequence (as a \class{formula} or a
 303 | \class{data frame}), and \texttt{test} is a function that accepts a subset of
 304 | \texttt{data} and returns a boolean that indicates whether a subset is suited
 305 | for aggregation or not. In \code{accumulate()}, the parameter \code{fun}
 306 | represents an aggregation function that is applied to every column of
 307 | \code{data}, and the ellipsis (\code{...}) is for arguments that are passed as
 308 | extra argument to \code{fun}. The interface of \code{accumulate()} is somewhat
 309 | similar to that of the \code{aggregate()} function in \proglang{R}.  In
 310 | \code{cumulate()}, the ellipsis is a sequence of comma-separated
 311 | \code{name = expression} pairs in the style of \code{summarise()} from the
 312 | \code{dplyr} package.
 313 | 
 314 | The output of both functions are of the same form. The columns of the output
 315 | data frame and can schematically be represented as follows. 
 316 | \begin{Code}
 317 |   [Grouping Variables, Collapse level, Output aggregates]
 318 | \end{Code}
 319 | The first columns represent the variables that define the output grouping, the
 320 | next column is an integer that indicates the level of collapsing used to
 321 | compute the aggregate (0 indicating no collapse), and the last set of columns
 322 | store the aggregates. Output aggregates may be of a simple data type
 323 | (\code{numeric}, \code{character}, \code{logical},$\ldots$) or of a composed
 324 | type such as the output of a linear model. The latter case is demonstrated in
 325 | Section~\ref{sect:complex}.
 326 | 
 327 | Both functions support two different interfaces for specifying collapsing
 328 | schemes through the \code{collapse} parameter. The first and most general is
 329 | the \class{formula} interface, which requires that the collapsing sequence is
 330 | represented as variables in the data set to be aggregated. The second is a
 331 | tabular interface where each row starts with the one of the lowest-level group
 332 | labels, and subsequent columns contain labels of courser groups.
 333 | 
 334 | 
 335 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 336 | \subsection{The Formula Interface}\label{sect:formula}
 337 | We will use the example of Section~\ref{sect:dynamicgrouping} to illustrate
 338 | the \class{formula} interface.
 339 | <<>>=
 340 | library("accumulate")
 341 | input <- data.frame(
 342 |   A  = c( 1,  1,  1,  2,  2,  2,  3,  3,  3),
 343 |   B  = c(11, 11, 11, 12, 12, 13, 21, 22, 12),
 344 |   B1 = c( 1,  1,  1,  1,  1,  1,  2,  2,  1),
 345 |   Y  = 1:9
 346 | )
 347 | 
 348 | cumulate(input, collapse = A * B ~ A * B1 + A
 349 |         , test = function(d) nrow(d) >= 3, muY = mean(Y) )
 350 | @
 351 | %
 352 | Consider the formula \code{A * B ~ A * B1 + A} in the call to \code{cumulate()}.
 353 | The left-hand-side \code{A * B} is the target output grouping. The
 354 | right-hand-side is to be interpreted as the collapsing sequence: if an instance
 355 | of \code{A * B} does not pass the test, then collapse to \code{A * B1}, and if that
 356 | does not pass the test collapse to \code{A}. If this final grouping also does
 357 | not pass the test, the result is \code{NA}.
 358 | 
 359 | Summarizing, the \class{formula} interface is always of the following form.
 360 | \begin{Code}
 361 |   Target grouping ~ Alternative1 + Alternative2 + ... + AlternativeN
 362 | \end{Code}
 363 | 
 364 | 
 365 | 
 366 | It is possible to get the same result with \code{accumulate()}. This will cause
 367 | summation over all variables that are not used in the formula object. In the
 368 | below example we also introduce the helper function \code{min_records()}.
 369 | <<>>=
 370 | input$Y2 <- 11:19
 371 | accumulate(input, collapse = A * B ~ A * B1 + A, 
 372 |   test = min_records(3), fun = mean)
 373 | @
 374 | This means that for experimentation, users must be careful to exclude categorical
 375 | variables that are not used from the input data set. For example, if \code{B1}
 376 | is not used, we get the following.
 377 | <<>>=
 378 | accumulate(input[-3], collapse = A * B ~ A,
 379 |   test = min_records(3), fun = mean)
 380 | @
 381 | 
 382 | The \class{formula} interface allows users to quickly experiment with different
 383 | collapsing schemes. It does require that all categorical variables have been
 384 | added to the data. As an alternative, one can use the data frame specification,
 385 | which allows for a further separation between defining the collapsing scheme
 386 | and the actual data processing.
 387 | 
 388 | 
 389 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 390 | \subsection{The Data Frame Interface} \label{sect:dfint}
 391 | The data frame interface is somewhat limited because it only allows for a
 392 | single grouping variable. The advantage however setting up a collapsing scheme
 393 | in the form of a table closely connects to domain knowledge and allows
 394 | fine-grained control on how groups are collapsed.
 395 | 
 396 | In order to use the data frame interface, the input dataset must include the
 397 | most fine-grained grouping variable. In the running example this is $A\times
 398 | B$, so we need to combine that into a single variable, and remove the other
 399 | ones.
 400 | <<>>=
 401 | input1 <- input
 402 | input1$AB <- paste(input$A, input$B, sep = "-")
 403 | input1 <- input1[-(1:3)]
 404 | input1
 405 | @
 406 | We now define the collapsing scheme as follows.
 407 | <<>>=
 408 | csh <- data.frame(
 409 |   AB  = c("1-11", "2-12", "2-13", "3-21", "3-22", "3-12"),
 410 |   AB1 = c("1-1" , "2-1" , "2-1" , "3-2" , "3-2" , "3-1" ),
 411 |   A   = c("1"   , "2"   , "2"   , "3"   , "3"   , "3"   ))
 412 | csh
 413 | @
 414 | In this data frame, two consecutive columns should be read as a child-parent
 415 | relation. For example, in the first collapsing step the groups defined by
 416 | \code{AB == "2-12"} and \code{AB == "2-13"} both collapse to \code{AB1 == "2-1"}.
 417 | In this artificial example the codes do not mean anything, but in realistic
 418 | cases where codes represent a (hierarchical) classification, domain experts
 419 | usually have a good grasp of which codes can be combined. 
 420 | 
 421 | The calls to \code{cumulate()} and \code{accumulate()} now look as follows.
 422 | <<>>=
 423 | accumulate(input1, collapse = csh, test = min_records(3), mean)
 424 | cumulate(input1, collapse = csh, test = min_records(3),
 425 |   muY = mean(Y), muY2 = mean(Y2))
 426 | @
 427 | 
 428 | 
 429 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 430 | \subsection{Specifying Tests}
 431 | The \code{test} parameter of \code{cumulate()} and \code{accumulate()} accepts a
 432 | function that takes a subset of the data and returns a boolean. For common
 433 | test conditions, including requiring a minimal number of records, or a minimal
 434 | number or fraction of complete records there are helper functions available.
 435 | %
 436 | \begin{center}
 437 | \begin{tabular}{lp{9cm}}
 438 | \code{min_records(n)}        & At least $n$ records.\\ 
 439 | \code{min_complete(n, vars)}  & At least $n$ records complete for variables \code{vars}.\\
 440 | \code{frac_complete(r, vars)} & At least $100r$\% complete records for variables \code{vars}.\\
 441 | \code{from_validator(v, ...)} & Construct a testing function from a \code{validator} object
 442 |                                of \proglang{R} package \pkg{validate}.\\
 443 | \end{tabular}
 444 | \end{center}
 445 | %
 446 | Second, for multiple, possibly complex requirements on variables users can
 447 | express conditions with the \pkg{validate} package \cite{loo2021data}.  The
 448 | \pkg{validate} packages offers a domain-specific language for expressing,
 449 | manipulating, and investigating conditions on datasets. It's core concept is a
 450 | list of `data validation rules' stored as a \class{validator} object. A
 451 | \class{validator} object is constructed with the eponymous function
 452 | \code{validator()}.  For example, to demand that there are at least 3 rows in a
 453 | group, and that there are at least three records where $Y\geq 2$ we create the
 454 | following ruleset.
 455 | <<>>=
 456 | library("validate")
 457 | rules <- validator(nrow(.) >= 3, sum(Y >= 2) >= 3)
 458 | rules
 459 | @
 460 | Here the \code{.} refers to the dataset as a whole, while rules that can be
 461 | evaluated within the dataset can be written as boolean \proglang{R}
 462 | expressions.
 463 | 
 464 | We will apply these conditions to the \code{input} dataset that was 
 465 | constructed in Section~\ref{sect:formula}. As a reminder we print
 466 | the first few records.
 467 | <<>>=
 468 | head(input, 4)
 469 | @
 470 | We use \code{A * B ~ A * B1 + B1} as collapsing scheme. The function
 471 | \code{from_validator} passes the requirements as a test function to
 472 | \code{accumulate()} (or \code{cumulate()}).
 473 | <<>>=
 474 | accumulate(input, collapse = A * B ~ A * B1 + B1,
 475 |     test = from_validator(rules), fun = mean)
 476 | @
 477 | Note that for target groups $(A=3,B=21)$ and $(A=3,B=22)$ none of the available
 478 | collapsing levels lead to a group that satisfied all conditions. Therefore the
 479 | collapsing level and output variables are all missing (\code{NA}).
 480 | 
 481 | The third and most flexible way for users to express tests is to write a custom
 482 | testing function. The requirements are that it must work on any subset of a
 483 | data frame, including a dataset with zero rows. The previous example can
 484 | thus also be expressed as follows.
 485 | <<>>=
 486 | my_test <- function(d) nrow(d) >= 3 && sum(d$Y >= 2) >= 3
 487 | accumulate(input, collapse = A * B ~ A * B1 + B1,
 488 |   test = my_test, fun = mean)
 489 | @
 490 | 
 491 | It is easy to overlook some edge cases when specifying test functions.
 492 | Recall that a test function is required to return \code{TRUE} or \code{FALSE},
 493 | regardless of the data circumstances. The only thing that a test function
 494 | can assume is that the received data set is a subset of records from the
 495 | dataset to be aggregated. As a service to the user, \pkg{accumulate} exports
 496 | a function that checks a test function against common edge cases, including
 497 | the occurrence of missing values, a dataset with zero rows, and the full
 498 | dataset. The function \code{smoke_test()} checks whether the output is \code{TRUE} or \code{FALSE}
 499 | under all circumstances and also reports errors, warnings, and messages.
 500 | It accepts a (realistic) dataset and a testing function and prints
 501 | test results to the console.
 502 | <<>>=
 503 | smoke_test(input, my_test)
 504 | @
 505 | By default only failing tests are printed. In this case our test function is
 506 | not robust against missing values for $Y$. This can be remedied by passing
 507 | \code{na.rm = TRUE} as parameter to \code{sum()} in the test function.
 508 | <<>>=
 509 | my_test1 <- function(d) nrow(d) >= 3 && sum(d$Y >= 2, na.rm = TRUE) >= 3 
 510 | smoke_test(input, my_test1)
 511 | @
 512 | The smoke test is aimed at preventing complicated stack traces when errors occur
 513 | in a call to \code{accumulate()} or \code{cumulate()}.  Users should be aware
 514 | that it does not guarantee correctness of the results, only robustness against
 515 | certain edge cases.
 516 | 
 517 | 
 518 | 
 519 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 520 | \subsection{Balanced and Unbalanced Hierarchical Classifications}
 521 | Hierarchical classifications are abundant in (official) statistics.
 522 | They represent a classification of entities into non-overlapping, nested
 523 | groupings. Examples include the international standard industrial classification
 524 | of economic activities (ISIC, \citet{un2022isic}), the related 
 525 | statistical classification of economic activities in europe (NACE, \citet{eu2006nace})
 526 | and the Euroean skills, competences, qualifications and occupations classification
 527 | (ESCO, \citet{eu2022esco}).
 528 | 
 529 | Hierarchical classifications offer a natural mechanism for collapsing
 530 | fine-grained groupings into larger groups because of the parent-child
 531 | relationships. As an example consider a small piece of the NACE classification.
 532 | 
 533 | \begin{center}
 534 | \begin{tikzpicture}
 535 | \tikzstyle{level 1}=[level distance=10mm,sibling distance=40mm]
 536 | \tikzstyle{level 2}=[level distance=10mm,sibling distance=10mm]
 537 |   \node {\code{01}}
 538 |     child { node {\code{011}}
 539 |       child {node {\code{0111}}}
 540 |       child {node {\code{0112}}} 
 541 |       child {node {\code{0113}}} }
 542 |     child { node {\code{012}}
 543 |       child {node {\code{0121}}}
 544 |       child {node {\code{0122}}} 
 545 |       child {node {\code{0123}}} 
 546 |       child {node {\code{0124}}} };
 547 | \end{tikzpicture}
 548 | \end{center}
 549 | Here, the hierarchy suggests to collapse $\{0111,0112,0113\}$ into $\{011\}$
 550 | when needed, and similarly for the right branch. The second level of collapsing
 551 | would combine $\{012\}$ with $\{011\}$ into $\{01\}$. The \pkg{accumulate}
 552 | package comes with a helper function that creates the collapsing scheme from the
 553 | lowest-level digits.
 554 | <<>>=
 555 | nace <- c("0111", "0112", "0113", "0121", "0121", "0122", "0123", "0124")
 556 | csh_from_digits(nace, levels = 2)
 557 | @ 
 558 | Here, the parameter \code{levels} determines how many collapsing steps will be
 559 | computed. Since all codes are prepended with zero, there is no need to collapse
 560 | $01$ any further. The output can be used as argument to the \code{collapse}
 561 | parameter of the \code{accumulate()} or \code{cumulate()} functions.
 562 | 
 563 | The situation becomes a little more involved when hierarchical classifications
 564 | form a tree such that the distance from leave to trunk is not the same for all
 565 | leaves (unbalanced tree). This occurs in practice, for example when local
 566 | organisations create an extra level of detail for some, but not all leaves.
 567 | Below is an example of such a situation.
 568 | %
 569 | \begin{center}
 570 | \begin{tikzpicture}
 571 | \tikzstyle{level 1}=[level distance=10mm,sibling distance=50mm]
 572 | \tikzstyle{level 2}=[level distance=10mm,sibling distance=10mm]
 573 | \tikzstyle{level 2}=[level distance=10mm,sibling distance=15mm]
 574 |   \node {\code{01}}
 575 |     child { node {\code{011}}
 576 |       child {node {\code{0111}}}
 577 |       child {node {\code{0112}}} 
 578 |       child {node {\code{0113}}} }
 579 |     child { node {\code{012}}
 580 |       child {node {\code{0121}}}
 581 |       child {node {\code{0122}}} 
 582 |       child {node {\code{0123}}} 
 583 |       child {node {\code{0124}}
 584 |        child {node {\code{01241}}}
 585 |        child {node {\code{01242}}} } };
 586 | \end{tikzpicture}
 587 | \end{center}
 588 | %
 589 | In this case, not all leaves can be collapsed with the same number of
 590 | collapsing levels. This provides an issue for specifying the collapsing
 591 | sequence as it now depends on the leaf where you start how many collapsing
 592 | levels are possible. It also complicates interpretability of the result as the
 593 | collapsing level reported in the output, now means different things for
 594 | different target groups. The solution chosen in \pkg{accumulate} is to extend
 595 | the tree by making copies of the leaves that are not on the lowest level as
 596 | follows.
 597 | %
 598 | \begin{center}
 599 | \begin{tikzpicture}
 600 | \tikzstyle{level 1}=[level distance=10mm,sibling distance=70mm]
 601 | \tikzstyle{level 2}=[level distance=10mm,sibling distance=20mm]
 602 | \tikzstyle{level 3}=[level distance=10mm,sibling distance=15mm]
 603 |   \node {\code{01}}
 604 |     child { node {\code{011}}
 605 |       child {node {\code{0111}}
 606 |        child {node{\code{0111}}}} 
 607 |       child {node {\code{0112}} 
 608 |        child {node{\code{0112}}}} 
 609 |       child {node {\code{0113}}
 610 |        child {node{\code{0113}}}} } 
 611 |     child { node {\code{012}}
 612 |       child {node {\code{0121}}
 613 |        child {node {\code{0121}}}}
 614 |       child {node {\code{0122}}
 615 |        child {node{\code{0122}}}} 
 616 |       child {node {\code{0123}}
 617 |        child {node{\code{0123}}}} 
 618 |       child {node {\code{0124}}
 619 |        child {node {\code{01241}}}
 620 |        child {node {\code{01242}}} } };
 621 | \end{tikzpicture}
 622 | \end{center}
 623 | %
 624 | The tradeoff is that although there may be some extra calculations in the case
 625 | where a leaf is collapsed to itself. The gain is that the specification of the
 626 | calculation as well as the interpretation of the results are now uniform
 627 | across all hierarchical classifications. Again, using \code{csh_from_digits()}
 628 | deriving the collapsing scheme can be automated.
 629 | <<>>=
 630 | nace <- c("0111", "0112", "0113", "0121", "0122", "0123", "01241", "01242")
 631 | csh_from_digits(nace, levels = 3)
 632 | @
 633 | 
 634 | 
 635 | 
 636 | 
 637 | 
 638 | 
 639 | 
 640 | 
 641 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 642 | \section{Extensive Example: Economic Data} \label{sect:example}
 643 | In this Section we discuss three practical examples using a synthetic
 644 | dataset included with the package.
 645 | <<>>=
 646 | data("producers")
 647 | head(producers)
 648 | @
 649 | This \code{producers} dataset contains synthetic data records of various income
 650 | sources for \Sexpr{nrow(producers)} industrial produces. The records are
 651 | classified into a local version of the NACE classification called \code{sbi}
 652 | and into \code{size} classes with values in $\{5,6,7,8,9\}$. 
 653 | 
 654 | \subsection{Small Area Estimation} 
 655 | Small area estimation (SAE) is a collection of methods that are aimed at
 656 | estimating subpopulation parameters in cases where there the number of
 657 | observations in a subpopulation is so mall that direct estimation leads to
 658 | unacceptable estimation variance. Instead one may resort for example to
 659 | indirect estimation, meaning that one estimates parameters for a larger
 660 | subpopulation which are then used in the estimate for the target subpopulation.
 661 | Here, we shall be interested in estimating the average turnover from industrial
 662 | activities (\code{industrial}) by SBI and size class.
 663 | 
 664 | In the simplest case, where no auxiliary information is available or used, one
 665 | replaces the estimator of the mean over a subpopulation with the estimator of
 666 | the mean over a larger subpopulation that includes the target subpopulation
 667 | \citep[Section 3.2.1]{rao2015small}. If we assume that the dataset is obtained
 668 | by simple random sampling from the population, the mean can be estimated with
 669 | the sample mean. In this example we will demand that there are at least ten
 670 | records for which turnover has been measured. The collapsing scheme is given by
 671 | \code{sbi * size ~ sbi + sbi2 + sbi1} where \code{sbi2} and \code{sbi1} are
 672 | classification by respectively the first two SBI digits and the first SBI
 673 | digit. We first add those variables to the dataset.
 674 | <<>>=
 675 | producers$sbi2 <- substr(producers$sbi, 1, 2)
 676 | producers$sbi1 <- substr(producers$sbi, 1, 1)
 677 | head(producers, 3)
 678 | @
 679 | Using \code{cumulate()} we obtain the means. 
 680 | <<>>=
 681 | a <- cumulate(producers,
 682 |   collapse = sbi * size ~ sbi + sbi2 + sbi1,
 683 |   test = min_complete(n = 10, vars = "industrial"),
 684 |   mean_industrial = mean(industrial, na.rm = TRUE))
 685 | head(a,3)
 686 | @
 687 | 
 688 | In terms of SAE, the collapsing scheme expresses the assumption that estimates
 689 | on the level of respectively \code{sbi}, \code{sbi2} and \code{sbi1} introduce
 690 | an acceptable bias.
 691 | 
 692 | 
 693 | \subsection{Imputing Missing Values Using SAE and Ratio Imputation} \label{sect:ratio}
 694 | Our goal in this example is to impute missing values for the \code{industrial}
 695 | variable based on ratio imputation with \code{total} as predictor.  Ratio
 696 | imputation is a method where the imputed value $\hat{y}_i$ for variable $Y$ of
 697 | record $i$ is estimated as $\hat{y_i}=\hat{R}_d x_i$, where $\hat{R}_d$ is an
 698 | estimate of the ratio between $Y$ and an auxiliary variable $X$ in
 699 | subpopulation $d$. An unbiased estimate for $R_d$ is given by
 700 | $\hat{\bar{Y}}_d/\hat{\bar{X}}_d$ where $\hat{\bar{Y}}_d$ and $\hat{\bar{X}}_d$
 701 | are estimated subpopulation means. 
 702 | 
 703 | We use SAE to estimate the subpopulation ratios, and then use the \code{simputation}
 704 | package \citep{loo2022simputation} to impute the missing values. 
 705 | <<>>=
 706 | r <- cumulate(producers,
 707 |   collapse = sbi * size ~ sbi + sbi2 + sbi1,
 708 |   test = min_complete(n = 10, vars = "industrial"),
 709 |   R = mean(industrial, na.rm = TRUE)/mean(total, na.rm = TRUE))
 710 | head(r,3)
 711 | @
 712 | To impute the values, using \code{impute_proxy()} we need to merge the ratios
 713 | with the producers dataset (which automatically happens by SBI and size class).
 714 | <<>>=
 715 | library("simputation")
 716 | dat <- merge(producers, r)
 717 | dat <- impute_proxy(dat, industrial ~ R * total)
 718 | @
 719 | We can inspect the imputed values as follows.
 720 | <<>>=
 721 | iNA <- is.na(producers$industrial)
 722 | head(dat[iNA, c("sbi", "size", "level", "R", "industrial", "total")])
 723 | @
 724 | From this, we get all the information to interpret the imputed values. For
 725 | example, we see that in record 664, the ratio was estimated after collapsing
 726 | the group $(\texttt{sbi},\texttt{size})=(21122,7)$ to $\texttt{sbi2}=21$ since
 727 | the collapse \code{level} equals 2.
 728 | 
 729 | 
 730 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 731 | \subsection{Random Nearest Neighbours Imputation with Collapsing Groups}
 732 | Nearest neighbor (NN) imputation is a donor imputation method where the
 733 | imputation value is copied from a record that is (randomly) chosen from a donor
 734 | pool \citep{andridge2010review}. In this example we use the grouping variables
 735 | in \code{producers} to define the donor pools. To prevent the same donor from
 736 | being used too often, it is not uncommon to demand a minimum number of records
 737 | in the donor pool. A collapsing scheme is one way of guaranteeing this, and
 738 | below we demonstrate how this problem can be expressed in \code{accumulate}.
 739 | 
 740 | We wish to impute the variable \code{trade} in the \code{producers} dataset
 741 | using donor imputation, where donors come from the same (\code{sbi},
 742 | \code{size}) combination. We wish to sample donor values from a group of at
 743 | least 5 donors. If this is not possible, we use the same fallback scenario as
 744 | in the previous Section.
 745 | 
 746 | We first define an `aggregation' function that takes a vector 
 747 | of donors and returns a non-empty sample.
 748 | <<>>=
 749 | random_element <- function(x) sample(x[ !is.na(x)], 1)
 750 | @
 751 | We will use \code{cumulate()} to ensure that there are at least five non-empty
 752 | values in \code{x} when \code{random_element()} is called. To make sure we
 753 | obtain a donor for each record, we add an identifying column \code{id} to use
 754 | as grouping variable.  
 755 | <<>>=
 756 | producers <- cbind(id = sprintf("ID%03d", seq_len(nrow(producers)))
 757 |                   , producers)
 758 | set.seed(111)
 759 | imputations <- cumulate(producers
 760 |                  , collapse = id ~ sbi * size + sbi + sbi2 + sbi1
 761 |                  , test = min_complete(5,"trade")
 762 |                  , donor_trade = random_element(trade))
 763 | 
 764 | head(imputations, 3)
 765 | @
 766 | To use the donor imputations, we merge the imputation candidates with the
 767 | original dataset and use \code{impute_proxy()} of the \pkg{simputation} package
 768 | for imputation.
 769 | <<>>=
 770 | imputed <- merge(producers, imputations) |>
 771 |   impute_proxy(trade ~ donor_trade)
 772 | cols <- c(1:3, 9:10, 5)
 773 | head(producers[cols], 3)
 774 | head(imputed[c(cols, 11)], 3)
 775 | @
 776 | The merge operation automatically merges on the \code{id} column, which also
 777 | adds the \code{level} column to the output.  The function \code{impute_proxy}
 778 | copies values from \code{donor_trade} into trade where \code{trade} is missing.
 779 | In the last expressions we only print the columns of interest. 
 780 | 
 781 | 
 782 | 
 783 | 
 784 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 785 | \subsection{Computing Complex Aggregates} \label{sect:complex}
 786 | Until now, the aggregates have been simple (scalar) values.  With the
 787 | \code{cumulate()} function it is also possible to specify complex aggregates that
 788 | go beyond simple aggregates. Below, we estimate the following linear model
 789 | \begin{displaymath}
 790 | \texttt{total} = \beta_0 + \beta_i\texttt{industrial} + \varepsilon,
 791 | \end{displaymath}
 792 | demanding that there are at least 10 records where both predictor and predicted
 793 | variable are available.
 794 | <<>>=
 795 | r <- cumulate(producers,
 796 |   collapse = sbi * size ~ sbi + sbi2 + sbi1,
 797 |   test = min_complete(n = 10, vars = c("total","industrial")),
 798 |   model = lm(total ~ industrial))
 799 | head(r,3)
 800 | @
 801 | Here, the last column is a list of class \code{object_list}, where each
 802 | element is either an object of class \code{lm} or \code{NA}. Variables of class
 803 | \code{object_list} only differ from from a standard \proglang{R} list by their
 804 | print method.
 805 | 
 806 | 
 807 | 
 808 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 809 | \section{Formal Description and Algorithms} \label{sect:algorithm}
 810 | In this Section we give a formal description of the algorithm for aggregation
 811 | with dynamic grouping.  We start by giving an algorithm for ordinary
 812 | split-apply-combine to demonstrate how the algorithm must be generalized to
 813 | allow for a collapsing scheme.
 814 | 
 815 | \subsection{Split-Apply-Combine}
 816 | To analyse a data set group by group we need to specify a data set, a
 817 | way to split it into groups, and a function that takes a subset of data and
 818 | returns an aggregate. Let us introduce some notation for that.
 819 | 
 820 | Denote with $U$ a finite set, and let $\phi:2^U\to X$ be a function that
 821 | accepts a subset of $U$ and returns a value in some domain $X$.  Here, $U$
 822 | represents a data set, $2^U$ its power set, and $\phi$ an aggregating function.
 823 | We split $U$ into groups using the following notation. Let $A$ be finite set
 824 | that has no more elements than $U$, and let $f:U\onto A$ be a
 825 | surjective function that takes an element of $U$ and returns a value in $A$. We
 826 | can think of $A$ as a set of group labels, and $f$ as the function that assigns
 827 | a label to each element of $U$. This way, $f$ divides $U$ into non-overlapping
 828 | subsets. We say that $f:U\onto A$ is a \emph{partition} of $U$. We
 829 | also introduce the \emph{pullback along $f$}, $f^*:2^A\to 2^U$ defined as
 830 | %
 831 | \begin{equation*}
 832 | f^*(S) = \{u\in U|f(u)\in S\},
 833 | \end{equation*}
 834 | where $S$ is a subset of $A$ (See \emph{e.g.}, \citet[Section 1.4]{fong2019invitation}).
 835 | 
 836 | In this notation, any split-apply-combine operation can be computed with the
 837 | following algorithm.
 838 | 
 839 | \begin{algorithm}[H]
 840 | \caption{Split-Apply-Combine: $\textsc{SAC}(U,\phi,f)$}
 841 | \label{alg:sac}
 842 | \SetKwInOut{Input}{Input}\SetKwInOut{Output}{Output}
 843 | \Input{A finite set $U$, an aggregator $\phi: 2^U\to X$, and a partition
 844 |        $f:U\onto A$.}
 845 | \Output{$R$: the value of $\phi$ for every part of $U$ as a set of pairs
 846 |         $(a,x)\in A\times X$. }
 847 | $R = \{\}$\;
 848 | \For{$a\in A$}{
 849 |   $d = f^*(\{a\})$\tcp*{get subset of $U$}
 850 |   $R = R\cup \{(a,\phi(d))\}$\tcp*{aggregate and add to result}
 851 | }
 852 | \end{algorithm}
 853 | %
 854 | In this algorithm the output is collected in a set $R$ containing pairs from
 855 | $A\times X$: one pair for each element of $A$. (Incidently, the algorithm can be
 856 | summarized even shorter in this notation as $R=\cup_{a\in A}\{(a,(\phi\circ
 857 | f^*)(\{a\}))\}$, where $\circ$ denotes function composition).  
 858 | 
 859 | It is interesting to see how the elements $U$, $f$, and $\phi$ are implemented
 860 | in practice. Consider the signature of the \proglang{R}'s \code{aggregate()}
 861 | function (we skip arguments that are not important for the discussion).
 862 | \begin{Code}
 863 |   aggregate(x, by, FUN)
 864 | \end{Code}
 865 | Here, \code{x} is a data frame where each row represents an element of $U$. The
 866 | parameter \code{by} is a list of vectors of group labels, where each vector has
 867 | a length that equals the number of rows in \code{x}. So the function $f:U\onto
 868 | A$ is implemented by asking the user to make sure that the position of each
 869 | label in \code{by} corresponds to the correct row number in the data frame
 870 | \code{x}. The argument \code{FUN} (of class \class{function}) represents the
 871 | function $\phi$ that aggregates each subset of \code{x}. When the records in
 872 | \code{x} contain more than one variable, the aggregator is applied to each one
 873 | of them.  Here is an example of how a user might call this function from the
 874 | \proglang{R} prompt.
 875 | %
 876 | <<>>=
 877 | aggregate(iris[1:2], by = iris["Species"], FUN = mean)
 878 | @
 879 | %
 880 | Note that the correspondence in position of the \texttt{Species} label and the
 881 | record position is implemented by taking them from the same data frame. The
 882 | output also reveals in the first column the set $A$: each row corresponds to a
 883 | unique value in \texttt{Species} column.
 884 | 
 885 | 
 886 | \subsection{Split-Apply-Combine with Collapsing Groups}
 887 | \label{sect:saccg}
 888 | The goal of the algorithm is to compute a value for each part of a dataset,
 889 | possibly using values external to the part ---conditional to restrictions
 890 | placed on each part. The input of the algorithm consists again of a finite set
 891 | $U$ and an aggregation function $\phi$ that takes a subset of $U$ and returns a
 892 | value in some domain $X$.  Compared to Algorithm~\ref{alg:sac} two other inputs
 893 | are needed. First, a function must be defined that that checks whether a given
 894 | subset $d$ of $U$ is suitable for computing $\phi(d)$.  We will denote this
 895 | function $\beta: 2^U\to \mathbb{B}$, where
 896 | $\mathbb{B}=\{\texttt{True},\texttt{False}\}$.  Typical tests are checking
 897 | whether there are sufficient records available, or whether certain variables
 898 | have a low enough fraction of missing values. Second, we need a
 899 | \emph{collapsing scheme} $C$, defined as sequence of $n+1$ mappings
 900 | \begin{equation}
 901 | C\equiv U\xonto{f}A\xonto{f_1}A_1\xonto{f_2}\cdots\xonto{f_n}A_n.
 902 | \label{eq:collapsingsequence}
 903 | \end{equation}
 904 | A collapsing scheme is a sequence of partitions where each $f_i$ partitions its
 905 | domain in $|A_i|$ groups while $f$ partitions $U$ in $|A|$ groups.
 906 | 
 907 | 
 908 | Denote with $F_k:A\to A_k$, the function that  accepts a label in $A$ and
 909 | returns the corresponding label in $A_k$. In other words, $F_k$ is the
 910 | composition $f_k\circ f_{k-1}\circ\cdots \circ f_1$. Similarly we define the
 911 | pullback along $F_k$ as $F_k^* = f_1^*\circ f_2^*\circ\cdots\circ f_k^*$.  This
 912 | function accepts a set of labels in $A_k$ and returns all the labels in $A$
 913 | that are mapped to those labels via the collapsing sequence of
 914 | Equation~\ref{eq:collapsingsequence}.  With this notation we can define the
 915 | Algorithm for aggregation with dynamic grouping as follows. 
 916 | 
 917 | \begin{algorithm}[H]
 918 | \caption{Split-Apply-Combine with Collapsing Groups: $\textsc{SACCG}(U,\phi,\beta,C)$}
 919 | \label{alg:saccg}
 920 | \SetKwInOut{Input}{Input}\SetKwInOut{Output}{Output}
 921 | \Input{A finite set $U$, an aggregator $\phi: 2^U\to X$, a test function $\beta: 2^U\to \mathbb{B}$,
 922 |       and a collapsing sequence $C\equiv U\xonto{f}A\xonto{f_1}A_1\xonto{f_2}\cdots \xonto{f_n}  A_n$.}
 923 | 
 924 | \Output{$R$: the value of $\phi$ for every part of $U$, for which a suitable
 925 | collapsing group can be found, as a set of triples $(a,k,x)\in A\times
 926 | \underline{n}\times X $ where $\underline{n}=\{0,1,\ldots,n\}$.}
 927 | 
 928 | $R = \{\}$\;
 929 | \For{$a\in A$}{
 930 |   $i=0$ \tcp*{Initiate collapse level}
 931 |   $d = f^*(\{a\})$ \tcp*{Get subset of $U$}
 932 |   \While{$i<n \land \lnot\beta(d)$}{ \label{line:while}
 933 |     $i = i+1$ \tcp*{Increase collapse level} 
 934 |     $d = (f^*\circ F_i^*\circ F_i)(a)$ \tcp*{Collapse and get subset}
 935 |   }\label{line:endwhile}
 936 |   \If{$i<n \lor \beta(d)$}{ \label{line:cond}
 937 |     $R = R\cup \{(a, i,\phi(d))\}$\; \label{line:R}
 938 |   } 
 939 | }
 940 | 
 941 | \end{algorithm}
 942 | %
 943 | 
 944 | In this algorithm the collapsing level $i$ is increased until the test is
 945 | passed or the maximum collapsing level $n$ is reached
 946 | (Lines~\ref{line:while}-\ref{line:endwhile}) %Since the eventual level of
 947 | collapsing is determined dynamically by data %circumstances in $U$, the
 948 | algorithm also reports the collapsing level $i$ used.  The condition in
 949 | Line~\ref{line:cond} ensures that if no suitable dataset is found after the
 950 | whole collapsing sequence has been executed, then no answer is returned. This
 951 | means that in contrast with Algorithm~\ref{alg:sac} there is no guarantee that
 952 | a value for each member of $A$ will be found. For each member of $A$ where
 953 | an aggregate is computed, there is a triple $(a,i,\phi(d))$, where $a\in A$
 954 | is the label to which the value $\phi(d)$ pertains, and $i$ is the number of
 955 | collapses applied to reach a suitable dataset. 
 956 | 
 957 | 
 958 | Comparing Algorithms~\ref{alg:sac} and~\ref{alg:saccg}, we see that the
 959 | standard split-apply-combine algorithm has worst-case runtime complexity
 960 | $O(|A|)$, as determined by counting applications of $f^*$. This is equal to the
 961 | algorithm's best case $\Omega(|A|)$. The split-apply-combine with collapsing
 962 | groups algorithm also has best case $\Omega(|A|)$ but has worst case $O(n|A|)$
 963 | (with $n$ the number of collapsing steps). In fact, the best case for
 964 | Algorithm~\ref{alg:saccg} is achieved by setting $\beta:d\mapsto\texttt{True}$.
 965 | In this case Algorithm~\ref{alg:saccg} reduces to Algorithm~\ref{alg:sac}. In
 966 | other words, we have
 967 | %
 968 | \begin{displaymath}
 969 | \textsc{SACCG}(U,\phi,\texttt{True},C) = \textsc{SAC}(U,\phi,f).
 970 | \end{displaymath}
 971 | %
 972 | To see this, observe that in Algorithm~\ref{alg:saccg} the condition in
 973 | line~\ref{line:while} is always \code{False} and the condition in
 974 | line~\ref{line:R} is always \code{True} in this case.
 975 | 
 976 | The worst case is achieved by setting $\beta: d\mapsto
 977 | \texttt{False}$. In that case the while loop in Line~\ref{line:while} is
 978 | iterated $n-1$ times (yielding total $n$ executions of $f^*$) while
 979 | Line~\ref{line:R} is never executed.
 980 | 
 981 | The analyses above leaves open the question of how the runtime depends on
 982 | application of the pullbacks in both algorithms. First note that there needs to
 983 | be no difference in applying $f^*$ or $f^*\circ F_i^*\circ F_i$: in practice a
 984 | collapsing scheme can be represented in tabular form just like $f$.  The
 985 | pullback then comes down to a lookup of records based on matching one or more
 986 | attributes with a set of attribute values. The time complexity of such
 987 | operations are typically reduced by proper preparation of the dataset. For
 988 | example, databases can be prepared to speed up certain often-used lookup
 989 | operations. In the implementation of the \code{accumulate} package the
 990 | pullback is implemented using standard join operations as
 991 | implemented by \proglang{R}'s standard \code{merge()} function.  
 992 | 
 993 | 
 994 | 
 995 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 996 | \section{Summary and Conclusion}\label{sect:conclusion}
 997 | The \proglang{R} package \code{accumulate} introduced in this paper offers
 998 | convenient interfaces for computing grouped aggregates where the grouping is
 999 | dynamically determined based on user-defined conditions and a user-defined
1000 | group collapsing scheme. We demonstrated that these interfaces can be used in
1001 | several situations, including those where multiple grouping variables are used
1002 | , where complex collapsing schemes are applied, as well as situations where
1003 | multiple variables need to be aggregated over. Conditions on groups of records
1004 | are principally represented by a function, but the package includes several
1005 | convenience functions for defining conditions on record groups. It also
1006 | interfaces with the \pkg{validate} \proglang{R} package to facilitate cases
1007 | where mutliple test conditions must be met. Moreover, the package supports the
1008 | collection of complex aggregates, such as model outputs and offers several
1009 | service functions that aim to facilitate the definition of the collapsing
1010 | conditions.
1011 | 
1012 | The pseudocode underlying the package's main functions has been formally
1013 | analyzed and it is shown that aggregation with collapsing groups can be
1014 | interpreted as a precise generalization of standard grouped aggregation.  We
1015 | hope that this stimulates broader implementation of this algorithm in software
1016 | offering grouped aggregation.
1017 | 
1018 | 
1019 | 
1020 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1021 | \newpage
1022 | \bibliography{jss5097}
1023 | 
1024 | 
1025 | 
1026 | 
1027 | \end{document}
1028 | 


--------------------------------------------------------------------------------
/paper/jss5097.bib:
--------------------------------------------------------------------------------
  1 | 
  2 | @article{andridge2010review,
  3 |   Title={A Review of Hot Deck Imputation for Survey Non-Response},
  4 |   author={Andridge, Rebecca R and Little, Roderick JA},
  5 |   journal={International Statistical Review},
  6 |   volume={78},
  7 |   number={1},
  8 |   pages={40--64},
  9 |   year={2010},
 10 |   publisher={Wiley Online Library}
 11 | }
 12 | 
 13 | @article{becker1988new,
 14 |   title={The New \proglang{S} Language},
 15 |   author={Becker, RA and Chambers, JM and Wilks, AR},
 16 |   journal={Computer Science Series, Pacific Grove, CA},
 17 |   year={1988}
 18 | }
 19 | 
 20 | 
 21 | @manual{argus2022,
 22 |   author = {{CBS}},
 23 |   title = {\pkg{Argus}: Software for Protecting Statistical Data},
 24 |   year  = {2022},
 25 |   note = {$\tau$ Argus version 4.2.0, $\mu$ Argus version 5.1.6},
 26 |   url  = {https://research.cbs.nl/casc/index.htm}
 27 | }
 28 | 
 29 | @manual{boonstra2022hbsae,
 30 |   title = {\pkg{hbsae}: Hierarchical Bayesian Small Area Estimation},
 31 |   author = {Harm Jan Boonstra},
 32 |   year = {2022},
 33 |   note = {\proglang{R} package version 1.2},
 34 |   url = {https://CRAN.R-project.org/package=hbsae},
 35 | }
 36 | 
 37 | 
 38 | 
 39 | @manual{dowle2022datatable,
 40 |   title = {\pkg{data.table}: Extension of `\pkg{data.frame}'},
 41 |   author = {Matt Dowle and Arun Srinivasan},
 42 |   year = {2024},
 43 |   note = {\proglang{R} package version 1.15.4},
 44 |   url = {https://CRAN.R-project.org/package=data.table},
 45 | }
 46 | 
 47 | 
 48 | 
 49 | @manual{eastwood2022poorman,
 50 |   title = {\pkg{poorman}: A Poor Man's Dependency Free Recreation of '\pkg{dplyr}'},
 51 |   author = {Nathan Eastwood},
 52 |   year = {2023},
 53 |   note = {\proglang{R} package version 0.2.7},
 54 |   url = {https://CRAN.R-project.org/package=poorman},
 55 | }
 56 | 
 57 | @misc{eu2006nace,
 58 |   author={{Council of European Union}},
 59 |   title={Council Regulation ({EU}) No 1893/2006},
 60 |   year={2006},
 61 |   howpublished={\url{https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:32006R1893}}
 62 | }
 63 | 
 64 | 
 65 | @misc{eu2022esco,
 66 |   author = {European Commision},
 67 |   title  ={European Skills, Competences, Qualifications, and Occupations},
 68 |   year   = {2022},
 69 |   howpublished = {https://esco.ec.europa.eu}
 70 | }
 71 | 
 72 | @book{fong2019invitation,
 73 |   title={An Invitation to Applied Category Theory: Seven Sketches in Compositionality},
 74 |   author={Fong, Brendan and Spivak, David I},
 75 |   year={2019},
 76 |   publisher={Cambridge University Press}
 77 | }
 78 | 
 79 | 
 80 | @book{hundepool2012statistical,
 81 |   title = {Statistical Disclosure Control},
 82 |   author = {Hundepool, Anco and Domingo-Ferrer, Josep and Franconi, Luisa and
 83 |             Giessing, Sarah and Nordholt, Eric Schulte and Spicer, Keith and De Wolf,
 84 |             Peter-Paul},
 85 |   volume = {2},
 86 |   year   = {2012},
 87 |   publisher = {Wiley New York}
 88 | }
 89 | 
 90 | 
 91 | @manual{kaminski2022dataframes,
 92 |   author       = {Bogumił Kamiński and
 93 |                   John Myles White and
 94 |                   Milan Bouchet-Valat and
 95 |                   powerdistribution and
 96 |                   Sean Garborg and
 97 |                   Jacob Quinn and
 98 |                   Simon Kornblith and
 99 |                   cjprybol and
100 |                   Alexey Stukalov and
101 |                   Douglas Bates and
102 |                   Tom Short and
103 |                   Chris DuBois and
104 |                   Harlan Harris and
105 |                   Kevin Squire and
106 |                   Alex Arslan and
107 |                   pdeffebach and
108 |                   David Anthoff and
109 |                   Dave Kleinschmidt and
110 |                   Andreas Noack and
111 |                   Viral B. Shah and
112 |                   Alex Mellnik and
113 |                   Takafumi Arakaki and
114 |                   Tanmay Mohapatra and
115 |                   Peter and
116 |                   Stefan Karpinski and
117 |                   Dahua Lin and
118 |                   Ronan Arraes Jardim Chagas and
119 |                   timema and
120 |                   ExpandingMan and
121 |                   Florian Oswald},
122 |   title        = {\pkg{JuliaData/DataFrames.jl}: v1.4.4},
123 |   month        = dec,
124 |   year         = 2022,
125 |   publisher    = {Zenodo},
126 |   version      = {v1.4.4}
127 | }
128 | 
129 | @manual{krantz2022collapse,
130 |   title = {\pkg{collapse}: Advanced and Fast Data Transformation},
131 |   author = {Sebastian Krantz},
132 |   year = {2024},
133 |   note = {\proglang{R} package version 2.0.13},
134 |   url = {https://CRAN.R-project.org/package=collapse},
135 | }
136 | 
137 | 
138 | 
139 | @article{kowarik2016imputation,
140 |   title = {Imputation with the \proglang{R} Package \pkg{VIM}},
141 |   author = {Alexander Kowarik and Matthias Templ},
142 |   journal = {Journal of Statistical Software},
143 |   year = {2016},
144 |   volume = {74},
145 |   number = {7},
146 |   pages = {1--16},
147 |   doi = {10.18637/jss.v074.i07},
148 | }
149 | 
150 | @article{loo2021data,
151 |    title = {Data Validation Infrastructure for \proglang{R}},
152 |    author = {Mark P. J. {van der Loo} and Edwin {de Jonge}},
153 |    journal = {Journal of Statistical Software},
154 |    year = {2021},
155 |    volume = {97},
156 |    number = {10},
157 |    pages = {1--31},
158 |    doi = {10.18637/jss.v097.i10}
159 |  }
160 | 
161 | 
162 | 
163 | @manual{loo2022accumulate,
164 |   title = {\pkg{accumulate}: Split-Apply-Combine with Collapsing Groups},
165 |   author = {Mark {van der Loo}},
166 |   year = {2023},
167 |   note = {\proglang{R} package version 0.9.3},
168 | }
169 | 
170 | 
171 | 
172 | @manual{loo2022simputation,
173 |   title = {\pkg{simputation}: Simple Imputation},
174 |   author = {Mark {van der Loo}},
175 |   year = {2022},
176 |   note = {\proglang{R} package version 0.2.8},
177 |   url = {https://github.com/markvanderloo/simputation},
178 | }
179 | 
180 | 
181 | 
182 | @manual{meindl2022sdctable,
183 |   title = {sdcTable: Methods for Statistical Disclosure Control in Tabular Data},
184 |   author = {Bernhard Meindl},
185 |   year = {2023},
186 |   note = {\proglang{R} package version 0.32.6},
187 |   url = {https://CRAN.R-project.org/package=sdcTable},
188 | }
189 | 
190 | 
191 | @article{molina2015sae,
192 |    author = {Isabel Molina and Yolanda Marhuenda},
193 |    title = {{sae}: An \proglang{R} Package for Small Area Estimation},
194 |    journal = {The R Journal},
195 |    year = {2015},
196 |    volume = {7},
197 |    number = {1},
198 |    pages = {81--98},
199 |    month = {jun},
200 |    url = {https://journal.r-project.org/archive/2015/RJ-2015-007/RJ-2015-007.pdf},
201 |  }
202 | 
203 | 
204 | 
205 | @manual{polars2023,
206 |    title = {Polars User Guide},
207 |    author = {Polars},
208 |    year = {2024},
209 |    url = {https://docs.pola.rs/}
210 | }
211 | 
212 | @book{rao2015small,
213 |   title={Small Area Estimation},
214 |   author={Rao, John NK and Molina, Isabel},
215 |   year={2015},
216 |   publisher={John Wiley \& Sons}
217 | }
218 | 
219 | 
220 | @manual{rcore,
221 |   title = {\proglang{R}: A Language and Environment for Statistical Computing},
222 |   author = {\proglang{R} Core Team},
223 |   organization = {\proglang{R} Foundation for Statistical Computing},
224 |   address = {Vienna, Austria},
225 |   year = {2024},
226 |   url = {https://www.R-project.org/}
227 | }
228 | 
229 | 
230 | @article{templ2015statistical,
231 |   title = {Statistical Disclosure Control for Micro-Data Using the \proglang{R} Package \pkg{sdcMicro}},
232 |   author = {Matthias Templ and Alexander Kowarik and Bernhard Meindl},
233 |   journal = {Journal of Statistical Software},
234 |   year = {2015},
235 |   volume = {67},
236 |   number = {4},
237 |   pages = {1--36},
238 |   doi = {10.18637/jss.v067.i04},
239 | }
240 | 
241 | @misc{un2022isic,
242 |   title  = {International Standard Industrial Classification of Economic Activities (ISIC)},
243 |   author = {{United Nations Statistical Division}},
244 |   year   = {2008},
245 |   howpublished = {\url{https://unstats.un.org/unsd/classifications/Econ/ISIC.cshtml}},
246 |   publisher  = {United Nations Department of Economic and Social Affairs}
247 | }
248 |   
249 | 
250 | @manual{wickham2022dplyr,
251 |    title = {\pkg{dplyr}: A Grammar of Data Manipulation},
252 |    author = {Hadley Wickham and Romain François and Lionel Henry and Kirill Müller},
253 |    year = {2024},
254 |    note = {\proglang{R} package version 1.1.4},
255 |    url = {https://CRAN.R-project.org/package=dplyr},
256 | }
257 | 
258 | 
259 | 
260 | 
261 | @article{wickham2011split,
262 |   title={The Split-Apply-Combine Strategy for Data Analysis},
263 |   author={Wickham, Hadley},
264 |   journal={Journal of Statistical Software},
265 |   volume={40},
266 |   pages={1--29},
267 |   year={2011}
268 | }
269 | 
270 | 
271 | 
272 | 


--------------------------------------------------------------------------------
/paper/jsslogo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markvanderloo/accumulate/1be9146a5f7c2e9b27660b69640290494a6fedf0/paper/jsslogo.jpg


--------------------------------------------------------------------------------
/paper/notes/Makefile:
--------------------------------------------------------------------------------
1 | 
2 | notes.pdf: notes.tex
3 | 	pdflatex notes.tex
4 | 	evince notes.pdf &
5 | 
6 | 


--------------------------------------------------------------------------------
/paper/notes/notes.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[fleqn,11pt]{article}
  2 | \usepackage{amsmath, amssymb}
  3 | \title{notes}
  4 | 
  5 | \begin{document}
  6 | \subsection*{Notation}
  7 | Given a finite population $U=\{1,2,\ldots, N\}$ partitioned in domains
  8 | $\{D_i\}_{i=1}^M$, of sizes $N_i$.  Draw a sample $S\subseteq U$ of size
  9 | $n$ uniformly without replacement from $U$. We have
 10 | 
 11 | $$
 12 | \mathbb{P}(j\in S) = \frac{n}{N}\textrm{ for all } j\in U.
 13 | $$
 14 | 
 15 | Assume we measure a variable $Y$ with realisations $\{y_j|j\in S\}$. We shall
 16 | be interested in estimating the population total $t_Y$, the population mean
 17 | $\mu_Y$ and the totals and means for each domain, denoted $t_{Y|i}$ and $\mu_{Y|i}$
 18 | respectively.
 19 | 
 20 | \subsection*{Horwitz-Thomson estimation}
 21 | The Horwitz-Thomson estimator for the population total is defined as
 22 | $$
 23 | \hat{t}^{\textrm{HT}}_Y = \sum_{j\in S} \frac{1}{\mathbb{P}(j\in S)}y_j
 24 | =\sum_{j\in S}\frac{N}{n}y_j = N\bar{y},
 25 | $$
 26 | where $\bar{y}$ is the sample mean of $Y$.  The HT estimator for a domain $D_i$
 27 | is given by
 28 | $$
 29 | \hat{t}_{Y|i}^{\textrm{HT}} = \sum_{j\in S_i}\frac{N}{n}y_j,\textrm{ with } S_i=D_i\cap S.
 30 | $$
 31 | This estimator has the property that
 32 | $\sum_{i=1}^M\hat{t}_{Y|i}^{\textrm{HT}}=\hat{t}_{Y}^{\textrm{HT}}$, or more
 33 | generally, for any $L\subseteq \{1,2,\ldots,M\}$ we have the additivity
 34 | property $\hat{t}^{\textrm{HT}}_{Y|L} = \sum_{l\in
 35 | L}\hat{t}^{\textrm{HT}}_{Y|l}$.
 36 | 
 37 | 
 38 | For the corresponding estimator of the population mean we get the following.
 39 | $$
 40 | \hat{\mu}_{Y}^{\textrm{HT}} = \frac{1}{N}\hat{t}^{\textrm{HT}}_Y = 
 41 | \frac{1}{n}\sum_{j\in S}y_j = \bar{y}.
 42 | $$
 43 | For the domain means we define
 44 | $$
 45 | \hat{\mu}_{Y|i}^{\textrm{HT}} = \frac{1}{N_i}\hat{t}^{\textrm{HT}}_{Y|i}
 46 | = \frac{N}{N_i n}\sum_{j\in S_i}y_j.
 47 | $$
 48 | We can rewrite this by observing that $\mathbb{E}(n_i)=\frac{N_in}{N}$ to get
 49 | $$
 50 | \hat{\mu}_{Y|i}^{\textrm{HT}} = \frac{1}{\mathbb{E}(n_i)}\sum_{j\in S_i}y_j
 51 | =\frac{n_i}{\mathbb{E}(n_i)}\bar{y}_i,
 52 | $$
 53 | where $\bar{y}_i$ is the sample mean of $Y$ observed in $S_i$.
 54 | 
 55 | \subsection*{The BARE estimator for undersampled domains}
 56 | In cases where $n_i$ is small, the estimation variance may be unacceptably
 57 | large.  A variance-bias tradeoff can be made by including units outside $S_i$
 58 | in the estimates for $S_i$. This is called \emph{Small Area Estimation} (SAE).
 59 | In SAE jargon, this is called `borrowing statistical strength' from other
 60 | domains.
 61 | 
 62 | One of the simplest estimators is the Broader Area Ratio Estimator (BARE). The
 63 | idea is to estimate the total for a broader domain that includes $D_i$ and then
 64 | disaggregate the total proportional to the domain size $N_i$. Choosing the
 65 | whole population $U$ for the broader we get the following.
 66 | $$
 67 | \hat{t}^{\textrm{BARE}}_Y = \frac{N_i}{N}\hat{t}^{\textrm{HT}}_Y = N_i\bar{y}.
 68 | $$
 69 | (Note that $\hat{y}$ is the mean, taken over the whole sample $S$). And for the
 70 | mean:
 71 | $$
 72 | \hat{\mu}^{\textrm{BARE}}_Y  = \frac{1}{N_i}\hat{t}^{\textrm{BARE}}_Y  = \bar{y}.
 73 | $$
 74 | Thus, the BARE estimator for $\mu_{Y|i}$ for a domain $i$ is given by the
 75 | estimated mean of the population.
 76 | 
 77 | More generally, suppose that $L\subseteq \{1,2,\ldots,M\}$; choose a $D_i$
 78 | with $i\in L$, and write $S_L = \cup_{l\in L}{S_l}$, then
 79 | %
 80 | $$
 81 | \hat{t}^{\textrm{BARE}}_{Y|i}(L) 
 82 | = \frac{N_i}{N_L}\hat{t}^{\textrm{HT}}_{Y|L}
 83 | =\frac{N_i}{N_L}\sum_{j\in S_L}\frac{N}{n}y_l 
 84 | = \frac{N}{N_L n}N_i n_L\bar{y}_L.
 85 | $$ 
 86 | Noting furthermore that $\mathbb{E}(n_L)=N_Ln/N$ we get
 87 | $$
 88 | \hat{t}^{\textrm{BARE}}_{Y|i}(L)= N_i\frac{n_L}{\mathbb{E}(n_L)}\bar{y}_L,
 89 | = N_i \hat{\mu}^{\textrm{HT}}_{Y|L}.
 90 | $$
 91 | Finally, we get for the mean
 92 | $$
 93 | \hat{\mu}^{\textrm{BARE}}_{Y|i}(L) 
 94 | = \frac{1}{N_i}\hat{t}^{\textrm{BARE}}_{Y|i}(L)
 95 | = \hat{\mu}^{\textrm{HT}}_{Y|L}
 96 | $$
 97 | 
 98 | If we assume the approximation $n_L/\mathbb{E}(n_L)\approx 1$ we get
 99 | $$
100 | \hat{\mu}^{\textrm{BARE}}_{Y|i}(L)\approx \bar{y}_L.
101 | $$
102 | Note that in the limit where $L=\{1,2,\ldots M\}$, i.e. the case where $S_L$
103 | equals the full sample $S$, the approximation becomes an equality. It is not in
104 | true in general that the approximation improves when $L$ is increased. 
105 | 
106 | 
107 | 
108 | 
109 | 
110 | 
111 | 
112 | 
113 | 
114 | 
115 | 
116 | 
117 | \end{document}
118 | 


--------------------------------------------------------------------------------
/paper/orcidlink.sty:
--------------------------------------------------------------------------------
 1 | %%
 2 | %% This is file `orcidlink.sty',
 3 | %% generated with the docstrip utility.
 4 | %%
 5 | %% The original source files were:
 6 | %%
 7 | %% orcidlink.dtx  (with options: `package')
 8 | %% 
 9 | %% This is a generated file.
10 | %% 
11 | %% Copyright (C) 2020 by Leo C. Stein <leo.stein@gmail.com>
12 | %% --------------------------------------------------------------------------
13 | %% This work may be distributed and/or modified under the
14 | %% conditions of the LaTeX Project Public License, either version 1.3
15 | %% of this license or (at your option) any later version.
16 | %% The latest version of this license is in
17 | %%   http://www.latex-project.org/lppl.txt
18 | %% and version 1.3 or later is part of all distributions of LaTeX
19 | %% version 2005/12/01 or later.
20 | %% 
21 | \NeedsTeXFormat{LaTeX2e}[1994/06/01]
22 | \ProvidesPackage{orcidlink}
23 |     [2021/06/11 v1.0.4 Linked ORCiD logo macro package]
24 | 
25 | %% All I did was package up Milo's code on TeX.SE,
26 | %% see https://tex.stackexchange.com/a/445583/34063
27 | \RequirePackage{hyperref}
28 | \RequirePackage{tikz}
29 | 
30 | \ProcessOptions\relax
31 | 
32 | \usetikzlibrary{svg.path}
33 | 
34 | \definecolor{orcidlogocol}{HTML}{A6CE39}
35 | \tikzset{
36 |   orcidlogo/.pic={
37 |     \fill[orcidlogocol] svg{M256,128c0,70.7-57.3,128-128,128C57.3,256,0,198.7,0,128C0,57.3,57.3,0,128,0C198.7,0,256,57.3,256,128z};
38 |     \fill[white] svg{M86.3,186.2H70.9V79.1h15.4v48.4V186.2z}
39 |                  svg{M108.9,79.1h41.6c39.6,0,57,28.3,57,53.6c0,27.5-21.5,53.6-56.8,53.6h-41.8V79.1z M124.3,172.4h24.5c34.9,0,42.9-26.5,42.9-39.7c0-21.5-13.7-39.7-43.7-39.7h-23.7V172.4z}
40 |                  svg{M88.7,56.8c0,5.5-4.5,10.1-10.1,10.1c-5.6,0-10.1-4.6-10.1-10.1c0-5.6,4.5-10.1,10.1-10.1C84.2,46.7,88.7,51.3,88.7,56.8z};
41 |   }
42 | }
43 | 
44 | %% Reciprocal of the height of the svg whose source is above.  The
45 | %% original generates a 256pt high graphic; this macro holds 1/256.
46 | \newcommand{\@OrigHeightRecip}{0.00390625}
47 | 
48 | %% We will compute the current X height to make the logo the right height
49 | \newlength{\@curXheight}
50 | 
51 | \DeclareRobustCommand\orcidlink[1]{%
52 | \texorpdfstring{%
53 | \setlength{\@curXheight}{\fontcharht\font`X}%
54 | \href{https://orcid.org/#1}{\XeTeXLinkBox{\mbox{%
55 | \begin{tikzpicture}[yscale=-\@OrigHeightRecip*\@curXheight,
56 | xscale=\@OrigHeightRecip*\@curXheight,transform shape]
57 | \pic{orcidlogo};
58 | \end{tikzpicture}%
59 | }}}}{}}
60 | 
61 | \endinput
62 | %%
63 | %% End of file `orcidlink.sty'.
64 | 


--------------------------------------------------------------------------------
/paper/reviews/D- 5097 post comments.txt:
--------------------------------------------------------------------------------
  1 | JSS 5097:
  2 | Split-Apply-Combine with Dynamic Grouping
  3 | Mark P.J. van der Loo
  4 | ---------------------------------------------------------
  5 | For further instructions on JSS style requirements please see the Style Files (in particular section 2.1 Style Checklist of jss.pdf) and FAQ at http://www.jstatsoft.org/about/submissions.
  6 | 
  7 | For further examples please see RECENT JSS papers.
  8 | ---------------------------------------------------------
  9 | 
 10 | Manuscript style comments:
 11 | 
 12 | 
 13 | o \section, \subsection, etc. should be in sentence style (see http://www.jstatsoft.org/about/submissions), e.g.,
 14 | 2. Dynamic Grouping
 15 | 3. R Package accumulate
 16 | 4. Extensive Example: Economic Data
 17 | 5. Formal Description and Algorithms
 18 | 3.1. The Formula Interface
 19 | 3.2. The Data Frame Interface
 20 | 3.3. Specifying Tests
 21 | 3.4. Balanced and Unbalanced Hierarchical Classiﬁcations
 22 | 4.1. Small Area Estimation
 23 | 4.2. Imputing Missing Values Using SAE and Ratio Imputation
 24 | 4.3. Random Nearest Neighbours Imputation with Collapsing Groups
 25 | 4.4. Computing Complex Aggregates
 26 | 
 27 | ==> DONE
 28 | 
 29 | 
 30 | o For the code layout in R publications, we typically distinguish input/output
 31 | using Sinput/Soutput (or equivalently CodeInput/CodeOutput). Unless there are
 32 | special reasons to format it differently, the input should use the text width
 33 | (up to 76 or 77 characters) and be indented by two spaces, e.g., Do not start
 34 | each new line with a comma
 35 | 
 36 | begin{Sinput}
 37 | R> example_model <- lm(response ~ variable1 + variable2 + variable3, 
 38 | +    weights = w, data = mydata)
 39 | \end{Sinput}
 40 | 
 41 | > cumulate(input, collapse = A*B ~ A*B1 + A
 42 | +
 43 | 
 44 | >= 3,
 45 | > input$Y2 <- 11:19
 46 | 
 47 | > accumulate(input, collapse = A*B ~ A*B1 + A
 48 | +
 49 | 
 50 | > accumulate(input[-3], collapse = A*B ~ A
 51 | +
 52 | 
 53 | > input1 <-
 54 | > input1$AB <- paste(input$A, input$B, sep = "-"
 55 | > input1 <- input1[-(1:3)
 56 | > input1
 57 | 
 58 | ==> DONE
 59 | 
 60 | o Code should have enough spaces to facilitate reading.  Please include spaces before and after operators and after commas (unless spaces have syntactical meaning).
 61 | 
 62 | ==> DONE
 63 | 
 64 | o If using "e.g." and "i.e." add a comma after the period to keep LaTeX from interpreting them as the end of a sentence, i.e.: "e.g., " and "i.e., "
 65 | 
 66 | ==> DONE
 67 | 
 68 | 
 69 | o The rule for capitalizing the starting letters of Figure, Section and Table
 70 | is as follows: If you are referring to a particular figure/section/table then
 71 | capitalize the first letter, otherwise use a lower-case first letter. For
 72 | example, something shown in Section 4 vs. there are three sections in this
 73 | paper. All of your figures and tables should be in the dedicated environment,
 74 | with proper captions and labels 
 75 | 
 76 | \begin{figure}
 77 | Figure
 78 | \caption{}
 79 | \label{}
 80 | \end{figure}
 81 | 
 82 | ==> DONE. In particular, put the colorded tables in a figure environment and referred to the Figure.
 83 | 
 84 | o Figures, tables and equations should be marked with a \label and referred to by \ref, e.g., Figure~\ref{...}.
 85 | 
 86 | ==> DONE
 87 | 
 88 | o All captions should appear below the corresponding figure/table. The captions
 89 | should be in sentence style and end with a period.  No additional formatting
 90 | (such as \emph, \bf or \it) should be used for the caption.
 91 | 
 92 | ==> DONE
 93 | 
 94 | o All table row/column headers should also be in sentence style. There should
 95 | not be further footnote-style annotations in tables; these should all be placed
 96 | in the caption.
 97 | 
 98 | ==> DONE
 99 | 
100 | o Equations should be marked with a \label and referred to by either
101 | 	Equation~\ref{...} (with capitalization, without parentheses)
102 | 		or
103 | 	(\ref({...})
104 | with the former being preferred if the number of equation references is not too large.
105 | 
106 | ==> DONE (I only number equations actually referred to, the others are used in running text)
107 | 
108 | o Abbreviations should be spelled in upper-case letters without additional
109 | formatting (i.e., without periods, without small caps, italics, etc.).  All
110 | abbreviations should be introduced with their expansion where the expansion
111 | should not be capitalized.
112 | 
113 | ==> DONE (NACE, ISIC and ESCO)
114 | 
115 | o As a reminder, please make sure that:
116 |   - \proglang, \pkg and \code have been used for highlighting throughout the
117 |     paper (including titles and references), except where explicitly escaped.
118 | 
119 | ==> DONE
120 | 
121 | References:
122 | 
123 | 
124 | 
125 | o As a reminder,
126 | 	- Please make sure that all software packages are \cite{}'d properly.
127 | 
128 | 	- All references should be in title style.
129 | 
130 | 	- See FAQ for specific reference instructions.
131 | 
132 | ==> DONE
133 | 
134 | Code:
135 | 
136 | o As a reminder, please make sure that the files needed to replicate all
137 | code/examples within the manuscript are included in a standalone replication
138 | script.
139 | 
140 | ==> DONE
141 | 


--------------------------------------------------------------------------------
/paper/setup.R:
--------------------------------------------------------------------------------
1 | # to setup your code before running article.R, use on the R prompt
2 | install.packages("accumulate_0.9.0.tar.gz")
3 | 


--------------------------------------------------------------------------------
/pkg/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: accumulate
 2 | Type: Package
 3 | Title: Split-Apply-Combine with Dynamic Groups
 4 | Version: 1.0.0
 5 | Authors@R: c( person("Mark", "van der Loo", role=c("aut","cre")
 6 |              , email="mark.vanderloo@gmail.com"
 7 |              , comment= c(ORCID="0000-0002-9807-4686")))
 8 | Maintainer: Mark van der Loo <mark.vanderloo@gmail.com>
 9 | Description: Estimate group aggregates, where one can set user-defined conditions
10 |    that each group of records must satisfy to be suitable for aggregation. If
11 |    a group of records is not suitable, it is expanded using a collapsing scheme
12 |    defined by the user. A paper on this package was published in the Journal
13 |    of Statistical Software <doi:10.18637/jss.v112.i04>.
14 | License: EUPL
15 | URL: https://github.com/markvanderloo/accumulate
16 | LazyData: TRUE
17 | VignetteBuilder: simplermarkdown
18 | Depends: R (>= 3.5.0)
19 | Suggests: tinytest, simplermarkdown, validate
20 | Encoding: UTF-8
21 | RoxygenNote: 7.3.2
22 | 


--------------------------------------------------------------------------------
/pkg/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method("[",object_list)
 4 | S3method(format,object_list)
 5 | S3method(print,object_list)
 6 | export(accumulate)
 7 | export(csh_from_digits)
 8 | export(cumulate)
 9 | export(frac_complete)
10 | export(from_validator)
11 | export(min_complete)
12 | export(min_records)
13 | export(object_list)
14 | export(smoke_test)
15 | 


--------------------------------------------------------------------------------
/pkg/NEWS:
--------------------------------------------------------------------------------
 1 | version 1.0.0
 2 | - updated style file for markdown vignette
 3 | - added citation for JSS article doi:10.18637/jss.v112.i04
 4 | 
 5 | version 0.9.3
 6 | - Fixed example for 'from_validator' in documentation.
 7 | - Better checking of formula specification of collapsing scheme.
 8 | - Improvements in documentation (thanks to the JSS editorial 
 9 |   team for pointing them out).
10 | 
11 | vesion 0.9.0
12 | - Package now supports complex objects as aggregation output
13 | - Internal: complete code reorganization
14 | 
15 | version 0.8.1
16 | - Added URL to description
17 | - Fixed typo in DESCRIPTION (thanks to CRAN team)
18 | 
19 | version 0.8.0
20 | - First CRAN release attempt
21 | 
22 | 


--------------------------------------------------------------------------------
/pkg/R/accumulate.R:
--------------------------------------------------------------------------------
  1 | #' Split-Apply-Combine with Collapsing Groups
  2 | #'
  3 | #' Compute grouped aggregates. If a group does not satisfy certain user-defined
  4 | #' conditions (such as too many missings, or not enough records) then the group
  5 | #' is expanded according to a user-defined 'collapsing' scheme.  This happens
  6 | #' recursively until either the group satisfies all conditions and the
  7 | #' aggregate is computed, or we run out of collapsing possibilities and the
  8 | #' \code{NA} is returned for that group. 
  9 | #' \itemize{
 10 | #'  \item{\code{accumulate} aggregates over all non-grouping variables defined in 
 11 | #'  \code{collapse}} 
 12 | #' \item{\code{cumulate} uses a syntax akin to \code{dplyr::summarise}}
 13 | #' }
 14 | #'
 15 | #'
 16 | #' @param data \code{[data.frame]} The data to aggregate by (collapsing) groups.
 17 | #' @param collapse \code{[formula|data.frame]} representing a group collapsing sequence.
 18 | #'        See below for details on how to specify each option.
 19 | #' @param test \code{[function]} A function that takes a subset of \code{data} and returns
 20 | #'             \code{TRUE} if it is suitable for computing the desired aggregates and 
 21 | #'             \code{FALSE} if a collapsing step is necessary.
 22 | #' @param fun \code{[function]} A scalar function that will be applied to all columns
 23 | #'            of \code{data}.
 24 | #' @param ... For \code{accumulate}, extra arguments to be passed to \code{fun}. For
 25 | #'            \code{cumulate}, a comma-separated list of \code{name=expression}, 
 26 | #'            where \code{expression} defines the aggregating operation.
 27 | #'
 28 | #' @section Using a formula to define the collapsing sequence:
 29 | #'
 30 | #' If all combinations of collapsing options are stored as columns in
 31 | #' \code{data}, the \code{formula} interface can be used. An example is the
 32 | #' easiest way to see how it works.  Suppose that \code{collapse = A*B ~ A1*B +
 33 | #' B} This means:
 34 | #' \itemize{
 35 | #'  \item{Compute output for groups defined by variables A and B}
 36 | #'  \item{If for a certain combination \code{(a,b)} in \code{AxB} the data does not
 37 | #'        pass the \code{test}, use \code{(a1,b)} in \code{A1xB} as alternative combination to compute
 38 | #'        a value for \code{(a,b)} (\code{A1xB} must yield larger groups than \code{AxB}).}
 39 | #'  \item{If that does not work, use only \code{B} as a grouping variable to compute
 40 | #'        a value for \code{(a,b)}}.
 41 | #'  \item{If that does not work, return \code{NA} for that particular combination \code{(a,b)}.}
 42 | #' }
 43 | #' Generally, the \code{formula} must be of the form \code{X0 ~ X1 + X2 + ... +
 44 | #' Xn} where each \code{Xi} is a (product of) grouping variable(s) in the data set.
 45 | #'
 46 | #' @section Using a data frame to define the collapsing scheme:
 47 | #'
 48 | #' In this case \code{collapse} is a data frame with columns \code{[A0, A1,
 49 | #' ..., An]}.  The variable \code{A0} represents the most fine-grained
 50 | #' grouping and must also be present in \code{data}. Aggregation works
 51 | #' as follows.
 52 | #' \itemize{
 53 | #'   \item{Compute output for groups defined by variable \code{A0}}
 54 | #'   \item{If for a certain \code{a0} in \code{A0} the corresponding selected
 55 | #'         data does not pass the \code{test}, use the larger dataset corresponding to
 56 | #'         \code{a1} in \code{A1} to compute output for \code{a1}}.
 57 | #'   \item{Repeat the second step until either the \code{test} is passed or 
 58 | #'         no more collapsing is possible. In the latter case, return \code{NA}
 59 | #'         for that particular value of \code{a0}}.
 60 | #' }
 61 | #' 
 62 | #' 
 63 | #'
 64 | #' @return
 65 | #' A data frame where each row represents a (multivariate) group.  The first
 66 | #' columns contain the grouping variables. The next column is called
 67 | #' \code{level} and indicates to what level collapsing was necessary to compute
 68 | #' a value, where 0 means that no collapsing was necessary. The following
 69 | #' colummns contain the aggregates defined in the \code{...} argument.  If no
 70 | #' amount of collapsing yields a data set that is satisfactory according to
 71 | #' \code{test}, then for that row, the \code{level} and subsequent columns are
 72 | #' \code{NA}.
 73 | #'
 74 | #' @references
 75 | #' MPJ van der Loo (2025) \emph{Split-Apply-Combine with Dynamic Grouping}
 76 | #' Journal of Statistical Software \code{doi:10.18637/jss.v112.i04}. 
 77 | #' 
 78 | #' @examples
 79 | #'
 80 | #' ## Example of data frame defining collapsing scheme, using accumulate
 81 | #'
 82 | #' input    <- data.frame(Y1 = 2^(0:8), Y2 = 2^(0:8))
 83 | #' input$Y2[c(1,4,7)] <- NA
 84 | #' # make sure that the input data also has the most fine-graind (target)
 85 | #' # grouping variable
 86 | #' input$A0 <- c(123,123,123,135,136,137,212,213,225)
 87 | #'
 88 | #' # define collapsing sequence
 89 | #' collapse <- data.frame(
 90 | #'      A0   = c(123, 135, 136, 137, 212, 213, 225)
 91 | #'    , A1   = c(12 , 13 , 13 , 13 , 21 , 21 , 22 )
 92 | #'    , A2   = c(1  , 1  , 1  , 1  , 2  , 2  , 2  )
 93 | #' )
 94 | #'
 95 | #' accumulate(input
 96 | #'  , collapse
 97 | #'  , test = function(d) nrow(d)>=3
 98 | #'  , fun  = sum, na.rm=TRUE)
 99 | #'
100 | #'
101 | #' ## Example of formula defining collapsing scheme, using cumulate
102 | #' input <- data.frame(
103 | #'    A  = c(1,1,1,2,2,2,3,3,3)
104 | #'  , B  = c(11,11,11,12,12,13,21,22,12)
105 | #'  , B1 = c(1,1,1,1,1,1,2,2,1)
106 | #'  , Y  = 2^(0:8)
107 | #' )
108 | #' cumulate(input, collapse=A*B ~ A*B1 + A
109 | #'         , test = function(d) nrow(d) >= 3
110 | #'         , tY = sum(Y))
111 | #'
112 | #'
113 | #' ## Example with formula defining collapsing scheme, using accumulate
114 | #' # The collapsing scheme must be represented by variables in the 
115 | #' # data. All columns not part of the collapsing scheme will be aggregated
116 | #' # over.
117 | #'
118 | #' input <- data.frame(
119 | #'     A  = c(1,1,1,2,2,2,3,3,3)
120 | #'   , B  = c(11,11,11,12,12,13,21,22,12)
121 | #'   , B1 = c(1,1,1,1,1,1,2,2,1)
122 | #'   , Y1 = 2^(0:8)
123 | #'   , Y2 = 2^(0:8)
124 | #' )
125 | #'
126 | #' input$Y2[c(1,4,7)] <- NA
127 | #'
128 | #' accumulate(input
129 | #'  , collapse = A*B ~ A*B1 + A
130 | #'  , test=function(a) nrow(a)>=3
131 | #'  , fun = sum, na.rm=TRUE)
132 | #'
133 | #'
134 | #'
135 | #' ## Example with data.frame defining collapsing scheme, using cumulate
136 | #' dat <- data.frame(A0 = c("11","12","11","22"), Y = c(2,4,6,8))
137 | #' # collapsing scheme
138 | #' csh <- data.frame(
139 | #'    A0 = c("11","12","22")
140 | #'  , A1 = c("1" ,"1", "2") 
141 | #' )
142 | #' cumulate(data = dat
143 | #'    , collapse = csh
144 | #'    , test     = function(d) if (nrow(d)<2) FALSE else TRUE
145 | #'    , mn = mean(Y, na.rm=TRUE)
146 | #'    , md = median(Y, na.rm=TRUE)
147 | #' )
148 | #'
149 | #' @export
150 | accumulate <- function(data, collapse, test, fun, ...){
151 |   compute   <- get_ag(collapse, fun, names(data), ...)
152 |   work(data, collapse, test, compute)
153 | }
154 | 
155 | 
156 | #' @rdname accumulate
157 | #' @export
158 | cumulate <- function(data, collapse, test, ...){
159 |   exprs   <- as.list(substitute(list(...))[-1])
160 |   compute <- get_ag(collapse, exprs, names(data))
161 |   work(data, collapse, test, compute)
162 | }
163 | 
164 | 
165 | work <- function(data, collapse, test, compute){
166 | 
167 |   pullback  <- get_pb(collapse, data)
168 |   jmax      <- max_collapse(collapse)
169 |   grpvars   <- groups(collapse)
170 | 
171 |   out       <- output_backbone(collapse, data)
172 |   R         <- output_template(nrow(out), collapse, compute)
173 |   for ( ia in seq_len(nrow(out)) ){
174 |     j <- 0
175 |     out_level <- out[ia, grpvars, drop=FALSE]
176 |     d <- pullback(out_level, j)
177 |     while ( j < jmax && !test(d) ){
178 |       j <- j + 1
179 |       d <- pullback(out_level,j)
180 |     }
181 |     if ( j < jmax || test(d) ){
182 |       R[[ia]] <- compute(d)
183 |       out$level[ia] <- j
184 |     }
185 |   }
186 |   combine(out, R, compute)
187 | }
188 | 
189 | 
190 | # check if the argument is of a basic R type and of length 1.
191 | is_scalar <- function(x){
192 |   length(x) == 1 && (
193 |   is.numeric(x) || 
194 |   is.logical(x) || 
195 |   is.character(x) || 
196 |   is.factor(x) || 
197 |   is.ordered(x) || 
198 |   is.raw(x) || 
199 |   is.complex(x))
200 | }
201 | 
202 | 
203 | # In 'work()', results are created in the form of a list, with one entry
204 | # per output group. Each entry may be a vector of aggregates, if the aggregates
205 | # are all 'atomic' and of the same type. Otherwise, each entry is a list.
206 | # 
207 | # The purpose of this function is to take such a row-wise list of the form
208 | # [
209 | #  [X = x1, Y = y1]
210 | #  [X = x2, Y = y2]
211 | # ]
212 | # to a data.frame of theform
213 | # X   Y
214 | # x1  y1
215 | # x2  y2
216 | #
217 | # where the columns may be lists, when the objects in them are more
218 | # complicated then simple scalars (see is_atomic).
219 | #
220 | # Input:
221 | # backbone: a data.frame where each row contains output group labels
222 | # results : a list of results per group: nrow(backbone) == length(results)
223 | # ag      : the agregation function used to create the values in 'results'
224 | #
225 | # Output:
226 | # A data frame, of the form [backbone, results].
227 | #
228 | combine <- function(backbone, results, ag){
229 |   # The simple case: aggregates are atomic; this probably covers
230 |   # most cases.
231 |   if (!any(sapply(results, is.list))){
232 |     return( cbind(backbone, do.call("rbind", results)) )
233 |   }
234 | 
235 |   # The complexer case: (some) aggregates are lists. 
236 |   L <- lapply(attr(ag,"outnames"), function(var){
237 |     # check if a column can be simplified, if so: do that
238 |     if ( all(sapply(results, function(x) is_scalar(x[[var]]))) ) {
239 |       sapply(results, `[[`,var)
240 |     # otherwise, combine in a list
241 |     } else {
242 |       lapply(results, `[[`,var)
243 |     }
244 |   })
245 |   names(L) <- names(results[[1]])
246 |   # make listcol objects of columns that are lists, so they are 
247 |   # printed better.
248 |   L <- lapply(L, function(x) if (!is.list(x)) x else object_list(x))
249 |   for ( x in names(L) ) backbone[[x]] <- L[[x]]
250 |   backbone
251 | }
252 | 
253 | 
254 | 
255 | ##### Entrails ####
256 | 
257 | # get_pb: create pullback function.
258 | # x  : formula or data.frame
259 | # dat: data to create inverse mapping for.
260 | # out: function accepting a single data frame row with one or more group labels 
261 | #      and a collapse level. It returns a subset of 'dat'.
262 | get_pb <- function(x, dat){
263 |   # Poor man's dispatch since we don't want to export this function
264 |   t <- last(class(x))
265 |   switch(t
266 |     , "data.frame" =  get_pb.data.frame(x,dat)
267 |     , "formula"    =  get_pb.formula(x, dat)
268 |     , stop("'x' must be a data frame or a formula")
269 |   )
270 | 
271 | }
272 | 
273 | 
274 | get_pb.data.frame <- function(x, dat){
275 |   group_labels <- dat[,colnames(x)[1]]
276 |   rownames(x) <- x[,1]
277 |   
278 |   # group: a 1x1 data frame with a single group label 
279 |   # level: collapse level (column index-1 in x)
280 |   function(group, level){ 
281 |     i <- group_labels %in% x[x[,level+1] == x[as.character(unclass(group)), level+1],1]
282 |     dat[i, , drop=FALSE]
283 |   }
284 | 
285 | }
286 | 
287 | get_pb.formula <- function(x, dat){
288 |   
289 |   collapse <- c(x[[2]], get_collapse(x[[3]]))
290 | 
291 |   labels <- dat[all.vars(x)]
292 |   # add column with row numbers; make sure column name does not exist already.
293 |   iname <- "__idx__"
294 |   n <- 0
295 |   while( iname %in% names(labels) ){ 
296 |       n <- n + 1
297 |       iname <- sprintf("%s%i",iname,n)
298 |   }
299 |   labels[,iname] <- seq_len(nrow(labels))
300 |  
301 |   function(group, level){
302 |     d <- merge(group, labels, by=names(group))
303 |     collapse_value <- d[1, all.vars(collapse[[level+1]]),drop=FALSE]
304 |     i <- merge(labels, collapse_value, by=names(collapse_value))[,iname]
305 |     dat[sort(i),,drop=FALSE]
306 |   }
307 | 
308 | }
309 | 
310 | # Accepts a formula 'e' of the form
311 | # P1 ~ P2 + P3 + ... + Pn, where each Pj is a single variable
312 | # name or a product of variable names. 
313 | # The return value is a list [P2, P3,...,Pn] of subformulae.
314 | get_collapse <- function(e, L = list()){
315 |   if (length(e) == 1 || e[[1]]=="*") return(append(L, e))
316 |   c(get_collapse(e[[2]],L), get_collapse(e[[3]],L))
317 | }
318 | 
319 | 
320 | # Ellipsis currying.
321 | # Accepts a function fun, and returns a 1-argument function
322 | # that has all arguments in ... fixed.
323 | curry <- function(fun,...){
324 |   L <- list(NULL, ...)
325 |   function(x) {
326 |     L[[1]] <- x
327 |     do.call(fun,L)
328 |   }
329 | }
330 | 
331 | # Create aggregator function
332 | # cps: collapsing scheme (data frame or formula)
333 | # x: aggregating function or object of class 'expression'
334 | # dnames: names of the input data.frame
335 | # ...: extra arguments to be passed to 'x', if it is a function.
336 | #
337 | # Output:
338 | # If x is a function:
339 | #   A function f(dat,..._ that applies x to every non-grouping column 
340 | #   in 'dat'  and returs a named vector with results.
341 | # If x is of type 'expression':
342 | #   A function f(dat) that evaluates each espression in the context of
343 | #   dat and returns a named vector with results.
344 | #
345 | # The output function has an attribute 'outnames' containing 
346 | # the names of the output variables.
347 | get_ag <- function(cps, x, dnames, ...){
348 |   if ( inherits(cps,"formula") && !ok_formula(cps) ){
349 |     stopf("Invalid formula: '%s' ", deparse(cps))
350 |   }
351 |   ag <- if ( is.function(x) ){
352 |     f <- curry(x, ...)
353 |     # grouping variables 'gv' are not to be aggregated over
354 |     gv <- if(inherits(cps,"formula")) all.vars(cps) else colnames(cps)
355 |     function(dat) sapply(dat[ ,!colnames(dat) %in% gv, drop=FALSE], f)
356 |   } else {
357 |     function(dat) lapply(x, function(e) with(dat,eval(e)))
358 |   }
359 |   outnames <- if (!is.function(x)) names(x)
360 |               else if (inherits(cps,"formula")) dnames[!dnames %in% all.vars(cps)]
361 |               else dnames[!dnames %in% names(cps)]
362 |   attr(ag, "outnames") <- outnames 
363 |   ag
364 | }
365 | 
366 | 
367 | # get relevant group combinations
368 | output_backbone <- function(cps, dat){
369 |   out <- if (inherits(cps,"formula")){
370 |     unique(dat[all.vars(cps[[2]])])
371 |   } else { # cps is a data.frame
372 |     unique(dat[,names(cps)[1],drop=FALSE])
373 |   }
374 |   out$level <- NA_integer_
375 |   out
376 | }
377 | 
378 | # get maximum number of collapsing steps (base 0).
379 | max_collapse <- function(cps){
380 |   if (inherits(cps,"formula")) length(get_collapse(cps[[3]])) else ncol(cps) - 1
381 | }
382 | 
383 | # Get the variable names for the desired grouping from the collapsing scheme.
384 | groups <- function(cps) {
385 |   if (inherits(cps,"formula")) all.vars(cps[[2]]) else names(cps)[1]
386 | }
387 | 
388 | # set up a list where we can store the output aggregate vectors. Seed with 
389 | # NA's for groups that after jmax collapses do not pass the test (and hence
390 | # will not yield an aggregate)
391 | # 
392 | # Input:
393 | # n       : number of output records
394 | # cps     : collapsing scheme (formula or data.frame)
395 | # outnames: column names of output variables
396 | output_template <- function(n, cps, ag){
397 |   vars <- attr(ag,"outnames")
398 |   template <- rep(NA, length(attr(ag,"outnames")))
399 |   names(template) <- vars
400 |   lapply(seq_len(n), function(i) template)
401 | }
402 | 
403 | 
404 | 
405 | 
406 | 
407 | 


--------------------------------------------------------------------------------
/pkg/R/helpers.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | #' Demand minimal number of records 
  4 | #' 
  5 | #' @param n Minimal number of records in a group.
  6 | #'
  7 | #' @examples
  8 | #'
  9 | #' min_records(5)(women)
 10 | #' min_records(200)(women)
 11 | #'
 12 | #' @return a function that accepts a data frame and returns \code{TRUE}
 13 | #'         when the number of records is larger than or equal to \code{n}
 14 | #'         and otherwise \code{FALSE}.
 15 | #'
 16 | #' @family helpers
 17 | #'
 18 | #' @export
 19 | min_records <- function(n){
 20 |   stopifnot(is.numeric(n), n>=0)
 21 |   min_rec <- n
 22 |   function(d) if (nrow(d) < n) FALSE else TRUE
 23 | }
 24 | 
 25 | #' Demand minimal number of complete records
 26 | #'
 27 | #' @param n Minimal number of records that must be complete
 28 | #' @param vars \code{[TRUE|column index]} Column index into the data to
 29 | #'        be tested (e.g. a character vectod with variable names or
 30 | #'        a numeric vector with column positions). The indexed columns
 31 | #'        will be testsed for completeness (absence of \code{NA}). Be default
 32 | #'        \code{vars=TRUE} meaning that all columns are taken into account.
 33 | #'
 34 | #' @return a function that accepts a data frame and returns \code{TRUE}
 35 | #'         when the number of complete records is larger than or equal to \code{n}
 36 | #'         and otherwise \code{FALSE}.
 37 | #'
 38 | #' @family helpers
 39 | #'
 40 | #' @examples
 41 | #'
 42 | #' f <- min_complete(20)
 43 | #' f(women)  # FALSE (15 records)
 44 | #' f(mtcars) # TRUE (32 records)
 45 | #'
 46 | #' @export
 47 | min_complete <- function(n, vars=TRUE){
 48 |   stopifnot(is.numeric(n), n>=0
 49 |           , isTRUE(vars) || 
 50 |             is.character(vars) || 
 51 |             is.numeric(vars) || 
 52 |             is.logical(vars))
 53 |   min_compl <- n
 54 |   vars <- vars
 55 |   function(d) sum(complete_cases(d[vars])) >= min_compl
 56 | }
 57 | 
 58 | #' Demand minimal fraction of complete records
 59 | #'
 60 | #' @param r Minimal fraction of records that must be complete.
 61 | #' @param vars \code{[TRUE|column index]} Column index into the data to
 62 | #'        be tested (e.g. a character vectod with variable names or
 63 | #'        a numeric vector with column positions). The indexed columns
 64 | #'        will be testsed for completeness (absence of \code{NA}). Be default
 65 | #'        \code{vars=TRUE} meaning that all columns are taken into account.
 66 | #'
 67 | #' @return a function that accepts a data frame and returns \code{TRUE} when the
 68 | #'         fraction of complete records is larger than or equal to \code{n} and
 69 | #'         otherwise \code{FALSE}.
 70 | #'
 71 | #' @family helpers
 72 | #'
 73 | #' @examples
 74 | #'
 75 | #' f <- frac_complete(0.1)
 76 | #' f(mtcars) # TRUE (all complete)
 77 | #' mt <- mtcars
 78 | #' mt[1:5,1] <- NA
 79 | #' f(mt)     # FALSE (5/32 incomplete)
 80 | #'
 81 | #' @export
 82 | frac_complete <- function(r, vars=TRUE){
 83 |   stopifnot(is.numeric(r), 0<=r, r<=1
 84 |           , isTRUE(vars) || 
 85 |             is.character(vars) || 
 86 |             is.numeric(vars) || 
 87 |             is.logical(vars))
 88 |   min_frac <- r
 89 |   vars <- vars
 90 |   function(d) mean(complete_cases(d[vars])) >= r
 91 | }
 92 | 
 93 | #' Use a validate::validator object to define a test
 94 | #'
 95 | #' Create a test function that accepts a data.frame, and returns \code{TRUE}
 96 | #' when the data passes all checks defined in the \code{validator} object, and
 97 | #' otherwise \code{FALSE}.
 98 | #'
 99 | #' @param v \code{[validator]} a validator object from the
100 | #'        \code{validate} package.
101 | #' @param ... options passed to \code{validate::confront}
102 | #'
103 | #' @note
104 | #' Requires the \code{validate} package to be installed.
105 | #'
106 | #' @return a function that accepts a data fram and returns \code{TRUE}
107 | #'         when the data passes all checks in \code{v} and otherwise
108 | #'         \code{FALSE}.
109 | #'
110 | #'
111 | #' @references
112 | #' Mark P. J. van der Loo, Edwin de Jonge (2021). Data Validation
113 | #' Infrastructure for R. Journal of Statistical Software, 97(10), 1-31.
114 | #' doi:10.18637/jss.v097.i10
115 | #'
116 | #' @examples
117 | #'
118 | #' if (requireNamespace("validate", quietly=TRUE)){
119 | #'  v <- validate::validator(height >= 0, weight >= 0)
120 | #'  f <- from_validator(v)
121 | #'  f(women)  # TRUE (all heights and weights are nonnegative)
122 | #' }
123 | #'
124 | #'
125 | #' @export
126 | from_validator <- function(v,...){
127 |   if (!requireNamespace('validate', quietly=TRUE)){
128 |     stop("Could not load the 'validate' package.")
129 |   }
130 |   args <- list(x=v,...)
131 |   function(d) all( do.call(validate::confront, append(list(dat=d),args) ) )
132 | 
133 | }
134 | 
135 | 
136 | 
137 | #' Derive collapsing scheme from a hierarchical classification
138 | #'
139 | #' Derive a collapsing scheme where group labels collapse to their
140 | #' parents in the hierarchy.
141 | #'
142 | #' @param x \code{[character|integer]} labels in a hierarchical classification (lowest level)
143 | #' @param levels \code{[integer >=0]} how many collapsing levels to include. Zero means
144 | #'        only include the original labels.
145 | #'
146 | #' @return A data frame where each consecitive pair of columns represents
147 | #'         one collapsing step induced by the hierarchical classification
148 | #'         encoded by the digits in \code{x}.
149 | #'
150 | #' @examples
151 | #' # balanced hierarchical classification
152 | #' csh_from_digits(c("111","112","121","122","123"))
153 | #' csh_from_digits(c("111","112","121","122","123"),levels=1)
154 | #' 
155 | #' # unbalanced hierarchical classification
156 | #' csh_from_digits(c("111","112","121","122","1221","1222"))
157 | #' csh_from_digits(c("111","112","121","122","1221","1222"),levels=2)
158 | #'
159 | #' @export
160 | csh_from_digits <- function(x, levels=max(nchar(x))-1){
161 |   stopifnot(levels>=0
162 |            , levels < max(nchar(x)))
163 |   x <- as.character(x)
164 |   nlevels <- max(nchar(x))
165 | 
166 |   A <- matrix(NA_character_,nrow=length(x), ncol=nlevels)
167 |   for ( i in seq_len(nlevels)){
168 |     A[,i] <- substr(x,1,nlevels+1-i)
169 |   }
170 |   colnames(A) <- sprintf("A%d",seq_len(nlevels)-1)
171 |   as.data.frame(A)[1:(levels+1)]
172 | }
173 | 
174 | #' Check your testing function against common edge cases
175 | #'
176 | #' Writing a testing function that works on any subset of records of a
177 | #' dataframe can be quite subtle. This function tries the testing function on a
178 | #' number of common (edge) cases that are easily overlooked.  It is \emph{not}
179 | #' a unit test: a smoke test will not tell you whether your output is correct.
180 | #' It only checks the output data type (must be \code{TRUE} or \code{FALSE} and
181 | #' reports if errors, warnings, or messages occur.
182 | #'
183 | #' @param dat an example dataset. For example the full dataset
184 | #'        to be fed into \code{\link{accumulate}} or \code{\link{cumulate}}.
185 | #' @param test A testing function to be passed as argument to \code{\link{accumulate}}
186 | #'        or \code{\link{cumulate}}.
187 | #' @param verbose \code{[logical]} If \code{TRUE}, all results (including
188 | #'        passed tests) are printed.  If \code{FALSE} only failed tests are printed.
189 | #' @param halt \code{[logical]} toggle stopping when an error is thrown
190 | #'
191 | #' @return \code{NULL}, invisibly. This function has as side-effect that test
192 | #' results are printed to screen.
193 | #'
194 | #'
195 | #' @examples
196 | #' dat <- data.frame(x = 1:5, y=(-2):2)
197 | #' smoke_test(dat, function(d) y > 0)   #error: Y not found
198 | #' smoke_test(dat, function(d) d$y > 0) # issue: output too long, not robust against NA
199 | #' smoke_test(dat, function(d) sum(d$y > 0) > 2) # issue: not robust against NA
200 | #' smoke_test(dat, function(d) sum(d$y > 0, na.rm=TRUE) > 2) # OK
201 | #'
202 | #' @export
203 | smoke_test <- function(dat, test, verbose=FALSE, halt=TRUE){
204 |   try_this(dat, test, verbose, info="full dataset") || !halt || return(invisible())
205 |   try_this(dat[1,,drop=FALSE], test, verbose, info="first record") || !halt || return(invisible())
206 |   try_this(dat[0,,drop=FALSE], test, verbose, info="zero records") || !halt || return(invisible())
207 | 
208 |   vars <- colnames(dat)
209 |   for (var in vars){
210 |     d <- dat[1,,drop=FALSE]
211 |     d[1,var] <- NA
212 |     try_this(d, test, verbose
213 |      , info=sprintf("first record and %s is NA",var)) || !halt || return(invisible())
214 |   } 
215 | 
216 |   d <- dat[1,,drop=FALSE]
217 |   d[1,] <- NA
218 |   try_this(d, test, verbose
219 |     , info="first record and all values NA") || !halt || return(invisible()) 
220 | 
221 |   for (var in vars){
222 |     d <- dat
223 |     d[,var] <- NA
224 |     try_this(d, test, verbose
225 |             , info=sprintf("full dataset and %s is NA for all records",var)) || 
226 |       !halt || return(invisible())
227 |   }
228 | 
229 | 
230 |   catf("\n")
231 |   invisible(NULL)
232 |  
233 | }
234 | 
235 | try_this <- function(d, f, verbose, info){
236 |   msg <- character()
237 |   wrn <- character()
238 |   err <- character()
239 |   out <- NULL
240 |   out <- tryCatch(withCallingHandlers(f(d)
241 |           , message = function(m){ msg <<- append(msg, m$message); invokeRestart("muffleMessage")}
242 |           , warning = function(w){ wrn <<- append(wrn, w$message);  invokeRestart("muffleWarning")}
243 |          )
244 |     , error   = function(e) err <<- append(err, e$message)
245 |   )
246 |   print_smoke(list(result=out, msg=msg, wrn=wrn, err=err, info=info), verbose=verbose)
247 |   invisible(length(err) == 0)
248 | }
249 | 
250 | 
251 | print_smoke <- function(x, verbose){
252 |   if (isTRUE(x$result)|| isFALSE(x$result)){
253 |     if(verbose) catf("\nTest with %s: OK", x$info)
254 |   } else {
255 |     rep <- character(0)
256 |     if (length(x$err) == 0){ 
257 |       if (!is.logical(x$result)){
258 |         rep <- c(rep,sprintf("Output is of class %s (must be 'logical')", class(x$result)))
259 |       }
260 |       if ( length(x$result) != 1){
261 |         rep <- c(rep,sprintf("Output has length %d (must be 1)", length(x$result)))
262 |       }
263 |       if ( any(is.na(x$result)) ){
264 |         rep <- c(rep, sprintf("NA detected in output (must be TRUE or FALSE)"))
265 |       }
266 |     }
267 |     if (length(x$msg)>0){
268 |       rep <- c(rep, paste(sprintf("MSG: %s",trimws(x$msg)), collapse="\n") )
269 |     }
270 |     if (length(x$wrn)>0){
271 |       rep <- c(rep, paste(sprintf("WRN: %s",trimws(x$wrn)), collapse="\n") )
272 |     }
273 |     if (length(x$err)>0){
274 |       rep <- c(rep, paste(sprintf("ERR: %s",trimws(x$err)), collapse="\n") )
275 |     }
276 |     report <- paste(rep, collapse="\n  ")
277 |     catf("\nTest with %s raised issues.\n\r   %s",x$info, report)
278 |   }
279 | }
280 | 
281 | 
282 | 
283 | 
284 | 


--------------------------------------------------------------------------------
/pkg/R/producers.R:
--------------------------------------------------------------------------------
 1 | #' @name producers
 2 | #' @title Synthetic data on producers
 3 | #' @description 
 4 | #'   A synthetic dataset listing several sources of turnover
 5 | #'   and other income for producers. The producers are classified
 6 | #'   in size classes and SBI (a refinement of NACE). Load with \code{data(producers)}.
 7 | #'
 8 | #'
 9 | #' \itemize{
10 | #'  \item  sbi: Classification of economic activity (refinement of NACE2008)
11 | #'  \item  size: Size class in 0 (smallest) to 9.
12 | #'  \item  industrial: Turnover from industrial activities.
13 | #'  \item  trade: Turnover from trade
14 | #'  \item  other: Turnover from other activities
15 | #'  \item  other_income: Income not from turnover (e.g. from financial transactions)
16 | #'  \item  total: Rowwise sum of indistrial, trade, and other turnover and other income.
17 | #'}
18 | #'
19 | #' @family datasets
20 | #'
21 | #' @docType data
22 | #' @format A \code{.rda} file, one producer per row.
23 | NULL
24 | 
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/pkg/R/utils.R:
--------------------------------------------------------------------------------
 1 | 
 2 | catf  <- function(fmt,...) cat(sprintf(fmt,...))
 3 | stopf <- function(fmt,...) stop(sprintf(fmt,...), call.=FALSE)
 4 | 
 5 | last <- function(x) x[length(x)]
 6 | 
 7 | complete_cases <- function(d) !Reduce(`|`, lapply(d, is.na))
 8 | 
 9 | #' Create a classed list
10 | #'
11 | #' Classed lists are used to pretty-print a list that is stored
12 | #' in a data frame.
13 | #'
14 | #' @param x a list
15 | #'
16 | #' @keywords internal
17 | #'
18 | #' @examples
19 | #' object_list(list(lm(speed ~ dist, data=cars)))
20 | #'
21 | #' @export
22 | #' @keywords internal
23 | object_list <- function(x) structure(x, class=c("object_list","list"))
24 | 
25 | #' @rdname object_list
26 | #' @export
27 | #' @keywords internal
28 | format.object_list <- function(x,...){
29 |   sapply(x, function(u) sprintf("<%s>",paste(class(u),collapse=",")))
30 | }
31 | 
32 | #' @rdname object_list
33 | #' @export
34 | #' @keywords internal
35 | print.object_list <- function(x,...) print(format.object_list(x,...))
36 | 
37 | #' @rdname object_list
38 | #' @export
39 | #' @keywords internal
40 | `[.object_list` <- function(x,i,j,...,drop=TRUE){
41 |   object_list(unclass(x)[i])
42 | }
43 | 
44 | 
45 | # check whether formula follows the allowed syntax
46 | ok_formula <- function(x){
47 |   x[[1]] == "~" && is_product(x[[2]]) && is_sum(x[[3]])
48 | }
49 | 
50 | is_sum <- function(x){
51 |   length(x)==1 || is_product(x) || (x[[1]] == "+" &&
52 |     (length(x[[2]]) == 1 || is_sum(x[[2]]) || is_product(x[[2]])) &&
53 |     (length(x[[3]]) == 1 || is_sum(x[[3]]) || is_product(x[[3]])))
54 | }
55 | 
56 | is_product <- function(x){
57 |   length(x) == 1 || (x[[1]] == "*" &&
58 |   (length(x[[2]] == 1) || is_product(x[[2]])) &&
59 |   (length(x[[3]] == 1) || is_product(x[[3]])) )
60 | }
61 | 
62 | 
63 | 
64 | 
65 | 
66 | 
67 | 
68 | 
69 | 
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/pkg/data/producers.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markvanderloo/accumulate/1be9146a5f7c2e9b27660b69640290494a6fedf0/pkg/data/producers.rda


--------------------------------------------------------------------------------
/pkg/inst/CITATION:
--------------------------------------------------------------------------------
 1 | bibentry(bibtype = "Article",
 2 |   title        = "Split-Apply-Combine with Dynamic Grouping",
 3 |   author       = person(given = c("Mark", "P.", "J."),
 4 |                         family = "van der Loo",
 5 |                         email = "mpj.vanderloo@cbs.nl"),
 6 |   journal      = "Journal of Statistical Software",
 7 |   year         = "2025",
 8 |   volume       = "112",
 9 |   number       = "4",
10 |   pages        = "1--21",
11 |   doi          = "10.18637/jss.v112.i04",
12 |   header       = "To cite accumulate in publications use:"
13 | )
14 | 
15 | 


--------------------------------------------------------------------------------
/pkg/inst/tinytest/test_accumulate.R:
--------------------------------------------------------------------------------
  1 | 
  2 | #source("../../R/rcollapse.R")
  3 | 
  4 | ## Accumulate with collapsing sequence as data frame
  5 | input    <- data.frame(Y1 = 2^(0:8), Y2 = 2^(0:8))
  6 | input$Y2[c(1,4,7)] <- NA
  7 | 
  8 | input$A0 <- c(123,123,123,135,136,137,212,213,225)
  9 | 
 10 | 
 11 | collapse <- data.frame(
 12 |       A0   = c(123, 135, 136, 137, 212, 213, 225)
 13 |     , A1   = c(12 , 13 , 13 , 13 , 21 , 21 , 22 )
 14 |     , A2   = c(1  , 1  , 1  , 1  , 2  , 2  , 2  )
 15 | )
 16 | 
 17 | out <- accumulate(input
 18 |          , collapse
 19 |          , test = function(d) nrow(d)>=3
 20 |          , fun  = sum, na.rm=TRUE)
 21 | 
 22 | expect_equal(out[,1], unique(collapse[,1]))
 23 | expect_equal(out[,2], c(0, 1 ,  1,  1,   2,   2, 2))
 24 | expect_equal(out[,3], c(7, 56, 56, 56, 448, 448, 448))
 25 | expect_equal(out[,4], c(6, 48, 48, 48, 384, 384, 384))
 26 | 
 27 | # With NA in result (case where no subset passes test())
 28 | out <- accumulate(input, collapse
 29 |           , test=function(d) nrow(d) >= 10
 30 |           , fun = sum, na.rm=TRUE)
 31 | 
 32 | expect_equal(out[,3], rep(NA,7))
 33 | expect_equal(out[,4], rep(NA,7))
 34 | 
 35 |  
 36 | 
 37 | ## Accumulate with collapsing scheme as formula
 38 | input <- data.frame(
 39 |     A  = c(1,1,1,2,2,2,3,3,3)
 40 |   , B  = c(11,11,11,12,12,13,21,22,12)
 41 |   , B1 = c(1,1,1,1,1,1,2,2,1)
 42 |   , Y1 = 2^(0:8)
 43 |   , Y2 = 2^(0:8)
 44 | )
 45 | 
 46 | input$Y2[c(1,4,7)] <- NA
 47 | 
 48 | out <- accumulate(input
 49 |   , collapse = A*B ~ A*B1 + A
 50 |   , test=function(a) nrow(a)>=3
 51 |   , fun = sum, na.rm=TRUE)
 52 | 
 53 | 
 54 | expect_equal( out[,1], c( 1,  2,  2,   3,   3,   3) )
 55 | expect_equal( out[,2], c(11, 12, 13,  21,  22,  12) )
 56 | expect_equal( out[,3], c( 0,  1,  1,   2,   2,   2) )
 57 | expect_equal( out[,4], c( 7, 56, 56, 448, 448, 448) )
 58 | expect_equal( out[,5], c( 6, 48, 48, 384, 384, 384) )
 59 | 
 60 | 
 61 | d <- data.frame(A0 = as.character(c(11,11,12)), Y = c(4,8,6))
 62 | A <- data.frame(A0 = as.character(c(11,12)), A1 = as.character(c(1,1)))
 63 | 
 64 | out <- cumulate(data = d
 65 |   , collapse  = A
 66 |   , test      = function(d) if (nrow(d)>=2) TRUE else FALSE
 67 |   , mn = mean(Y, na.rm=TRUE)
 68 |   , md = median(Y,na.rm=TRUE) )
 69 | 
 70 | expect_equal(out[,1],c("11","12"))
 71 | expect_equal(out[,2],c(0,1))
 72 | expect_equal(out[,3],c(6,6))
 73 | expect_equal(out[,4],c(6,6))
 74 | 
 75 | input <- data.frame(
 76 |     A  = c(1,1,1,2,2,2,3,3,3)
 77 |   , B  = c(11,11,11,12,12,13,21,22,12)
 78 |   , B1 = c(1,1,1,1,1,1,2,2,1)
 79 |   , Y  = 2^(0:8)
 80 | )
 81 | 
 82 | output <- data.frame(
 83 |     A = c(1,2,2,3,3,3)
 84 |   , B = c(11,12,13,21,22,12)
 85 |   , level = c(0,1,1,2,2,2)
 86 |   , tY    = c(7,56,56,448,448,448)
 87 | )
 88 | 
 89 | out <- cumulate(data=input
 90 |         , collapse=A*B ~ A*B1 + A
 91 |         , test=function(d) nrow(d)>=3
 92 |         , tY = sum(Y) )
 93 | 
 94 | expect_equivalent(out, output)
 95 | 
 96 | # Case where the aggregate is an object (not a scalar)
 97 | out <- cumulate(data=input
 98 |         , collapse = A*B ~ A*B1 + A
 99 |         , test = function(d) nrow(d) >= 3
100 |         , model = lm(Y ~ 1)
101 |        )
102 | 
103 | expect_inherits(out$model, "object_list")
104 | 
105 | # with extra columns
106 | out <- cumulate(data=input
107 |         , collapse = A*B ~ A*B1 + A
108 |         , test = function(d) nrow(d) >= 3
109 |         , model = lm(Y ~ 1)
110 |         , mean   = mean(Y)
111 |        )
112 | 
113 | expect_equivalent(sapply(out$model, coef), out$mean)
114 | 
115 | 
116 | 
117 | 
118 | ## test connection with 'validator' 
119 | if ( !requireNamespace("validate", quietly=TRUE) ){ 
120 |   exit_file("validate not installed")
121 | }
122 | 
123 | rules <- validate::validator(nrow(.) >= 3, sum(Y >= 2) >= 3)
124 | 
125 | input <- data.frame(
126 |      A  = c(1,1,1,2,2,2,3,3,3)
127 |    , B  = c(11,11,11,12,12,13,21,22,12)
128 |    , B1 = c(1,1,1,1,1,1,2,2,1)
129 |    , Y  = 2^(0:8)
130 | )
131 | 
132 | expect_silent(accumulate(input, collapse = A*B ~ A*B1 + B1,
133 |     test = from_validator(rules), fun=sum))
134 | 
135 | 
136 | # test formula-checking
137 | expect_error(accumulate(input, collapse=A*B~A~B, test=min_records(3), fun=mean))
138 | 
139 | 
140 | 
141 | 
142 | 
143 | 


--------------------------------------------------------------------------------
/pkg/inst/tinytest/test_helpers.R:
--------------------------------------------------------------------------------
 1 | 
 2 | expect_error(accumulate:::stopf("foo %g",bar))
 3 | expect_equal(accumulate:::last(1:3),3)
 4 | 
 5 | 
 6 | d <- data.frame(x=c(1:5,rep(NA,5)), y=1:10)
 7 | expect_equal(accumulate:::complete_cases(d)
 8 |            , c(rep(TRUE,5), rep(FALSE,5)))
 9 | 
10 | 
11 | 
12 | expect_true( min_records(8)(d))
13 | expect_false(min_records(12)(d))
14 | expect_error(min_records(-1))
15 | expect_error(min_records("foo"))
16 | 
17 | expect_true( min_complete(3)(d))
18 | expect_false(min_complete(8)(d))
19 | expect_true( min_complete(8,vars="y")(d))
20 | expect_false(min_complete(8,vars="x")(d))
21 | expect_error(min_complete(n="foo"))
22 | 
23 | 
24 | expect_true( frac_complete(0.4)(d))
25 | expect_false(frac_complete(0.8)(d))
26 | expect_true( frac_complete(0.8, vars="y")(d))
27 | expect_false(frac_complete(0.8, vars="x")(d))
28 | expect_error(min_complete(r="foo"))
29 | 
30 | 
31 | 
32 | 
33 | csh <- csh_from_digits(c("11","12","123"))
34 | expect_equal(csh_from_digits(c("11","12"))
35 |             , data.frame(A0=c("11","12"), A1=c("1","1")))
36 | expect_equal(csh_from_digits(c("11","12"), levels=0)
37 |             , data.frame(A0=c("11","12")))
38 | 
39 | expect_error(csh_from_digits(c(11,12), levels=-1))
40 | expect_error(csh_from_digits(c(11,12), levels= 2))
41 | 
42 | #from_validator
43 | 
44 | if (!requireNamespace('validate', quietly=TRUE)) exit_file("validate not installed")
45 | 
46 | expect_true(from_validator( validate::validator(sum(is.na(x)) < 8) )(d))
47 | expect_false(from_validator( validate::validator(sum(is.na(x)) < 4) )(d))
48 | 
49 | 
50 | dat <- data.frame(x = 1:5, y=(-2):2)
51 | expect_stdout(smoke_test(dat, function(d) y > 0), pattern="ERR")  
52 | expect_stdout(smoke_test(dat, function(d) d$y > 0),pattern="length") 
53 | expect_stdout(smoke_test(dat, function(d) sum(d$y > 0) > 2), pattern="NA detected")
54 | expect_stdout(smoke_test(dat, function(d) sum(d$y > 0, na.rm=TRUE) > 2),pattern="")
55 | 
56 | 
57 | 
58 | 
59 | 
60 | 
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/pkg/inst/tinytest/test_internals.R:
--------------------------------------------------------------------------------
 1 | #source("../../R/acc.R")
 2 | #library(tinytest)
 3 | # 
 4 | #
 5 | input    <- data.frame(Y1 = 2^(0:8), Y2 = 2^(0:8))
 6 | input$Y2[c(1,4,7)] <- NA
 7 | input$A0 <- c(123,123,123,135,136,137,212,213,225)
 8 | 
 9 | 
10 | collapse <- data.frame(
11 |       A0   = c(123, 135, 136, 137, 212, 213, 225)
12 |     , A1   = c(12 , 13 , 13 , 13 , 21 , 21 , 22 )
13 |     , A2   = c(1  , 1  , 1  , 1  , 2  , 2  , 2  )
14 | )
15 | 
16 | pullback <- accumulate:::get_pb(collapse, input)
17 | 
18 | expect_equal(pullback(123, 0), input[1:3,])
19 | expect_equal(pullback(135, 1), input[4:6,])
20 | expect_equal(pullback(212, 2), input[7:9,])
21 | 
22 | 
23 | input <- data.frame(
24 |     A  = c(1,1,1,2,2,2,3,3,3)
25 |   , B  = c(11,11,11,12,12,13,21,22,12)
26 |   , B1 = c(1,1,1,1,1,1,2,2,1)
27 |   , Y1 = 2^(0:8)
28 |   , Y2 = 2^(0:8)
29 | )
30 | 
31 | input$Y2[c(1,4,7)] <- NA
32 | 
33 | pullback <- accumulate:::get_pb(A*B ~ A*B1 + A, input)
34 | 
35 | expect_equal(pullback(data.frame(A=1,B=11), level=0), input[1:3,])
36 | expect_equal(pullback(data.frame(A=3,B=21), level=1), input[7:8,])
37 | expect_equal(pullback(data.frame(A=2,B=13), level=2), input[4:6,])
38 | 
39 | 
40 | 
41 | 
42 | 
43 | 
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/pkg/inst/tinytest/test_object_list.R:
--------------------------------------------------------------------------------
 1 | # objects of type 'object_list' are typed lists that only differ from ordenary
 2 | # lists in the way that they are printed. Also, selecting a subset with `[`
 3 | # returns an 'object_list'. 
 4 | 
 5 | 
 6 | # trivial check on constructor
 7 | expect_inherits(object_list(1:3),"object_list")
 8 | 
 9 | # check subsetting method
10 | expect_inherits(object_list(1:3)[1], "object_list")
11 | 
12 | 
13 | # only the type of each element is printed
14 | expect_true(grepl("^<integer>$", format(object_list(1L))))
15 | expect_stdout(print(object_list(1L)), "<integer>")
16 | 
17 | 
18 | 
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/pkg/inst/tinytest/test_utils.R:
--------------------------------------------------------------------------------
 1 | 
 2 | expect_true(accumulate:::ok_formula(a ~ b))
 3 | expect_true(accumulate:::ok_formula(a*b ~ b))
 4 | expect_true(accumulate:::ok_formula(a*b*c ~ a*b + a))
 5 | expect_true(accumulate:::ok_formula(a*b*c*d ~ a*b*c + a*c + a + c))
 6 | expect_false(accumulate:::ok_formula(a+b ~ b))
 7 | 
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/pkg/tests/tinytest.R:
--------------------------------------------------------------------------------
1 | 
2 | if ( requireNamespace("tinytest", quietly=TRUE) ){
3 |   tinytest::test_package("accumulate")
4 | }
5 | 
6 | 


--------------------------------------------------------------------------------
/pkg/vignettes/introduction.md:
--------------------------------------------------------------------------------
  1 | <!--
  2 | %\VignetteEngine{simplermarkdown::mdweave_to_html}
  3 | %\VignetteIndexEntry{Introduction to accumulate}
  4 | -->
  5 | 
  6 | ---
  7 | title: Introduction to `accumulate`
  8 | author: Mark P.J. van der Loo
  9 | css: "style.css"
 10 | ---
 11 | 
 12 | Package version `packageVersion("accumulate")`{.R}. 
 13 | 
 14 | Use `citation('accumulate')` to cite the package.
 15 | 
 16 | ## Introduction
 17 | 
 18 | `Accumulate` is a package for grouped aggregation, where the groups can be
 19 | dynamically collapsed into larger groups. When this collapsing takes place and
 20 | how collapsing takes place is user-defined.
 21 | 
 22 | 
 23 | ## Installation
 24 | 
 25 | The latest CRAN release can be installed as follows.
 26 | ```
 27 | install.packages("accumulate")
 28 | ```
 29 | Next, the package can be loaded. You can use `packageVersion` (from base R) to
 30 | check which version you have installed.
 31 | ```{#load_package .R}
 32 | library(accumulate)
 33 | # check the package version
 34 | packageVersion("accumulate")
 35 | ```
 36 | 
 37 | ## A first example
 38 | 
 39 | We will use a built-in dataset as example. 
 40 | ```{#loading_data .R}
 41 | data(producers)
 42 | head(producers)
 43 | ```
 44 | This synthetic dataset contains information on various sources of turnover from
 45 | producers, that are labeled with an economic activity classification (`sbi`)
 46 | and a `size` class (0-9). 
 47 | 
 48 | We wish to find a group mean by `sbi x size`. However, we demand that the group has
 49 | at least five records, otherwise we combine the size classes of a single `sbi` group.
 50 | This can be done as follows.
 51 | ```{#first_example .R}
 52 | a <- accumulate(producers
 53 |               , collapse = sbi*size ~ sbi
 54 |               , test = min_records(5)
 55 |               , fun  = mean, na.rm=TRUE)
 56 | head(round(a))
 57 | ```
 58 | The accumulate function does the following:
 59 | 
 60 | - For each combination of `sbi` and `size` occurring in the data, it checks whether
 61 |   `test` is satisfied. Here, it tests whether there are at least five records.
 62 |     - If the test is satisfied, the mean is computed for each non-grouping variable
 63 |       in the data. The output column `level` is set to 0 (no collapsing took place).
 64 |     - If the test is _not_ satisfied, it will only use `sbi` as grouping variable
 65 |       for the current combination of `sbi` and `size`. Then, if there are enough
 66 |       records, the mean is computed for each variable and the output variable `level`
 67 |       is set to 1 (first level of collapsing has been used). 
 68 |     - If the test is still not satisfied, no computation is possible
 69 |       and all outputs are `NA` for the current `sbi` and `size` combination.
 70 |  
 71 | Explicitly, for this example we see that for `(sbi,size)==(2752,5)` no
 72 | satisfactory group of records was found under the current collapsing scheme.
 73 | Therefore the `level` variable equals `NA` and all aggregated variables are
 74 | missing as well.  For `(sbi,size)==(2840,7)` there are sufficient records, and
 75 | since `level=0` no collapsing was necessary. For the group
 76 | `(sbi,size)=(3410,8)` there were not enough records to compute a mean, but
 77 | taking all records in `sbi==3410` gave enough records. This is signified by
 78 | `level=1`, meaning that one collapsing step has taken place (from `sbi x size`
 79 | to `sbi`).
 80 | 
 81 |  
 82 | Let us see how we specified this call to `accumulate`
 83 | 
 84 | - The first argument is the data to be aggregated.
 85 | - The second argument is a formula of the form `target groups ~ collapsing scheme`.
 86 |   The output is always at the level of the target groups. The collapsing scheme determines
 87 |   which records are used to compute a value for the target groups if the `test` is not
 88 |   satisfied.
 89 | - The third argument, called `test` is a function that should accept any subset of 
 90 |   records of `producers` and return `TRUE` or `FALSE`. In this case we used the convenience
 91 |   function `min_records(5)` provided by `accumulate`. The function `min_records()` creates
 92 |   a testing function for us that we can pass as testing function.
 93 | - Finally, the argument `fun` is the aggregation function that will be applied to each
 94 |   group. 
 95 | 
 96 | Observe that the accumulate function is similar to R's built-in `aggregate` function (this is
 97 | by design). There is a second function called `cumulate` that has an interface that
 98 | is similar to `dplyr::summarise`.
 99 | 
100 | ```{#cumulate_formula .R}
101 | a <- cumulate(producers, collapse = sbi*size ~ sbi
102 |       , test = function(d) nrow(d) >= 5
103 |       , mu_industrial = mean(industrial, na.rm=TRUE)
104 |       , sd_industrial = sd(industrial, na.rm=TRUE))
105 | 
106 | head(round(a))
107 | ```
108 | 
109 | Notice that here, we wrote our own test function.
110 | 
111 | 
112 | ### Exercises
113 | 
114 | 1. How many combinations of `(sbi, size)` could not be computed, even when 
115 |    collapsing to `sbi`? (You need to run the code and investigate the output).
116 | 2. Compute the trimmed mean of all numeric variables where you trim
117 |    5% of each side the distribution. See `?mean` on how to compute trimmed
118 |    means.
119 | 
120 | ## The formula interface for specifying collapsing schemes
121 | 
122 | A collapsing scheme can be defined in a data frame or with a
123 | formula of the form 
124 | ```
125 | target grouping ~ collapse1 + collapse2 + ... + collapseN
126 | ```
127 | Here, the `target grouping` is a variable or product of variables.  Each
128 | `collapse` term is also a variable or product of variables. Each subsequent
129 | term defines the next collapsing step. Let us show the idea with a 
130 | more involved example.
131 | 
132 | The `sbi` variable in the `producers` dataset encodes a hierarchical classification
133 | where longer digit sequences indicate higher level of detail. Hence we can collapse
134 | to lower levels of detail by deleting digits at the end. Let us enrich the
135 | `producers` dataset with extra grouping levels.
136 | 
137 | ```{#derive_sbi_levels .R}
138 | producers$sbi3 <- substr(producers$sbi,1,3)
139 | producers$sbi2 <- substr(producers$sbi,1,2)
140 | head(producers,3)
141 | ```
142 | 
143 | We can now use a more involved collapsing scheme as follows.
144 | ```{#accumulate_formula .R}
145 | a <- accumulate(producers, collapse = sbi*size ~ sbi + sbi3 + sbi2
146 |                , test = min_records(5), fun = mean, na.rm=TRUE)
147 | head(round(a))
148 | ```
149 | For `(sbi,size) == (2752,5)` we have 2 levels of collapsing. In other
150 | words, for that aggregate, all records in `sbi3 == 275` were used.
151 | 
152 | ### Exercises
153 | 
154 | 1. Compute standard deviation for `trade` and `total` using the `cumulate` function
155 |    under the same collapsing scheme as defined above.
156 | 2. What is the maximum collapsing level in the collapsing scheme above?
157 | 3. Find out how many combinations of `(sbi,size)` have been collapsed to 
158 |    level 0, 1, 2, or 3. Tabulate them.
159 | 4. Define a collapsing scheme that ends with a single-digit `sbi` code and compute
160 |    the means of all variables.
161 | 
162 | 
163 | ## The data frame interface for defining collapsing schemes
164 | 
165 | Collapsing schemes can be represented in data frames that have the
166 | form
167 | 
168 | ```
169 | [target group, parent of target group, parent of parent of target group,...].
170 | ```
171 | The package comes with a helper function that creates such a scheme
172 | from hierarchical classifications that are encoded as digits.
173 | 
174 | For the `sbi` example we can do the following to derive a collapsing scheme.
175 | ```{#dataframe_construction .R}
176 | sbi <- unique(producers$sbi)
177 | csh <- csh_from_digits(sbi)
178 | names(csh)[1] <- "sbi"
179 | head(csh)
180 | ```
181 | Here, the column `sbi` denotes the original (maximally) 5-digit codes,
182 | `A1` the 4-digit codes, and so on. It is important that the name of
183 | the first column matches a column in the data to be agregated.
184 | Both `cumlate` and `accumulate` accept such a data frame as an argument.
185 | Here is an example with `cumulate`.
186 | 
187 | ```{#dataframe_cumulate .R}
188 | a <- cumulate(producers, collapse = csh, test = function(d) nrow(d) >= 5
189 |        , mu_total = mean(total, na.rm=TRUE)
190 |        , sd_total = sd(total, na.rm=TRUE))
191 | head(a)
192 | ```
193 | 
194 | In this representation is is not possible to use multiple grouping
195 | variables, unless you combine multiple grouping variables into a single
196 | one, for example by pasting them together.
197 | 
198 | The advantage of this representation is that it allows users to externally
199 | define a (manually edited) collapsing scheme.
200 | 
201 | ### Exercises
202 | 
203 | 1. Use `csh` to compute the median of all numerical variables of
204 |    the `producers` dataset with `accumulate` (hint: you need to remove
205 |    the `size` variable).
206 | 
207 | 
208 | ## Convenience functions to define tests
209 | 
210 | There are several options to define test on groups of records:
211 | 
212 | 1. Use one of the built-in functions to specify common test conditions:
213 |    `min_records()`, `min_complete()`, or `frac_complete()`.
214 | 2. Use a ruleset defined with the [validate](https://cran.r-project.org/package=validate)
215 |    package, with the `from_validator()` function.
216 | 3. Write your own custom test function. 
217 | 
218 | 
219 | Let us look at a small example for each case. For comparison we will
220 | always test that there are a minimum of five records.
221 | 
222 | 
223 | ```{#helpers .R}
224 | # load the data again to loose columns 'sbi2' and 'sbi3' and work
225 | # with the original data.
226 | data(producers)
227 | 
228 | # 1. using a helper function
229 | a <- accumulate(producers, collapse = sbi*size ~ sbi
230 |                , test = min_records(5)
231 |                , fun  = mean, na.rm=TRUE)
232 | 
233 | # 2. using a 'validator' object
234 | rules <- validate::validator(nrow(.) >= 5)
235 | a <- accumulate(producers, collapse = sbi*size ~ sbi
236 |                , test = from_validator(rules)
237 |                , fun  = mean, na.rm=TRUE)
238 | 
239 | # 3. using a custom function
240 | a <- accumulate(producers, collapse=sbi*size ~ sbi
241 |                , test = function(d) nrow(d) >= 5
242 |                , fun  = mean, na.rm=TRUE)
243 | ```
244 | 
245 | ## Complex aggregates
246 | 
247 | An aggregate may be something more complex than a scalar. The `accumulate`
248 | package also supports complex aggregates such as linear models.
249 | 
250 | ```{#complex .R}
251 | a <- cumulate(producers, collapse = sbi*size ~ sbi
252 |                        , test = min_complete(5, c("other_income","trade"))
253 |                        , model = lm(other_income ~ trade)
254 |                        , mean_other = mean(other_income, na.rm=TRUE))
255 | 
256 | head(a)
257 | ```
258 | Here, we demand that there are at least five records available for estimating 
259 | the model.
260 | 
261 | The linear models are stored in a `list` of type `object_list`. Subsets or individual
262 | elements can be accessed as usual with data frames.
263 | ```{#objlist .R}
264 | a$model[[1]]
265 | a$model[[2]]
266 | ```
267 | 
268 | 
269 | 
270 | 
271 | 
272 | ### Smoke-testing your test function
273 | 
274 | If you write your own test function from scratch, it is easy to overlook some
275 | edge cases like the occurrence of missing data, a column that is completely
276 | `NA`, or receiving zero records. The function `smoke_test()` accepts a data set
277 | and a test function and runs the test function on several common edge cases
278 | based on the dataset. It does _not_ check whether the test function works as
279 | expected, but it checks that the output is `TRUE` or `FALSE` in all cases and
280 | reports errors, warnings and mesages if they occur.
281 | 
282 | 
283 | As an example we construct a test function that checks whether one
284 | of the variables has sufficient non-zero values.
285 | ```{#smoketest1 .R}
286 | my_test <- function(d) sum(other != 0) > 3
287 | smoke_test(producers, my_test)
288 | ```
289 | Oops, we forgot to refer to the data set. Let's try it again.
290 | ```{#smoketest2 .R}
291 | my_test <- function(d) sum(d$other != 0) > 3
292 | smoke_test(producers, my_test)
293 | ```
294 | Our function is not robust against occurrence of `NA`. Here's a third attempt.
295 | ```{#smoketest3 .R}
296 | my_test <- function(d) sum(d$other != 0,na.rm=TRUE) > 3
297 | smoke_test(producers, my_test)
298 | ```
299 | 
300 | 
301 | 
302 | ### Exercises
303 | 
304 | 1. Compute the mean of all variables using `sbi*size ~ sbi1 + sbi2` as collapsing
305 |    scheme. Make sure there are at least 10 records in each group.
306 | 2. Compute the mean of the ratio between `industrial` and `total`, but demand
307 |    that there are not more than 20% zeros in `other`. Use `csh` as collapsing scheme.
308 | 
309 | 
310 | 
311 | 
312 | 
313 | 
314 | 
315 | 
316 | 
317 | 
318 | 
319 | 
320 | 
321 | 
322 | 
323 | 


--------------------------------------------------------------------------------
/pkg/vignettes/style.css:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | body {
 4 |   max-width: 50rem;
 5 |   margin-left: auto;
 6 |   margin-right: auto;
 7 |   font-family: system-ui;
 8 |   color: black;
 9 | /*  background: #555555;*/
10 | }
11 | figure {
12 |   margin: 0;
13 |   padding: 0;
14 | }
15 | p, li {
16 |   text-align: justify;
17 | }
18 | pre {
19 |   background: #F8F9F9;
20 |   padding: 0.5rem;
21 | }
22 | h1, h2, h3, h4, h5 {
23 |   color: orange;
24 |   font-weight: bold;
25 | }
26 | h4, h5 {
27 |   color: inherit;
28 | }
29 | h2 {
30 |   margin-top: 3rem;
31 | }
32 | a {
33 |   color: orange;
34 | }
35 | a:hover {
36 |   color: lightblue;
37 | }
38 | 
39 | /* Specific styling */
40 | p.author {
41 |   font-weight : bold;
42 |   color: orange;
43 | }
44 | 
45 | 


--------------------------------------------------------------------------------