├── .Rbuildignore
├── .covrignore
├── .github
├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── ISSUE_TEMPLATE.md
└── workflows
│ ├── R-CMD-check.yaml
│ ├── pkgdown.yaml
│ ├── render-readme.yaml
│ ├── rhub.yaml
│ └── test-coverage.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── DESCRIPTION
├── LICENSE
├── LICENSE.md
├── NAMESPACE
├── NEWS.md
├── R
├── b_summaries.R
├── brolgar-package.R
├── data-pisa.R
├── data-wages-ts.R
├── data-world-height.R
├── facet-sample.R
├── facet-strata.R
├── features.R
├── index-helpers.R
├── key_slope.R
├── keys_near.R
├── monotonics.R
├── n-obs.R
├── nearests.R
├── reexports.R
├── sample-frac-n-keys.R
├── stratify-keys.R
├── utils.R
└── zzz.R
├── README.Rmd
├── README.md
├── _pkgdown.yml
├── brolgar.Rproj
├── codecov.yml
├── cran-comments.md
├── data-raw
├── Height_Compact.xlsx
├── compress-pisa-data.R
├── tidy-world-heights.R
├── wages-original.rds
└── wages-ts.R
├── data
├── heights.rda
├── pisa.rda
└── wages.rda
├── inst
├── CITATION
├── WORDLIST
└── examples
│ ├── max_if.R
│ ├── new-api.R
│ └── test.R
├── man
├── add_n_obs.Rd
├── b_summaries.Rd
├── brolgar-features.Rd
├── brolgar-package.Rd
├── facet_sample.Rd
├── facet_strata.Rd
├── figures
│ ├── README-demo-brolgar-1.png
│ ├── README-demo-why-brolgar-1.png
│ ├── README-facet-sample-1.png
│ ├── README-facet-sample-3-by-20-1.png
│ ├── README-facet-strata-1.png
│ ├── README-facet-strata-20-1.png
│ ├── README-features-left-join-1.png
│ ├── README-plot-filter-sample-n-keys-1.png
│ ├── README-plot-sample-n-keys-1.png
│ ├── README-plot-strata-1.png
│ ├── README-show-monotonic-1.png
│ ├── README-show-spaghetti-1.png
│ ├── README-show-spaghetti-2.png
│ ├── README-show-wages-lg-1.png
│ ├── README-summarise-n-obs-1.png
│ ├── README-use-gg-highlight-1.png
│ └── brolgar-logo.png
├── heights.Rd
├── index_summary.Rd
├── key_slope.Rd
├── keys_near.Rd
├── keys_near.data.frame.Rd
├── keys_near.tbl_ts.Rd
├── l_funs.Rd
├── monotonic.Rd
├── n_obs.Rd
├── near_between.Rd
├── near_middle.Rd
├── near_quantile.Rd
├── nearests.Rd
├── pipe.Rd
├── pisa.Rd
├── reexports.Rd
├── sample-n-frac-keys.Rd
├── stratify_keys.Rd
└── wages.Rd
├── revdep
├── README.md
├── cran.md
├── failures.md
└── problems.md
├── tests
├── spelling.R
├── testthat.R
└── testthat
│ ├── _snaps
│ ├── facet-sample
│ │ ├── gg-facet-sample-alt.svg
│ │ └── gg-facet-sample.svg
│ ├── facet-strata
│ │ ├── gg-facet-strata-along.svg
│ │ └── gg-facet-strata.svg
│ ├── keys-near.md
│ └── keys-near
│ │ └── stat-plot.svg
│ ├── test-add-longnostic.R
│ ├── test-b-funs.R
│ ├── test-facet-sample.R
│ ├── test-facet-strata.R
│ ├── test-feature-diff-summary.R
│ ├── test-feature-five-num.R
│ ├── test-feature-monotonic.R
│ ├── test-feature-n-obs.R
│ ├── test-feature-three-num.R
│ ├── test-features-brolgar.R
│ ├── test-features-ranges.R
│ ├── test-features-spread.R
│ ├── test-filter-n-obs.R
│ ├── test-if_tsibble.R
│ ├── test-index-helpers.R
│ ├── test-key-slope.R
│ ├── test-keys-near.R
│ ├── test-longnostics.R
│ ├── test-monotonics.R
│ ├── test-nearests.R
│ ├── test-sample-n-frac-keys.R
│ ├── test-sample-n-frac-multiple-keys.R
│ ├── test-stratify-key.R
│ └── test-utils.R
└── vignettes
├── .gitignore
├── exploratory-modelling.Rmd
├── finding-features.Rmd
├── getting-started.Rmd
├── id-interesting-obs.Rmd
├── longitudinal-data-structures.Rmd
├── mixed-effects-models.Rmd
└── visualisation-gallery.Rmd
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^pkgdown$
2 | ^_pkgdown\.yml$
3 | ^docs$
4 | ^data-raw$
5 | ^codecov\.yml$
6 | ^README\.Rmd$
7 | ^LICENSE\.md$
8 | ^.*\.Rproj$
9 | ^\.Rproj\.user$
10 | ^\.pre-commit-config\.yaml$
11 | ^\.github$
12 | ^\.covrignore$
13 | ^\.github/workflows/pkgdown\.yaml$
14 | ^\.github/workflows/R-CMD-check\.yaml$
15 | ^\.github/workflows/render-readme\.yaml$
16 | ^cran-comments\.md$
17 | ^CRAN-RELEASE$
18 | ^revdep$
19 | ^CRAN-SUBMISSION$
20 |
--------------------------------------------------------------------------------
/.covrignore:
--------------------------------------------------------------------------------
1 | R/deprec-*.R
2 | R/compat-*.R
3 |
--------------------------------------------------------------------------------
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 |
--------------------------------------------------------------------------------
/.github/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Code of Conduct
2 |
3 | As contributors and maintainers of this project, we pledge to respect all people who
4 | contribute through reporting issues, posting feature requests, updating documentation,
5 | submitting pull requests or patches, and other activities.
6 |
7 | We are committed to making participation in this project a harassment-free experience for
8 | everyone, regardless of level of experience, gender, gender identity and expression,
9 | sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion.
10 |
11 | Examples of unacceptable behavior by participants include the use of sexual language or
12 | imagery, derogatory comments or personal attacks, trolling, public or private harassment,
13 | insults, or other unprofessional conduct.
14 |
15 | Project maintainers have the right and responsibility to remove, edit, or reject comments,
16 | commits, code, wiki edits, issues, and other contributions that are not aligned to this
17 | Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed
18 | from the project team.
19 |
20 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by
21 | opening an issue or contacting one or more of the project maintainers.
22 |
23 | This Code of Conduct is adapted from the Contributor Covenant
24 | (https://www.contributor-covenant.org), version 1.0.0, available at
25 | https://contributor-covenant.org/version/1/0/0/.
26 |
--------------------------------------------------------------------------------
/.github/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to brolgar
2 |
3 | This outlines how to propose a change to brolgar. For more detailed
4 | info about contributing to this, and other tidyverse packages, please see the
5 | [**development contributing guide**](https://rstd.io/tidy-contrib).
6 |
7 | ### Fixing typos
8 |
9 | Small typos or grammatical errors in documentation may be edited directly using
10 | the GitHub web interface, so long as the changes are made in the _source_ file.
11 |
12 | * YES: you edit a roxygen comment in a `.R` file below `R/`.
13 | * NO: you edit an `.Rd` file below `man/`.
14 |
15 | ### Prerequisites
16 |
17 | Before you make a substantial pull request, you should always file an issue and
18 | make sure someone from the team agrees that it’s a problem. If you’ve found a
19 | bug, create an associated issue and illustrate the bug with a minimal
20 | [reprex](https://www.tidyverse.org/help/#reprex).
21 |
22 | ### Pull request process
23 |
24 | * We recommend that you create a Git branch for each pull request (PR).
25 | * Look at the Travis and AppVeyor build status before and after making changes.
26 | The `README` should contain badges for any continuous integration services used
27 | by the package.
28 | * New code should follow the tidyverse [style guide](https://style.tidyverse.org).
29 | You can use the [styler](https://CRAN.R-project.org/package=styler) package to
30 | apply these styles, but please don't restyle code that has nothing to do with
31 | your PR.
32 | * We use [roxygen2](https://cran.r-project.org/package=roxygen2), with
33 | [Markdown syntax](https://cran.r-project.org/web/packages/roxygen2/vignettes/markdown.html),
34 | for documentation.
35 | * We use [testthat](https://cran.r-project.org/package=testthat). Contributions
36 | with test cases included are easier to accept.
37 | * For user-facing changes, add a bullet to the top of `NEWS.md` below the
38 | current development version header describing the changes made followed by your
39 | GitHub username, and links to relevant issue(s)/PR(s).
40 |
41 | ### Code of Conduct
42 |
43 | Please note that the brolgar project is released with a
44 | [Contributor Code of Conduct](CODE_OF_CONDUCT.md). By contributing to this
45 | project you agree to abide by its terms.
46 |
47 | ### See tidyverse [development contributing guide](https://rstd.io/tidy-contrib)
48 | for further details.
49 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | Please briefly describe your problem and what output you expect. If you have a question, please don't use this form. Instead, ask on or .
2 |
3 | Please include a minimal reproducible example (AKA a reprex). If you've never heard of a [reprex](https://reprex.tidyverse.org/) before, start by reading .
4 |
5 | ---
6 |
7 | Brief description of the problem
8 |
9 | ```r
10 | # insert reprex here
11 | ```
12 |
--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
3 | on:
4 | push:
5 | branches: [main, master]
6 | pull_request:
7 |
8 | name: R-CMD-check.yaml
9 |
10 | permissions: read-all
11 |
12 | jobs:
13 | R-CMD-check:
14 | runs-on: ${{ matrix.config.os }}
15 |
16 | name: ${{ matrix.config.os }} (${{ matrix.config.r }})
17 |
18 | strategy:
19 | fail-fast: false
20 | matrix:
21 | config:
22 | - {os: macos-latest, r: 'release'}
23 | - {os: windows-latest, r: 'release'}
24 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'}
25 | - {os: ubuntu-latest, r: 'release'}
26 | - {os: ubuntu-latest, r: 'oldrel-1'}
27 |
28 | env:
29 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
30 | R_KEEP_PKG_SOURCE: yes
31 |
32 | steps:
33 | - uses: actions/checkout@v4
34 |
35 | - uses: r-lib/actions/setup-pandoc@v2
36 |
37 | - uses: r-lib/actions/setup-r@v2
38 | with:
39 | r-version: ${{ matrix.config.r }}
40 | http-user-agent: ${{ matrix.config.http-user-agent }}
41 | use-public-rspm: true
42 |
43 | - uses: r-lib/actions/setup-r-dependencies@v2
44 | with:
45 | extra-packages: any::rcmdcheck
46 | needs: check
47 |
48 | - uses: r-lib/actions/check-r-package@v2
49 | with:
50 | upload-snapshots: true
51 | build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")'
52 |
--------------------------------------------------------------------------------
/.github/workflows/pkgdown.yaml:
--------------------------------------------------------------------------------
1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
3 | on:
4 | push:
5 | branches: [main, master]
6 | pull_request:
7 | release:
8 | types: [published]
9 | workflow_dispatch:
10 |
11 | name: pkgdown.yaml
12 |
13 | permissions: read-all
14 |
15 | jobs:
16 | pkgdown:
17 | runs-on: ubuntu-latest
18 | # Only restrict concurrency for non-PR jobs
19 | concurrency:
20 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }}
21 | env:
22 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
23 | permissions:
24 | contents: write
25 | steps:
26 | - uses: actions/checkout@v4
27 |
28 | - uses: r-lib/actions/setup-pandoc@v2
29 |
30 | - uses: r-lib/actions/setup-r@v2
31 | with:
32 | use-public-rspm: true
33 |
34 | - uses: r-lib/actions/setup-r-dependencies@v2
35 | with:
36 | extra-packages: any::pkgdown, local::.
37 | needs: website
38 |
39 | - name: Build site
40 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
41 | shell: Rscript {0}
42 |
43 | - name: Deploy to GitHub pages 🚀
44 | if: github.event_name != 'pull_request'
45 | uses: JamesIves/github-pages-deploy-action@v4.5.0
46 | with:
47 | clean: false
48 | branch: gh-pages
49 | folder: docs
50 |
--------------------------------------------------------------------------------
/.github/workflows/render-readme.yaml:
--------------------------------------------------------------------------------
1 | on:
2 | push:
3 | paths:
4 | - examples/README.Rmd
5 | - examples/*yaml
6 |
7 | name: Render README
8 |
9 | jobs:
10 | render:
11 | name: Render README
12 | runs-on: macOS-latest
13 | steps:
14 | - uses: actions/checkout@v2
15 | - uses: r-lib/actions/setup-r@v2
16 | - uses: r-lib/actions/setup-pandoc@v2
17 | - name: Install rmarkdown
18 | run: Rscript -e 'install.packages("rmarkdown")'
19 | - name: Render README
20 | run: Rscript -e 'rmarkdown::render("examples/README.Rmd")'
21 | - name: Commit results
22 | run: |
23 | git commit examples/README.md -m 'Re-build README.Rmd' || echo "No changes to commit"
24 | git push https://${{github.actor}}:${{secrets.GITHUB_TOKEN}}@github.com/${{github.repository}}.git HEAD:${{ github.ref }} || echo "No changes to commit"
25 |
--------------------------------------------------------------------------------
/.github/workflows/rhub.yaml:
--------------------------------------------------------------------------------
1 | # R-hub's generic GitHub Actions workflow file. It's canonical location is at
2 | # https://github.com/r-hub/actions/blob/v1/workflows/rhub.yaml
3 | # You can update this file to a newer version using the rhub2 package:
4 | #
5 | # rhub::rhub_setup()
6 | #
7 | # It is unlikely that you need to modify this file manually.
8 |
9 | name: R-hub
10 | run-name: "${{ github.event.inputs.id }}: ${{ github.event.inputs.name || format('Manually run by {0}', github.triggering_actor) }}"
11 |
12 | on:
13 | workflow_dispatch:
14 | inputs:
15 | config:
16 | description: 'A comma separated list of R-hub platforms to use.'
17 | type: string
18 | default: 'linux,windows,macos'
19 | name:
20 | description: 'Run name. You can leave this empty now.'
21 | type: string
22 | id:
23 | description: 'Unique ID. You can leave this empty now.'
24 | type: string
25 |
26 | jobs:
27 |
28 | setup:
29 | runs-on: ubuntu-latest
30 | outputs:
31 | containers: ${{ steps.rhub-setup.outputs.containers }}
32 | platforms: ${{ steps.rhub-setup.outputs.platforms }}
33 |
34 | steps:
35 | # NO NEED TO CHECKOUT HERE
36 | - uses: r-hub/actions/setup@v1
37 | with:
38 | config: ${{ github.event.inputs.config }}
39 | id: rhub-setup
40 |
41 | linux-containers:
42 | needs: setup
43 | if: ${{ needs.setup.outputs.containers != '[]' }}
44 | runs-on: ubuntu-latest
45 | name: ${{ matrix.config.label }}
46 | strategy:
47 | fail-fast: false
48 | matrix:
49 | config: ${{ fromJson(needs.setup.outputs.containers) }}
50 | container:
51 | image: ${{ matrix.config.container }}
52 |
53 | steps:
54 | - uses: r-hub/actions/checkout@v1
55 | - uses: r-hub/actions/platform-info@v1
56 | with:
57 | token: ${{ secrets.RHUB_TOKEN }}
58 | job-config: ${{ matrix.config.job-config }}
59 | - uses: r-hub/actions/setup-deps@v1
60 | with:
61 | token: ${{ secrets.RHUB_TOKEN }}
62 | job-config: ${{ matrix.config.job-config }}
63 | - uses: r-hub/actions/run-check@v1
64 | with:
65 | token: ${{ secrets.RHUB_TOKEN }}
66 | job-config: ${{ matrix.config.job-config }}
67 |
68 | other-platforms:
69 | needs: setup
70 | if: ${{ needs.setup.outputs.platforms != '[]' }}
71 | runs-on: ${{ matrix.config.os }}
72 | name: ${{ matrix.config.label }}
73 | strategy:
74 | fail-fast: false
75 | matrix:
76 | config: ${{ fromJson(needs.setup.outputs.platforms) }}
77 |
78 | steps:
79 | - uses: r-hub/actions/checkout@v1
80 | - uses: r-hub/actions/setup-r@v1
81 | with:
82 | job-config: ${{ matrix.config.job-config }}
83 | token: ${{ secrets.RHUB_TOKEN }}
84 | - uses: r-hub/actions/platform-info@v1
85 | with:
86 | token: ${{ secrets.RHUB_TOKEN }}
87 | job-config: ${{ matrix.config.job-config }}
88 | - uses: r-hub/actions/setup-deps@v1
89 | with:
90 | job-config: ${{ matrix.config.job-config }}
91 | token: ${{ secrets.RHUB_TOKEN }}
92 | - uses: r-hub/actions/run-check@v1
93 | with:
94 | job-config: ${{ matrix.config.job-config }}
95 | token: ${{ secrets.RHUB_TOKEN }}
96 |
--------------------------------------------------------------------------------
/.github/workflows/test-coverage.yaml:
--------------------------------------------------------------------------------
1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
3 | on:
4 | push:
5 | branches: [main, master]
6 | pull_request:
7 |
8 | name: test-coverage.yaml
9 |
10 | permissions: read-all
11 |
12 | jobs:
13 | test-coverage:
14 | runs-on: ubuntu-latest
15 | env:
16 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
17 |
18 | steps:
19 | - uses: actions/checkout@v4
20 |
21 | - uses: r-lib/actions/setup-r@v2
22 | with:
23 | use-public-rspm: true
24 |
25 | - uses: r-lib/actions/setup-r-dependencies@v2
26 | with:
27 | extra-packages: any::covr, any::xml2
28 | needs: coverage
29 |
30 | - name: Test coverage
31 | run: |
32 | cov <- covr::package_coverage(
33 | quiet = FALSE,
34 | clean = FALSE,
35 | install_path = file.path(normalizePath(Sys.getenv("RUNNER_TEMP"), winslash = "/"), "package")
36 | )
37 | print(cov)
38 | covr::to_cobertura(cov)
39 | shell: Rscript {0}
40 |
41 | - uses: codecov/codecov-action@v5
42 | with:
43 | # Fail if error if not on PR, or if on PR and token is given
44 | fail_ci_if_error: ${{ github.event_name != 'pull_request' || secrets.CODECOV_TOKEN }}
45 | files: ./cobertura.xml
46 | plugins: noop
47 | disable_search: true
48 | token: ${{ secrets.CODECOV_TOKEN }}
49 |
50 | - name: Show testthat output
51 | if: always()
52 | run: |
53 | ## --------------------------------------------------------------------
54 | find '${{ runner.temp }}/package' -name 'testthat.Rout*' -exec cat '{}' \; || true
55 | shell: bash
56 |
57 | - name: Upload test results
58 | if: failure()
59 | uses: actions/upload-artifact@v4
60 | with:
61 | name: coverage-test-failures
62 | path: ${{ runner.temp }}/package
63 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | wages.R
6 | inst/doc
7 | .DS_Store
8 | docs/
9 | revdep/checks.noindex/
10 | revdep/data.sqlite
11 | revdep/library.noindex/
12 | docs
13 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/lorenzwalthert/pre-commit-hooks
3 | rev: v0.0.0.9024
4 | hooks:
5 | - id: parsable-R
6 | - id: no-browser-statement
7 | - id: readme-rmd-rendered
8 | - id: use-tidy-description
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: brolgar
2 | Title: Browse Over Longitudinal Data Graphically and Analytically in R
3 | Version: 1.0.1.9000
4 | Authors@R:
5 | c(person(given = "Nicholas",
6 | family = "Tierney",
7 | role = c("aut", "cre"),
8 | email = "nicholas.tierney@gmail.com",
9 | comment = c(ORCID = "https://orcid.org/0000-0003-1460-8722")),
10 | person(given = "Di",
11 | family = "Cook",
12 | role = "aut",
13 | email = "dicook@monash.edu",
14 | comment = c(ORCID = "https://orcid.org/0000-0002-3813-7155")),
15 | person(given = "Tania",
16 | family = "Prvan",
17 | role = "aut",
18 | email = "tania.prvan@mq.edu.au"),
19 | person(given = "Stuart",
20 | family = "Lee",
21 | role = "ctb"),
22 | person(given = "Earo",
23 | family = "Wang",
24 | role = "ctb"))
25 | Description: Provides a framework of tools to summarise, visualise, and explore
26 | longitudinal data. It builds upon the tidy time series data frames used in the
27 | 'tsibble' package, and is designed to integrate within the 'tidyverse', and
28 | 'tidyverts' (for time series) ecosystems. The methods implemented include
29 | calculating features for understanding longitudinal data, including
30 | calculating summary statistics such as quantiles, medians, and numeric ranges,
31 | sampling individual series, identifying individual series representative of a
32 | group, and extending the facet system in 'ggplot2' to facilitate exploration of samples of data. These methods are
33 | fully described in the paper "brolgar: An R package to Browse Over
34 | Longitudinal Data Graphically and Analytically in R", Nicholas Tierney,
35 | Dianne Cook, Tania Prvan (2020) .
36 | License: MIT + file LICENSE
37 | URL: https://github.com/njtierney/brolgar, https://brolgar.njtierney.com/, http://brolgar.njtierney.com/
38 | BugReports: https://github.com/njtierney/brolgar/issues
39 | Depends:
40 | R (>= 3.5.0)
41 | Imports:
42 | dplyr (>= 0.8.3),
43 | fabletools,
44 | ggplot2 (>= 3.2.0),
45 | glue (>= 1.3.1),
46 | magrittr (>= 1.5),
47 | purrr (>= 0.3.2),
48 | rlang (>= 0.4.0),
49 | stats,
50 | tibble (>= 2.1.3),
51 | tidyr (>= 0.8.3),
52 | tsibble (>= 0.8.2),
53 | vctrs
54 | Suggests:
55 | gapminder,
56 | gghighlight (>= 0.1.0),
57 | knitr (>= 1.23),
58 | Matrix (>= 1.6-5),
59 | lme4,
60 | modelr,
61 | rmarkdown (>= 1.14),
62 | spelling (>= 2.1),
63 | testthat (>= 3.0.0),
64 | tsibbledata,
65 | vdiffr (>= 0.3.1)
66 | VignetteBuilder:
67 | knitr
68 | Encoding: UTF-8
69 | Language: en-US
70 | LazyData: true
71 | Roxygen: list(markdown = TRUE)
72 | RoxygenNote: 7.3.2
73 | Config/testthat/edition: 3
74 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2019
2 | COPYRIGHT HOLDER: Nicholas Tierney
3 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | # MIT License
2 |
3 | Copyright (c) 2019 Nicholas Tierney, Di Cook, Tania Prvan
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 |
3 | S3method(index_regular,data.frame)
4 | S3method(index_regular,tbl_ts)
5 | S3method(index_summary,data.frame)
6 | S3method(index_summary,tbl_ts)
7 | S3method(key_slope,tbl_ts)
8 | S3method(keys_near,data.frame)
9 | S3method(keys_near,default)
10 | S3method(keys_near,tbl_ts)
11 | S3method(sample_frac_keys,tbl_ts)
12 | S3method(sample_n_keys,tbl_ts)
13 | S3method(stratify_keys,tbl_ts)
14 | export("%>%")
15 | export(add_key_slope)
16 | export(add_key_slope.default)
17 | export(add_n_obs)
18 | export(as_tsibble)
19 | export(b_diff_iqr)
20 | export(b_diff_max)
21 | export(b_diff_mean)
22 | export(b_diff_median)
23 | export(b_diff_min)
24 | export(b_diff_q25)
25 | export(b_diff_q75)
26 | export(b_diff_sd)
27 | export(b_diff_var)
28 | export(b_iqr)
29 | export(b_mad)
30 | export(b_max)
31 | export(b_mean)
32 | export(b_median)
33 | export(b_min)
34 | export(b_q25)
35 | export(b_q75)
36 | export(b_range)
37 | export(b_range_diff)
38 | export(b_sd)
39 | export(b_var)
40 | export(decreasing)
41 | export(facet_sample)
42 | export(facet_strata)
43 | export(feat_brolgar)
44 | export(feat_diff_summary)
45 | export(feat_five_num)
46 | export(feat_monotonic)
47 | export(feat_ranges)
48 | export(feat_spread)
49 | export(feat_three_num)
50 | export(features)
51 | export(features_all)
52 | export(features_at)
53 | export(features_if)
54 | export(increasing)
55 | export(index_regular)
56 | export(index_summary)
57 | export(key_slope)
58 | export(keys_near)
59 | export(l_five_num)
60 | export(l_three_num)
61 | export(monotonic)
62 | export(n_keys)
63 | export(n_obs)
64 | export(near_between)
65 | export(near_middle)
66 | export(near_quantile)
67 | export(nearest_lgl)
68 | export(nearest_qt_lgl)
69 | export(sample_frac_keys)
70 | export(sample_n_keys)
71 | export(stratify_keys)
72 | export(unvarying)
73 | import(ggplot2)
74 | importFrom(fabletools,features)
75 | importFrom(fabletools,features_all)
76 | importFrom(fabletools,features_at)
77 | importFrom(fabletools,features_if)
78 | importFrom(magrittr,"%>%")
79 | importFrom(rlang,":=")
80 | importFrom(rlang,.data)
81 | importFrom(rlang,enquo)
82 | importFrom(rlang,enquos)
83 | importFrom(rlang,ensym)
84 | importFrom(rlang,quo)
85 | importFrom(rlang,quo_name)
86 | importFrom(rlang,quos)
87 | importFrom(rlang,sym)
88 | importFrom(rlang,syms)
89 | importFrom(tsibble,as_tsibble)
90 | importFrom(tsibble,n_keys)
91 |
--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
1 | # brolgar (development version)
2 |
3 | # brolgar 1.0.1 "Conservation status: Least Concern" (2024-05-07)
4 |
5 | * This is a patch release that fixed a couple of small issues that got it removed from CRAN
6 |
7 | # brolgar 1.0.0 "On the QLD coat of arms since 1977" (2023-02-06)
8 |
9 | This is a major release to signify that this version is associated with a
10 | publication (woo!) for this paper in the R Journal. However, this release only
11 | represents minor changes, summarised below:
12 |
13 | * small fix for dev dplyr [113](https://github.com/njtierney/brolgar/pull/113)
14 | * minor updates to GH actions and website.
15 |
16 | # brolgar 0.1.2 "Also known as the 'Australian Crane'" (2021-08-06)
17 |
18 | ## Minor Improvements
19 |
20 | * Uses Testthat Third edition, and implements snapshot testing
21 |
22 | # brolgar 0.1.1 "Formerly known as the 'native companion'" (2021-05-27)
23 |
24 | ## Bug fixes
25 |
26 | * Address bug with not creating equal strata
27 | * Address warnings when using b_diff_summary, which now returns NA if there is
28 | only one observation, as we can't take the difference of one observation, and
29 | a difference of 0 in these cases would be misleading.
30 |
31 | # brolgar 0.1.0 "Antigone rubicunda"
32 |
33 | * fix warning bug in `keys_near` related to factors
34 | * Add `feat_diff_summary()` functions to help summarise diff(). Useful for exploring the time gaps in the `index`. (#100)
35 | * sample functions now work with multiple keys (#85, #89) (Thanks to @earowang and @deanmarchiori for their help with this.)
36 | * `facet_sample()` now has a default of 3 per plot
37 | * resolve features(data ,.key, n_obs) error (#71)
38 | * For `near_quantile()`, the `tol` argument now defaults to 0.01.
39 | * provide an S3 generic for `tbl_ts` objects for `keys_near()` - #76
40 | * Add new dataset, `pisa` containing a short summary of the PISA dataset from
41 | https://github.com/ropenscilabs/learningtower for three (of 99) countries
42 | * add helper functions `index_regular()` and `index_summary()` to help identify
43 | index variables
44 |
45 | # brolgar 0.0.4.9000
46 |
47 | * remove `feasts` from dependencies as the functions required in `brolgar` are
48 | actually in `fabletools`.
49 | * add `nearest_lgl` and `nearest_qt_lgl`
50 | * Gave more verbose names to the `wages_ts` data.
51 | * renamed `sample_n_obs()` to `sample_n_keys()` and `sample_frac_keys()`
52 | * renamed `add_k_groups()` to `stratify_keys()`
53 | * removed many of the `l_` functions in favour of the `features` approach.
54 | * rename `l_summarise_fivenum` to `l_summarise`, and have an option to pass a list of functions.
55 | * rename `l_n_obs()` to `n_key_obs()`
56 | * rename `l_slope()` to `key_slope()`
57 | * added `monotonic` summaries and `feat_monotonic`
58 | * rename `l_summarise()` to `keys_near()`
59 | * make monotonic functions return FALSE if length == 1.
60 | * add `monotonic` function, which returns TRUE if increasing or decreasing, and false otherwise.
61 | * re export `as_tsibble()` and `n_keys()` from `tsibble
62 | * Data `world_heights` gains a continent column
63 | * Implement `facet_strata()` to create a random group of size `n_strata` to put the data into (#32). Add support for `along`, and `fun`.
64 | * Implement `facet_sample()` to create facetted plots with a set number of keys inside each facet. (#32).
65 | * `add_` functions now return a `tsibble()` (#49).
66 | * Fixed bug where `stratify_keys()` didn't assign an equal number of keys per strata (#55)
67 | * Update `wages_ts` dataset to now just be `wages` data, and remove previous `tibble()` version of `wages` (#39).
68 | * Add `top_n` argument to `keys_near` to provide control over the number of observations near a stat that are returned.
69 | * change `world_heights` to `heights`.
70 | * remove function `n_key_obs()` in favour of using `n_obs()` (#62)
71 | * remove function `filter_n_obs()` in favour of cleaner workflow with `add_n_obs()` (#63)
72 |
73 | # brolgar 0.0.1.9000
74 |
75 | * Made brolgar integrate with `tsibble`.
76 |
77 | # brolgar 0.0.0.9990
78 |
79 | * Added the `world_heights` dataset, which contains average male height in
80 | centimetres for many countries. #28
81 | * created `near_` family of functions to find values near to a quantile or percentile. So far there are `near_quantile()`, `near_middle()`, and `near_between()` (#11).
82 | * `near_quantile()` Specify some quantile and then find those values around
83 | it (within some specified tolerance).
84 | * `near_middle()` Specify some middle percentile value and find values
85 | within given percentiles.
86 | * `near_between()` Extract percentile values from a given percentile to
87 | another percentile.
88 | * Create `add_k_groups()` (#20) to randomly split the data into groups to
89 | explore the data.
90 | * Add `sample_n_obs()` and `sample_frac_obs()` (#19) to select a random group
91 | of ids.
92 | * Add `filter_n_obs()` to filter the data by the number of observations #15
93 | * Remove unnecessary use of `var`, in `l_n_obs()`, since it only needs
94 | information on the `id`. Also gets a nice 5x speedup with simpler code
95 | * calculate all longnostics (#4)
96 | * use the word `longnostic` instead of `lognostic` (#9)
97 | * `l_slope` now returns `l_intercept` and `l_slope` instead of `intercept` and
98 | `slope`.
99 | * `l_slope` now takes bare variable names
100 | * Renamed `l_d1` to `l_diff` and added a lag argument. This makes `l_diff` more
101 | flexible and the function more clearly describes its purpose.
102 | * Rename `l_length` to `l_n_obs` to more clearly indicate that this counts the
103 | number of observations.
104 | * Create `longnostic` function to create longnostic functions to package up
105 | reproduced code inside the `l_` functions.
106 | * Added a `NEWS.md` file to track changes to the package.
107 |
--------------------------------------------------------------------------------
/R/b_summaries.R:
--------------------------------------------------------------------------------
1 | #' Brolgar summaries (b_summaries)
2 | #'
3 | #' Customised summaries of vectors with appropriate defaults for longitudinal
4 | #' data. The functions are prefixed with `b_` to assist with autocomplete.
5 | #' It uses `na.rm = TRUE` for all, and for calculations
6 | #' involving quantiles, `type = 8` and `names = FALSE`. Summaries include:
7 | #' * b_min: The minimum
8 | #' * b_max: The maximum
9 | #' * b_median: The median
10 | #' * b_mean: The mean
11 | #' * b_q25: The 25th quantile
12 | #' * b_q75: The 75th quantile
13 | #' * b_range: The range
14 | #' * b_range_diff: difference in range (max - min)
15 | #' * b_sd: The standard deviation
16 | #' * b_var: The variance
17 | #' * b_mad: The mean absolute deviation
18 | #' * b_iqr: The Inter-quartile range
19 | #' * b_diff_var: The variance diff()
20 | #' * b_diff_sd: The standard deviation of diff()
21 | #' * b_diff_mean: The mean of diff()
22 | #' * b_diff_median: The median of diff()
23 | #' * b_diff_q25: The q25 of diff()
24 | #' * b_diff_q75: The q75 of diff()
25 | #'
26 | #' @param x a vector
27 | #' @param ... other arguments to pass
28 | #' @rdname b_summaries
29 | #' @examples
30 | #'
31 | #' x <- c(1:5, NA, 5:1)
32 | #' min(x)
33 | #' b_min(x)
34 | #' max(x)
35 | #' b_max(x)
36 | #' median(x)
37 | #' b_median(x)
38 | #' mean(x)
39 | #' b_mean(x)
40 | #' range(x)
41 | #' b_range(x)
42 | #' var(x)
43 | #' b_var(x)
44 | #' sd(x)
45 | #' b_sd(x)
46 | #'
47 | #' @export
48 | b_min <- function(x, ...) {
49 | min(x, na.rm = TRUE, ...)
50 | }
51 |
52 | #' @name b_summaries
53 | #' @export
54 | b_max <- function(x, ...) {
55 | max(x, na.rm = TRUE, ...)
56 | }
57 |
58 | #' @name b_summaries
59 | #' @export
60 | b_median <- function(x, ...) {
61 | stats::median(x, na.rm = TRUE, ...)
62 | }
63 |
64 | #' @name b_summaries
65 | #' @export
66 | b_mean <- function(x, ...) {
67 | mean(x, na.rm = TRUE, ...)
68 | }
69 |
70 | #' @name b_summaries
71 | #' @export
72 | b_q25 <- function(x, ...) {
73 | stats::quantile(x, type = 8, probs = 0.25, na.rm = TRUE, names = FALSE, ...)
74 | }
75 |
76 | #' @name b_summaries
77 | #' @export
78 | b_q75 <- function(x, ...) {
79 | stats::quantile(x, type = 8, probs = 0.75, na.rm = TRUE, names = FALSE, ...)
80 | }
81 |
82 | #' @name b_summaries
83 | #' @export
84 | b_range <- function(x, ...) {
85 | range(x, na.rm = TRUE, ...)
86 | }
87 |
88 | #' @name b_summaries
89 | #' @export
90 | b_range_diff <- function(x, ...) {
91 | the_range <- b_range(x, ...)
92 | diff(the_range)
93 | }
94 |
95 | #' @name b_summaries
96 | #' @export
97 | b_sd <- function(x, ...) {
98 | stats::sd(x, na.rm = TRUE, ...)
99 | }
100 |
101 | #' @name b_summaries
102 | #' @export
103 | b_var <- function(x, ...) {
104 | stats::var(x, na.rm = TRUE, ...)
105 | }
106 |
107 | #' @name b_summaries
108 | #' @export
109 | b_mad <- function(x, ...) {
110 | stats::mad(x, na.rm = TRUE, ...)
111 | }
112 |
113 | #' @name b_summaries
114 | #' @export
115 | b_iqr <- function(x, ...) {
116 | stats::IQR(x, na.rm = TRUE, type = 8, ...)
117 | }
118 |
119 | #' @name b_summaries
120 | #' @export
121 | b_diff_var <- function(x, ...) {
122 | x <- stats::na.omit(x)
123 | if (length(x) == 1) {
124 | return(NA)
125 | }
126 | stats::var(diff(x, na.rm = TRUE, ...))
127 | }
128 |
129 | #' @name b_summaries
130 | #' @export
131 | b_diff_sd <- function(x, ...) {
132 | x <- stats::na.omit(x)
133 | if (length(x) == 1) {
134 | return(NA)
135 | }
136 | b_sd(diff(x, ...))
137 | }
138 |
139 | #' @name b_summaries
140 | #' @export
141 | b_diff_mean <- function(x, ...) {
142 | x <- stats::na.omit(x)
143 | if (length(x) == 1) {
144 | return(NA)
145 | }
146 | b_mean(diff(x, ...))
147 | }
148 |
149 | #' @name b_summaries
150 | #' @export
151 | b_diff_median <- function(x, ...) {
152 | x <- stats::na.omit(x)
153 | if (length(x) == 1) {
154 | return(NA)
155 | }
156 | b_median(diff(x, ...))
157 | }
158 |
159 | #' @name b_summaries
160 | #' @export
161 | b_diff_q25 <- function(x, ...) {
162 | x <- stats::na.omit(x)
163 | if (length(x) == 1) {
164 | return(NA)
165 | }
166 | b_q25(diff(x, ...))
167 | }
168 |
169 | #' @name b_summaries
170 | #' @export
171 | b_diff_q75 <- function(x, ...) {
172 | x <- stats::na.omit(x)
173 | if (length(x) == 1) {
174 | return(NA)
175 | }
176 | b_q75(diff(x, ...))
177 | }
178 |
179 | #' @name b_summaries
180 | #' @export
181 | b_diff_max <- function(x, ...) {
182 | x <- stats::na.omit(x)
183 | if (length(x) == 1) {
184 | return(NA)
185 | }
186 | b_max(diff(x, ...))
187 | }
188 |
189 | #' @name b_summaries
190 | #' @export
191 | b_diff_min <- function(x, ...) {
192 | x <- stats::na.omit(x)
193 | # return early if there is only one thing, as we can't take the difference
194 | # of that. And a difference of 0 is misleading.
195 | if (length(x) == 1) {
196 | return(NA)
197 | }
198 | b_min(diff(x, ...))
199 | }
200 |
201 | #' @name b_summaries
202 | #' @export
203 | b_diff_iqr <- function(x, ...) {
204 | x <- stats::na.omit(x)
205 | if (length(x) == 1) {
206 | return(NA)
207 | }
208 | b_iqr(diff(x, ...))
209 | }
210 |
211 | # * `l_n_obs()` Number of observations
212 | # * `l_slope()` Slope and intercept (given some linear model formula)
213 |
--------------------------------------------------------------------------------
/R/brolgar-package.R:
--------------------------------------------------------------------------------
1 | #' @details
2 | #' `brolgar` stands for: **BR**owse over **L**ongitudinal data **G**raphically
3 | #' and **A**nalytically in **R**.
4 | #'
5 | #' @keywords internal
6 | "_PACKAGE"
7 |
8 | #' @importFrom rlang .data quo quos enquo enquos quo_name sym ensym syms :=
9 |
10 | if (getRversion() >= "2.15.1") utils::globalVariables(c("."))
11 | globalVariables(c(
12 | "intercept",
13 | "is",
14 | "slope",
15 | "estimate",
16 | "na.rm",
17 | "stat",
18 | "stat_diff",
19 | "stat_value",
20 | ".rows",
21 | "quantile",
22 | "coef_tbl",
23 | "n_obs",
24 | "colorRampPalette"
25 | ))
26 |
--------------------------------------------------------------------------------
/R/data-pisa.R:
--------------------------------------------------------------------------------
1 | #' Student data from 2000-2018 PISA OECD data
2 | #'
3 | #' A subset of PISA data, containing scores and other information
4 | #' from the triennial testing of 15 year olds around
5 | #' the globe. Original data available from
6 | #' \url{https://www.oecd.org/pisa/data/}. Data derived from
7 | #' \url{https://github.com/kevinwang09/learningtower}.
8 | #'
9 | #' @format A tibble of the following variables
10 | #' \itemize{
11 | #' \item year the year of measurement
12 | #' \item country the three letter country code. This data contains Australia,
13 | #' New Zealand, and Indonesia. The full data from learningtower contains
14 | #' 99 countries.
15 | #' \item school_id The unique school identification number
16 | #' \item student_id The student identification number
17 | #' \item gender recorded gender - 1 female or 2 male or missing
18 | #' \item math Simulated score in mathematics
19 | #' \item read Simulated score in reading
20 | #' \item science Simulated score in science
21 | #' \item stu_wgt The final survey weight score for the student score
22 | #' }
23 | #'
24 | #' Understanding a bit more about the PISA data, the `school_id` and
25 | #' `student_id` are not unique across time. This means the longitudinal element
26 | #' is the country within a given year.
27 | #'
28 | #' We can cast `pisa` as a `tsibble`, but we need to aggregate the data to each
29 | #' year and country. In doing so, it is important that we provide some summary
30 | #' statistics of each of the scores - we want to include the mean, and minimum
31 | #' and maximum of the math, reading, and science scores, so that we do not lose
32 | #' the information of the individuals.
33 | #'
34 | #' The example code below does this, first grouping by year and country, then
35 | #' calculating the weighted mean for math, reading, and science. This can be
36 | #' done using the student weight variable `stu_wgt`, to get the survey weighted
37 | #' mean. The minimum and maximum are then calculated.
38 | #'
39 | #' @docType data
40 | #' @name pisa
41 | #' @keywords datasets
42 | #' @examples
43 | #' pisa
44 | #'
45 | #' library(dplyr)
46 | #' # Let's identify
47 | #'
48 | #' #1. The **key**, the individual, who would have repeated measurements.
49 | #' #2. The **index**, the time component.
50 | #' #3. The **regularity** of the time interval (index).
51 | #'
52 | #' # Here it looks like the key is the student_id, which is nested within
53 | #' # school_id #' and country,
54 | #'
55 | #' # And the index is year, so we would write the following
56 | #'
57 | #' as_tsibble(pisa,
58 | #' key = country,
59 | #' index = year)
60 | #'
61 | #' # We can assess the regularity of the year like so:
62 | #'
63 | #' index_regular(pisa, year)
64 | #' index_summary(pisa, year)
65 | #'
66 | #' # We can now convert this into a `tsibble`:
67 | #'
68 | #' pisa_ts <- as_tsibble(pisa,
69 | #' key = country,
70 | #' index = year,
71 | #' regular = TRUE)
72 | #'
73 | #' pisa_ts
74 | #' pisa_ts_au_nz <- pisa_ts %>% filter(country %in% c("AUS", "NZL", "QAT"))
75 | #'
76 | #' library(ggplot2)
77 | #' ggplot(pisa_ts_au_nz,
78 | #' aes(x = year,
79 | #' y = math_mean,
80 | #' group = country,
81 | #' colour = country)) +
82 | #' geom_ribbon(aes(ymin = math_min,
83 | #' ymax = math_max),
84 | #' fill = "grey70") +
85 | #' geom_line(size = 1) +
86 | #' lims(y = c(0, 1000)) +
87 | #' labs(y = "math") +
88 | #' facet_wrap(~country)
89 | "pisa"
90 |
--------------------------------------------------------------------------------
/R/data-wages-ts.R:
--------------------------------------------------------------------------------
1 | #' Wages data from National Longitudinal Survey of Youth (NLSY)
2 | #'
3 | #' This data contains measurements on hourly wages by years in
4 | #' the workforce, with education and race as covariates. The population
5 | #' measured was male high-school dropouts, aged between 14 and 17 years
6 | #' when first measured. `wages` is a time series `tsibble`.
7 | #' It comes from J. D. Singer and J. B. Willett.
8 | #' Applied Longitudinal Data Analysis.
9 | #' Oxford University Press, Oxford, UK, 2003.
10 | #' https://stats.idre.ucla.edu/stat/r/examples/alda/data/wages_pp.txt
11 | #'
12 | #' @format A `tsibble` data frame with 6402 rows and 8 variables:
13 | #' \describe{
14 | #' \item{id}{1–888, for each subject. This forms the `key` of the data}
15 | #' \item{ln_wages}{natural log of wages, adjusted for inflation,
16 | #' to 1990 dollars.}
17 | #' \item{xp}{Experience - the length of time in the workforce (in years).
18 | #' This is treated as the time variable, with t0 for each subject starting
19 | #' on their first day at work. The number of time points and values of time
20 | #' points for each subject can differ. This forms the `index` of the data}
21 | #' \item{ged}{when/if a graduate equivalency diploma is obtained.}
22 | #' \item{xp_since_ged}{change in experience since getting a ged (if they get one)}
23 | #' \item{black}{categorical indicator of race = black.}
24 | #' \item{hispanic}{categorical indicator of race = hispanic.}
25 | #' \item{high_grade}{highest grade completed}
26 | #' \item{unemploy_rate}{unemployment rates in the local geographic region
27 | #' at each measurement time}
28 | #' }
29 | #'
30 | #' @docType data
31 | #' @name wages
32 | #' @keywords datasets
33 | #' @examples
34 | #' # show the data
35 | #' wages
36 | #' library(ggplot2)
37 | #' # set seed so that the plots stay the same
38 | #' set.seed(2019-7-15-1300)
39 | #' # explore a sample of five individuals
40 | #' wages %>%
41 | #' sample_n_keys(size = 5) %>%
42 | #' ggplot(aes(x = xp,
43 | #' y = ln_wages,
44 | #' group = id)) +
45 | #' geom_line()
46 | #'
47 | #' # Explore many samples with `facet_sample()`
48 | #' ggplot(wages,
49 | #' aes(x = xp,
50 | #' y = ln_wages,
51 | #' group = id)) +
52 | #' geom_line() +
53 | #' facet_sample()
54 | #'
55 | #' # explore the five number summary of ln_wages with `features`
56 | #' wages %>%
57 | #' features(ln_wages, feat_five_num)
58 | #'
59 | "wages"
60 |
--------------------------------------------------------------------------------
/R/data-world-height.R:
--------------------------------------------------------------------------------
1 | #' World Height Data
2 | #'
3 | #' Average male heights in 144 countries from 1810-1989, with a
4 | #' smaller number of countries from 1500-1800. Data has been filtered to
5 | #' only include countries with more than one observation.
6 | #'
7 | #' `heights` is stored as a time series `tsibble` object. It contains
8 | #' the variables:
9 | #'
10 | #' * country: The Country. This forms the identifying `key`.
11 | #' * year: Year. This forms the time `index`.
12 | #' * height_cm: Average male height in centimeters.
13 | #' * continent: continent extracted from country name using `countrycode`
14 | #' package (https://joss.theoj.org/papers/10.21105/joss.00848).
15 | #'
16 | #' For more information, see the article: "Why are you tall while others are
17 | #' short? Agricultural production and other proximate determinants of global
18 | #' heights", Joerg Baten and Matthias Blum, European Review of Economic
19 | #' History 18 (2014), 144–165. Data available from
20 | #' , accessed via the Clio Infra website.
21 | #'
22 | #' @keywords datasets
23 | #' @examples
24 | #' # show the data
25 | #' heights
26 | #'
27 | #' # show the spaghetti plot (ugh!)
28 | #' library(ggplot2)
29 | #' ggplot(heights,
30 | #' aes(x = year,
31 | #' y = height_cm,
32 | #' group = country)) +
33 | #' geom_line()
34 | #'
35 | #' # Explore all samples with `facet_strata()`
36 | #' ggplot(heights,
37 | #' aes(x = year,
38 | #' y = height_cm,
39 | #' group = country)) +
40 | #' geom_line() +
41 | #' facet_strata()
42 | #'
43 | #' # Explore the heights over each continent
44 | #' ggplot(heights,
45 | #' aes(x = year,
46 | #' y = height_cm,
47 | #' group = country)) +
48 | #' geom_line() +
49 | #' facet_wrap(~continent)
50 | #'
51 | #' # explore the five number summary of height_cm with `features`
52 | #' heights %>%
53 | #' features(height_cm, feat_five_num)
54 | "heights"
55 |
--------------------------------------------------------------------------------
/R/facet-sample.R:
--------------------------------------------------------------------------------
1 | #' Facet data into groups to facilitate exploration
2 | #'
3 | #' This function requires a `tbl_ts` object, which can be created with
4 | #' `tsibble::as_tsibble()`. Under the hood, `facet_strata` is powered by
5 | #' [stratify_keys()] and [sample_n_keys()].
6 | #'
7 | #' @param n_per_facet Number of keys per facet you want to plot. Default is 3.
8 | #' @param n_facets Number of facets to create. Default is 12
9 | #' @inheritParams ggplot2::facet_wrap
10 | #' @import ggplot2
11 | #'
12 | #' @return a ggplot object
13 | #' @export
14 | #'
15 | #' @examples
16 | #' library(ggplot2)
17 | #' ggplot(heights,
18 | #' aes(x = year,
19 | #' y = height_cm,
20 | #' group = country)) +
21 | #' geom_line() +
22 | #' facet_sample()
23 | #'
24 | #' ggplot(heights,
25 | #' aes(x = year,
26 | #' y = height_cm,
27 | #' group = country)) +
28 | #' geom_line() +
29 | #' facet_sample(n_per_facet = 1,
30 | #' n_facets = 12)
31 | facet_sample <- function(
32 | n_per_facet = 3,
33 | n_facets = 12,
34 | nrow = NULL,
35 | ncol = NULL,
36 | scales = "fixed",
37 | shrink = TRUE,
38 | strip.position = "top"
39 | ) {
40 | facet <- facet_wrap(
41 | ~.strata,
42 | nrow = nrow,
43 | ncol = ncol,
44 | scales = scales,
45 | shrink = shrink,
46 | strip.position = strip.position
47 | )
48 |
49 | facet$params$n <- n_facets
50 | facet$params$n_per_facet <- n_per_facet
51 |
52 | ggproto(NULL, FacetSample, shrink = shrink, params = facet$params)
53 | }
54 |
55 | FacetSample <- ggproto(
56 | "FacetSample",
57 | FacetWrap,
58 | compute_layout = function(data, params) {
59 | id <- seq_len(params$n)
60 | dims <- wrap_dims(n = params$n, nrow = params$nrow, ncol = params$ncol)
61 |
62 | layout <- data.frame(PANEL = factor(id))
63 |
64 | if (params$as.table) {
65 | layout$ROW <- as.integer((id - 1L) %/% dims[2] + 1L)
66 | } else {
67 | layout$ROW <- as.integer(dims[1] - (id - 1L) %/% dims[2])
68 | }
69 |
70 | layout$COL <- as.integer((id - 1L) %% dims[2] + 1L)
71 |
72 | layout <- layout[order(layout$PANEL), , drop = FALSE]
73 |
74 | rownames(layout) <- NULL
75 |
76 | # Add scale identification
77 | layout$SCALE_X <- if (params$free$x) id else 1L
78 | layout$SCALE_Y <- if (params$free$y) id else 1L
79 |
80 | cbind(layout, .strata = id)
81 | },
82 |
83 | map_data = function(data, layout, params) {
84 | if (is.null(data) || nrow(data) == 0) {
85 | return(cbind(data, PANEL = integer(0)))
86 | }
87 |
88 | new_data <- data %>%
89 | sample_n_keys(size = params$n * params$n_per_facet) %>%
90 | stratify_keys(n_strata = params$n)
91 |
92 | new_data$PANEL = new_data$.strata
93 |
94 | return(new_data)
95 | }
96 | )
97 |
--------------------------------------------------------------------------------
/R/facet-strata.R:
--------------------------------------------------------------------------------
1 | #' Facet data into groups to facilitate exploration
2 | #'
3 | #' This function requires a `tbl_ts` object, which can be created with
4 | #' `tsibble::as_tsibble()`. Under the hood, `facet_strata` is powered by
5 | #' [stratify_keys()].
6 | #'
7 | #' @inheritParams stratify_keys
8 | #' @inheritParams ggplot2::facet_wrap
9 | #' @import ggplot2
10 | #'
11 | #' @return a ggplot object
12 | #' @export
13 | #'
14 | #' @examples
15 | #' library(ggplot2)
16 | #' ggplot(heights,
17 | #' aes(x = year,
18 | #' y = height_cm,
19 | #' group = country)) +
20 | #' geom_line() +
21 | #' facet_strata()
22 | #'
23 | #'
24 | #' ggplot(heights,
25 | #' aes(x = year,
26 | #' y = height_cm,
27 | #' group = country)) +
28 | #' geom_line() +
29 | #' facet_wrap(~continent)
30 | #'
31 | #' ggplot(heights,
32 | #' aes(x = year,
33 | #' y = height_cm,
34 | #' group = country)) +
35 | #' geom_line() +
36 | #' facet_strata(along = year)
37 | #'
38 | #' \donttest{
39 | #' library(dplyr)
40 | #' heights %>%
41 | #' key_slope(height_cm ~ year) %>%
42 | #' right_join(heights, ., by = "country") %>%
43 | #' ggplot(aes(x = year,
44 | #' y = height_cm)) +
45 | #' geom_line(aes(group = country)) +
46 | #' geom_smooth(method = "lm") +
47 | #' facet_strata(along = .slope_year)
48 | #' }
49 |
50 | facet_strata <- function(
51 | n_strata = 12,
52 | along = NULL,
53 | fun = mean,
54 | nrow = NULL,
55 | ncol = NULL,
56 | scales = "fixed",
57 | shrink = TRUE,
58 | strip.position = "top"
59 | ) {
60 | facet <- facet_wrap(
61 | ~.strata,
62 | nrow = nrow,
63 | ncol = ncol,
64 | scales = scales,
65 | shrink = shrink,
66 | strip.position = strip.position
67 | )
68 |
69 | facet$params$n <- n_strata
70 | facet$params$along <- rlang::enquo(along)
71 | facet$params$fun <- fun
72 |
73 | ggproto(NULL, FacetStrata, shrink = shrink, params = facet$params)
74 | }
75 |
76 | FacetStrata <- ggproto(
77 | "FacetStrata",
78 | FacetWrap,
79 | compute_layout = function(data, params) {
80 | id <- seq_len(params$n)
81 | dims <- wrap_dims(n = params$n, nrow = params$nrow, ncol = params$ncol)
82 |
83 | layout <- data.frame(PANEL = factor(id))
84 |
85 | if (params$as.table) {
86 | layout$ROW <- as.integer((id - 1L) %/% dims[2] + 1L)
87 | } else {
88 | layout$ROW <- as.integer(dims[1] - (id - 1L) %/% dims[2])
89 | }
90 |
91 | layout$COL <- as.integer((id - 1L) %% dims[2] + 1L)
92 |
93 | layout <- layout[order(layout$PANEL), , drop = FALSE]
94 |
95 | rownames(layout) <- NULL
96 |
97 | # Add scale identification
98 | layout$SCALE_X <- if (params$free$x) id else 1L
99 | layout$SCALE_Y <- if (params$free$y) id else 1L
100 |
101 | cbind(layout, .strata = id)
102 | },
103 |
104 | map_data = function(data, layout, params) {
105 | if (is.null(data) || nrow(data) == 0) {
106 | return(cbind(data, PANEL = integer(0)))
107 | }
108 |
109 | new_data <- brolgar::stratify_keys(
110 | .data = data,
111 | n_strata = params$n,
112 | along = !!params$along,
113 | fun = params$fun
114 | )
115 |
116 | new_data$PANEL = new_data$.strata
117 |
118 | return(new_data)
119 | }
120 | )
121 |
--------------------------------------------------------------------------------
/R/features.R:
--------------------------------------------------------------------------------
1 | #' Calculate features of a `tsibble` object in conjunction with [features()]
2 | #'
3 | #' You can calculate a series of summary statistics (features) of a given
4 | #' variable for a dataset. For example, a three number summary, the minimum,
5 | #' median, and maximum, can be calculated for a given variable. This is
6 | #' designed to work with the [features()] function shown in the examples.
7 | #' Other available features in `brolgar` include:
8 | #'
9 | #' - [feat_three_num()] - minimum, median, maximum
10 | #' - [feat_five_num()] - minimum, q25, median, q75, maximum.
11 | #' - [feat_ranges()] - min, max, range difference, interquartile range.
12 | #' - [feat_spread()] - variance, standard deviation, median absolute distance,
13 | #' and interquartile range
14 | #' - [feat_monotonic()] - is it always increasing, decreasing, or unvarying?
15 | #' - [feat_diff_summary()] - the summary statistics of the differences
16 | #' amongst a value, including the five number summary, as well as the
17 | #' standard deviation and variance. Returns NA if there is only one
18 | #' observation, as we can't take the difference of one observation, and a
19 | #' difference of 0 in these cases would be misleading.
20 | #'
21 | #' - [feat_brolgar()] all features in brolgar.
22 | #'
23 | #' @param x A vector to extract features from.
24 | #' @param ... Further arguments passed to other functions.
25 | #' @name brolgar-features
26 | #' @examples
27 | #'
28 | #' # You can use any of the features `feat_*` in conjunction with `features`
29 | #' # like so:
30 | #' heights %>%
31 | #' features(height_cm, # variable you want to explore
32 | #' feat_three_num) # the feature summarisation you want to perform
33 |
34 | #' @rdname brolgar-features
35 | #' @export
36 | feat_three_num <- function(x, ...) {
37 | c(min = b_min(x, ...), med = b_median(x, ...), max = b_max(x, ...))
38 | }
39 |
40 | #' @rdname brolgar-features
41 | #' @export
42 | feat_five_num <- function(x, ...) {
43 | c(
44 | min = b_min(x, ...),
45 | q25 = b_q25(x, ...),
46 | med = b_median(x, ...),
47 | q75 = b_q75(x, ...),
48 | max = b_max(x, ...)
49 | )
50 | }
51 |
52 | #' @rdname brolgar-features
53 | #' @export
54 | feat_ranges <- function(x, ...) {
55 | c(
56 | min = b_min(x, ...),
57 | max = b_max(x, ...),
58 | range_diff = b_range_diff(x, ...),
59 | iqr = b_iqr(x, ...)
60 | )
61 | }
62 |
63 | #' @rdname brolgar-features
64 | #' @export
65 | feat_spread <- function(x, ...) {
66 | c(
67 | var = b_var(x, ...),
68 | sd = b_sd(x, ...),
69 | mad = b_mad(x, ...),
70 | iqr = b_iqr(x, ...)
71 | )
72 | }
73 |
74 | #' @rdname brolgar-features
75 | #' @export
76 | feat_monotonic <- function(x, ...) {
77 | c(
78 | increase = increasing(x, ...),
79 | decrease = decreasing(x, ...),
80 | unvary = unvarying(x, ...),
81 | monotonic = monotonic(x, ...)
82 | )
83 | }
84 |
85 | #' @rdname brolgar-features
86 | #' @export
87 | feat_brolgar <- function(x, ...) {
88 | c(
89 | min = b_min(x, ...),
90 | max = b_max(x, ...),
91 | median = b_median(x, ...),
92 | mean = b_mean(x, ...),
93 | q25 = b_q25(x, ...),
94 | q75 = b_q75(x, ...),
95 | range = b_range(x, ...),
96 | range_diff = b_range_diff(x, ...),
97 | sd = b_sd(x, ...),
98 | var = b_var(x, ...),
99 | mad = b_mad(x, ...),
100 | iqr = b_iqr(x, ...),
101 | increase = increasing(x, ...),
102 | decrease = decreasing(x, ...),
103 | unvary = unvarying(x, ...),
104 | feat_diff_summary(x, ...)
105 | )
106 | }
107 |
108 | #' @rdname brolgar-features
109 | #' @export
110 | feat_diff_summary <- function(x, ...) {
111 | c(
112 | diff_min = b_diff_min(x, ...),
113 | diff_q25 = b_diff_q25(x, ...),
114 | diff_median = b_diff_median(x, ...),
115 | diff_mean = b_diff_mean(x, ...),
116 | diff_q75 = b_diff_q75(x, ...),
117 | diff_max = b_diff_max(x, ...),
118 | diff_var = b_diff_var(x, ...),
119 | diff_sd = b_diff_sd(x, ...),
120 | diff_iqr = b_diff_iqr(x, ...)
121 | )
122 | }
123 |
--------------------------------------------------------------------------------
/R/index-helpers.R:
--------------------------------------------------------------------------------
1 | #' Index summaries
2 | #'
3 | #' These functions check if the index is regular (`index_regular()`), and
4 | #' summarise the index variable (`index_summary()`). This can be useful
5 | #' to check your index variables.
6 | #'
7 | #' @param .data data.frame or tsibble
8 | #' @param index the proposed index variable
9 | #' @param ... extra arguments
10 | #' @name index_summary
11 | #'
12 | #' @return logical TRUE means it is regular, FALSE means not
13 | #'
14 | #' @examples
15 | #' # a tsibble
16 | #' index_regular(heights)
17 | #'
18 | #' # some data frames
19 | #' index_regular(pisa, year)
20 | #' index_regular(airquality, Month)
21 | #'
22 | #' # a tsibble
23 | #' index_summary(heights)
24 | #' # some data frames
25 | #' index_summary(pisa, year)
26 | #' index_summary(airquality, Month)
27 | #' index_summary(airquality, Day)
28 | #' @export
29 | index_regular <- function(.data, ...) {
30 | UseMethod("index_regular")
31 | }
32 |
33 |
34 | #' @rdname index_summary
35 | #' @export
36 | index_regular.tbl_ts <- function(.data, ...) {
37 | .data %>%
38 | dplyr::pull(tsibble::index(.)) %>%
39 | unique() %>%
40 | sort() %>%
41 | diff() %>%
42 | unvarying()
43 | }
44 |
45 | #' @rdname index_summary
46 | #' @export
47 | index_regular.data.frame <- function(.data, index, ...) {
48 | .data %>%
49 | dplyr::pull({{ index }}) %>%
50 | unique() %>%
51 | sort() %>%
52 | diff() %>%
53 | unvarying()
54 | }
55 |
56 | #' @rdname index_summary
57 | #' @export
58 | index_summary <- function(.data, ...) {
59 | UseMethod("index_summary")
60 | }
61 |
62 |
63 | #' @rdname index_summary
64 | #' @export
65 | index_summary.tbl_ts <- function(.data, ...) {
66 | .data %>%
67 | dplyr::pull(tsibble::index(.)) %>%
68 | unique() %>%
69 | summary()
70 | }
71 |
72 | #' @rdname index_summary
73 | #' @export
74 | index_summary.data.frame <- function(.data, index, ...) {
75 | .data %>%
76 | dplyr::pull({{ index }}) %>%
77 | unique() %>%
78 | summary()
79 | }
80 |
--------------------------------------------------------------------------------
/R/key_slope.R:
--------------------------------------------------------------------------------
1 | #' Fit linear model for each key
2 | #'
3 | #' Using `key_slope` you can fit a linear model to each key in the `tsibble`.
4 | #' `add_key_slope` adds this slope information back to the data, and returns
5 | #' the full dimension `tsibble`.
6 | #'
7 | #' @param .data tsibble
8 | #' @param formula formula
9 | #' @param ... extra arguments
10 | #'
11 | #' @return tibble with coefficient information
12 | #'
13 | #' @name key_slope
14 | #'
15 | #' @examples
16 | #' key_slope(heights, height_cm ~ year)
17 | #'
18 | #' @export
19 | key_slope <- function(.data, formula, ...) {
20 | test_if_tilde(formula)
21 | test_if_formula(formula)
22 | UseMethod("key_slope")
23 | }
24 |
25 | #' @export
26 | key_slope.tbl_ts <- function(.data, formula, ...) {
27 | quo_formula <- rlang::enquo(formula)
28 | f_rhs_vars <- all.vars(
29 | rlang::f_rhs(
30 | stats::as.formula(
31 | rlang::as_label(quo_formula)
32 | )
33 | )
34 | )
35 | coef_tbl_vars <- c(
36 | tsibble::key_vars(.data),
37 | ".intercept",
38 | paste0(".slope_", f_rhs_vars)
39 | )
40 | .data %>%
41 | tibble::as_tibble() %>%
42 | dplyr::group_by(!!!tsibble::key(.data)) %>%
43 | dplyr::summarise(
44 | coef_tbl = list(
45 | as.data.frame(
46 | t(stats::coef(stats::lm(
47 | stats::as.formula(
48 | rlang::as_label(quo_formula)
49 | )
50 | )))
51 | )
52 | )
53 | ) %>%
54 | tidyr::unnest(cols = c(coef_tbl)) %>%
55 | rlang::set_names(coef_tbl_vars)
56 | }
57 |
58 | #' @rdname key_slope
59 | #' @export
60 | add_key_slope <- function(.data, formula) {
61 | test_if_null(.data)
62 | test_if_null(formula)
63 | test_if_tsibble(.data)
64 |
65 | str_key <- purrr::map_chr(tsibble::key(.data), rlang::as_label)
66 |
67 | key_slope(.data = .data, formula = {{ formula }}) %>%
68 | dplyr::left_join(.data, ., by = str_key) %>%
69 | dplyr::select(
70 | !!!tsibble::key(.data),
71 | !!tsibble::index(.data),
72 | dplyr::starts_with("."),
73 | dplyr::everything()
74 | )
75 | }
76 |
77 | #' @rdname key_slope
78 | #' @export
79 | add_key_slope.default <- function(.data, formula) {
80 | stop(
81 | "Currently there is no method to deal with .data, which is of class ",
82 | class(.data),
83 | "."
84 | )
85 | }
86 |
--------------------------------------------------------------------------------
/R/keys_near.R:
--------------------------------------------------------------------------------
1 | #' Return keys nearest to a given statistics or summary.
2 | #'
3 | #' @param .data tsibble
4 | #' @param ... extra arguments to pass to `mutate_at` when performing the summary
5 | #' as given by `funs`.
6 | #'
7 | #' @return data.frame containing keys closest to a given statistic.
8 | #'
9 | #' @examples
10 | #' keys_near(heights, height_cm)
11 | #'
12 | #' @export
13 | keys_near <- function(.data, ...) {
14 | UseMethod("keys_near")
15 | }
16 |
17 | #' @title Return keys nearest to a given statistics or summary.
18 | #' @inheritParams keys_near
19 | #' @param var variable to summarise
20 | #' @param top_n top number of closest observations to return - default is 1, which will also return ties.
21 | #' @param funs named list of functions to summarise by. Default is a given
22 | #' list of the five number summary, `l_five_num`.
23 | #' @param stat_as_factor coerce `stat` variable into a factor? Default is TRUE.
24 | #' @export
25 | #' @examples
26 | #'
27 | #' # Return observations closest to the five number summary of height_cm
28 | #' heights %>%
29 | #' keys_near(var = height_cm)
30 | #'
31 |
32 | keys_near.tbl_ts <- function(
33 | .data,
34 | var,
35 | top_n = 1,
36 | funs = l_five_num,
37 | stat_as_factor = TRUE,
38 | ...
39 | ) {
40 | key <- tsibble::key_vars(.data)
41 |
42 | data_keys_near <- .data %>%
43 | tibble::as_tibble() %>%
44 | dplyr::mutate_at(
45 | .vars = dplyr::vars({{ var }}),
46 | .funs = funs,
47 | ...
48 | ) %>%
49 | dplyr::select(dplyr::all_of(key), {{ var }}, dplyr::any_of(names(funs))) %>%
50 | tidyr::pivot_longer(
51 | cols = -c(dplyr::all_of(key), {{ var }}),
52 | names_to = "stat",
53 | values_to = "stat_value"
54 | ) %>%
55 | dplyr::mutate(stat_diff = abs({{ var }} - stat_value)) %>%
56 | dplyr::group_by(stat) %>%
57 | dplyr::top_n(-top_n, wt = stat_diff) %>%
58 | dplyr::ungroup()
59 |
60 | # set factors
61 | if (isTRUE(stat_as_factor)) {
62 | data_keys_near %>%
63 | dplyr::mutate(stat = factor(x = stat, levels = names(funs)))
64 | } else if (!stat_as_factor) {
65 | return(data_keys_near)
66 | }
67 | }
68 |
69 | #' @title Return keys nearest to a given statistics or summary.
70 | #' @param .data data.frame
71 | #' @param key key, which identifies unique observations.
72 | #' @param var variable to summarise
73 | #' @param top_n top number of closest observations to return - default is 1, which will also return ties.
74 | #' @param funs named list of functions to summarise by. Default is a given
75 | #' list of the five number summary, `l_five_num`.
76 | #' @param ... extra arguments to pass to `mutate_at` when performing the summary
77 | #' as given by `funs`.
78 | #' @examples
79 | #' heights %>%
80 | #' key_slope(height_cm ~ year) %>%
81 | #' keys_near(key = country,
82 | #' var = .slope_year)
83 | #' # Specify your own list of summaries
84 | #' l_ranges <- list(min = b_min,
85 | #' range_diff = b_range_diff,
86 | #' max = b_max,
87 | #' iqr = b_iqr)
88 | #'
89 | #' heights %>%
90 | #' key_slope(formula = height_cm ~ year) %>%
91 | #' keys_near(key = country,
92 | #' var = .slope_year,
93 | #' funs = l_ranges)
94 | #' @export
95 | keys_near.data.frame <- function(
96 | .data,
97 | key,
98 | var,
99 | top_n = 1,
100 | funs = l_five_num,
101 | ...
102 | ) {
103 | .data %>%
104 | tibble::as_tibble() %>%
105 | dplyr::mutate_at(
106 | .vars = dplyr::vars({{ var }}),
107 | .funs = funs,
108 | ...
109 | ) %>%
110 | dplyr::select({{ key }}, {{ var }}, dplyr::all_of(names(funs))) %>%
111 | tidyr::pivot_longer(
112 | cols = -c({{ key }}, {{ var }}),
113 | names_to = "stat",
114 | values_to = "stat_value"
115 | ) %>%
116 | dplyr::mutate(stat_diff = abs({{ var }} - stat_value)) %>%
117 | dplyr::group_by(stat) %>%
118 | dplyr::top_n(-top_n, wt = stat_diff) %>%
119 | dplyr::ungroup() %>%
120 | dplyr::mutate(stat = factor(x = stat, levels = names(funs)))
121 | }
122 |
123 | #' @rdname keys_near
124 | #' @param ... extra arguments to pass to `mutate_at` when performing the summary
125 | #' as given by `funs`.
126 | #' @export
127 | keys_near.default <- function(.data, ...) {
128 | stop(.data, "must be a data.frame or tsibble, class is ", class(.data))
129 | }
130 |
131 | #' A named list of the five number summary
132 | #'
133 | #' Designed for use with the [keys_near()] function.
134 | #' @name l_funs
135 | #' @examples
136 | #' # Specify your own list of summaries
137 | #' l_ranges <- list(min = b_min,
138 | #' range_diff = b_range_diff,
139 | #' max = b_max,
140 | #' iqr = b_iqr)
141 | #'
142 | #' heights %>%
143 | #' key_slope(formula = height_cm ~ year) %>%
144 | #' keys_near(key = country,
145 | #' var = .slope_year,
146 | #' funs = l_ranges)
147 |
148 | #' @export
149 | l_five_num <- list(
150 | min = b_min,
151 | q_25 = b_q25,
152 | med = b_median,
153 | q_75 = b_q75,
154 | max = b_max
155 | )
156 |
157 | #' @rdname l_funs
158 | #' @export
159 | l_three_num <- list(
160 | min = b_min,
161 | med = b_median,
162 | max = b_max
163 | )
164 |
--------------------------------------------------------------------------------
/R/monotonics.R:
--------------------------------------------------------------------------------
1 | #' Are values monotonic? Always increasing, decreasing, or unvarying?
2 | #'
3 | #' These provides three families of functions to tell you if values are always
4 | #' increasing, decreasing, or unvarying, with the functions, `increasing()`,
5 | #' `decreasing()`, or `unvarying()`. Under the hood it uses `diff` to find
6 | #' differences, so if you like you can pass extra arguments to `diff`.
7 | #'
8 | #' @param x numeric or integer
9 | #' @param ... extra arguments to pass to diff
10 | #'
11 | #' @return logical TRUE or FALSE
12 | #' @name monotonic
13 | #' @export
14 | #'
15 | #' @examples
16 | #' vec_inc <- c(1:10)
17 | #' vec_dec<- c(10:1)
18 | #' vec_ran <- c(sample(1:10))
19 | #' vec_flat <- rep.int(1,10)
20 | #'
21 | #' increasing(vec_inc)
22 | #' increasing(vec_dec)
23 | #' increasing(vec_ran)
24 | #' increasing(vec_flat)
25 | #'
26 | #' decreasing(vec_inc)
27 | #' decreasing(vec_dec)
28 | #' decreasing(vec_ran)
29 | #' decreasing(vec_flat)
30 | #'
31 | #' unvarying(vec_inc)
32 | #' unvarying(vec_dec)
33 | #' unvarying(vec_ran)
34 | #' unvarying(vec_flat)
35 | #'
36 | #' library(ggplot2)
37 | #' library(gghighlight)
38 | #' library(dplyr)
39 | #'
40 | #' heights_mono <- heights %>%
41 | #' features(height_cm, feat_monotonic) %>%
42 | #' left_join(heights, by = "country")
43 | #'
44 | #' ggplot(heights_mono,
45 | #' aes(x = year,
46 | #' y = height_cm,
47 | #' group = country)) +
48 | #' geom_line() +
49 | #' gghighlight(increase)
50 | #'
51 | #' ggplot(heights_mono,
52 | #' aes(x = year,
53 | #' y = height_cm,
54 | #' group = country)) +
55 | #' geom_line() +
56 | #' gghighlight(decrease)
57 | #'
58 | #' heights_mono %>%
59 | #' filter(monotonic) %>%
60 | #' ggplot(aes(x = year,
61 | #' y = height_cm,
62 | #' group = country)) +
63 | #' geom_line()
64 | #'
65 | #' heights_mono %>%
66 | #' filter(increase) %>%
67 | #' ggplot(aes(x = year,
68 | #' y = height_cm,
69 | #' group = country)) +
70 | #' geom_line()
71 | #'
72 | increasing <- function(x, ...) {
73 | if (length(x) == 1) {
74 | return(FALSE)
75 | }
76 |
77 | all(diff(x, ...) > 0)
78 | }
79 |
80 | #' @rdname monotonic
81 | #' @export
82 | decreasing <- function(x, ...) {
83 | if (length(x) == 1) {
84 | return(FALSE)
85 | }
86 |
87 | all(diff(x, ...) < 0)
88 | }
89 |
90 | #' @rdname monotonic
91 | #' @export
92 | unvarying <- function(x, ...) {
93 | if (length(x) == 1) {
94 | return(FALSE)
95 | }
96 |
97 | all(diff(x, ...) == 0)
98 | }
99 |
100 | #' @rdname monotonic
101 | #' @export
102 | monotonic <- function(x, ...) {
103 | if (length(x) == 1) {
104 | return(FALSE)
105 | }
106 |
107 | any(increasing(x, ...), decreasing(x, ...))
108 | }
109 |
--------------------------------------------------------------------------------
/R/n-obs.R:
--------------------------------------------------------------------------------
1 | #' Return the number of observations
2 | #'
3 | #' Returns the number of observations of a vector or data.frame. It uses
4 | #' `vctrs::vec_size()` under the hood.
5 | #'
6 | #' @param x vector or data.frame
7 | #' @param names logical; If TRUE the result is a named vector named "n_obs", else
8 | #' it is just the number of observations.
9 | #'
10 | #' @note You cannot use `n_obs` with `features` counting the key variable like
11 | #' so - `features(heights, country, n_obs)`. Instead, use any other variable.
12 | #'
13 | #' @return number of observations
14 | #' @export
15 | #'
16 | #' @examples
17 | #' n_obs(iris)
18 | #' n_obs(1:10)
19 | #' add_n_obs(heights)
20 | #' heights %>%
21 | #' features(height_cm, n_obs) # can be any variable except id, the key.
22 | n_obs <- function(x, names = TRUE) {
23 | # assert if thing is either a vector
24 | vctrs::vec_assert(names, logical())
25 | # not sure how to check if x is a data.frame is a safe way.
26 |
27 | if (names) {
28 | size <- c(n_obs = vctrs::vec_size(x))
29 | }
30 |
31 | if (!names) {
32 | size <- vctrs::vec_size(x)
33 | }
34 |
35 | return(size)
36 | }
37 |
38 | #' Add the number of observations for each key in a `tsibble`
39 | #'
40 | #' Here, we are not counting the number of rows in the dataset, but rather
41 | #' we are counting the number observations for each keys in the data.
42 | #'
43 | #' @param .data tsibble
44 | #' @param ... extra arguments
45 | #'
46 | #' @return tsibble with `n_obs`, the number of observations per key added.
47 | #' @export
48 | #'
49 | #' @examples
50 | #' library(dplyr)
51 | #' # you can explore the data to see those cases that have exactly two
52 | #' # observations:
53 | #' heights %>%
54 | #' add_n_obs() %>%
55 | #' filter(n_obs == 2)
56 | add_n_obs <- function(.data, ...) {
57 | test_if_null(.data)
58 | test_if_tsibble(.data)
59 |
60 | .data %>%
61 | tsibble::group_by_key() %>%
62 | dplyr::mutate(n_obs = dplyr::n()) %>%
63 | dplyr::ungroup() %>%
64 | dplyr::select(
65 | !!!tsibble::key(.data),
66 | !!tsibble::index(.data),
67 | n_obs,
68 | dplyr::everything()
69 | )
70 | }
71 |
--------------------------------------------------------------------------------
/R/nearests.R:
--------------------------------------------------------------------------------
1 | #' Return the middle x percent of values
2 | #'
3 | #' @param x numeric vector
4 | #' @param middle percentage you want to center around
5 | #' @param within percentage around center
6 | #' @return logical vector
7 | #' @export
8 | #'
9 | #' @examples
10 | #' x <- runif(20)
11 | #' near_middle(x = x,
12 | #' middle = 0.5,
13 | #' within = 0.2)
14 | #'
15 | #' library(dplyr)
16 | #' heights %>% features(height_cm, list(min = min)) %>%
17 | #' filter(near_middle(min, 0.5, 0.1))
18 | #'
19 | near_middle <- function(x, middle, within) {
20 | within <- within / 2
21 |
22 | dplyr::between(
23 | dplyr::percent_rank(x),
24 | left = middle - within,
25 | right = middle + within
26 | )
27 | }
28 |
29 | #' Return x percent to y percent of values
30 | #'
31 | #' @param x numeric vector
32 | #' @param from the lower bound of percentage
33 | #' @param to the upper bound of percentage
34 | #'
35 | #' @return logical vector
36 | #' @export
37 | #'
38 | #' @examples
39 | #' x <- runif(20)
40 | #'
41 | #' near_middle(x = x,
42 | #' middle = 0.5,
43 | #' within = 0.2)
44 | #'
45 | #' library(dplyr)
46 | #' heights %>% features(height_cm, list(min = min)) %>%
47 | #' filter(near_between(min, 0.1, 0.9))
48 | #'
49 | #' near_quantile(x = x,
50 | #' probs = 0.5,
51 | #' tol = 0.01)
52 | #'
53 | #' near_quantile(x, c(0.25, 0.5, 0.75), 0.05)
54 | #'
55 | #' heights %>%
56 | #' features(height_cm, l_five_num) %>%
57 | #' mutate_at(vars(min:max),
58 | #' .funs = near_quantile,
59 | #' 0.5,
60 | #' 0.01) %>%
61 | #' filter(min)
62 | #'
63 | #' heights %>%
64 | #' features(height_cm, list(min = min)) %>%
65 | #' mutate(min_near_q3 = near_quantile(min, c(0.25, 0.5, 0.75), 0.01)) %>%
66 | #' filter(min_near_q3)
67 | #'
68 | #' heights %>%
69 | #' features(height_cm, list(min = min)) %>%
70 | #' filter(near_between(min, 0.1, 0.9))
71 | #'
72 | #' heights %>%
73 | #' features(height_cm, list(min = min)) %>%
74 | #' filter(near_middle(min, 0.5, 0.1))
75 | near_between <- function(x, from, to) {
76 | dplyr::between(dplyr::percent_rank(x), left = from, right = to)
77 | }
78 |
79 | #' Which values are nearest to any given quantiles
80 | #'
81 | #' @param x vector
82 | #' @param probs quantiles to calculate
83 | #' @param tol tolerance in terms of x that you will accept near to the
84 | #' quantile. Default is 0.01.
85 | #'
86 | #' @return logical vector of TRUE/FALSE if number is close to a quantile
87 | #' @examples
88 | #' x <- runif(20)
89 | #' near_quantile(x, 0.5, 0.05)
90 | #' near_quantile(x, c(0.25, 0.5, 0.75), 0.05)
91 | #'
92 | #' library(dplyr)
93 | #' heights %>%
94 | #' features(height_cm, list(min = min)) %>%
95 | #' mutate(min_near_median = near_quantile(min, 0.5, 0.01)) %>%
96 | #' filter(min_near_median)
97 | #' heights %>%
98 | #' features(height_cm, list(min = min)) %>%
99 | #' mutate(min_near_q3 = near_quantile(min, c(0.25, 0.5, 0.75), 0.01)) %>%
100 | #' filter(min_near_q3)
101 | #' @export
102 |
103 | near_quantile <- function(x, probs, tol = 0.01) {
104 | x <- as.numeric(x)
105 |
106 | quant <- qtl(x, probs = probs)
107 | upper <- purrr::map_dbl(quant, sum, tol)
108 | lower <- purrr::map_dbl(quant, sum, -tol)
109 |
110 | part_btn <- purrr::partial(dplyr::between, x = x)
111 |
112 | purrr::map2_dfr(.x = lower, .y = upper, .f = part_btn) %>%
113 | rowSums() ==
114 | 1
115 | }
116 |
117 |
118 | #' Is x nearest to y?
119 | #'
120 | #' @description Returns TRUE if x is nearest to y.
121 | #' There are two implementations. `nearest_lgl()` returns a logical vector
122 | #' when an element of the first argument is nearest to an element of the
123 | #' second argument. `nearest_qt_lgl()` is similar to `nearest_lgl()`, but
124 | #' instead determines if an element of the first argument is nearest to
125 | #' some value of the given quantile probabilities. See example for more
126 | #' detail.
127 | #'
128 | #' @param x a numeric vector
129 | #' @param y a numeric vector
130 | #' @param ... (if used) arguments to pass to `quantile()`.
131 | #'
132 | #' @return logical vector of `length(y)`
133 | #' @name nearests
134 | #' @export
135 | #'
136 | #' @examples
137 | #'
138 | #' x <- 1:10
139 | #' y <- 5:14
140 | #' z <- 16:25
141 | #' a <- -1:-5
142 | #' b <- -1
143 | #'
144 | #' nearest_lgl(x, y)
145 | #' nearest_lgl(y, x)
146 | #'
147 | #' nearest_lgl(x, z)
148 | #' nearest_lgl(z, x)
149 | #'
150 | #' nearest_lgl(x, a)
151 | #' nearest_lgl(a, x)
152 | #'
153 | #' nearest_lgl(x, b)
154 | #' nearest_lgl(b, x)
155 | #'
156 | #' library(dplyr)
157 | #' heights_near_min <- heights %>%
158 | #' filter(nearest_lgl(min(height_cm), height_cm))
159 | #'
160 | #' heights_near_fivenum <- heights %>%
161 | #' filter(nearest_lgl(fivenum(height_cm), height_cm))
162 | #'
163 | #' heights_near_qt_1 <- heights %>%
164 | #' filter(nearest_qt_lgl(height_cm, c(0.5)))
165 | #'
166 | #' heights_near_qt_3 <- heights %>%
167 | #' filter(nearest_qt_lgl(height_cm, c(0.1, 0.5, 0.9)))
168 | #'
169 | nearest_lgl <- function(x, y) {
170 | x <- vctrs::vec_cast(x, to = double())
171 | y <- vctrs::vec_cast(y, to = double())
172 | out <- logical(length(y))
173 | out[purrr::map_dbl(x, function(x) which.min(abs(y - x)))] <- TRUE
174 | out
175 | }
176 |
177 | #' @export
178 | #' @rdname nearests
179 | nearest_qt_lgl <- function(y, ...) {
180 | x <- stats::quantile(y, ...)
181 | out <- logical(length(y))
182 | out[purrr::map_dbl(x, function(x) which.min(abs(y - x)))] <- TRUE
183 | out
184 | }
185 |
186 | # it is not clear to me how this is significantly different to nearest_qt_lgl
187 | # nearest_qt <- function(y, ...){
188 | # x <- stats::quantile(y, ...)
189 | # purrr::map(x, function(x) which.min(abs(y - x)))
190 | # out <- logical(length(y))
191 | # out[purrr::map_dbl(x, function(x) which.min(abs(y - x)))] <- TRUE
192 | # out
193 | # }
194 | #
195 | #
196 | # # Mitch's notes
197 | # stat_near_quant(min, 0.5, 0.1)(map(seq_len(100), rnorm))
198 | #
199 | #
200 | # min_near_quant <- stat_near_quant(min, 0.5, 0.1)
201 | #
202 | # min_near_quant(rnorm(1000))
203 | #
204 | # stat_near_quant <- function(fn, qt, tol){
205 | # function(lst, ...){
206 | # # ??? magic quantile stuff ???
207 | # fn_out <- map_dbl(lst, fn, ...)
208 | #
209 | # quantile(fn_out, probs = qt)
210 | # #
211 | # # x_near <- map(qt_out, ~ x[dplyr::near(x, .x, tol)])
212 | # #
213 | # # map(x_near, fn, ...)
214 | # }
215 | # }
216 | #
217 | # ?dplyr::ntile()
218 | #
219 | # # does it return a lgl (is it near to a quantile? TRUE/FALSE)
220 | # # or a character? (q25, or q50, say - or NA if not near)
221 |
--------------------------------------------------------------------------------
/R/reexports.R:
--------------------------------------------------------------------------------
1 | #' Pipe operator
2 | #'
3 | #' See \code{magrittr::\link[magrittr]{\%>\%}} for details.
4 | #'
5 | #' @name %>%
6 | #' @rdname pipe
7 | #' @keywords internal
8 | #' @export
9 | #' @importFrom magrittr %>%
10 | #' @usage lhs \%>\% rhs
11 | NULL
12 |
13 |
14 | #' @importFrom fabletools features features_at features_if features_all
15 | #' @export
16 | fabletools::features
17 |
18 | #' @export
19 | fabletools::features_at
20 |
21 | #' @export
22 | fabletools::features_if
23 |
24 | #' @export
25 | fabletools::features_all
26 |
27 | #' @importFrom tsibble as_tsibble n_keys
28 | #' @export
29 | tsibble::as_tsibble
30 |
31 | #' @export
32 | tsibble::n_keys
33 |
--------------------------------------------------------------------------------
/R/sample-frac-n-keys.R:
--------------------------------------------------------------------------------
1 | #' Sample a number or fraction of keys to explore
2 | #'
3 | #' @param .data tsibble object
4 | #' @param size The number or fraction of observations, depending on the
5 | #' function used. In `sample_n_keys`, it is a number > 0, and in
6 | #' `sample_frac_keys` it is a fraction, between 0 and 1.
7 | #'
8 | #' @return tsibble with fewer observations of key
9 | #' @name sample-n-frac-keys
10 | #' @export
11 | #' @examples
12 | #' library(ggplot2)
13 | #' sample_n_keys(heights,
14 | #' size = 10) %>%
15 | #' ggplot(aes(x = year,
16 | #' y = height_cm,
17 | #' group = country)) +
18 | #' geom_line()
19 | sample_n_keys <- function(.data, size) {
20 | test_if_tsibble(.data)
21 | test_if_null(.data)
22 | UseMethod("sample_n_keys")
23 | }
24 |
25 | #' @export
26 | sample_n_keys.tbl_ts <- function(.data, size) {
27 | key_indices <- tsibble::key_rows(.data)
28 | sample_unique_keys <- sample(key_indices, size)
29 | dplyr::slice(.data, vctrs::vec_c(!!!sample_unique_keys))
30 | }
31 |
32 | #' @rdname sample-n-frac-keys
33 | #' @examples
34 | #' library(ggplot2)
35 | #' sample_frac_keys(wages,
36 | #' 0.1) %>%
37 | #' ggplot(aes(x = xp,
38 | #' y = unemploy_rate,
39 | #' group = id)) +
40 | #' geom_line()
41 | #' @export
42 | sample_frac_keys <- function(.data, size) {
43 | test_if_tsibble(.data)
44 | test_if_null(.data)
45 | UseMethod("sample_frac_keys")
46 | }
47 |
48 | #' @inheritParams sample-n-frac-keys
49 | #' @export
50 | sample_frac_keys.tbl_ts <- function(.data, size) {
51 | if (size > 1 & size > 0) {
52 | stop(
53 | "sample size for `sample_frac_keys` must between 0 and 1, the size given was `",
54 | size,
55 | "`"
56 | )
57 | }
58 |
59 | sample_n_keys(.data, size = round(size * tsibble::n_keys(.data)))
60 | }
61 |
--------------------------------------------------------------------------------
/R/stratify-keys.R:
--------------------------------------------------------------------------------
1 | #' Stratify the keys into groups to facilitate exploration
2 | #'
3 | #' To look at as much of the raw data as possible, it can be helpful to
4 | #' stratify the data into groups for plotting. You can `stratify` the
5 | #' `keys` using the `stratify_keys()` function, which adds the column,
6 | #' `.strata`. This allows the user to create facetted plots showing a more
7 | #' of the raw data.
8 | #'
9 | #' @param .data data.frame to explore
10 | #' @param n_strata number of groups to create
11 | #' @param along variable to stratify along. This groups by each `key` and then
12 | #' takes a summary statistic (by default, the mean). It then arranges by the
13 | #' mean value for each `key` and assigns the `n_strata` groups.
14 | #' @param fun summary function. Default is mean.
15 | #' @param ... extra arguments
16 | #'
17 | #' @return data.frame with column, `.strata` containing `n_strata` groups
18 | #' @export
19 | #' @examples
20 | #' library(ggplot2)
21 | #' library(brolgar)
22 | #'
23 | #' heights %>%
24 | #' sample_frac_keys(size = 0.1) %>%
25 | #' stratify_keys(10) %>%
26 | #' ggplot(aes(x = height_cm,
27 | #' y = year,
28 | #' group = country)) +
29 | #' geom_line() +
30 | #' facet_wrap(~.strata)
31 | #'
32 | #' # now facet along some feature
33 | #' library(dplyr)
34 | #' heights %>%
35 | #' key_slope(height_cm ~ year) %>%
36 | #' right_join(heights, ., by = "country") %>%
37 | #' stratify_keys(n_strata = 12,
38 | #' along = .slope_year,
39 | #' fun = median) %>%
40 | #' ggplot(aes(x = year,
41 | #' y = height_cm,
42 | #' group = country)) +
43 | #' geom_line() +
44 | #' facet_wrap(~.strata)
45 | #'
46 | #'
47 | #' heights %>%
48 | #' stratify_keys(n_strata = 12,
49 | #' along = height_cm) %>%
50 | #' ggplot(aes(x = year,
51 | #' y = height_cm,
52 | #' group = country)) +
53 | #' geom_line() +
54 | #' facet_wrap(~.strata)
55 | stratify_keys <- function(.data, n_strata, along = NULL, fun = mean, ...) {
56 | test_if_tsibble(.data)
57 | test_if_null(.data)
58 | UseMethod("stratify_keys")
59 | }
60 |
61 | #' @export
62 | stratify_keys.tbl_ts <- function(
63 | .data,
64 | n_strata,
65 | along = NULL,
66 | fun = mean,
67 | ...
68 | ) {
69 | q_along <- rlang::enquo(along)
70 | q_keys <- tsibble::key(.data)
71 | keys_chr <- tsibble::key_vars(.data)
72 |
73 | if (rlang::quo_is_null(q_along)) {
74 | # stratify(.data, n_strata)
75 | # could just return some vector of numbers?
76 | # perhaps that is the difference btn strata and stratify
77 | # strata returns a vector, stratify adds this to the data?
78 |
79 | id_w_strata <- full_strata_df(.data, n_strata)
80 |
81 | only_strata <- dplyr::distinct(id_w_strata)
82 |
83 | data_strata <- dplyr::left_join(.data, only_strata, by = keys_chr)
84 |
85 | return(data_strata)
86 | }
87 |
88 | if (!rlang::quo_is_null(q_along)) {
89 | # stratify_along(.data,
90 | # n_strata,
91 | # along, - aka q_along
92 | # keys) - aka q_keys (can we generate keys_chr from q_keys?)
93 |
94 | possible_strata <- possible_strata(.data, n_strata)
95 |
96 | data_strata <- .data %>%
97 | tibble::as_tibble() %>%
98 | dplyr::group_by(!!!q_keys) %>%
99 | dplyr::summarise(stat = fun(!!q_along, na.rm = TRUE)) %>%
100 | dplyr::arrange(-stat) %>%
101 | dplyr::mutate(.strata = sort(possible_strata)) %>%
102 | dplyr::select(-stat) %>%
103 | dplyr::right_join(.data, ., by = keys_chr)
104 |
105 | return(data_strata)
106 | }
107 | }
108 |
--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
1 | pct <- function(x) {
2 | as.character(glue::glue("q_{scales::percent(x, accuracy = 1, suffix = '')}"))
3 | }
4 |
5 | qtl <- function(x, probs) {
6 | prob_names <- pct(probs)
7 | stats::quantile(x = x, probs = probs, type = 7, names = FALSE) %>%
8 | purrr::set_names(prob_names)
9 | }
10 |
11 | test_if_tsibble <- function(x) {
12 | if (!inherits(x, "tbl_ts")) {
13 | stop("Input must inherit from tsibble", call. = FALSE)
14 | }
15 | }
16 |
17 | test_if_dataframe <- function(x) {
18 | if (!inherits(x, "data.frame")) {
19 | stop("Input must inherit from data.frame", call. = FALSE)
20 | }
21 | }
22 |
23 | test_if_null <- function(x, message = "Input must not be NULL") {
24 | if (is.null(x)) {
25 | stop(message, call. = FALSE)
26 | }
27 | }
28 |
29 | test_if_tilde <- function(x) {
30 | contains_tilde <- grepl("~", x)
31 | if (!any(contains_tilde)) {
32 | stop(
33 | "Input x must be a formula with a tilde ,`~`, we see, '",
34 | x,
35 | "' of class",
36 | class(x),
37 | ".",
38 | call. = FALSE
39 | )
40 | }
41 | }
42 |
43 | test_if_formula <- function(x) {
44 | if (!is(x, "formula")) {
45 | stop(
46 | "Input x must be a formula, we see, '",
47 | x,
48 | "' of class ",
49 | class(x),
50 | ".",
51 | " Formula should be specified with something on the left hand side of ~ and the right hand side.",
52 | " For more details on formula in R, see `?formula`.",
53 | call. = FALSE
54 | )
55 | }
56 | }
57 |
58 | classes <- function(x) purrr::map_chr(x, class)
59 |
60 | possible_strata <- function(.data, n_strata) {
61 | n_keys_data <- tsibble::n_keys(.data)
62 | # Ensures the strata are evenly distributed amongst keys
63 | seq_len(n_strata) %>%
64 | rep(length.out = n_keys_data) %>%
65 | sample()
66 | }
67 |
68 | full_strata_df <- function(.data, n_strata) {
69 | possible_strata <- possible_strata(.data, n_strata)
70 | tsibble::key_data(.data) %>%
71 | dplyr::mutate(.strata = possible_strata) %>%
72 | tidyr::unnest_longer(col = c(.rows)) %>%
73 | dplyr::select(-.rows)
74 | }
75 |
76 | full_strata <- function(.data, n_strata) {
77 | possible_strata(.data, n_strata) %>%
78 | rep.int(times = lengths(my_key_rows(.data)))
79 | }
80 |
81 | my_key_data <- function(.data) {
82 | .data %>%
83 | dplyr::left_join(
84 | tsibble::key_data(.data),
85 | by = tsibble::key_vars(.data)
86 | ) %>%
87 | tibble::as_tibble() %>%
88 | dplyr::select(tsibble::key_vars(.data), .rows) %>%
89 | dplyr::distinct()
90 | }
91 |
92 | my_key_rows <- function(.data) {
93 | my_key_data(.data)[[".rows"]]
94 | }
95 |
96 | skip_on_gh_actions <- function() {
97 | if (!identical(Sys.getenv("GITHUB_ACTIONS"), "true")) {
98 | return(invisible(TRUE))
99 | }
100 | testthat::skip("On GitHub Actions")
101 | }
102 |
--------------------------------------------------------------------------------
/R/zzz.R:
--------------------------------------------------------------------------------
1 | .onLoad <- function(...) {
2 | fabletools::register_feature(feat_three_num, c("summary"))
3 | fabletools::register_feature(feat_five_num, c("summary"))
4 | fabletools::register_feature(feat_ranges, c("summary", "range", "spread"))
5 | fabletools::register_feature(feat_spread, c("summary", "spread"))
6 | fabletools::register_feature(feat_brolgar, c("summary", "all"))
7 | fabletools::register_feature(feat_monotonic, c("summary", "monotonic"))
8 | invisible()
9 | }
10 |
--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
1 | url: http://brolgar.njtierney.com/
2 | template:
3 | bootstrap: 5
4 | bootswatch: flatly
5 | search:
6 | exclude: news/index.html
7 | home:
8 | links:
9 | - text: Learn more
10 | href: https://github.com/njtierney/brolgar
11 | reference:
12 | - title: Data
13 | desc: |
14 | Datasets that come with `brolgar`
15 | contents:
16 | - wages
17 | - heights
18 | - pisa
19 | - title: ggplot helpers
20 | desc: |
21 | ggplot2 functions to help explore your data
22 | contents:
23 | - facet_sample
24 | - facet_strata
25 | - title: Sampling helpers
26 | desc: |
27 | Functions to help sampling series from the data
28 | contents:
29 | - starts_with("sample_")
30 | - stratify_keys
31 | - title: Features
32 | desc: |
33 | Identify features in your data:
34 | contents: starts_with("feat_")
35 | - title: Exploratory tools (Experimental)
36 | desc: |
37 | Tools to help with exploratory modelling of your data
38 | contents:
39 | - key_slope
40 | - add_key_slope
41 | - starts_with("keys_near")
42 | - title: Find observations near a value
43 | desc: |
44 | Identify observations near some summary value
45 | contents: starts_with("near")
46 | - title: Summaries
47 | desc: |
48 | Functions that are used to power the summary functions in `brolgar`
49 | contents:
50 | - starts_with("b_")
51 | - starts_with("l_")
52 | - title: Helpers
53 | desc: |
54 | Helper functions to assist summarising data
55 | contents:
56 | - n_obs
57 | - add_n_obs
58 | - n_keys
59 | - increasing
60 | - decreasing
61 | - unvarying
62 | - monotonic
63 | - index_summary
64 | navbar:
65 | left:
66 | - text: Getting Started
67 | href: articles/getting-started.html
68 | - text: Gallery
69 | href: articles/visualisation-gallery.html
70 | - text: Articles
71 | menu:
72 | - text: Data Structures
73 | href: articles/longitudinal-data-structures.html
74 | - text: Exploratory Modelling
75 | href: articles/exploratory-modelling.html
76 | - text: Finding Features
77 | href: articles/finding-features.html
78 | - text: Identify Interesting Observations
79 | href: articles/id-interesting-obs.html
80 | - text: Exploring Mixed Effects Models
81 | href: articles/mixed-effects-models.html
82 | - text: Reference
83 | href: reference/index.html
84 | - text: News
85 | href: news/index.html
86 | destination: docs
87 |
88 |
--------------------------------------------------------------------------------
/brolgar.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 | ProjectId: 6873fc21-71d8-4495-a7c7-f613fce2e8f2
3 |
4 | RestoreWorkspace: Default
5 | SaveWorkspace: Default
6 | AlwaysSaveHistory: Default
7 |
8 | EnableCodeIndexing: Yes
9 | UseSpacesForTab: Yes
10 | NumSpacesForTab: 2
11 | Encoding: UTF-8
12 |
13 | RnwWeave: Sweave
14 | LaTeX: pdfLaTeX
15 |
16 | BuildType: Package
17 | PackageUseDevtools: Yes
18 | PackageInstallArgs: --no-multiarch --with-keep.source
19 | PackageRoxygenize: rd,collate,namespace
20 |
--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
1 | comment: false
2 |
3 | coverage:
4 | status:
5 | project:
6 | default:
7 | target: auto
8 | threshold: 1%
9 | patch:
10 | default:
11 | target: auto
12 | threshold: 1%
13 |
--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
1 | ## Test environments
2 |
3 | * local OS X install, R 4.4.0
4 | * github actions testing for devel, release, and ubuntu, windows, and macOSX
5 | * win-builder (devel and release)
6 |
7 | ## R CMD check results
8 |
9 | 0 errors | 0 warnings | 1 notes
10 |
11 | ```
12 | * New submission
13 | * Package was archived on CRAN
14 | ```
15 |
16 | We have addressed the problems resulting in archival in this release
17 |
18 | ```
19 | Possibly misspelled words in DESCRIPTION:
20 | Prvan (35:22)
21 | Tierney (34:66)
22 | brolgar (33:33)
23 | quantiles (30:42)
24 | ```
25 |
26 | These spellings are all correct
27 |
28 | ```
29 | Found the following (possibly) invalid URLs:
30 | URL: https://www.oecd.org/pisa/data/
31 | From: man/pisa.Rd
32 | Status: 403
33 | Message: Forbidden
34 | ```
35 |
36 | I'm really not sure how to check this one - I've used {urlchecker} and the website opens for me, should I just remove this?
37 |
38 | ## revdepcheck results
39 |
40 | There are no reverse dependencies as this package is not on CRAN
--------------------------------------------------------------------------------
/data-raw/Height_Compact.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/njtierney/brolgar/3b2e04c68b7296f0a53f999ca2377f88fd03af6f/data-raw/Height_Compact.xlsx
--------------------------------------------------------------------------------
/data-raw/compress-pisa-data.R:
--------------------------------------------------------------------------------
1 | # install.packages("remotes")
2 | # remotes::install_github("ropenscilabs/learningtower")
3 | library(learningtower)
4 | library(brolgar)
5 | library(tsibble)
6 | library(tidyverse)
7 |
8 | pisa <- student %>%
9 | select(year:student_id, gender, math:stu_wgt) %>%
10 | group_by(country, year) %>%
11 | summarise(
12 | math_mean = weighted.mean(math, stu_wgt, na.rm = TRUE),
13 | read_mean = weighted.mean(read, stu_wgt, na.rm = TRUE),
14 | science_mean = weighted.mean(science, stu_wgt, na.rm = TRUE),
15 | math_max = max(math, na.rm = TRUE),
16 | read_max = max(read, na.rm = TRUE),
17 | science_max = max(science, na.rm = TRUE),
18 | math_min = min(math, na.rm = TRUE),
19 | read_min = min(read, na.rm = TRUE),
20 | science_min = min(science, na.rm = TRUE)
21 | ) %>%
22 | ungroup() %>%
23 | mutate(country = fct_drop(country), year = as.integer(as.character(year))) %>%
24 | relocate(math_min, math_max, .after = math_mean) %>%
25 | relocate(read_min, read_max, .after = read_mean) %>%
26 | relocate(science_min, science_max, .after = science_mean)
27 |
28 | pisa
29 |
30 | pryr::object_size(pisa)
31 |
32 | unique(pisa$country)
33 |
34 | barplot(table(pisa$country))
35 |
36 | plot(table(pisa$year))
37 |
38 | usethis::use_data(pisa, compress = "xz", overwrite = TRUE)
39 |
--------------------------------------------------------------------------------
/data-raw/tidy-world-heights.R:
--------------------------------------------------------------------------------
1 | library(readxl)
2 | library(here)
3 | library(dplyr)
4 | library(tsibble)
5 | library(countrycode)
6 | library(brolgar)
7 |
8 | heights <-
9 | read_excel(here("data-raw", "Height_Compact.xlsx"), sheet = 2) %>%
10 | rename(country = country.name, height_cm = value) %>%
11 | select(-ccode) %>%
12 | arrange(country) %>%
13 | mutate(
14 | continent = countrycode(
15 | sourcevar = country,
16 | origin = "country.name",
17 | destination = "continent"
18 | )
19 | ) %>%
20 | as_tsibble(key = country, index = year, regular = FALSE) %>%
21 | add_n_obs() %>%
22 | filter(n_obs > 1) %>%
23 | select(country, continent, year, height_cm)
24 |
25 | heights
26 |
27 | usethis::use_data(heights, overwrite = TRUE)
28 |
--------------------------------------------------------------------------------
/data-raw/wages-original.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/njtierney/brolgar/3b2e04c68b7296f0a53f999ca2377f88fd03af6f/data-raw/wages-original.rds
--------------------------------------------------------------------------------
/data-raw/wages-ts.R:
--------------------------------------------------------------------------------
1 | library(dplyr)
2 | library(brolgar)
3 | library(tsibble)
4 |
5 | # The original `wages` data from ggobi: http://ggobi.org/book/index.html
6 | # but downloaded from https://github.com/tprvan/brolgar/tree/master/data
7 | # and converted to `rds` object
8 | wages_og <- readr::read_rds("data-raw/wages-original.rds")
9 |
10 | wages_og
11 |
12 | wages <- wages_og %>%
13 | dplyr::select(-hgc.9, -ue.7, -ue.centert1, -ue.mean, -ue.person.cen, -ue1) %>%
14 | as_tsibble(x = ., key = id, index = exper, regular = FALSE) %>%
15 | rename(
16 | ln_wages = lnw,
17 | xp = exper,
18 | high_grade = hgc,
19 | xp_since_ged = postexp,
20 | unemploy_rate = uerate
21 | )
22 |
23 | usethis::use_data(wages, overwrite = TRUE)
24 |
--------------------------------------------------------------------------------
/data/heights.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/njtierney/brolgar/3b2e04c68b7296f0a53f999ca2377f88fd03af6f/data/heights.rda
--------------------------------------------------------------------------------
/data/pisa.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/njtierney/brolgar/3b2e04c68b7296f0a53f999ca2377f88fd03af6f/data/pisa.rda
--------------------------------------------------------------------------------
/data/wages.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/njtierney/brolgar/3b2e04c68b7296f0a53f999ca2377f88fd03af6f/data/wages.rda
--------------------------------------------------------------------------------
/inst/CITATION:
--------------------------------------------------------------------------------
1 | bibentry(
2 | bibtype = "Article",
3 | title = "The R Journal: brolgar: An R package to BRowse Over Longitudinal Data Graphically and Analytically in R",
4 | author = c(person(given = "Nicholas", family = "Tierney", email = "nicholas.tierney@gmail.com"),
5 | person(given = "Di", family = "Cook"),
6 | person(given = "Tania", family = "Prvan")),
7 | journal = "The R Journal",
8 | year = "2022",
9 | volume = "14",
10 | issue = "2",
11 | pages = "6-25",
12 | note = "https://doi.org/10.32614/RJ-2022-023",
13 | issn = "2073-4859",
14 | doi = "10.32614/RJ-2022-023",
15 | header = "To cite brolgar in publications use:"
16 | )
--------------------------------------------------------------------------------
/inst/WORDLIST:
--------------------------------------------------------------------------------
1 | Acknowledgements
2 | BRowse
3 | Baten
4 | Blum
5 | CMD
6 | Codecov
7 | Customised
8 | GH
9 | Joerg
10 | Laa
11 | Lifecycle
12 | Modelling
13 | NLSY
14 | ORCID
15 | QLD
16 | Swihart
17 | Testthat
18 | Visualisation
19 | Visualise
20 | Willett
21 | alda
22 | amongst
23 | analysed
24 | arXiv
25 | ata
26 | autocomplete
27 | br
28 | centimetres
29 | dev
30 | dplyr
31 | edu
32 | eries
33 | facetted
34 | favour
35 | gapminder
36 | ged
37 | gghighlight
38 | ggplot
39 | helpfile
40 | hispanic
41 | https
42 | idre
43 | ime
44 | internalise
45 | iqr
46 | joss
47 | learningtower
48 | longnostic
49 | longnostics
50 | modelling
51 | nalytically
52 | olds
53 | ongitudinal
54 | owse
55 | quartile
56 | raphically
57 | rubicunda
58 | sd
59 | stu
60 | summarisation
61 | summarise
62 | summarised
63 | th
64 | theoj
65 | tibble
66 | tidyverse
67 | tidyverts
68 | tion
69 | tprvan
70 | tsibble
71 | ucla
72 | ver
73 | visualisations
74 | visualise
75 | wgt
76 | xp
77 |
--------------------------------------------------------------------------------
/inst/examples/max_if.R:
--------------------------------------------------------------------------------
1 | #' Calculate maximum and return NA if input is empty
2 | #'
3 | #' This is a helper function that returns NA if length(x) == 0 (e.g., is
4 | #' numeric(0)), otherwise calculates the maximum
5 | #'
6 | #' @param x numeric
7 | #' @param ... additional arguments for max
8 | #'
9 | #' @return either NA or the maximum value
10 | #' @noRd
11 | #' @examples
12 | #' \dontrun{
13 | #' max_if(numeric(0))
14 | #' }
15 | #' @name safe_minima
16 | max_if <- function(x, ...) {
17 | ifelse(test = length(x) == 0, yes = NA, no = max(x, ...))
18 | }
19 |
--------------------------------------------------------------------------------
/inst/examples/new-api.R:
--------------------------------------------------------------------------------
1 | library(tsibble)
2 | library(brolgar)
3 |
4 | world_heights %>%
5 | filter(nearest_qt(height_cm))
6 |
7 | # we need a way to add a label of the quantiles so we can plot them on top of
8 | # the data.
9 |
10 | heights <- as_tsibble(
11 | x = world_heights,
12 | key = country,
13 | index = year,
14 | regular = FALSE
15 | )
16 |
17 | library(feasts)
18 |
19 | heights %>%
20 | features(features = count)
21 |
22 | heights_qs <- heights %>%
23 | filter(nearest_qt(height_cm)) %>%
24 | semi_join(heights, ., by = "country")
25 |
26 | autoplot(heights_qs, .vars = height_cm)
27 |
28 | library(ggplot2)
29 | library(gghighlight)
30 | ggplot(heights, aes(x = year, y = height_cm, group = country)) +
31 | geom_line() +
32 | gghighlight()
33 |
34 | wages_ts <- as_tsibble(
35 | x = wages,
36 | key = id, # the thing that identifies each distinct series
37 | index = exper, # the time part
38 | regular = FALSE
39 | ) # important for longitudinal data
40 |
41 | wages_ts
42 |
43 | heights <- as_tsibble(
44 | x = world_heights,
45 | key = country,
46 | index = year,
47 | regular = FALSE
48 | )
49 |
50 | l_fivenum <- list(
51 | min = b_min,
52 | max = b_max,
53 | median = b_median,
54 | q1 = b_q25,
55 | q3 = b_q75
56 | )
57 |
58 | heights %>%
59 | add_l_slope(id)
60 | summarise_at(vars(height_cm), l_fivenum)
61 |
62 | wages_lm <- lm(lnw ~ exper, wages_ts)
63 |
64 | wages %>%
65 | l_slope(id = id, formula = lnw ~ exper)
66 |
67 | library(feasts)
68 | library(tsibbledata)
69 |
70 | slope <- function(x, ...) {
71 | setNames(coef(lm(x ~ seq_along(x))), c("int", "slope"))
72 | }
73 |
74 | library(dplyr)
75 | aus_retail %>%
76 | features(Turnover, features_stl) %>%
77 | filter(seasonal_strength_year %in% range(seasonal_strength_year)) %>%
78 | semi_join(aus_retail, ., by = c("State", "Industry")) %>%
79 | autoplot(Turnover)
80 |
81 |
82 | aus_retail %>%
83 | features(Turnover, crossing_points) %>%
84 | filter(nearest_qt(seasonal_strength.year, type = 8)) %>%
85 | semi_join(aus_retail, ., by = c("State", "Industry")) %>%
86 | autoplot(Turnover)
87 | # summarise(seas_strength = list(as_tibble(as.list(quantile(seasonal_strength.year, type = 8))))) %>%
88 | # tidyr::unnest()
89 |
90 | library(fable)
91 | .resid <- aus_retail %>%
92 | model(SNAIVE(Turnover)) %>%
93 | residuals()
94 |
95 |
96 | .resid %>%
97 | filter(!is.na(.resid), length(.resid) > 24) %>%
98 | features(.resid, slope) %>%
99 | filter(nearest_qt(slope, type = 8)) %>%
100 | semi_join(aus_retail, ., by = c("State", "Industry")) %>%
101 | autoplot(Turnover)
102 |
103 |
104 | aus_retail %>%
105 | filter(Industry == "Other specialised food retailing") %>%
106 | autoplot(Turnover)
107 |
--------------------------------------------------------------------------------
/inst/examples/test.R:
--------------------------------------------------------------------------------
1 | # Since Nick took over
2 | library(tidyverse)
3 | library(brolgar)
4 |
5 | lwage <- longnostic_all(wages, id = id, var = lnw, formula = lnw ~ exper)
6 | lwage <- lwage %>% filter(l_n_obs > 2)
7 |
8 | library(tourr)
9 | quartz()
10 | animate_xy(lwage[, -1], axes = "bottomleft")
11 |
12 | # Old code
13 | library(gghighlight)
14 |
15 | data(wages)
16 | sl <- l_slope(wages, "id", "lnw~exper")
17 | ns <- l_length(wages, "id", "lnw")
18 | md <- l_median(wages, "id", "lnw")
19 | min1 <- l_min(wages, "id", "lnw")
20 | wages_lg <- wages %>%
21 | left_join(sl, by = "id") %>%
22 | left_join(ns, by = "id")
23 |
24 | # Without gghighlight
25 | wages_lg_enough <- wages_lg %>% filter(n > 5)
26 | ggplot(wages_lg_enough, aes(x = exper, y = lnw, group = id)) +
27 | geom_line(colour = "grey80", alpha = 0.5) +
28 | geom_line(
29 | data = filter(wages_lg_enough, slope < (-0.5)),
30 | aes(colour = factor(id))
31 | ) +
32 | scale_colour_brewer("", palette = "Dark2") +
33 | xlab("Years in workforce") +
34 | ylab("Log_e hourly wages")
35 |
36 | ggplot(wages_lg_enough, aes(x = exper, y = lnw, group = id)) +
37 | geom_line(colour = "grey80", alpha = 0.5) +
38 | geom_line(
39 | data = filter(wages_lg_enough, slope > 0.3),
40 | aes(colour = factor(id))
41 | ) +
42 | scale_colour_brewer("", palette = "Dark2") +
43 | xlab("Years in workforce") +
44 | ylab("Log_e hourly wages")
45 |
46 | # Code with gghighlight
47 | wages_lg %>%
48 | filter(n > 5) %>%
49 | ggplot(aes(x = exper, y = lnw, group = id)) +
50 | geom_line() +
51 | gghighlight(slope < (-0.5), use_direct_label = FALSE)
52 |
53 | ggplot(wages_lg, aes(x = exper, y = lnw, group = id)) +
54 | geom_line() +
55 | gghighlight(slope > 2, use_direct_label = FALSE)
56 |
57 | ggplot(sl, aes(x = slope)) + geom_histogram()
58 |
59 | # Stats on stats
60 | wages_enough <- wages %>%
61 | left_join(ns, by = "id") %>%
62 | filter(n > 4)
63 | sl <- l_slope(wages_enough, "id", "lnw~exper")
64 | m <- l_median(wages_enough, "id", "lnw")
65 | # min1<-l_min(wages_enough,"id","lnw")
66 | n <- m %>% filter(!is.na(m)) %>% nrow()
67 | med_n <- ifelse(n %% 2 == 1, (n + 1) / 2, c(n / 2, n / 2 + 1))
68 | indx <- m %>%
69 | arrange(m, m) %>%
70 | filter(between(row_number(), med_n[1], med_n[2]))
71 | indx_m <- m %>% arrange(m, m) %>% filter(row_number() == med_n)
72 | wages_md <- wages_enough %>% filter(id %in% indx_m$id)
73 | ggplot(wages_enough, aes(x = exper, y = lnw, group = id)) +
74 | geom_point(alpha = 0.1) +
75 | geom_line(data = wages_md, colour = "orange", size = 2)
76 |
77 | indx_q <- m %>%
78 | arrange(m, m) %>%
79 | filter(between(row_number(), floor(n * 0.25), ceiling(n * 0.75)))
80 | wages_q <- wages_enough %>% filter(id %in% indx_q$id)
81 | ggplot(wages_enough, aes(x = exper, y = lnw, group = id)) +
82 | geom_point(alpha = 0.1) +
83 | geom_line(data = wages_q, colour = "red", alpha = 0.5) +
84 | geom_line(data = wages_md, colour = "orange", size = 2)
85 |
86 | qrt <- c(1, floor(n * 0.25), round(n * 0.5, 0), ceiling(n * 0.75), n)
87 | indx <- m %>% arrange(m, m) %>% filter(row_number() %in% qrt)
88 | wages_q <- wages_enough %>% filter(id %in% indx$id)
89 | wages_q$id <- factor(wages_q$id, levels = indx$id)
90 | ggplot() +
91 | geom_point(data = wages_enough, aes(x = exper, y = lnw), alpha = 0.1) +
92 | geom_line(
93 | data = wages_q,
94 | aes(x = exper, y = lnw, group = id, colour = id),
95 | size = 2
96 | ) +
97 | scale_colour_viridis_d()
98 |
99 | # For min
100 | #
101 | # sl <- l_slope(wages_enough, "id", "lnw~exper")
102 | # m <- l_median(wages_enough, "id", "lnw")
103 | min1 <- l_min(wages_enough, "id", "lnw")
104 |
105 |
106 | min1_n <- which(min1[, 2] == min(min1[, 2]))
107 | min1_id <- min1[min1_n, 1]
108 |
109 | min1_nmax <- which(min1[, 2] == max(min1[, 2]))
110 | min1_idmax <- min1[min1_nmax, 1]
111 |
112 | # test<-filter(wages_enough,id==min1_id[[1]]) #now works
113 |
114 | ggplot(wages_enough, aes(x = exper, y = lnw, group = id)) +
115 | geom_point(alpha = 0.1) +
116 | geom_line(
117 | data = filter(wages_enough, id == min1_id[[1]]),
118 | colour = "blue",
119 | size = 2
120 | ) +
121 | geom_line(
122 | data = filter(wages_enough, id == min1_idmax[[1]]),
123 | colour = "red",
124 | size = 2
125 | )
126 |
127 |
128 | max1 <- l_max(wages_enough, "id", "lnw")
129 | max1_nmin <- which(max1[, 2] == min(max1[, 2]))
130 | max1_idmin <- max1[max1_nmin, 1]
131 |
132 | max1_nmax <- which(max1[, 2] == max(max1[, 2]))
133 | max1_idmax <- max1[max1_nmax, 1]
134 |
135 | ggplot(wages_enough, aes(x = exper, y = lnw, group = id)) +
136 | geom_point(alpha = 0.1) +
137 | geom_line(
138 | data = filter(wages_enough, id == max1_idmin[[1]]),
139 | colour = "blue",
140 | size = 2
141 | ) +
142 | geom_line(
143 | data = filter(wages_enough, id == max1_idmax[[1]]),
144 | colour = "red",
145 | size = 2
146 | )
147 |
148 |
149 | indx_q1min1 <- min1 %>%
150 | arrange(min1, m) %>%
151 | filter(between(row_number(), floor(n * 0.25), ceiling(n * 0.75))) #doesn't work
152 | wages_q1min1 <- wages_enough %>% filter(id %in% indx_q1min$id)
153 | ggplot(wages_enough, aes(x = exper, y = lnw, group = id)) +
154 | geom_point(alpha = 0.1) +
155 | geom_line(data = wages_q, colour = "red", alpha = 0.5) +
156 | geom_line(data = wages_md, colour = "orange", size = 2)
157 |
158 | qrt <- c(1, floor(n * 0.25), round(n * 0.5, 0), ceiling(n * 0.75), n)
159 | indx <- min1 %>% arrange(min1, m) %>% filter(row_number() %in% qrt) ### not changed yet
160 | wages_q <- wages_enough %>% filter(id %in% indx$id)
161 | wages_q$id <- factor(wages_q$id, levels = indx$id)
162 | ggplot() +
163 | geom_point(data = wages_enough, aes(x = exper, y = lnw), alpha = 0.1) +
164 | geom_line(
165 | data = wages_q,
166 | aes(x = exper, y = lnw, group = id, colour = id),
167 | size = 2
168 | ) +
169 | scale_colour_viridis_d()
170 |
--------------------------------------------------------------------------------
/man/add_n_obs.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/n-obs.R
3 | \name{add_n_obs}
4 | \alias{add_n_obs}
5 | \title{Add the number of observations for each key in a \code{tsibble}}
6 | \usage{
7 | add_n_obs(.data, ...)
8 | }
9 | \arguments{
10 | \item{.data}{tsibble}
11 |
12 | \item{...}{extra arguments}
13 | }
14 | \value{
15 | tsibble with \code{n_obs}, the number of observations per key added.
16 | }
17 | \description{
18 | Here, we are not counting the number of rows in the dataset, but rather
19 | we are counting the number observations for each keys in the data.
20 | }
21 | \examples{
22 | library(dplyr)
23 | # you can explore the data to see those cases that have exactly two
24 | # observations:
25 | heights \%>\%
26 | add_n_obs() \%>\%
27 | filter(n_obs == 2)
28 | }
29 |
--------------------------------------------------------------------------------
/man/b_summaries.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/b_summaries.R
3 | \name{b_min}
4 | \alias{b_min}
5 | \alias{b_summaries}
6 | \alias{b_max}
7 | \alias{b_median}
8 | \alias{b_mean}
9 | \alias{b_q25}
10 | \alias{b_q75}
11 | \alias{b_range}
12 | \alias{b_range_diff}
13 | \alias{b_sd}
14 | \alias{b_var}
15 | \alias{b_mad}
16 | \alias{b_iqr}
17 | \alias{b_diff_var}
18 | \alias{b_diff_sd}
19 | \alias{b_diff_mean}
20 | \alias{b_diff_median}
21 | \alias{b_diff_q25}
22 | \alias{b_diff_q75}
23 | \alias{b_diff_max}
24 | \alias{b_diff_min}
25 | \alias{b_diff_iqr}
26 | \title{Brolgar summaries (b_summaries)}
27 | \usage{
28 | b_min(x, ...)
29 |
30 | b_max(x, ...)
31 |
32 | b_median(x, ...)
33 |
34 | b_mean(x, ...)
35 |
36 | b_q25(x, ...)
37 |
38 | b_q75(x, ...)
39 |
40 | b_range(x, ...)
41 |
42 | b_range_diff(x, ...)
43 |
44 | b_sd(x, ...)
45 |
46 | b_var(x, ...)
47 |
48 | b_mad(x, ...)
49 |
50 | b_iqr(x, ...)
51 |
52 | b_diff_var(x, ...)
53 |
54 | b_diff_sd(x, ...)
55 |
56 | b_diff_mean(x, ...)
57 |
58 | b_diff_median(x, ...)
59 |
60 | b_diff_q25(x, ...)
61 |
62 | b_diff_q75(x, ...)
63 |
64 | b_diff_max(x, ...)
65 |
66 | b_diff_min(x, ...)
67 |
68 | b_diff_iqr(x, ...)
69 | }
70 | \arguments{
71 | \item{x}{a vector}
72 |
73 | \item{...}{other arguments to pass}
74 | }
75 | \description{
76 | Customised summaries of vectors with appropriate defaults for longitudinal
77 | data. The functions are prefixed with \code{b_} to assist with autocomplete.
78 | It uses \code{na.rm = TRUE} for all, and for calculations
79 | involving quantiles, \code{type = 8} and \code{names = FALSE}. Summaries include:
80 | * b_min: The minimum
81 | * b_max: The maximum
82 | * b_median: The median
83 | * b_mean: The mean
84 | * b_q25: The 25th quantile
85 | * b_q75: The 75th quantile
86 | * b_range: The range
87 | * b_range_diff: difference in range (max - min)
88 | * b_sd: The standard deviation
89 | * b_var: The variance
90 | * b_mad: The mean absolute deviation
91 | * b_iqr: The Inter-quartile range
92 | * b_diff_var: The variance diff()
93 | * b_diff_sd: The standard deviation of diff()
94 | * b_diff_mean: The mean of diff()
95 | * b_diff_median: The median of diff()
96 | * b_diff_q25: The q25 of diff()
97 | * b_diff_q75: The q75 of diff()
98 | }
99 | \examples{
100 |
101 | x <- c(1:5, NA, 5:1)
102 | min(x)
103 | b_min(x)
104 | max(x)
105 | b_max(x)
106 | median(x)
107 | b_median(x)
108 | mean(x)
109 | b_mean(x)
110 | range(x)
111 | b_range(x)
112 | var(x)
113 | b_var(x)
114 | sd(x)
115 | b_sd(x)
116 |
117 | }
118 |
--------------------------------------------------------------------------------
/man/brolgar-features.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/features.R
3 | \name{brolgar-features}
4 | \alias{brolgar-features}
5 | \alias{feat_three_num}
6 | \alias{feat_five_num}
7 | \alias{feat_ranges}
8 | \alias{feat_spread}
9 | \alias{feat_monotonic}
10 | \alias{feat_brolgar}
11 | \alias{feat_diff_summary}
12 | \title{Calculate features of a \code{tsibble} object in conjunction with \code{\link[=features]{features()}}}
13 | \usage{
14 | feat_three_num(x, ...)
15 |
16 | feat_five_num(x, ...)
17 |
18 | feat_ranges(x, ...)
19 |
20 | feat_spread(x, ...)
21 |
22 | feat_monotonic(x, ...)
23 |
24 | feat_brolgar(x, ...)
25 |
26 | feat_diff_summary(x, ...)
27 | }
28 | \arguments{
29 | \item{x}{A vector to extract features from.}
30 |
31 | \item{...}{Further arguments passed to other functions.}
32 | }
33 | \description{
34 | You can calculate a series of summary statistics (features) of a given
35 | variable for a dataset. For example, a three number summary, the minimum,
36 | median, and maximum, can be calculated for a given variable. This is
37 | designed to work with the \code{\link[=features]{features()}} function shown in the examples.
38 | Other available features in \code{brolgar} include:
39 | }
40 | \details{
41 | \itemize{
42 | \item \code{\link[=feat_three_num]{feat_three_num()}} - minimum, median, maximum
43 | \item \code{\link[=feat_five_num]{feat_five_num()}} - minimum, q25, median, q75, maximum.
44 | \item \code{\link[=feat_ranges]{feat_ranges()}} - min, max, range difference, interquartile range.
45 | \item \code{\link[=feat_spread]{feat_spread()}} - variance, standard deviation, median absolute distance,
46 | and interquartile range
47 | \item \code{\link[=feat_monotonic]{feat_monotonic()}} - is it always increasing, decreasing, or unvarying?
48 | \item \code{\link[=feat_diff_summary]{feat_diff_summary()}} - the summary statistics of the differences
49 | amongst a value, including the five number summary, as well as the
50 | standard deviation and variance. Returns NA if there is only one
51 | observation, as we can't take the difference of one observation, and a
52 | difference of 0 in these cases would be misleading.
53 | \item \code{\link[=feat_brolgar]{feat_brolgar()}} all features in brolgar.
54 | }
55 | }
56 | \examples{
57 |
58 | # You can use any of the features `feat_*` in conjunction with `features`
59 | # like so:
60 | heights \%>\%
61 | features(height_cm, # variable you want to explore
62 | feat_three_num) # the feature summarisation you want to perform
63 | }
64 |
--------------------------------------------------------------------------------
/man/brolgar-package.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/brolgar-package.R
3 | \docType{package}
4 | \name{brolgar-package}
5 | \alias{brolgar}
6 | \alias{brolgar-package}
7 | \title{brolgar: Browse Over Longitudinal Data Graphically and Analytically in R}
8 | \description{
9 | Provides a framework of tools to summarise, visualise, and explore longitudinal data. It builds upon the tidy time series data frames used in the 'tsibble' package, and is designed to integrate within the 'tidyverse', and 'tidyverts' (for time series) ecosystems. The methods implemented include calculating features for understanding longitudinal data, including calculating summary statistics such as quantiles, medians, and numeric ranges, sampling individual series, identifying individual series representative of a group, and extending the facet system in 'ggplot2' to facilitate exploration of samples of data. These methods are fully described in the paper "brolgar: An R package to Browse Over Longitudinal Data Graphically and Analytically in R", Nicholas Tierney, Dianne Cook, Tania Prvan (2020) \doi{10.32614/RJ-2022-023}.
10 | }
11 | \details{
12 | \code{brolgar} stands for: \strong{BR}owse over \strong{L}ongitudinal data \strong{G}raphically
13 | and \strong{A}nalytically in \strong{R}.
14 | }
15 | \seealso{
16 | Useful links:
17 | \itemize{
18 | \item \url{https://github.com/njtierney/brolgar}
19 | \item \url{https://brolgar.njtierney.com/}
20 | \item \url{http://brolgar.njtierney.com/}
21 | \item Report bugs at \url{https://github.com/njtierney/brolgar/issues}
22 | }
23 |
24 | }
25 | \author{
26 | \strong{Maintainer}: Nicholas Tierney \email{nicholas.tierney@gmail.com} (\href{https://orcid.org/0000-0003-1460-8722}{ORCID})
27 |
28 | Authors:
29 | \itemize{
30 | \item Di Cook \email{dicook@monash.edu} (\href{https://orcid.org/0000-0002-3813-7155}{ORCID})
31 | \item Tania Prvan \email{tania.prvan@mq.edu.au}
32 | }
33 |
34 | Other contributors:
35 | \itemize{
36 | \item Stuart Lee [contributor]
37 | \item Earo Wang [contributor]
38 | }
39 |
40 | }
41 | \keyword{internal}
42 |
--------------------------------------------------------------------------------
/man/facet_sample.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/facet-sample.R
3 | \name{facet_sample}
4 | \alias{facet_sample}
5 | \title{Facet data into groups to facilitate exploration}
6 | \usage{
7 | facet_sample(
8 | n_per_facet = 3,
9 | n_facets = 12,
10 | nrow = NULL,
11 | ncol = NULL,
12 | scales = "fixed",
13 | shrink = TRUE,
14 | strip.position = "top"
15 | )
16 | }
17 | \arguments{
18 | \item{n_per_facet}{Number of keys per facet you want to plot. Default is 3.}
19 |
20 | \item{n_facets}{Number of facets to create. Default is 12}
21 |
22 | \item{nrow, ncol}{Number of rows and columns.}
23 |
24 | \item{scales}{Should scales be fixed (\code{"fixed"}, the default),
25 | free (\code{"free"}), or free in one dimension (\code{"free_x"},
26 | \code{"free_y"})?}
27 |
28 | \item{shrink}{If \code{TRUE}, will shrink scales to fit output of
29 | statistics, not raw data. If \code{FALSE}, will be range of raw data
30 | before statistical summary.}
31 |
32 | \item{strip.position}{By default, the labels are displayed on the top of
33 | the plot. Using \code{strip.position} it is possible to place the labels on
34 | either of the four sides by setting \code{strip.position = c("top",
35 | "bottom", "left", "right")}}
36 | }
37 | \value{
38 | a ggplot object
39 | }
40 | \description{
41 | This function requires a \code{tbl_ts} object, which can be created with
42 | \code{tsibble::as_tsibble()}. Under the hood, \code{facet_strata} is powered by
43 | \code{\link[=stratify_keys]{stratify_keys()}} and \code{\link[=sample_n_keys]{sample_n_keys()}}.
44 | }
45 | \examples{
46 | library(ggplot2)
47 | ggplot(heights,
48 | aes(x = year,
49 | y = height_cm,
50 | group = country)) +
51 | geom_line() +
52 | facet_sample()
53 |
54 | ggplot(heights,
55 | aes(x = year,
56 | y = height_cm,
57 | group = country)) +
58 | geom_line() +
59 | facet_sample(n_per_facet = 1,
60 | n_facets = 12)
61 | }
62 |
--------------------------------------------------------------------------------
/man/facet_strata.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/facet-strata.R
3 | \name{facet_strata}
4 | \alias{facet_strata}
5 | \title{Facet data into groups to facilitate exploration}
6 | \usage{
7 | facet_strata(
8 | n_strata = 12,
9 | along = NULL,
10 | fun = mean,
11 | nrow = NULL,
12 | ncol = NULL,
13 | scales = "fixed",
14 | shrink = TRUE,
15 | strip.position = "top"
16 | )
17 | }
18 | \arguments{
19 | \item{n_strata}{number of groups to create}
20 |
21 | \item{along}{variable to stratify along. This groups by each \code{key} and then
22 | takes a summary statistic (by default, the mean). It then arranges by the
23 | mean value for each \code{key} and assigns the \code{n_strata} groups.}
24 |
25 | \item{fun}{summary function. Default is mean.}
26 |
27 | \item{nrow, ncol}{Number of rows and columns.}
28 |
29 | \item{scales}{Should scales be fixed (\code{"fixed"}, the default),
30 | free (\code{"free"}), or free in one dimension (\code{"free_x"},
31 | \code{"free_y"})?}
32 |
33 | \item{shrink}{If \code{TRUE}, will shrink scales to fit output of
34 | statistics, not raw data. If \code{FALSE}, will be range of raw data
35 | before statistical summary.}
36 |
37 | \item{strip.position}{By default, the labels are displayed on the top of
38 | the plot. Using \code{strip.position} it is possible to place the labels on
39 | either of the four sides by setting \code{strip.position = c("top",
40 | "bottom", "left", "right")}}
41 | }
42 | \value{
43 | a ggplot object
44 | }
45 | \description{
46 | This function requires a \code{tbl_ts} object, which can be created with
47 | \code{tsibble::as_tsibble()}. Under the hood, \code{facet_strata} is powered by
48 | \code{\link[=stratify_keys]{stratify_keys()}}.
49 | }
50 | \examples{
51 | library(ggplot2)
52 | ggplot(heights,
53 | aes(x = year,
54 | y = height_cm,
55 | group = country)) +
56 | geom_line() +
57 | facet_strata()
58 |
59 |
60 | ggplot(heights,
61 | aes(x = year,
62 | y = height_cm,
63 | group = country)) +
64 | geom_line() +
65 | facet_wrap(~continent)
66 |
67 | ggplot(heights,
68 | aes(x = year,
69 | y = height_cm,
70 | group = country)) +
71 | geom_line() +
72 | facet_strata(along = year)
73 |
74 | \donttest{
75 | library(dplyr)
76 | heights \%>\%
77 | key_slope(height_cm ~ year) \%>\%
78 | right_join(heights, ., by = "country") \%>\%
79 | ggplot(aes(x = year,
80 | y = height_cm)) +
81 | geom_line(aes(group = country)) +
82 | geom_smooth(method = "lm") +
83 | facet_strata(along = .slope_year)
84 | }
85 | }
86 |
--------------------------------------------------------------------------------
/man/figures/README-demo-brolgar-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/njtierney/brolgar/3b2e04c68b7296f0a53f999ca2377f88fd03af6f/man/figures/README-demo-brolgar-1.png
--------------------------------------------------------------------------------
/man/figures/README-demo-why-brolgar-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/njtierney/brolgar/3b2e04c68b7296f0a53f999ca2377f88fd03af6f/man/figures/README-demo-why-brolgar-1.png
--------------------------------------------------------------------------------
/man/figures/README-facet-sample-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/njtierney/brolgar/3b2e04c68b7296f0a53f999ca2377f88fd03af6f/man/figures/README-facet-sample-1.png
--------------------------------------------------------------------------------
/man/figures/README-facet-sample-3-by-20-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/njtierney/brolgar/3b2e04c68b7296f0a53f999ca2377f88fd03af6f/man/figures/README-facet-sample-3-by-20-1.png
--------------------------------------------------------------------------------
/man/figures/README-facet-strata-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/njtierney/brolgar/3b2e04c68b7296f0a53f999ca2377f88fd03af6f/man/figures/README-facet-strata-1.png
--------------------------------------------------------------------------------
/man/figures/README-facet-strata-20-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/njtierney/brolgar/3b2e04c68b7296f0a53f999ca2377f88fd03af6f/man/figures/README-facet-strata-20-1.png
--------------------------------------------------------------------------------
/man/figures/README-features-left-join-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/njtierney/brolgar/3b2e04c68b7296f0a53f999ca2377f88fd03af6f/man/figures/README-features-left-join-1.png
--------------------------------------------------------------------------------
/man/figures/README-plot-filter-sample-n-keys-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/njtierney/brolgar/3b2e04c68b7296f0a53f999ca2377f88fd03af6f/man/figures/README-plot-filter-sample-n-keys-1.png
--------------------------------------------------------------------------------
/man/figures/README-plot-sample-n-keys-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/njtierney/brolgar/3b2e04c68b7296f0a53f999ca2377f88fd03af6f/man/figures/README-plot-sample-n-keys-1.png
--------------------------------------------------------------------------------
/man/figures/README-plot-strata-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/njtierney/brolgar/3b2e04c68b7296f0a53f999ca2377f88fd03af6f/man/figures/README-plot-strata-1.png
--------------------------------------------------------------------------------
/man/figures/README-show-monotonic-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/njtierney/brolgar/3b2e04c68b7296f0a53f999ca2377f88fd03af6f/man/figures/README-show-monotonic-1.png
--------------------------------------------------------------------------------
/man/figures/README-show-spaghetti-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/njtierney/brolgar/3b2e04c68b7296f0a53f999ca2377f88fd03af6f/man/figures/README-show-spaghetti-1.png
--------------------------------------------------------------------------------
/man/figures/README-show-spaghetti-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/njtierney/brolgar/3b2e04c68b7296f0a53f999ca2377f88fd03af6f/man/figures/README-show-spaghetti-2.png
--------------------------------------------------------------------------------
/man/figures/README-show-wages-lg-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/njtierney/brolgar/3b2e04c68b7296f0a53f999ca2377f88fd03af6f/man/figures/README-show-wages-lg-1.png
--------------------------------------------------------------------------------
/man/figures/README-summarise-n-obs-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/njtierney/brolgar/3b2e04c68b7296f0a53f999ca2377f88fd03af6f/man/figures/README-summarise-n-obs-1.png
--------------------------------------------------------------------------------
/man/figures/README-use-gg-highlight-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/njtierney/brolgar/3b2e04c68b7296f0a53f999ca2377f88fd03af6f/man/figures/README-use-gg-highlight-1.png
--------------------------------------------------------------------------------
/man/figures/brolgar-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/njtierney/brolgar/3b2e04c68b7296f0a53f999ca2377f88fd03af6f/man/figures/brolgar-logo.png
--------------------------------------------------------------------------------
/man/heights.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/data-world-height.R
3 | \docType{data}
4 | \name{heights}
5 | \alias{heights}
6 | \title{World Height Data}
7 | \format{
8 | An object of class \code{tbl_ts} (inherits from \code{tbl_df}, \code{tbl}, \code{data.frame}) with 1490 rows and 4 columns.
9 | }
10 | \usage{
11 | heights
12 | }
13 | \description{
14 | Average male heights in 144 countries from 1810-1989, with a
15 | smaller number of countries from 1500-1800. Data has been filtered to
16 | only include countries with more than one observation.
17 | }
18 | \details{
19 | \code{heights} is stored as a time series \code{tsibble} object. It contains
20 | the variables:
21 | \itemize{
22 | \item country: The Country. This forms the identifying \code{key}.
23 | \item year: Year. This forms the time \code{index}.
24 | \item height_cm: Average male height in centimeters.
25 | \item continent: continent extracted from country name using \code{countrycode}
26 | package (https://joss.theoj.org/papers/10.21105/joss.00848).
27 | }
28 |
29 | For more information, see the article: "Why are you tall while others are
30 | short? Agricultural production and other proximate determinants of global
31 | heights", Joerg Baten and Matthias Blum, European Review of Economic
32 | History 18 (2014), 144–165. Data available from
33 | \url{https://datasets.iisg.amsterdam/dataset.xhtml?persistentId=hdl:10622/IAEKLA}, accessed via the Clio Infra website.
34 | }
35 | \examples{
36 | # show the data
37 | heights
38 |
39 | # show the spaghetti plot (ugh!)
40 | library(ggplot2)
41 | ggplot(heights,
42 | aes(x = year,
43 | y = height_cm,
44 | group = country)) +
45 | geom_line()
46 |
47 | # Explore all samples with `facet_strata()`
48 | ggplot(heights,
49 | aes(x = year,
50 | y = height_cm,
51 | group = country)) +
52 | geom_line() +
53 | facet_strata()
54 |
55 | # Explore the heights over each continent
56 | ggplot(heights,
57 | aes(x = year,
58 | y = height_cm,
59 | group = country)) +
60 | geom_line() +
61 | facet_wrap(~continent)
62 |
63 | # explore the five number summary of height_cm with `features`
64 | heights \%>\%
65 | features(height_cm, feat_five_num)
66 | }
67 | \keyword{datasets}
68 |
--------------------------------------------------------------------------------
/man/index_summary.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/index-helpers.R
3 | \name{index_summary}
4 | \alias{index_summary}
5 | \alias{index_regular}
6 | \alias{index_regular.tbl_ts}
7 | \alias{index_regular.data.frame}
8 | \alias{index_summary.tbl_ts}
9 | \alias{index_summary.data.frame}
10 | \title{Index summaries}
11 | \usage{
12 | index_regular(.data, ...)
13 |
14 | \method{index_regular}{tbl_ts}(.data, ...)
15 |
16 | \method{index_regular}{data.frame}(.data, index, ...)
17 |
18 | index_summary(.data, ...)
19 |
20 | \method{index_summary}{tbl_ts}(.data, ...)
21 |
22 | \method{index_summary}{data.frame}(.data, index, ...)
23 | }
24 | \arguments{
25 | \item{.data}{data.frame or tsibble}
26 |
27 | \item{...}{extra arguments}
28 |
29 | \item{index}{the proposed index variable}
30 | }
31 | \value{
32 | logical TRUE means it is regular, FALSE means not
33 | }
34 | \description{
35 | These functions check if the index is regular (\code{index_regular()}), and
36 | summarise the index variable (\code{index_summary()}). This can be useful
37 | to check your index variables.
38 | }
39 | \examples{
40 | # a tsibble
41 | index_regular(heights)
42 |
43 | # some data frames
44 | index_regular(pisa, year)
45 | index_regular(airquality, Month)
46 |
47 | # a tsibble
48 | index_summary(heights)
49 | # some data frames
50 | index_summary(pisa, year)
51 | index_summary(airquality, Month)
52 | index_summary(airquality, Day)
53 | }
54 |
--------------------------------------------------------------------------------
/man/key_slope.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/key_slope.R
3 | \name{key_slope}
4 | \alias{key_slope}
5 | \alias{add_key_slope}
6 | \alias{add_key_slope.default}
7 | \title{Fit linear model for each key}
8 | \usage{
9 | key_slope(.data, formula, ...)
10 |
11 | add_key_slope(.data, formula)
12 |
13 | add_key_slope.default(.data, formula)
14 | }
15 | \arguments{
16 | \item{.data}{tsibble}
17 |
18 | \item{formula}{formula}
19 |
20 | \item{...}{extra arguments}
21 | }
22 | \value{
23 | tibble with coefficient information
24 | }
25 | \description{
26 | Using \code{key_slope} you can fit a linear model to each key in the \code{tsibble}.
27 | \code{add_key_slope} adds this slope information back to the data, and returns
28 | the full dimension \code{tsibble}.
29 | }
30 | \examples{
31 | key_slope(heights, height_cm ~ year)
32 |
33 | }
34 |
--------------------------------------------------------------------------------
/man/keys_near.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/keys_near.R
3 | \name{keys_near}
4 | \alias{keys_near}
5 | \alias{keys_near.default}
6 | \title{Return keys nearest to a given statistics or summary.}
7 | \usage{
8 | keys_near(.data, ...)
9 |
10 | \method{keys_near}{default}(.data, ...)
11 | }
12 | \arguments{
13 | \item{.data}{tsibble}
14 |
15 | \item{...}{extra arguments to pass to \code{mutate_at} when performing the summary
16 | as given by \code{funs}.}
17 | }
18 | \value{
19 | data.frame containing keys closest to a given statistic.
20 | }
21 | \description{
22 | Return keys nearest to a given statistics or summary.
23 | }
24 | \examples{
25 | keys_near(heights, height_cm)
26 |
27 | }
28 |
--------------------------------------------------------------------------------
/man/keys_near.data.frame.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/keys_near.R
3 | \name{keys_near.data.frame}
4 | \alias{keys_near.data.frame}
5 | \title{Return keys nearest to a given statistics or summary.}
6 | \usage{
7 | \method{keys_near}{data.frame}(.data, key, var, top_n = 1, funs = l_five_num, ...)
8 | }
9 | \arguments{
10 | \item{.data}{data.frame}
11 |
12 | \item{key}{key, which identifies unique observations.}
13 |
14 | \item{var}{variable to summarise}
15 |
16 | \item{top_n}{top number of closest observations to return - default is 1, which will also return ties.}
17 |
18 | \item{funs}{named list of functions to summarise by. Default is a given
19 | list of the five number summary, \code{l_five_num}.}
20 |
21 | \item{...}{extra arguments to pass to \code{mutate_at} when performing the summary
22 | as given by \code{funs}.}
23 | }
24 | \description{
25 | Return keys nearest to a given statistics or summary.
26 | }
27 | \examples{
28 | heights \%>\%
29 | key_slope(height_cm ~ year) \%>\%
30 | keys_near(key = country,
31 | var = .slope_year)
32 | # Specify your own list of summaries
33 | l_ranges <- list(min = b_min,
34 | range_diff = b_range_diff,
35 | max = b_max,
36 | iqr = b_iqr)
37 |
38 | heights \%>\%
39 | key_slope(formula = height_cm ~ year) \%>\%
40 | keys_near(key = country,
41 | var = .slope_year,
42 | funs = l_ranges)
43 | }
44 |
--------------------------------------------------------------------------------
/man/keys_near.tbl_ts.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/keys_near.R
3 | \name{keys_near.tbl_ts}
4 | \alias{keys_near.tbl_ts}
5 | \title{Return keys nearest to a given statistics or summary.}
6 | \usage{
7 | \method{keys_near}{tbl_ts}(.data, var, top_n = 1, funs = l_five_num, stat_as_factor = TRUE, ...)
8 | }
9 | \arguments{
10 | \item{.data}{tsibble}
11 |
12 | \item{var}{variable to summarise}
13 |
14 | \item{top_n}{top number of closest observations to return - default is 1, which will also return ties.}
15 |
16 | \item{funs}{named list of functions to summarise by. Default is a given
17 | list of the five number summary, \code{l_five_num}.}
18 |
19 | \item{stat_as_factor}{coerce \code{stat} variable into a factor? Default is TRUE.}
20 |
21 | \item{...}{extra arguments to pass to \code{mutate_at} when performing the summary
22 | as given by \code{funs}.}
23 | }
24 | \description{
25 | Return keys nearest to a given statistics or summary.
26 | }
27 | \examples{
28 |
29 | # Return observations closest to the five number summary of height_cm
30 | heights \%>\%
31 | keys_near(var = height_cm)
32 |
33 | }
34 |
--------------------------------------------------------------------------------
/man/l_funs.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/keys_near.R
3 | \docType{data}
4 | \name{l_funs}
5 | \alias{l_funs}
6 | \alias{l_five_num}
7 | \alias{l_three_num}
8 | \title{A named list of the five number summary}
9 | \format{
10 | An object of class \code{list} of length 5.
11 |
12 | An object of class \code{list} of length 3.
13 | }
14 | \usage{
15 | l_five_num
16 |
17 | l_three_num
18 | }
19 | \description{
20 | Designed for use with the \code{\link[=keys_near]{keys_near()}} function.
21 | }
22 | \examples{
23 | # Specify your own list of summaries
24 | l_ranges <- list(min = b_min,
25 | range_diff = b_range_diff,
26 | max = b_max,
27 | iqr = b_iqr)
28 |
29 | heights \%>\%
30 | key_slope(formula = height_cm ~ year) \%>\%
31 | keys_near(key = country,
32 | var = .slope_year,
33 | funs = l_ranges)
34 | }
35 | \keyword{datasets}
36 |
--------------------------------------------------------------------------------
/man/monotonic.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/monotonics.R
3 | \name{monotonic}
4 | \alias{monotonic}
5 | \alias{increasing}
6 | \alias{decreasing}
7 | \alias{unvarying}
8 | \title{Are values monotonic? Always increasing, decreasing, or unvarying?}
9 | \usage{
10 | increasing(x, ...)
11 |
12 | decreasing(x, ...)
13 |
14 | unvarying(x, ...)
15 |
16 | monotonic(x, ...)
17 | }
18 | \arguments{
19 | \item{x}{numeric or integer}
20 |
21 | \item{...}{extra arguments to pass to diff}
22 | }
23 | \value{
24 | logical TRUE or FALSE
25 | }
26 | \description{
27 | These provides three families of functions to tell you if values are always
28 | increasing, decreasing, or unvarying, with the functions, \code{increasing()},
29 | \code{decreasing()}, or \code{unvarying()}. Under the hood it uses \code{diff} to find
30 | differences, so if you like you can pass extra arguments to \code{diff}.
31 | }
32 | \examples{
33 | vec_inc <- c(1:10)
34 | vec_dec<- c(10:1)
35 | vec_ran <- c(sample(1:10))
36 | vec_flat <- rep.int(1,10)
37 |
38 | increasing(vec_inc)
39 | increasing(vec_dec)
40 | increasing(vec_ran)
41 | increasing(vec_flat)
42 |
43 | decreasing(vec_inc)
44 | decreasing(vec_dec)
45 | decreasing(vec_ran)
46 | decreasing(vec_flat)
47 |
48 | unvarying(vec_inc)
49 | unvarying(vec_dec)
50 | unvarying(vec_ran)
51 | unvarying(vec_flat)
52 |
53 | library(ggplot2)
54 | library(gghighlight)
55 | library(dplyr)
56 |
57 | heights_mono <- heights \%>\%
58 | features(height_cm, feat_monotonic) \%>\%
59 | left_join(heights, by = "country")
60 |
61 | ggplot(heights_mono,
62 | aes(x = year,
63 | y = height_cm,
64 | group = country)) +
65 | geom_line() +
66 | gghighlight(increase)
67 |
68 | ggplot(heights_mono,
69 | aes(x = year,
70 | y = height_cm,
71 | group = country)) +
72 | geom_line() +
73 | gghighlight(decrease)
74 |
75 | heights_mono \%>\%
76 | filter(monotonic) \%>\%
77 | ggplot(aes(x = year,
78 | y = height_cm,
79 | group = country)) +
80 | geom_line()
81 |
82 | heights_mono \%>\%
83 | filter(increase) \%>\%
84 | ggplot(aes(x = year,
85 | y = height_cm,
86 | group = country)) +
87 | geom_line()
88 |
89 | }
90 |
--------------------------------------------------------------------------------
/man/n_obs.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/n-obs.R
3 | \name{n_obs}
4 | \alias{n_obs}
5 | \title{Return the number of observations}
6 | \usage{
7 | n_obs(x, names = TRUE)
8 | }
9 | \arguments{
10 | \item{x}{vector or data.frame}
11 |
12 | \item{names}{logical; If TRUE the result is a named vector named "n_obs", else
13 | it is just the number of observations.}
14 | }
15 | \value{
16 | number of observations
17 | }
18 | \description{
19 | Returns the number of observations of a vector or data.frame. It uses
20 | \code{vctrs::vec_size()} under the hood.
21 | }
22 | \note{
23 | You cannot use \code{n_obs} with \code{features} counting the key variable like
24 | so - \code{features(heights, country, n_obs)}. Instead, use any other variable.
25 | }
26 | \examples{
27 | n_obs(iris)
28 | n_obs(1:10)
29 | add_n_obs(heights)
30 | heights \%>\%
31 | features(height_cm, n_obs) # can be any variable except id, the key.
32 | }
33 |
--------------------------------------------------------------------------------
/man/near_between.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/nearests.R
3 | \name{near_between}
4 | \alias{near_between}
5 | \title{Return x percent to y percent of values}
6 | \usage{
7 | near_between(x, from, to)
8 | }
9 | \arguments{
10 | \item{x}{numeric vector}
11 |
12 | \item{from}{the lower bound of percentage}
13 |
14 | \item{to}{the upper bound of percentage}
15 | }
16 | \value{
17 | logical vector
18 | }
19 | \description{
20 | Return x percent to y percent of values
21 | }
22 | \examples{
23 | x <- runif(20)
24 |
25 | near_middle(x = x,
26 | middle = 0.5,
27 | within = 0.2)
28 |
29 | library(dplyr)
30 | heights \%>\% features(height_cm, list(min = min)) \%>\%
31 | filter(near_between(min, 0.1, 0.9))
32 |
33 | near_quantile(x = x,
34 | probs = 0.5,
35 | tol = 0.01)
36 |
37 | near_quantile(x, c(0.25, 0.5, 0.75), 0.05)
38 |
39 | heights \%>\%
40 | features(height_cm, l_five_num) \%>\%
41 | mutate_at(vars(min:max),
42 | .funs = near_quantile,
43 | 0.5,
44 | 0.01) \%>\%
45 | filter(min)
46 |
47 | heights \%>\%
48 | features(height_cm, list(min = min)) \%>\%
49 | mutate(min_near_q3 = near_quantile(min, c(0.25, 0.5, 0.75), 0.01)) \%>\%
50 | filter(min_near_q3)
51 |
52 | heights \%>\%
53 | features(height_cm, list(min = min)) \%>\%
54 | filter(near_between(min, 0.1, 0.9))
55 |
56 | heights \%>\%
57 | features(height_cm, list(min = min)) \%>\%
58 | filter(near_middle(min, 0.5, 0.1))
59 | }
60 |
--------------------------------------------------------------------------------
/man/near_middle.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/nearests.R
3 | \name{near_middle}
4 | \alias{near_middle}
5 | \title{Return the middle x percent of values}
6 | \usage{
7 | near_middle(x, middle, within)
8 | }
9 | \arguments{
10 | \item{x}{numeric vector}
11 |
12 | \item{middle}{percentage you want to center around}
13 |
14 | \item{within}{percentage around center}
15 | }
16 | \value{
17 | logical vector
18 | }
19 | \description{
20 | Return the middle x percent of values
21 | }
22 | \examples{
23 | x <- runif(20)
24 | near_middle(x = x,
25 | middle = 0.5,
26 | within = 0.2)
27 |
28 | library(dplyr)
29 | heights \%>\% features(height_cm, list(min = min)) \%>\%
30 | filter(near_middle(min, 0.5, 0.1))
31 |
32 | }
33 |
--------------------------------------------------------------------------------
/man/near_quantile.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/nearests.R
3 | \name{near_quantile}
4 | \alias{near_quantile}
5 | \title{Which values are nearest to any given quantiles}
6 | \usage{
7 | near_quantile(x, probs, tol = 0.01)
8 | }
9 | \arguments{
10 | \item{x}{vector}
11 |
12 | \item{probs}{quantiles to calculate}
13 |
14 | \item{tol}{tolerance in terms of x that you will accept near to the
15 | quantile. Default is 0.01.}
16 | }
17 | \value{
18 | logical vector of TRUE/FALSE if number is close to a quantile
19 | }
20 | \description{
21 | Which values are nearest to any given quantiles
22 | }
23 | \examples{
24 | x <- runif(20)
25 | near_quantile(x, 0.5, 0.05)
26 | near_quantile(x, c(0.25, 0.5, 0.75), 0.05)
27 |
28 | library(dplyr)
29 | heights \%>\%
30 | features(height_cm, list(min = min)) \%>\%
31 | mutate(min_near_median = near_quantile(min, 0.5, 0.01)) \%>\%
32 | filter(min_near_median)
33 | heights \%>\%
34 | features(height_cm, list(min = min)) \%>\%
35 | mutate(min_near_q3 = near_quantile(min, c(0.25, 0.5, 0.75), 0.01)) \%>\%
36 | filter(min_near_q3)
37 | }
38 |
--------------------------------------------------------------------------------
/man/nearests.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/nearests.R
3 | \name{nearests}
4 | \alias{nearests}
5 | \alias{nearest_lgl}
6 | \alias{nearest_qt_lgl}
7 | \title{Is x nearest to y?}
8 | \usage{
9 | nearest_lgl(x, y)
10 |
11 | nearest_qt_lgl(y, ...)
12 | }
13 | \arguments{
14 | \item{x}{a numeric vector}
15 |
16 | \item{y}{a numeric vector}
17 |
18 | \item{...}{(if used) arguments to pass to \code{quantile()}.}
19 | }
20 | \value{
21 | logical vector of \code{length(y)}
22 | }
23 | \description{
24 | Returns TRUE if x is nearest to y.
25 | There are two implementations. \code{nearest_lgl()} returns a logical vector
26 | when an element of the first argument is nearest to an element of the
27 | second argument. \code{nearest_qt_lgl()} is similar to \code{nearest_lgl()}, but
28 | instead determines if an element of the first argument is nearest to
29 | some value of the given quantile probabilities. See example for more
30 | detail.
31 | }
32 | \examples{
33 |
34 | x <- 1:10
35 | y <- 5:14
36 | z <- 16:25
37 | a <- -1:-5
38 | b <- -1
39 |
40 | nearest_lgl(x, y)
41 | nearest_lgl(y, x)
42 |
43 | nearest_lgl(x, z)
44 | nearest_lgl(z, x)
45 |
46 | nearest_lgl(x, a)
47 | nearest_lgl(a, x)
48 |
49 | nearest_lgl(x, b)
50 | nearest_lgl(b, x)
51 |
52 | library(dplyr)
53 | heights_near_min <- heights \%>\%
54 | filter(nearest_lgl(min(height_cm), height_cm))
55 |
56 | heights_near_fivenum <- heights \%>\%
57 | filter(nearest_lgl(fivenum(height_cm), height_cm))
58 |
59 | heights_near_qt_1 <- heights \%>\%
60 | filter(nearest_qt_lgl(height_cm, c(0.5)))
61 |
62 | heights_near_qt_3 <- heights \%>\%
63 | filter(nearest_qt_lgl(height_cm, c(0.1, 0.5, 0.9)))
64 |
65 | }
66 |
--------------------------------------------------------------------------------
/man/pipe.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/reexports.R
3 | \name{\%>\%}
4 | \alias{\%>\%}
5 | \title{Pipe operator}
6 | \usage{
7 | lhs \%>\% rhs
8 | }
9 | \description{
10 | See \code{magrittr::\link[magrittr]{\%>\%}} for details.
11 | }
12 | \keyword{internal}
13 |
--------------------------------------------------------------------------------
/man/pisa.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/data-pisa.R
3 | \docType{data}
4 | \name{pisa}
5 | \alias{pisa}
6 | \title{Student data from 2000-2018 PISA OECD data}
7 | \format{
8 | A tibble of the following variables
9 | \itemize{
10 | \item year the year of measurement
11 | \item country the three letter country code. This data contains Australia,
12 | New Zealand, and Indonesia. The full data from learningtower contains
13 | 99 countries.
14 | \item school_id The unique school identification number
15 | \item student_id The student identification number
16 | \item gender recorded gender - 1 female or 2 male or missing
17 | \item math Simulated score in mathematics
18 | \item read Simulated score in reading
19 | \item science Simulated score in science
20 | \item stu_wgt The final survey weight score for the student score
21 | }
22 |
23 | Understanding a bit more about the PISA data, the \code{school_id} and
24 | \code{student_id} are not unique across time. This means the longitudinal element
25 | is the country within a given year.
26 |
27 | We can cast \code{pisa} as a \code{tsibble}, but we need to aggregate the data to each
28 | year and country. In doing so, it is important that we provide some summary
29 | statistics of each of the scores - we want to include the mean, and minimum
30 | and maximum of the math, reading, and science scores, so that we do not lose
31 | the information of the individuals.
32 |
33 | The example code below does this, first grouping by year and country, then
34 | calculating the weighted mean for math, reading, and science. This can be
35 | done using the student weight variable \code{stu_wgt}, to get the survey weighted
36 | mean. The minimum and maximum are then calculated.
37 | }
38 | \usage{
39 | pisa
40 | }
41 | \description{
42 | A subset of PISA data, containing scores and other information
43 | from the triennial testing of 15 year olds around
44 | the globe. Original data available from
45 | \url{https://www.oecd.org/pisa/data/}. Data derived from
46 | \url{https://github.com/kevinwang09/learningtower}.
47 | }
48 | \examples{
49 | pisa
50 |
51 | library(dplyr)
52 | # Let's identify
53 |
54 | #1. The **key**, the individual, who would have repeated measurements.
55 | #2. The **index**, the time component.
56 | #3. The **regularity** of the time interval (index).
57 |
58 | # Here it looks like the key is the student_id, which is nested within
59 | # school_id #' and country,
60 |
61 | # And the index is year, so we would write the following
62 |
63 | as_tsibble(pisa,
64 | key = country,
65 | index = year)
66 |
67 | # We can assess the regularity of the year like so:
68 |
69 | index_regular(pisa, year)
70 | index_summary(pisa, year)
71 |
72 | # We can now convert this into a `tsibble`:
73 |
74 | pisa_ts <- as_tsibble(pisa,
75 | key = country,
76 | index = year,
77 | regular = TRUE)
78 |
79 | pisa_ts
80 | pisa_ts_au_nz <- pisa_ts \%>\% filter(country \%in\% c("AUS", "NZL", "QAT"))
81 |
82 | library(ggplot2)
83 | ggplot(pisa_ts_au_nz,
84 | aes(x = year,
85 | y = math_mean,
86 | group = country,
87 | colour = country)) +
88 | geom_ribbon(aes(ymin = math_min,
89 | ymax = math_max),
90 | fill = "grey70") +
91 | geom_line(size = 1) +
92 | lims(y = c(0, 1000)) +
93 | labs(y = "math") +
94 | facet_wrap(~country)
95 | }
96 | \keyword{datasets}
97 |
--------------------------------------------------------------------------------
/man/reexports.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/reexports.R
3 | \docType{import}
4 | \name{reexports}
5 | \alias{reexports}
6 | \alias{features}
7 | \alias{features_at}
8 | \alias{features_if}
9 | \alias{features_all}
10 | \alias{as_tsibble}
11 | \alias{n_keys}
12 | \title{Objects exported from other packages}
13 | \keyword{internal}
14 | \description{
15 | These objects are imported from other packages. Follow the links
16 | below to see their documentation.
17 |
18 | \describe{
19 | \item{fabletools}{\code{\link[fabletools]{features}}, \code{\link[fabletools:features]{features_all}}, \code{\link[fabletools:features]{features_at}}, \code{\link[fabletools:features]{features_if}}}
20 |
21 | \item{tsibble}{\code{\link[tsibble:as-tsibble]{as_tsibble}}, \code{\link[tsibble:key-data]{n_keys}}}
22 | }}
23 |
24 |
--------------------------------------------------------------------------------
/man/sample-n-frac-keys.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/sample-frac-n-keys.R
3 | \name{sample-n-frac-keys}
4 | \alias{sample-n-frac-keys}
5 | \alias{sample_n_keys}
6 | \alias{sample_frac_keys}
7 | \title{Sample a number or fraction of keys to explore}
8 | \usage{
9 | sample_n_keys(.data, size)
10 |
11 | sample_frac_keys(.data, size)
12 | }
13 | \arguments{
14 | \item{.data}{tsibble object}
15 |
16 | \item{size}{The number or fraction of observations, depending on the
17 | function used. In \code{sample_n_keys}, it is a number > 0, and in
18 | \code{sample_frac_keys} it is a fraction, between 0 and 1.}
19 | }
20 | \value{
21 | tsibble with fewer observations of key
22 | }
23 | \description{
24 | Sample a number or fraction of keys to explore
25 | }
26 | \examples{
27 | library(ggplot2)
28 | sample_n_keys(heights,
29 | size = 10) \%>\%
30 | ggplot(aes(x = year,
31 | y = height_cm,
32 | group = country)) +
33 | geom_line()
34 | library(ggplot2)
35 | sample_frac_keys(wages,
36 | 0.1) \%>\%
37 | ggplot(aes(x = xp,
38 | y = unemploy_rate,
39 | group = id)) +
40 | geom_line()
41 | }
42 |
--------------------------------------------------------------------------------
/man/stratify_keys.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/stratify-keys.R
3 | \name{stratify_keys}
4 | \alias{stratify_keys}
5 | \title{Stratify the keys into groups to facilitate exploration}
6 | \usage{
7 | stratify_keys(.data, n_strata, along = NULL, fun = mean, ...)
8 | }
9 | \arguments{
10 | \item{.data}{data.frame to explore}
11 |
12 | \item{n_strata}{number of groups to create}
13 |
14 | \item{along}{variable to stratify along. This groups by each \code{key} and then
15 | takes a summary statistic (by default, the mean). It then arranges by the
16 | mean value for each \code{key} and assigns the \code{n_strata} groups.}
17 |
18 | \item{fun}{summary function. Default is mean.}
19 |
20 | \item{...}{extra arguments}
21 | }
22 | \value{
23 | data.frame with column, \code{.strata} containing \code{n_strata} groups
24 | }
25 | \description{
26 | To look at as much of the raw data as possible, it can be helpful to
27 | stratify the data into groups for plotting. You can \code{stratify} the
28 | \code{keys} using the \code{stratify_keys()} function, which adds the column,
29 | \code{.strata}. This allows the user to create facetted plots showing a more
30 | of the raw data.
31 | }
32 | \examples{
33 | library(ggplot2)
34 | library(brolgar)
35 |
36 | heights \%>\%
37 | sample_frac_keys(size = 0.1) \%>\%
38 | stratify_keys(10) \%>\%
39 | ggplot(aes(x = height_cm,
40 | y = year,
41 | group = country)) +
42 | geom_line() +
43 | facet_wrap(~.strata)
44 |
45 | # now facet along some feature
46 | library(dplyr)
47 | heights \%>\%
48 | key_slope(height_cm ~ year) \%>\%
49 | right_join(heights, ., by = "country") \%>\%
50 | stratify_keys(n_strata = 12,
51 | along = .slope_year,
52 | fun = median) \%>\%
53 | ggplot(aes(x = year,
54 | y = height_cm,
55 | group = country)) +
56 | geom_line() +
57 | facet_wrap(~.strata)
58 |
59 |
60 | heights \%>\%
61 | stratify_keys(n_strata = 12,
62 | along = height_cm) \%>\%
63 | ggplot(aes(x = year,
64 | y = height_cm,
65 | group = country)) +
66 | geom_line() +
67 | facet_wrap(~.strata)
68 | }
69 |
--------------------------------------------------------------------------------
/man/wages.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/data-wages-ts.R
3 | \docType{data}
4 | \name{wages}
5 | \alias{wages}
6 | \title{Wages data from National Longitudinal Survey of Youth (NLSY)}
7 | \format{
8 | A \code{tsibble} data frame with 6402 rows and 8 variables:
9 | \describe{
10 | \item{id}{1–888, for each subject. This forms the \code{key} of the data}
11 | \item{ln_wages}{natural log of wages, adjusted for inflation,
12 | to 1990 dollars.}
13 | \item{xp}{Experience - the length of time in the workforce (in years).
14 | This is treated as the time variable, with t0 for each subject starting
15 | on their first day at work. The number of time points and values of time
16 | points for each subject can differ. This forms the \code{index} of the data}
17 | \item{ged}{when/if a graduate equivalency diploma is obtained.}
18 | \item{xp_since_ged}{change in experience since getting a ged (if they get one)}
19 | \item{black}{categorical indicator of race = black.}
20 | \item{hispanic}{categorical indicator of race = hispanic.}
21 | \item{high_grade}{highest grade completed}
22 | \item{unemploy_rate}{unemployment rates in the local geographic region
23 | at each measurement time}
24 | }
25 | }
26 | \usage{
27 | wages
28 | }
29 | \description{
30 | This data contains measurements on hourly wages by years in
31 | the workforce, with education and race as covariates. The population
32 | measured was male high-school dropouts, aged between 14 and 17 years
33 | when first measured. \code{wages} is a time series \code{tsibble}.
34 | It comes from J. D. Singer and J. B. Willett.
35 | Applied Longitudinal Data Analysis.
36 | Oxford University Press, Oxford, UK, 2003.
37 | https://stats.idre.ucla.edu/stat/r/examples/alda/data/wages_pp.txt
38 | }
39 | \examples{
40 | # show the data
41 | wages
42 | library(ggplot2)
43 | # set seed so that the plots stay the same
44 | set.seed(2019-7-15-1300)
45 | # explore a sample of five individuals
46 | wages \%>\%
47 | sample_n_keys(size = 5) \%>\%
48 | ggplot(aes(x = xp,
49 | y = ln_wages,
50 | group = id)) +
51 | geom_line()
52 |
53 | # Explore many samples with `facet_sample()`
54 | ggplot(wages,
55 | aes(x = xp,
56 | y = ln_wages,
57 | group = id)) +
58 | geom_line() +
59 | facet_sample()
60 |
61 | # explore the five number summary of ln_wages with `features`
62 | wages \%>\%
63 | features(ln_wages, feat_five_num)
64 |
65 | }
66 | \keyword{datasets}
67 |
--------------------------------------------------------------------------------
/revdep/README.md:
--------------------------------------------------------------------------------
1 | # Platform
2 |
3 | |field |value |
4 | |:--------|:-------------------------------------------------------------------------------------------|
5 | |version |R version 4.2.2 (2022-10-31) |
6 | |os |macOS Ventura 13.2 |
7 | |system |aarch64, darwin20 |
8 | |ui |RStudio |
9 | |language |(EN) |
10 | |collate |en_US.UTF-8 |
11 | |ctype |en_US.UTF-8 |
12 | |tz |Australia/Hobart |
13 | |date |2023-02-06 |
14 | |rstudio |2023.03.0-daily+282 Cherry Blossom (desktop) |
15 | |pandoc |2.19.2 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/ (via rmarkdown) |
16 |
17 | # Dependencies
18 |
19 | |package |old |new |Δ |
20 | |:-------|:-----|:-----|:--|
21 | |brolgar |0.1.2 |1.0.0 |* |
22 |
23 | # Revdeps
24 |
25 |
--------------------------------------------------------------------------------
/revdep/cran.md:
--------------------------------------------------------------------------------
1 | ## revdepcheck results
2 |
3 | We checked 1 reverse dependencies, comparing R CMD check results across CRAN and dev versions of this package.
4 |
5 | * We saw 0 new problems
6 | * We failed to check 0 packages
7 |
8 |
--------------------------------------------------------------------------------
/revdep/failures.md:
--------------------------------------------------------------------------------
1 | *Wow, no problems at all. :)*
--------------------------------------------------------------------------------
/revdep/problems.md:
--------------------------------------------------------------------------------
1 | *Wow, no problems at all. :)*
--------------------------------------------------------------------------------
/tests/spelling.R:
--------------------------------------------------------------------------------
1 | if (requireNamespace('spelling', quietly = TRUE))
2 | spelling::spell_check_test(
3 | vignettes = TRUE,
4 | error = FALSE,
5 | skip_on_cran = TRUE
6 | )
7 |
--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(brolgar)
3 |
4 | test_check("brolgar")
5 |
--------------------------------------------------------------------------------
/tests/testthat/_snaps/keys-near.md:
--------------------------------------------------------------------------------
1 | # keys_near returns the same dimension and names etc
2 |
3 | Code
4 | summarise_ln_wages
5 | Output
6 | # A tibble: 71 x 5
7 | id ln_wages stat stat_value stat_diff
8 |
9 | 1 223 2.14 q_75 2.14 0
10 | 2 304 1.84 med 1.84 0
11 | 3 470 1.84 med 1.84 0
12 | 4 537 1.84 med 1.84 0
13 | 5 630 1.84 med 1.84 0
14 | 6 700 1.59 q_25 1.59 0
15 | 7 735 4.30 max 4.30 0
16 | 8 767 1.84 med 1.84 0
17 | 9 871 1.84 med 1.84 0
18 | 10 1150 1.84 med 1.84 0
19 | # i 61 more rows
20 |
21 | ---
22 |
23 | Code
24 | summarise_slope
25 | Output
26 | # A tibble: 5 x 5
27 | id .slope_xp stat stat_value stat_diff
28 |
29 | 1 2594 -0.00768 q_25 -0.00769 0.0000127
30 | 2 7918 -4.58 min -4.58 0
31 | 3 10380 0.0479 med 0.0480 0.0000498
32 | 4 12178 0.0946 q_75 0.0947 0.0000579
33 | 5 12455 13.2 max 13.2 0
34 |
35 |
--------------------------------------------------------------------------------
/tests/testthat/test-add-longnostic.R:
--------------------------------------------------------------------------------
1 | library(dplyr)
2 |
3 | # function to use inside testthat
4 | add_new_names <- function(.data, x) {
5 | c(
6 | tsibble::key_vars(.data),
7 | tsibble::index_var(.data),
8 | x,
9 | tsibble::measured_vars(.data)
10 | )
11 | }
12 |
13 | wages_test <- sample_frac_keys(wages, 0.05)
14 |
15 | df_add_l_mean_1 <- wages_test %>%
16 | features(ln_wages, list(mean = mean)) %>%
17 | left_join(wages_test, by = "id", multiple = "all")
18 | df_add_n_obs <- add_n_obs(wages_test)
19 | df_add_key_slope <- add_key_slope(wages_test, ln_wages ~ xp)
20 | df_add_key_slope_multi <- add_key_slope(wages_test, ln_wages ~ xp + ged)
21 |
22 | updated_dim <- c(nrow(wages_test), ncol(wages_test) + 1)
23 |
24 | test_that("add_* funs return a tsibble", {
25 | expect_s3_class(df_add_n_obs, class = c("tbl_ts"))
26 | expect_s3_class(df_add_key_slope, class = c("tbl_ts"))
27 | expect_s3_class(df_add_key_slope_multi, class = c("tbl_ts"))
28 | })
29 |
30 |
31 | test_that("longnostics returns the right dimensions", {
32 | expect_equal(dim(df_add_l_mean_1), updated_dim)
33 | expect_equal(dim(df_add_n_obs), updated_dim)
34 | expect_equal(dim(df_add_key_slope), c(nrow(wages_test), ncol(wages_test) + 2))
35 | expect_equal(
36 | dim(df_add_key_slope_multi),
37 | c(nrow(wages_test), ncol(wages_test) + 3)
38 | )
39 | })
40 |
41 | test_that("longnostic returns the right names", {
42 | expect_equal(
43 | names(df_add_l_mean_1),
44 | c(
45 | names(wages_test)[1],
46 | "mean",
47 | names(wages_test)[2:length(names(wages_test))]
48 | )
49 | )
50 | expect_equal(names(df_add_n_obs), add_new_names(wages_test, "n_obs"))
51 | expect_equal(
52 | names(df_add_key_slope),
53 | add_new_names(wages_test, c(".intercept", ".slope_xp"))
54 | )
55 | expect_equal(
56 | names(df_add_key_slope_multi),
57 | add_new_names(wages_test, c(".intercept", ".slope_xp", ".slope_ged"))
58 | )
59 | })
60 |
61 | test_that("longnostic returns a tibble", {
62 | expect_s3_class(df_add_l_mean_1, class = c("tbl"))
63 | expect_s3_class(df_add_n_obs, class = c("tbl"))
64 | expect_s3_class(df_add_key_slope, class = c("tbl"))
65 | expect_s3_class(df_add_key_slope_multi, class = c("tbl"))
66 | })
67 |
68 | classes <- function(x) purrr::map_chr(x, class)
69 |
70 | test_that("longnostic returns correct classes", {
71 | expect_equal(classes(df_add_l_mean_1)[["mean"]], "numeric")
72 | expect_equal(classes(df_add_n_obs)[["n_obs"]], "integer")
73 | expect_equal(classes(df_add_key_slope)[[".intercept"]], "numeric")
74 | expect_equal(classes(df_add_key_slope)[[".slope_xp"]], "numeric")
75 | expect_equal(classes(df_add_key_slope_multi)[[".intercept"]], "numeric")
76 | expect_equal(classes(df_add_key_slope_multi)[[".slope_xp"]], "numeric")
77 | expect_equal(classes(df_add_key_slope_multi)[[".slope_ged"]], "numeric")
78 | })
79 |
80 | library(dplyr)
81 | test_that("add-key-slope returns different slopes and intercepts", {
82 | expect_gte(n_distinct(df_add_key_slope$.intercept), 2)
83 | expect_gte(n_distinct(df_add_key_slope$.slope_xp), 2)
84 | })
85 |
--------------------------------------------------------------------------------
/tests/testthat/test-b-funs.R:
--------------------------------------------------------------------------------
1 | vec <- c(1:5, NA, 5:1)
2 |
3 | test_that("b functions return non NA value", {
4 | expect_equal(b_diff_max(vec), 1)
5 | expect_equal(b_diff_mean(vec), 0)
6 | expect_equal(b_diff_median(vec), 0)
7 | expect_equal(b_diff_min(vec), -1)
8 | expect_equal(b_diff_q25(vec), -1)
9 | expect_equal(b_diff_q75(vec), 1)
10 | expect_equal(b_diff_sd(vec), 1)
11 | expect_equal(b_diff_var(vec), 1)
12 | expect_equal(b_iqr(vec), (2 + 1 / 6))
13 | expect_equal(b_mad(vec), 1.4826)
14 | expect_equal(b_max(vec), 5)
15 | expect_equal(b_mean(vec), 3)
16 | expect_equal(b_median(vec), 3)
17 | expect_equal(b_min(vec), 1)
18 | expect_equal(b_q25(vec), (1.9 + (0.0001 / 0.006)))
19 | expect_equal(round(b_q75(vec), 6), 4.083333)
20 | expect_equal(b_range(vec), c(1, 5))
21 | expect_equal(b_range_diff(vec), 4)
22 | expect_equal(round(b_sd(vec), 6), 1.490712)
23 | expect_equal(round(b_var(vec), 6), 2.222222)
24 | })
25 |
--------------------------------------------------------------------------------
/tests/testthat/test-facet-sample.R:
--------------------------------------------------------------------------------
1 | library(ggplot2)
2 | set.seed(2019 - 07 - 23 - 1900)
3 | gg_facet_sample <- ggplot(
4 | heights,
5 | aes(x = year, y = height_cm, group = country)
6 | ) +
7 | geom_line() +
8 | facet_sample()
9 |
10 | set.seed(2019 - 07 - 23 - 1901)
11 | gg_facet_sample_alt <- ggplot(
12 | heights,
13 | aes(x = year, y = height_cm, group = country)
14 | ) +
15 | geom_line(colour = "red") +
16 | facet_sample(n_per_facet = 4, n_facets = 6)
17 |
18 | test_that("facet_sample works", {
19 | skip_on_cran()
20 | skip_on_ci()
21 | vdiffr::expect_doppelganger("gg_facet_sample", gg_facet_sample)
22 | })
23 |
24 |
25 | test_that("facet_sample works with different options", {
26 | skip_on_cran()
27 | skip_on_ci()
28 | vdiffr::expect_doppelganger("gg_facet_sample_alt", gg_facet_sample_alt)
29 | })
30 |
--------------------------------------------------------------------------------
/tests/testthat/test-facet-strata.R:
--------------------------------------------------------------------------------
1 | library(ggplot2)
2 | library(dplyr)
3 | set.seed(2019 - 07 - 23 - 1835)
4 | gg_facet_strata <- ggplot(
5 | heights,
6 | aes(x = year, y = height_cm, group = country)
7 | ) +
8 | geom_line() +
9 | facet_strata()
10 |
11 | set.seed(2019 - 07 - 23 - 1836)
12 | gg_facet_strata_along <- wages %>%
13 | sample_frac_keys(0.1) %>%
14 | key_slope(ln_wages ~ xp) %>%
15 | right_join(wages, ., by = "id") %>%
16 | ggplot(aes(x = xp, y = ln_wages)) +
17 | geom_line(aes(group = id)) +
18 | geom_smooth(method = "lm") +
19 | facet_strata(along = .slope_xp)
20 |
21 | test_that("facet_strata works without along", {
22 | skip_on_cran()
23 | skip_on_ci()
24 | vdiffr::expect_doppelganger("gg_facet_strata", gg_facet_strata)
25 | })
26 |
27 |
28 | test_that("facet_strata works with along", {
29 | skip_on_cran()
30 | skip_on_ci()
31 | vdiffr::expect_doppelganger("gg_facet_strata_along", gg_facet_strata_along)
32 | })
33 |
--------------------------------------------------------------------------------
/tests/testthat/test-feature-diff-summary.R:
--------------------------------------------------------------------------------
1 | heights_diff_summary <- heights %>%
2 | features(year, feat_diff_summary)
3 |
4 | test_that("feat_diff_summary returns the right names", {
5 | expect_equal(
6 | names(heights_diff_summary),
7 | c(
8 | "country",
9 | "diff_min",
10 | "diff_q25",
11 | "diff_median",
12 | "diff_mean",
13 | "diff_q75",
14 | "diff_max",
15 | "diff_var",
16 | "diff_sd",
17 | "diff_iqr"
18 | )
19 | )
20 | })
21 |
22 | test_that("feat_diff_summary returns the right dimensions", {
23 | expect_equal(dim(heights_diff_summary), c(144, 10))
24 | })
25 |
26 | test_that("feat_diff_summary returns all ids", {
27 | expect_equal(dplyr::n_distinct(heights_diff_summary$country), 144)
28 | })
29 |
--------------------------------------------------------------------------------
/tests/testthat/test-feature-five-num.R:
--------------------------------------------------------------------------------
1 | wages_five_num <- wages %>%
2 | features(ln_wages, feat_five_num)
3 |
4 | test_that("feat_five_num returns the right names", {
5 | expect_equal(
6 | names(wages_five_num),
7 | c("id", "min", "q25", "med", "q75", "max")
8 | )
9 | })
10 |
11 | test_that("feat_five_num returns the right dimensions", {
12 | expect_equal(dim(wages_five_num), c(888, 6))
13 | })
14 |
15 | library(dplyr)
16 | test_that("feat_five_num returns all ids", {
17 | expect_equal(n_distinct(wages_five_num$id), 888)
18 | })
19 |
--------------------------------------------------------------------------------
/tests/testthat/test-feature-monotonic.R:
--------------------------------------------------------------------------------
1 | wages_monotonic <- wages %>%
2 | features(ln_wages, feat_monotonic)
3 |
4 | test_that("feat_monotonic returns the right names", {
5 | expect_equal(
6 | names(wages_monotonic),
7 | c("id", "increase", "decrease", "unvary", "monotonic")
8 | )
9 | })
10 |
11 |
12 | test_that("feat_monotonic returns the right dimensions", {
13 | expect_equal(dim(wages_monotonic), c(888, 5))
14 | })
15 |
16 | library(dplyr)
17 | test_that("feat_monotonic returns all ids", {
18 | expect_equal(n_distinct(wages_monotonic$id), 888)
19 | })
20 |
--------------------------------------------------------------------------------
/tests/testthat/test-feature-n-obs.R:
--------------------------------------------------------------------------------
1 | wages_test <- sample_frac_keys(wages, 0.05)
2 | df_n_obs <- features(wages_test, ln_wages, n_obs)
3 | new_dims <- c(n_keys(wages_test), 2)
4 |
5 | test_that("feature returns the right dimensions", {
6 | expect_equal(dim(df_n_obs), new_dims)
7 | })
8 |
9 | test_that("longnostic returns the right names", {
10 | expect_equal(names(df_n_obs), c("id", "n_obs"))
11 | })
12 |
13 | test_that("longnostic returns a tbl_df", {
14 | expect_s3_class(df_n_obs, class = c("tbl"))
15 | })
16 |
17 | test_that("longnostic returns correct classes", {
18 | expect_equal(classes(df_n_obs), c(id = "integer", n_obs = "integer"))
19 | })
20 |
--------------------------------------------------------------------------------
/tests/testthat/test-feature-three-num.R:
--------------------------------------------------------------------------------
1 | wages_three_num <- wages %>%
2 | features(ln_wages, feat_three_num)
3 |
4 | test_that("feat_three_num returns the right names", {
5 | expect_equal(names(wages_three_num), c("id", "min", "med", "max"))
6 | })
7 |
8 | test_that("feat_three_num returns the right dimensions", {
9 | expect_equal(dim(wages_three_num), c(888, 4))
10 | })
11 |
12 | library(dplyr)
13 | test_that("feat_three_num returns all ids", {
14 | expect_equal(n_distinct(wages_three_num$id), 888)
15 | })
16 |
--------------------------------------------------------------------------------
/tests/testthat/test-features-brolgar.R:
--------------------------------------------------------------------------------
1 | wages_brolgar <- wages %>%
2 | features(ln_wages, feat_brolgar)
3 |
4 | test_that("feat_brolgar returns the right names", {
5 | expect_equal(
6 | names(wages_brolgar),
7 | c(
8 | "id",
9 | "min",
10 | "max",
11 | "median",
12 | "mean",
13 | "q25",
14 | "q75",
15 | "range1",
16 | "range2",
17 | "range_diff",
18 | "sd",
19 | "var",
20 | "mad",
21 | "iqr",
22 | "increase",
23 | "decrease",
24 | "unvary",
25 | "diff_min",
26 | "diff_q25",
27 | "diff_median",
28 | "diff_mean",
29 | "diff_q75",
30 | "diff_max",
31 | "diff_var",
32 | "diff_sd",
33 | "diff_iqr"
34 | )
35 | )
36 | })
37 |
38 |
39 | test_that("feat_brolgar returns the right dimensions", {
40 | expect_equal(dim(wages_brolgar), c(888, 26))
41 | })
42 |
43 | library(dplyr)
44 | test_that("feat_brolgar returns all ids", {
45 | expect_equal(n_distinct(wages_brolgar$id), 888)
46 | })
47 |
--------------------------------------------------------------------------------
/tests/testthat/test-features-ranges.R:
--------------------------------------------------------------------------------
1 | wages_ranges <- wages %>%
2 | features(ln_wages, feat_ranges)
3 |
4 | test_that("feat_ranges returns the right names", {
5 | expect_equal(names(wages_ranges), c("id", "min", "max", "range_diff", "iqr"))
6 | })
7 |
8 | test_that("feat_ranges returns the right dimensions", {
9 | expect_equal(dim(wages_ranges), c(888, 5))
10 | })
11 |
12 | library(dplyr)
13 | test_that("feat_ranges returns all ids", {
14 | expect_equal(n_distinct(wages_ranges$id), 888)
15 | })
16 |
--------------------------------------------------------------------------------
/tests/testthat/test-features-spread.R:
--------------------------------------------------------------------------------
1 | wages_spread <- wages %>%
2 | features(ln_wages, feat_spread)
3 |
4 | test_that("feat_spread returns the right names", {
5 | expect_equal(names(wages_spread), c("id", "var", "sd", "mad", "iqr"))
6 | })
7 |
8 | test_that("feat_spread returns the right dimensions", {
9 | expect_equal(dim(wages_spread), c(888, 5))
10 | })
11 |
12 | library(dplyr)
13 | test_that("feat_spread returns all ids", {
14 | expect_equal(n_distinct(wages_spread$id), 888)
15 | })
16 |
--------------------------------------------------------------------------------
/tests/testthat/test-filter-n-obs.R:
--------------------------------------------------------------------------------
1 | # function to use inside testthat
2 | add_new_names <- function(data, x) {
3 | c(
4 | tsibble::key_vars(data),
5 | tsibble::index_var(data),
6 | x,
7 | tsibble::measured_vars(data)
8 | )
9 | }
10 |
11 | wages_test <- wages %>% add_n_obs()
12 | wages_gt_10 <- filter(wages_test, n_obs > 10)
13 |
14 | wages_et_2 <- filter(wages_test, n_obs == 2)
15 |
16 | wages_gte_10 <- filter(wages_test, n_obs >= 10)
17 |
18 | wages_lte_2 <- filter(wages_test, n_obs <= 2)
19 |
20 |
21 | test_that("n_obs works with names", {
22 | expect_equal(as.numeric(n_obs(wages)), nrow(wages))
23 | expect_equal(n_obs(wages), c(n_obs = nrow(wages)))
24 | expect_equal(n_obs(wages, names = FALSE), nrow(wages))
25 | })
26 |
27 | test_that("correct number of observations are returned", {
28 | expect_equal(nrow(wages_gt_10), 1105)
29 | expect_equal(nrow(wages_et_2), 78)
30 | expect_equal(nrow(wages_gte_10), 2235)
31 | expect_equal(nrow(wages_lte_2), 116)
32 | })
33 |
34 | test_that("correct number of columns are returned", {
35 | expect_equal(ncol(wages_gt_10), ncol(wages) + 1)
36 | expect_equal(ncol(wages_et_2), ncol(wages) + 1)
37 | expect_equal(ncol(wages_gte_10), ncol(wages) + 1)
38 | expect_equal(ncol(wages_lte_2), ncol(wages) + 1)
39 | })
40 |
41 | test_that("n_obs is added to the dataframe", {
42 | expect_equal(names(wages_gt_10), add_new_names(wages, "n_obs"))
43 | })
44 |
45 | test_that("is a tibble", {
46 | expect_s3_class(wages_et_2, "tbl")
47 | expect_s3_class(wages_gt_10, "tbl")
48 | expect_s3_class(wages_gte_10, "tbl")
49 | expect_s3_class(wages_lte_2, "tbl")
50 | })
51 |
--------------------------------------------------------------------------------
/tests/testthat/test-if_tsibble.R:
--------------------------------------------------------------------------------
1 | test_that("test_if_tsibble works", {
2 | expect_silent(test_if_tsibble(wages))
3 | expect_error(test_if_tsibble(iris))
4 | })
5 |
--------------------------------------------------------------------------------
/tests/testthat/test-index-helpers.R:
--------------------------------------------------------------------------------
1 | test_that("index regular returns logical", {
2 | expect_type(index_regular(pisa, year), "logical")
3 | })
4 |
5 | test_that("index regular fails when non data.frame provided", {
6 | expect_error(index_regular(matrix(0), 1))
7 | })
8 |
9 | test_that("index regular fails when column doesn't exist", {
10 | expect_error(index_regular(pisa, wat))
11 | })
12 |
13 | test_that("index summary returns summaryDefault", {
14 | expect_s3_class(index_summary(pisa, year), "summaryDefault")
15 | expect_type(index_summary(pisa, year), "double")
16 | })
17 |
18 | test_that("index summary fails when non data.frame provided", {
19 | expect_error(index_summary(matrix(0), 1))
20 | })
21 |
22 | test_that("index summary fails when column doesn't exist", {
23 | expect_error(index_summary(pisa, wat))
24 | })
25 |
--------------------------------------------------------------------------------
/tests/testthat/test-key-slope.R:
--------------------------------------------------------------------------------
1 | test_that("key_slope does not handle character input", {
2 | expect_error(key_slope(heights, "height_cm ~ year"))
3 | })
4 |
5 | test_that("key_slope does not fail", {
6 | expect_s3_class(key_slope(heights, height_cm ~ year), "tbl_df")
7 | })
8 |
9 | test_that("key_slope fails when given non-formula", {
10 | expect_error(key_slope(heights, "height_cm year"))
11 | expect_error(key_slope(heights, height_cm + .))
12 | expect_error(key_slope(heights, height_cm))
13 | })
14 |
15 | test_that("add_key_slope does not handle character input", {
16 | expect_error(add_key_slope(heights, "height_cm ~ year"))
17 | })
18 |
19 | test_that("add_key_slope does not fail", {
20 | expect_s3_class(add_key_slope(heights, height_cm ~ year), "tbl_df")
21 | })
22 |
23 |
24 | test_that("add_key_slope fails when given non-formula", {
25 | expect_error(add_key_slope(heights, "height_cm year"))
26 | expect_error(add_key_slope(heights, height_cm + .))
27 | expect_error(add_key_slope(heights, height_cm))
28 | })
29 |
--------------------------------------------------------------------------------
/tests/testthat/test-keys-near.R:
--------------------------------------------------------------------------------
1 | # Return observations closest to the five number summary of ln_wages
2 | summarise_ln_wages <- keys_near(.data = wages, var = ln_wages)
3 |
4 | # Specify your own list of summaries
5 | l_ranges <- list(
6 | min = b_min,
7 | range_diff = b_range_diff,
8 | max = b_max,
9 | iqr = b_iqr
10 | )
11 |
12 | summarise_slope <- wages %>%
13 | key_slope(formula = ln_wages ~ xp) %>%
14 | keys_near(key = id, var = .slope_xp)
15 |
16 | summarise_ln_wages
17 | summarise_slope
18 |
19 | test_that("keys_near returns the same dimension and names etc", {
20 | skip_on_cran()
21 | skip_on_os("linux")
22 | expect_snapshot(summarise_ln_wages)
23 | expect_snapshot(summarise_slope)
24 | })
25 |
26 | summarised_slop_add_data <- dplyr::left_join(
27 | summarise_slope,
28 | wages,
29 | by = "id"
30 | )
31 |
32 | plot_stat <- ggplot(
33 | summarised_slop_add_data,
34 | aes(
35 | x = xp,
36 | y = ln_wages,
37 | group = id
38 | )
39 | ) +
40 | geom_line(
41 | data = wages,
42 | colour = "grey50",
43 | alpha = 0.5
44 | ) +
45 | geom_line(
46 | aes(colour = stat)
47 | )
48 |
49 | test_that("keys_near returns a similar plot", {
50 | skip_on_ci()
51 | vdiffr::expect_doppelganger(
52 | "stat_plot",
53 | plot_stat
54 | )
55 | })
56 |
--------------------------------------------------------------------------------
/tests/testthat/test-longnostics.R:
--------------------------------------------------------------------------------
1 | wages_test <- sample_frac_keys(wages, 0.05)
2 |
3 | df_l_range_1 <- features(
4 | wages_test,
5 | ln_wages,
6 | list(range = ~ setNames(b_range(.), c("min", "max")))
7 | )
8 | df_l_max <- features(wages_test, ln_wages, c(max = b_max))
9 | df_l_mean <- features(wages_test, ln_wages, c(mean = b_mean))
10 | df_l_median <- features(wages_test, ln_wages, c(median = b_median))
11 | df_l_min <- features(wages_test, ln_wages, c(min = b_min))
12 | df_l_q1 <- features(wages_test, ln_wages, c(q25 = b_q25))
13 | df_l_q3 <- features(wages_test, ln_wages, c(q75 = b_q75))
14 | df_l_sd <- features(wages_test, ln_wages, c(sd = b_sd))
15 | df_l_slope <- key_slope(wages_test, ln_wages ~ xp)
16 | df_l_slope_multi <- key_slope(wages_test, ln_wages ~ xp + ged)
17 |
18 | new_dims <- c(n_keys(wages_test), 2)
19 |
20 | test_that("longnostics returns the right dimensions", {
21 | expect_equal(dim(df_l_range_1), c(new_dims[1], new_dims[2] + 1))
22 | expect_equal(dim(df_l_max), new_dims)
23 | expect_equal(dim(df_l_mean), new_dims)
24 | expect_equal(dim(df_l_median), new_dims)
25 | expect_equal(dim(df_l_min), new_dims)
26 | expect_equal(dim(df_l_q1), new_dims)
27 | expect_equal(dim(df_l_q3), new_dims)
28 | expect_equal(dim(df_l_sd), new_dims)
29 | expect_equal(dim(df_l_slope), c(n_keys(wages_test), 3))
30 | expect_equal(dim(df_l_slope_multi), c(n_keys(wages_test), 4))
31 | })
32 |
33 | test_that("longnostic returns the right names", {
34 | expect_equal(names(df_l_range_1), c("id", "range_min", "range_max"))
35 | expect_equal(names(df_l_max), c("id", "max"))
36 | expect_equal(names(df_l_mean), c("id", "mean"))
37 | expect_equal(names(df_l_median), c("id", "median"))
38 | expect_equal(names(df_l_min), c("id", "min"))
39 | expect_equal(names(df_l_q1), c("id", "q25"))
40 | expect_equal(names(df_l_q3), c("id", "q75"))
41 | expect_equal(names(df_l_sd), c("id", "sd"))
42 | expect_equal(names(df_l_slope), c("id", ".intercept", ".slope_xp"))
43 | expect_equal(
44 | names(df_l_slope_multi),
45 | c("id", ".intercept", ".slope_xp", ".slope_ged")
46 | )
47 | })
48 |
49 | test_that("longnostic returns a tbl_df", {
50 | expect_s3_class(df_l_range_1, class = c("tbl"))
51 | expect_s3_class(df_l_max, class = c("tbl"))
52 | expect_s3_class(df_l_mean, class = c("tbl"))
53 | expect_s3_class(df_l_median, class = c("tbl"))
54 | expect_s3_class(df_l_min, class = c("tbl"))
55 | expect_s3_class(df_l_q1, class = c("tbl"))
56 | expect_s3_class(df_l_q3, class = c("tbl"))
57 | expect_s3_class(df_l_sd, class = c("tbl"))
58 | expect_s3_class(df_l_slope, class = c("tbl"))
59 | expect_s3_class(df_l_slope_multi, class = c("tbl"))
60 | })
61 |
62 | classes <- function(x) purrr::map_chr(x, class)
63 |
64 | test_that("longnostic returns correct classes", {
65 | expect_equal(
66 | classes(df_l_range_1),
67 | c(id = "integer", range_min = "numeric", range_max = "numeric")
68 | )
69 | expect_equal(classes(df_l_max), c(id = "integer", max = "numeric"))
70 | expect_equal(classes(df_l_mean), c(id = "integer", mean = "numeric"))
71 | expect_equal(classes(df_l_median), c(id = "integer", median = "numeric"))
72 | expect_equal(classes(df_l_min), c(id = "integer", min = "numeric"))
73 | expect_equal(classes(df_l_q1), c(id = "integer", q25 = "numeric"))
74 | expect_equal(classes(df_l_q3), c(id = "integer", q75 = "numeric"))
75 | expect_equal(classes(df_l_sd), c(id = "integer", sd = "numeric"))
76 | expect_equal(
77 | classes(df_l_slope),
78 | c(id = "integer", .intercept = "numeric", .slope_xp = "numeric")
79 | )
80 | expect_equal(
81 | classes(df_l_slope_multi),
82 | c(
83 | id = "integer",
84 | .intercept = "numeric",
85 | .slope_xp = "numeric",
86 | .slope_ged = "numeric"
87 | )
88 | )
89 | })
90 |
91 | test_that("add-key-slope returns different slopes and intercepts", {
92 | expect_gte(n_distinct(df_l_slope$.intercept), 2)
93 | expect_gte(n_distinct(df_l_slope$.slope_xp), 2)
94 | })
95 |
--------------------------------------------------------------------------------
/tests/testthat/test-monotonics.R:
--------------------------------------------------------------------------------
1 | vec_inc <- c(1:10)
2 | vec_dec <- c(10:1)
3 | vec_ran <- c(sample(1:10))
4 | vec_flat <- rep.int(1, 10)
5 |
6 | test_that("increasing correctly guesses", {
7 | expect_true(increasing(vec_inc))
8 | expect_false(increasing(vec_dec))
9 | expect_false(increasing(vec_ran))
10 | expect_false(increasing(vec_flat))
11 | expect_false(increasing(1))
12 | })
13 |
14 | test_that("decreasing correctly guesses", {
15 | expect_false(decreasing(vec_inc))
16 | expect_true(decreasing(vec_dec))
17 | expect_false(decreasing(vec_ran))
18 | expect_false(decreasing(vec_flat))
19 | expect_false(decreasing(1))
20 | })
21 |
22 | test_that("unvarying correctly guesses", {
23 | expect_false(unvarying(vec_inc))
24 | expect_false(unvarying(vec_dec))
25 | expect_false(unvarying(vec_ran))
26 | expect_true(unvarying(vec_flat))
27 | expect_false(unvarying(1))
28 | })
29 |
30 | test_that("monotonic correctly guesses", {
31 | expect_true(monotonic(vec_inc))
32 | expect_true(monotonic(vec_dec))
33 | expect_false(monotonic(vec_ran))
34 | expect_false(monotonic(vec_flat))
35 | expect_false(monotonic(1))
36 | })
37 |
38 | wages_monotonic <- wages %>% features(ln_wages, feat_monotonic)
39 |
40 | test_that("wages_monotonic produces output 0 or 1", {
41 | expect_equal(max(rowSums(wages_monotonic[, 2:4])), 1)
42 | expect_false(any(rowSums(wages_monotonic[, 2:4]) > 1))
43 | })
44 |
--------------------------------------------------------------------------------
/tests/testthat/test-nearests.R:
--------------------------------------------------------------------------------
1 | x <- 1:100
2 | library(dplyr)
3 |
4 | x <- 1:10
5 |
6 | test_that("`near_between()` works on a vector", {
7 | expect_equal(
8 | near_between(x, from = 0.4, to = 0.6),
9 | c(rep(FALSE, 4), TRUE, TRUE, rep(FALSE, 4))
10 | )
11 | expect_equal(
12 | near_between(x, from = 0.1, to = 0.3),
13 | c(FALSE, TRUE, TRUE, rep(FALSE, 7))
14 | )
15 | })
16 |
17 | wages_feat <- wages %>%
18 | features(ln_wages, list(min = min))
19 |
20 | wages_feat_near_btn <- wages_feat %>%
21 | filter(near_between(x = min, from = 0.4, to = 0.6))
22 |
23 | test_that("`near_between()` works on a data.frame", {
24 | expect_s3_class(wages_feat_near_btn, "data.frame")
25 | expect_lte(nrow(wages_feat_near_btn), nrow(wages_feat))
26 | expect_equal(ncol(wages_feat_near_btn), ncol(wages_feat))
27 | })
28 |
29 | test_that("`near_middle()` works on a vector", {
30 | expect_equal(
31 | near_middle(x = x, middle = 0.5, within = 0.2),
32 | c(rep(FALSE, 4), TRUE, TRUE, rep(FALSE, 4))
33 | )
34 | expect_equal(
35 | near_middle(x = x, middle = 0.2, within = 0.1),
36 | c(FALSE, FALSE, TRUE, rep(FALSE, 7))
37 | )
38 | })
39 |
40 | wages_feat_near_middle <- wages_feat %>%
41 | filter(near_middle(x = min, middle = 0.5, within = 0.2))
42 |
43 | test_that("`near_middle()` works on a data.frame", {
44 | expect_s3_class(wages_feat_near_middle, "data.frame")
45 | expect_lte(nrow(wages_feat_near_middle), nrow(wages_feat))
46 | expect_equal(ncol(wages_feat_near_middle), ncol(wages_feat))
47 | })
48 |
49 |
50 | test_that("`near_quantile()` works on a vector", {
51 | expect_equal(near_quantile(x, probs = 0.5, tol = 0.01), rep(FALSE, 10))
52 | expect_equal(
53 | near_quantile(x, probs = 0.5, tol = 0.5),
54 | c(rep(FALSE, 4), TRUE, TRUE, rep(FALSE, 4))
55 | )
56 | expect_equal(near_quantile(x, probs = 0.25, tol = 0.01), rep(FALSE, 10))
57 | expect_equal(
58 | near_quantile(x, probs = 0.25, tol = 0.5),
59 | c(rep(FALSE, 2), TRUE, rep(FALSE, 7))
60 | )
61 | expect_equal(
62 | near_quantile(x, probs = c(0.25, 0.75), tol = 0.01),
63 | rep(FALSE, 10)
64 | )
65 | expect_equal(
66 | near_quantile(x, probs = c(0.25, 0.75), tol = 0.5),
67 | c(rep(FALSE, 2), TRUE, rep(FALSE, 4), TRUE, FALSE, FALSE)
68 | )
69 | })
70 |
71 | wages_q1 <- wages %>%
72 | features(ln_wages, list(min = min)) %>%
73 | filter(near_quantile(
74 | x = min,
75 | probs = 0.5,
76 | tol = 0.01
77 | ))
78 |
79 | wages_q2 <- wages %>%
80 | features(ln_wages, list(min = min)) %>%
81 | filter(near_quantile(
82 | x = min,
83 | probs = c(0.25, 0.5, 0.75),
84 | tol = 0.01
85 | ))
86 |
87 | test_that("`near_quantile()` works on a data.frame", {
88 | expect_s3_class(wages_q1, "data.frame")
89 | expect_s3_class(wages_q2, "data.frame")
90 | expect_lte(nrow(wages_q1), nrow(wages_feat))
91 | expect_lte(nrow(wages_q2), nrow(wages_feat))
92 | expect_equal(ncol(wages_q1), ncol(wages_feat))
93 | expect_equal(ncol(wages_q2), ncol(wages_feat))
94 | })
95 |
96 | x <- 1:10
97 | y <- 5:14
98 | z <- 16:25
99 | a <- -1:-5
100 | b <- -1
101 |
102 | test_that("`nearest_lgl()` works for vectors", {
103 | expect_equal(nearest_lgl(x, y), c(rep(TRUE, 6), rep(FALSE, 4)))
104 | expect_equal(nearest_lgl(y, x), c(rep(FALSE, 4), rep(TRUE, 6)))
105 | expect_equal(nearest_lgl(x, z), c(TRUE, rep(FALSE, 9)))
106 | expect_equal(nearest_lgl(z, x), c(rep(FALSE, 9), TRUE))
107 | expect_equal(nearest_lgl(x, a), c(TRUE, rep(FALSE, 4)))
108 | expect_equal(nearest_lgl(a, x), c(TRUE, rep(FALSE, 9)))
109 | expect_equal(nearest_lgl(x, b), TRUE)
110 | expect_equal(nearest_lgl(b, x), c(TRUE, rep(FALSE, 9)))
111 | })
112 |
113 | wages_near_min <- wages %>%
114 | filter(nearest_lgl(min(ln_wages), ln_wages))
115 |
116 | wages_near_fivenum <- wages %>%
117 | filter(nearest_lgl(fivenum(ln_wages), ln_wages))
118 |
119 | test_that("`nearest_lgl()` works for data.frames", {
120 | expect_s3_class(wages_near_min, "data.frame")
121 | expect_s3_class(wages_near_fivenum, "data.frame")
122 | expect_lte(nrow(wages_near_min), nrow(wages))
123 | expect_lte(nrow(wages_near_fivenum), nrow(wages))
124 | expect_equal(ncol(wages_near_min), ncol(wages))
125 | expect_equal(ncol(wages_near_fivenum), ncol(wages))
126 | expect_equal(nrow(wages_near_min), 1)
127 | expect_equal(nrow(wages_near_fivenum), 5)
128 | })
129 |
130 |
131 | test_that("`nearest_qt_lgl()` works for vectors", {
132 | expect_equal(nearest_qt_lgl(x, c(0.5)), c(rep(FALSE, 4), TRUE, rep(FALSE, 5)))
133 | expect_equal(sum(nearest_qt_lgl(x, c(0.5))), 1)
134 | expect_equal(
135 | nearest_qt_lgl(x, c(0.25, 0.5, 0.75)),
136 | c(FALSE, FALSE, TRUE, FALSE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE)
137 | )
138 | expect_equal(sum(nearest_qt_lgl(x, c(0.25, 0.5, 0.75))), 3)
139 | expect_equal(
140 | nearest_qt_lgl(x, c(0.1, 0.5, 0.9)),
141 | c(FALSE, TRUE, FALSE, FALSE, TRUE, rep(FALSE, 3), TRUE, FALSE)
142 | )
143 | expect_equal(sum(nearest_qt_lgl(x, c(0.1, 0.5, 0.9))), 3)
144 | })
145 |
146 | wages_near_qt_1 <- wages %>%
147 | filter(nearest_qt_lgl(ln_wages, c(0.5)))
148 |
149 | wages_near_qt_3 <- wages %>%
150 | filter(nearest_qt_lgl(ln_wages, c(0.1, 0.5, 0.9)))
151 |
152 | test_that("`nearest_qt_lgl()` works for data.frames", {
153 | expect_s3_class(wages_near_qt_1, "data.frame")
154 | expect_s3_class(wages_near_qt_3, "data.frame")
155 | expect_lte(nrow(wages_near_qt_1), nrow(wages))
156 | expect_lte(nrow(wages_near_qt_3), nrow(wages))
157 | expect_equal(ncol(wages_near_qt_1), ncol(wages))
158 | expect_equal(ncol(wages_near_qt_3), ncol(wages))
159 | expect_equal(nrow(wages_near_qt_1), 1)
160 | expect_equal(nrow(wages_near_qt_3), 3)
161 | })
162 |
--------------------------------------------------------------------------------
/tests/testthat/test-sample-n-frac-keys.R:
--------------------------------------------------------------------------------
1 | l_sample_n_keys <- sample_n_keys(.data = wages, size = 10)
2 |
3 | l_sample_frac_keys <- sample_frac_keys(.data = wages, size = 0.11)
4 |
5 | test_that("fails when size > 0 given", {
6 | expect_error(sample_frac_keys(.data = wages, size = 2))
7 | expect_equal(dim(sample_frac_keys(.data = wages, size = 1)), c(6402, 9))
8 | })
9 |
10 | test_that("correct number of columns returned", {
11 | expect_equal(ncol(l_sample_n_keys), ncol(wages))
12 | expect_equal(ncol(l_sample_frac_keys), ncol(wages))
13 | })
14 |
15 | test_that("correct number of ids returned", {
16 | expect_equal(dplyr::n_distinct(l_sample_n_keys$id), 10)
17 | expect_equal(dplyr::n_distinct(l_sample_frac_keys$id), 98)
18 | })
19 |
20 | test_that("correct names returned", {
21 | expect_equal(names(l_sample_n_keys), names(wages))
22 | expect_equal(names(l_sample_frac_keys), names(wages))
23 | })
24 |
25 | test_that("Returns a tibble", {
26 | expect_s3_class(l_sample_n_keys, "tbl")
27 | expect_s3_class(l_sample_frac_keys, "tbl")
28 | })
29 |
30 | classes <- function(x) purrr::map_chr(x, class)
31 |
32 | test_that("Returns correct classes", {
33 | expect_equal(classes(l_sample_n_keys), classes(wages))
34 | expect_equal(classes(l_sample_frac_keys), classes(wages))
35 | })
36 |
--------------------------------------------------------------------------------
/tests/testthat/test-sample-n-frac-multiple-keys.R:
--------------------------------------------------------------------------------
1 | library(tsibble)
2 | library(brolgar)
3 | library(dplyr)
4 | skip_if_not_installed("tsibbledata")
5 | library(tsibbledata)
6 |
7 | data("aus_retail")
8 |
9 | # will fail
10 | test_that("multiple keys doesn't fail", {
11 | expect_equal(ncol(sample_n_keys(aus_retail, size = 10)), 5)
12 | expect_gt(nrow(sample_n_keys(aus_retail, size = 10)), 1)
13 |
14 | expect_equal(ncol(sample_frac_keys(aus_retail, size = 0.11)), 5)
15 | expect_gt(nrow(sample_frac_keys(aus_retail, size = 0.11)), 1)
16 | })
17 |
18 | test_that("fails when size > 0 given", {
19 | expect_error(sample_frac_keys(aus_retail, size = 2))
20 | expect_equal(dim(sample_frac_keys(aus_retail, size = 1)), dim(aus_retail))
21 | })
22 |
23 | aus_retail_sample10 <- sample_n_keys(aus_retail, size = 10)
24 |
25 | # I'm not really sure what I would expect these numbers to be?
26 | aus_retail_sample10 %>% pull(State) %>% n_distinct()
27 | aus_retail_sample10 %>% pull(Industry) %>% n_distinct()
28 |
29 | sample_n_keys(wages, size = 10)
30 |
31 | sample_n_keys_retail_nkeys <- sample_n_keys(aus_retail, size = 10)
32 | sample_frac_keys_retail_nkeys <- sample_frac_keys(aus_retail, size = 0.11)
33 | n_keys(sample_frac_keys_retail_nkeys)
34 |
35 |
36 | test_that("correct number of columns returned", {
37 | expect_equal(ncol(sample_n_keys_retail_nkeys), ncol(aus_retail))
38 | expect_equal(ncol(sample_frac_keys_retail_nkeys), ncol(aus_retail))
39 | })
40 |
41 | aus_retail_keys <- tsibble::key_vars(aus_retail)
42 |
43 | test_that("correct number of keys returned", {
44 | expect_equal(n_keys(sample_n_keys_retail_nkeys), 10)
45 | expect_equal(n_keys(sample_frac_keys_retail_nkeys), 17)
46 | })
47 |
48 | test_that("correct names returned", {
49 | expect_equal(names(sample_n_keys_retail_nkeys), names(aus_retail))
50 | expect_equal(names(sample_frac_keys_retail_nkeys), names(aus_retail))
51 | })
52 |
53 | test_that("Returns a tibble", {
54 | expect_s3_class(sample_n_keys_retail_nkeys, "tbl")
55 | expect_s3_class(sample_frac_keys_retail_nkeys, "tbl")
56 | })
57 |
58 | classes <- function(x) purrr::map(x, class)
59 |
60 | test_that("Returns correct classes", {
61 | expect_equal(classes(sample_n_keys_retail_nkeys), classes(aus_retail))
62 | expect_equal(classes(sample_frac_keys_retail_nkeys), classes(aus_retail))
63 | })
64 |
--------------------------------------------------------------------------------
/tests/testthat/test-stratify-key.R:
--------------------------------------------------------------------------------
1 | wages_test <- sample_frac_keys(wages, size = 0.10)
2 |
3 | wages_strat <- stratify_keys(wages_test, n_strata = 10)
4 |
5 | library(tsibble)
6 | library(dplyr)
7 | wages_strat_along <- wages_test %>%
8 | stratify_keys(n_strata = 10, along = unemploy_rate, fun = median)
9 |
10 | test_that("correct number of observations are returned", {
11 | expect_equal(nrow(wages_strat), nrow(wages_test))
12 | expect_equal(nrow(wages_strat_along), nrow(wages_test))
13 | })
14 |
15 | test_that("correct number of columns are returned", {
16 | expect_equal(ncol(wages_strat), ncol(wages_test) + 1)
17 | expect_equal(ncol(wages_strat_along), ncol(wages_test) + 1)
18 | })
19 |
20 | test_that(".strata is added to the dataframe", {
21 | expect_equal(names(wages_strat), c(names(wages_test), ".strata"))
22 | expect_equal(names(wages_strat_along), c(names(wages_test), ".strata"))
23 | })
24 |
25 | test_that("is a tsibble", {
26 | expect_s3_class(wages_strat, class = "tbl_ts")
27 | expect_s3_class(wages_strat_along, class = "tbl_ts")
28 | })
29 |
30 | wages_strat_along_sum <- wages_strat_along %>%
31 | as_tibble() %>%
32 | group_by(.strata) %>%
33 | summarise_at(vars(unemploy_rate), list(mean = mean), na.rm = TRUE)
34 |
35 | test_that("stratify_keys with along returns strata that decrease on average", {
36 | expect_true(mean(diff(wages_strat_along_sum$mean)) < 0)
37 | })
38 |
39 | test_that("The strata are unique within each id", {
40 | n_strata_and_id <- wages_test %>%
41 | stratify_keys(n_strata = 4) %>%
42 | select(id, ln_wages, .strata) %>%
43 | as_tibble() %>%
44 | distinct(id, .strata) %>%
45 | nrow()
46 |
47 | expect_equal(n_strata_and_id, tsibble::n_keys(wages_test))
48 | })
49 |
50 | test_that("possible_strata returns the same length as the number of keys", {
51 | how_many_possible_strata <- length(possible_strata(wages_test, 2))
52 | expect_equal(how_many_possible_strata, n_keys(wages_test))
53 | })
54 |
55 |
56 | test_that("The number of groups in each strata equals the number of keys", {
57 | wages_groups <- wages_test %>%
58 | sample_n_keys(12) %>%
59 | select(id) %>%
60 | stratify_keys(n_strata = 4) %>%
61 | as_tibble() %>%
62 | group_by(.strata) %>%
63 | summarise(n = n_distinct(id)) %>%
64 | pull(n) %>%
65 | sum()
66 | expect_equal(tsibble::n_keys(sample_n_keys(wages_test, 12)), wages_groups)
67 | })
68 |
69 |
70 | strata_equal_1 <- wages_test %>%
71 | sample_n_keys(12) %>%
72 | stratify_keys(n_strata = 4) %>%
73 | as_tibble() %>%
74 | group_by(.strata) %>%
75 | summarise(n = n_distinct(id))
76 |
77 | strata_equal_2 <- wages_test %>%
78 | sample_n_keys(24) %>%
79 | stratify_keys(n_strata = 4) %>%
80 | as_tibble() %>%
81 | group_by(.strata) %>%
82 | summarise(n = n_distinct(id))
83 |
84 | strata_equal_3 <- wages_test %>%
85 | sample_n_keys(25) %>%
86 | stratify_keys(n_strata = 4) %>%
87 | as_tibble() %>%
88 | group_by(.strata) %>%
89 | summarise(n = n_distinct(id))
90 |
91 | test_that("stratify_keys returns the same number of keys per strata", {
92 | expect_true(all(strata_equal_1$n == 3))
93 | expect_true(all(strata_equal_2$n == 6))
94 | expect_true(all(strata_equal_3$n %in% c(6, 7, 6, 6)))
95 | })
96 |
97 | # need to add tests for each
98 |
99 | # ggplot(wages_strat_sum,
100 | # aes(x = .strata,
101 | # y = .slope_xp_mean)) +
102 | # geom_point()
103 |
--------------------------------------------------------------------------------
/tests/testthat/test-utils.R:
--------------------------------------------------------------------------------
1 | test_that("pct works", {
2 | expect_equal(pct(0.1), c("q_10"))
3 | expect_equal(pct(0.2), c("q_20"))
4 | expect_equal(pct(0.3), c("q_30"))
5 | expect_equal(pct(0.99), c("q_99"))
6 | })
7 |
8 | q_vec <- c(1:100)
9 |
10 | test_that("qtl works", {
11 | expect_equal(
12 | as.numeric(qtl(q_vec, 0.25)),
13 | quantile(q_vec, 0.25, names = FALSE)
14 | )
15 |
16 | expect_equal(names(qtl(q_vec, 0.25)), "q_25")
17 |
18 | expect_equal(
19 | as.numeric(qtl(q_vec, c(0.25, 0.50))),
20 | quantile(q_vec, c(0.25, 0.50), names = FALSE)
21 | )
22 |
23 | expect_equal(names(qtl(q_vec, c(0.25, 0.50))), c("q_25", "q_50"))
24 |
25 | expect_equal(
26 | as.numeric(qtl(q_vec, seq(0.25, 0.5, by = 0.05))),
27 | quantile(q_vec, seq(0.25, 0.5, by = 0.05), names = FALSE)
28 | )
29 |
30 | expect_equal(
31 | names(qtl(q_vec, seq(0.25, 0.5, by = 0.05))),
32 | c("q_25", "q_30", "q_35", "q_40", "q_45", "q_50")
33 | )
34 | })
35 |
36 | test_that("test_if_null returns different error messages", {
37 | expect_error(test_if_null(NULL))
38 | expect_error(test_if_null(NULL), regexp = "must not be NULL")
39 | expect_error(
40 | test_if_null(NULL, message = "wat even is this"),
41 | regexp = "even is this"
42 | )
43 | })
44 |
--------------------------------------------------------------------------------
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 |
--------------------------------------------------------------------------------
/vignettes/exploratory-modelling.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Exploratory Modelling"
3 | output: rmarkdown::html_vignette
4 | vignette: >
5 | %\VignetteIndexEntry{Exploratory Modelling}
6 | %\VignetteEngine{knitr::rmarkdown}
7 | %\VignetteEncoding{UTF-8}
8 | ---
9 |
10 | ```{r, include = FALSE}
11 | options(rmarkdown.html_vignette.check_title = FALSE)
12 | knitr::opts_chunk$set(
13 | collapse = TRUE,
14 | comment = "#>",
15 | warning = FALSE,
16 | message = FALSE
17 | )
18 | ```
19 |
20 | ```{r setup}
21 | library(brolgar)
22 | ```
23 |
24 | It can be useful to fit a model to explore your data. One technique is to fit a linear model for each group in a dataset. For example, you could fit a linear model for each key in the data.
25 |
26 | `brolgar` provides a helper function to help with this, called `key_slope()`.
27 |
28 | `key_slope()` returns the intercept and slope estimate for each key, given a linear model formula. We can get the number of observations, and slope information for each individual to identify those that are decreasing over time.
29 |
30 | ```{r use-gghighlight}
31 | key_slope(wages,ln_wages ~ xp)
32 | ```
33 |
34 | We can then join these summaries back to the data:
35 |
36 | ```{r show-wages-lg}
37 | library(dplyr)
38 | wages_slope <- key_slope(wages,ln_wages ~ xp) %>%
39 | left_join(wages, by = "id")
40 |
41 | wages_slope
42 | ```
43 |
44 | And highlight those individuals with a negative slope using `gghighlight`:
45 |
46 | ```{r use-gg-highlight}
47 | library(gghighlight)
48 |
49 | wages_slope %>%
50 | as_tibble() %>% # workaround for gghighlight + tsibble
51 | ggplot(aes(x = xp,
52 | y = ln_wages,
53 | group = id)) +
54 | geom_line() +
55 | gghighlight(.slope_xp < 0)
56 | ```
57 |
58 | # Find keys near other summaries with `keys_near()`
59 |
60 | We might want to further summarise our exploratory modelling by finding those slopes that are near a five number summary values:
61 |
62 | ```{r summary-slope}
63 | summary(wages_slope$.slope_xp)
64 | ```
65 |
66 | Finding those groups that are near these values can be surprisingly challenging!
67 |
68 | `brolgar` makes it easier by providing the `keys_near()` function. You tell it what the key is, what variable you want to summarise by, and then by default it returns those keys near the five number summary. Let's return the keys near the `.slope_xp`:
69 |
70 | ```{r keys-near}
71 | wages_slope %>%
72 | keys_near(key = id,
73 | var = .slope_xp)
74 | ```
75 |
76 | Here it returns the `id`, the `.slope_xp`, and the statistic that it was closest to, and what the difference between the slope_xp and the statistic.
77 |
78 | You can visualise these summary keys by joining them back to the data:
79 |
80 | ```{r keys-near-plot}
81 | wages_slope %>%
82 | keys_near(key = id,
83 | var = .slope_xp) %>%
84 | left_join(wages, by = "id") %>%
85 | ggplot(aes(x = xp,
86 | y = ln_wages,
87 | group = id,
88 | colour = stat)) +
89 | geom_line()
90 | ```
91 |
92 | You can read more about `keys_near()` in the [Identifying interesting observations](https://brolgar.njtierney.com/articles/id-interesting-obs.html) vignette.
93 |
--------------------------------------------------------------------------------
/vignettes/finding-features.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Finding Features in Data"
3 | output: rmarkdown::html_vignette
4 | vignette: >
5 | %\VignetteIndexEntry{Finding Features in Data}
6 | %\VignetteEngine{knitr::rmarkdown}
7 | %\VignetteEncoding{UTF-8}
8 | ---
9 |
10 | ```{r setup, include = FALSE}
11 | options(rmarkdown.html_vignette.check_title = FALSE)
12 | knitr::opts_chunk$set(
13 | collapse = TRUE,
14 | comment = "#>",
15 | warning = FALSE,
16 | message = FALSE
17 | )
18 |
19 | ```
20 |
21 | When you are presented with longitudinal data, it is useful to summarise the data into a format where you have one row per key. That means one row per unique identifier of the data - if you aren't sure what this means, see the vignette, ["Longitudinal Data Structures"](https://brolgar.njtierney.com/articles/longitudinal-data-structures.html).
22 |
23 | So, say for example you wanted to find features in the wages data, which looks like this:
24 |
25 | ```{r print-wages}
26 | library(brolgar)
27 | wages
28 | ```
29 |
30 | You can return a dataset that has one row per key, with say the minimum value for `ln_wages`, for each key:
31 |
32 | ```{r wages-summary, echo = FALSE}
33 | wages_min <- wages %>%
34 | features(ln_wages,
35 | list(min = min))
36 |
37 | wages_min
38 | ```
39 |
40 | This then allows us to summarise these kinds of data, to say for example find the distribution of minimum values:
41 |
42 | ```{r gg-min-wages}
43 | library(ggplot2)
44 | ggplot(wages_min,
45 | aes(x = min)) +
46 | geom_density()
47 | ```
48 |
49 | We call these summaries `features` of the data.
50 |
51 | This vignette discusses how to calculate these features of the data.
52 |
53 | # Calculating features
54 |
55 | We can calculate `features` of longitudinal data using the `features` function (from [`fabletools`](https://fabletools.tidyverts.org/), made available in `brolgar`).
56 |
57 | `features` works by specifying the data, the variable to summarise, and the feature to calculate:
58 |
59 | ```r
60 | features(, , )
61 | ```
62 |
63 | or with the pipe:
64 |
65 | ```r
66 | %>% features(, )
67 | ```
68 |
69 | As an example, we can calculate a five number summary (minimum, 25th quantile, median, mean, 75th quantile, and maximum) of the data using `feat_five_num`, like so:
70 |
71 | ```{r features-fivenum}
72 | wages_five <- wages %>%
73 | features(ln_wages, feat_five_num)
74 |
75 | wages_five
76 | ```
77 |
78 | Here we are taking the `wages` data, piping it to `features`, and then telling it to summarise the `ln_wages` variable, using `feat_five_num`.
79 |
80 | There are several handy functions for calculating features of the data that
81 | `brolgar` provides. These all start with `feat_`.
82 |
83 | You can, for example, find those whose values only increase or decrease with `feat_monotonic`:
84 |
85 | ```{r features-monotonic}
86 | wages_mono <- wages %>%
87 | features(ln_wages, feat_monotonic)
88 |
89 | wages_mono
90 | ```
91 |
92 | These could then be used to identify individuals who only increase like so:
93 |
94 | ```{r wages-mono-filter}
95 | library(dplyr)
96 | wages_mono %>%
97 | filter(increase)
98 | ```
99 |
100 | They could then be joined back to the data
101 |
102 | ```{r wages-mono-join}
103 | wages_mono_join <- wages_mono %>%
104 | filter(increase) %>%
105 | left_join(wages, by = "id")
106 |
107 | wages_mono_join
108 | ```
109 |
110 | And these could be plotted:
111 |
112 | ```{r gg-wages-mono}
113 | ggplot(wages_mono_join,
114 | aes(x = xp,
115 | y = ln_wages,
116 | group = id)) +
117 | geom_line()
118 | ```
119 |
120 | To get a sense of the data and where it came from, we could create a plot with `gghighlight` to highlight those that only increase, by using `gghighlight(increase)` - since `increase` is a logical, this tells `gghighlight` to highlight those that are TRUE.
121 |
122 | ```{r gg-high-mono}
123 | library(gghighlight)
124 | wages_mono %>%
125 | left_join(wages, by = "id") %>%
126 | ggplot(aes(x = xp,
127 | y = ln_wages,
128 | group = id)) +
129 | geom_line() +
130 | gghighlight(increase)
131 | ```
132 |
133 | You can explore the available features, see the function [References](https://brolgar.njtierney.com/reference/index.html)
134 |
135 | # Creating your own Features
136 |
137 | To create your own features or summaries to pass to `features`, you provide a named list of functions. For example:
138 |
139 | ```{r create-three}
140 | library(brolgar)
141 | feat_three <- list(min = min,
142 | med = median,
143 | max = max)
144 |
145 | feat_three
146 |
147 | ```
148 |
149 | These are then passed to `features` like so:
150 |
151 | ```{r demo-feat-three}
152 | wages %>%
153 | features(ln_wages, feat_three)
154 |
155 | heights %>%
156 | features(height_cm, feat_three)
157 | ```
158 |
159 | Inside `brolgar`, the features are created with the following syntax:
160 |
161 | ```{r demo-feat-five-num, eval = FALSE}
162 | feat_five_num <- function(x, ...) {
163 | list(
164 | min = b_min(x, ...),
165 | q25 = b_q25(x, ...),
166 | med = b_median(x, ...),
167 | q75 = b_q75(x, ...),
168 | max = b_max(x, ...)
169 | )
170 | }
171 | ```
172 |
173 | Here the functions `b_` are functions with a default of `na.rm = TRUE`, and in
174 | the cases of quantiles, they use `type = 8`, and `names = FALSE`.
175 |
176 | # Accessing sets of features
177 |
178 | If you want to run many or all features from a package on your data you can collect them all with `feature_set`. For example:
179 |
180 | ```{r show-features-set}
181 | library(fabletools)
182 | feat_brolgar <- feature_set(pkgs = "brolgar")
183 | length(feat_brolgar)
184 | ```
185 |
186 | You could then run these like so:
187 |
188 | ```{r run-features-set}
189 | wages %>%
190 | features(ln_wages, feat_brolgar)
191 | ```
192 |
193 | For more information see `?fabletools::feature_set`
194 |
195 | # Registering a feature in a package
196 |
197 | If you create features in your own package and want to make them accessible with `feature_set`, do the following.
198 |
199 | Functions can be registered via `fabletools::register_feature()`.
200 | To register features in a package, I create a file called `zzz.R`, and use the
201 | `.onLoad(...)` function to set this up on loading the package:
202 |
203 | ```{r show-register-feature, eval = FALSE}
204 | .onLoad <- function(...) {
205 | fabletools::register_feature(feat_three_num, c("summary"))
206 | # ... and as many as you want here!
207 | }
208 | ```
209 |
210 |
--------------------------------------------------------------------------------
/vignettes/id-interesting-obs.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Identify Interesting Observations"
3 | output: rmarkdown::html_vignette
4 | vignette: >
5 | %\VignetteIndexEntry{Identify Interesting Observations}
6 | %\VignetteEngine{knitr::rmarkdown}
7 | %\VignetteEncoding{UTF-8}
8 | ---
9 |
10 | ```{r setup, include = FALSE}
11 | options(rmarkdown.html_vignette.check_title = FALSE)
12 | knitr::opts_chunk$set(
13 | collapse = TRUE,
14 | comment = "#>",
15 | warning = FALSE,
16 | message = FALSE
17 | )
18 |
19 | library(brolgar)
20 | library(ggplot2)
21 | library(dplyr)
22 | library(tidyr)
23 | ```
24 |
25 | This vignette unpacks how to find interesting individuals.
26 |
27 | Let's say we calculate a slope for each individual `key`, using `keys_slope()`:
28 |
29 | ```{r wages-slope}
30 | wages_slope <- key_slope(wages, ln_wages ~ xp)
31 | wages_slope
32 | ```
33 |
34 | This is neat! But now we want to know those `keys` that are **nearest** to some set of summary statistics of the slope. By **nearest** here we mean which values have the smallest numerical difference.
35 |
36 | Let's say the _five number summary_:
37 |
38 | ```{r summary-slope}
39 | summary(wages_slope$.slope_xp)
40 | ```
41 |
42 | If want to find those individuals that have slopes **near** these values.
43 |
44 | We can do this using `keys_near()`, which returns those nearest to some summary
45 | statistics. In this case, it is the five number summary. In the next section we describe how you can provide your own named list of functions to use.
46 |
47 | ```{r use-summarise-fivenum}
48 |
49 | wages_slope_near <- wages_slope %>%
50 | keys_near(key = id,
51 | var = .slope_xp)
52 |
53 | wages_slope_near
54 |
55 | ```
56 |
57 | We can then join this information back against the data and plot those
58 | interesting individuals:
59 |
60 | ```{r plot-keys-near}
61 |
62 | wages_slope_near %>%
63 | left_join(wages, by = "id") %>%
64 | ggplot(aes(x = xp,
65 | y = ln_wages,
66 | group = id,
67 | colour = stat)) +
68 | geom_line()
69 |
70 | ```
71 |
72 | You could also, with a bit of work, show these lines against the background using gghighlight
73 |
74 | ```{r gghighlight-near}
75 | library(gghighlight)
76 | wages %>%
77 | left_join(wages_slope_near, by = "id") %>%
78 | as_tibble() %>%
79 | ggplot(aes(x = xp,
80 | y = ln_wages,
81 | group = id,
82 | colour = stat)) +
83 | geom_line() +
84 | gghighlight(!is.na(stat))
85 |
86 | ```
87 |
88 | # Specify your own summaries for `keys_near`
89 |
90 | You can specify your own list of summaries to pass to `keys_near`. For example, you could create your own summaries to give a sense of range. Note that the functions here start with `b_`, and are `b_summaries` provided by `brolgar` that have sensible defaults. You can [read about them here](https://brolgar.njtierney.com/reference/b_summaries.html), or with `?b_summaries`
91 |
92 | ```{r create-your-own}
93 | l_ranges <- list(min = b_min,
94 | range_diff = b_range_diff,
95 | max = b_max,
96 | iqr = b_iqr)
97 |
98 | wages %>%
99 | key_slope(formula = ln_wages ~ xp) %>%
100 | keys_near(key = id,
101 | var = .slope_xp,
102 | funs = l_ranges)
103 |
104 | ```
105 |
106 | # Implementation of `keys_near`
107 |
108 | If you are interested in the specifics of how `keys_near()` works, this section describes how it is implemented in `brolgar`.
109 |
110 | To get the data into the right format, there are a few steps.
111 |
112 | First, we need to get the data into a format where we have all the statistics that we are interested in, along with the id, and the statistic of interest.
113 |
114 | We can fit a linear model for each `key` in the dataset using `key_slope()`.
115 |
116 | ```{r key-slope}
117 | wages_slope <- key_slope(wages, ln_wages ~ xp)
118 |
119 | wages_slope
120 | ```
121 |
122 | We can then perform a summary of the statistic of interest, in this case the slope.
123 |
124 | ```{r mutate-all-wages}
125 | wages_slope_all_stats <- wages_slope %>%
126 | mutate_at(.vars = vars(.slope_xp),
127 | .funs = list(.slope_min = b_min,
128 | .slope_max = b_max,
129 | .slope_median = b_median,
130 | .slope_q1 = b_q25,
131 | .slope_q3 = b_q75)) %>%
132 | select(id,
133 | starts_with(".slope"))
134 |
135 | wages_slope_all_stats
136 | ```
137 |
138 | We then need to convert this into long format
139 |
140 | ```{r gather-wages}
141 | wages_slope_all_stats_long <-
142 | wages_slope_all_stats %>%
143 | gather(key = "stat",
144 | value = "stat_value",
145 | -id,
146 | -.slope_xp)
147 |
148 | wages_slope_all_stats_long
149 | ```
150 |
151 | We can then calculate the difference between each stat and the slope, `.slope_xp`:
152 | ```{r stats-diff}
153 | stats_diff <-
154 | wages_slope_all_stats_long %>%
155 | mutate(stat_diff = abs(.slope_xp - stat_value))
156 |
157 | stats_diff
158 | ```
159 |
160 | With stats diff, we can then group by the `stat`, and find return those rows with the smallest difference between the statistic and the value:
161 |
162 | ```{r choose-top-diff}
163 | top_stats_diff <-
164 | stats_diff %>%
165 | group_by(stat) %>%
166 | top_n(-1,
167 | wt = stat_diff)
168 |
169 | top_stats_diff
170 | ```
171 |
172 | ```{r join-top-stats-diff}
173 | top_stats_diff %>%
174 | left_join(wages, by = "id") %>%
175 | ggplot(aes(x = xp,
176 | y = ln_wages,
177 | group = id,
178 | colour = stat)) +
179 | geom_line()
180 | ```
181 |
182 | We can see that we get the same output using `keys_near()`:
183 |
184 | ```{r show-same}
185 | wages %>%
186 | key_slope(ln_wages ~ xp) %>%
187 | keys_near(key = id,
188 | var = .slope_xp) %>%
189 | left_join(wages, by = "id") %>%
190 | ggplot(aes(x = xp,
191 | y = ln_wages,
192 | group = id,
193 | colour = stat)) +
194 | geom_line()
195 | ```
196 |
197 |
--------------------------------------------------------------------------------
/vignettes/longitudinal-data-structures.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Longitudinal Data Structures"
3 | output: rmarkdown::html_vignette
4 | vignette: >
5 | %\VignetteIndexEntry{Longitudinal Data Structures}
6 | %\VignetteEngine{knitr::rmarkdown}
7 | %\VignetteEncoding{UTF-8}
8 | ---
9 |
10 | ```{r setup, include = FALSE}
11 | options(rmarkdown.html_vignette.check_title = FALSE)
12 | knitr::opts_chunk$set(
13 | collapse = TRUE,
14 | comment = "#>",
15 | warning = FALSE,
16 | message = FALSE
17 | )
18 | ```
19 |
20 | There are many ways to describe longitudinal data - from panel data,
21 | cross-sectional data, and time series. We define longitudinal data as:
22 |
23 | > Information from the same individuals, recorded at multiple points in time.
24 |
25 | To explore and model longitudinal data, It is important to understand what
26 | variables represent the individual components, and the time components, and how
27 | these identify an individual moving through time. Identifying the individual and
28 | time components can sometimes be a challenge, so this vignette walks through how
29 | to do this.
30 |
31 |
32 | # Defining longitudinal data as a `tsibble`
33 |
34 | The tools and workflows in `brolgar` are designed to work with a special tidy
35 | time series data frame called a `tsibble`. We can define our longitudinal data
36 | in terms of a time series to gain access to some really useful tools. To do so,
37 | we need to identify three components:
38 |
39 | 1. The **key** variable in your data is the **identifier** of your individual.
40 | 2. The **index** variable is the **time** component of your data.
41 | 3. The **regularity** of the time interval (index). Longitudinal data typically
42 | has irregular time periods between measurements, but can have regular
43 | measurements.
44 |
45 | Together, time **index** and **key** uniquely identify an observation with repeated measurements
46 |
47 | The term `key` is used a lot in brolgar, so it is an important idea to
48 | internalise:
49 |
50 | > **The key is the identifier of your individuals or series**
51 |
52 | Why care about defining longitudinal data as a time series? Once we account for this time series
53 | structure inherent in longitudinal data, we gain access to a suite of nice tools
54 | that simplify and accelerate how we work with time series data.
55 |
56 | `brolgar` is
57 | built on top of the powerful [`tsibble`](https://tsibble.tidyverts.org/) package
58 | by [Earo Wang](https://earo.me/), if you would like to learn more, see the
59 | [official package documentation](https://tsibble.tidyverts.org/) or read [the
60 | paper](https://arxiv.org/abs/1901.10257).
61 |
62 |
63 | ## Converting your longitudinal data to a time series
64 |
65 | To convert longitudinal data into a "**t**ime **s**eries tibble", a
66 | [`tsibble`](https://tsibble.tidyverts.org/), we need to consider which variables
67 | identify:
68 |
69 | 1. The individual, who would have repeated measurements. This is the **key**
70 | 2. The time component, this is the **index** .
71 | 3. The **regularity** of the time interval (index).
72 |
73 | Together, time **index** and **key** uniquely identify an observation with repeated measurements
74 |
75 | The vignette now walks through some examples of converting longitudinal data into a `tsibble`.
76 |
77 | # example data: wages
78 |
79 | Let's look at the **wages**
80 | data analysed in Singer & Willett (2003). This data contains measurements on
81 | hourly wages by years in the workforce, with education and race as covariates.
82 | The population measured was male high-school dropouts, aged between 14 and 17
83 | years when first measured. Below is the first 10 rows of the data.
84 |
85 |
86 | ``` {r slice-wages}
87 | library(brolgar)
88 | suppressPackageStartupMessages(library(dplyr))
89 | slice(wages, 1:10) %>% knitr::kable()
90 | ```
91 |
92 | To create a `tsibble` of the data we ask, "which variables identify...":
93 |
94 | 1. The **key**, the individual, who would have repeated measurements.
95 | 2. The **index**, the time component.
96 | 3. The **regularity** of the time interval (index).
97 |
98 |
99 | Together, time **index** and **key** uniquely identify an observation with repeated measurements
100 |
101 | From this, we can say that:
102 |
103 | 1. The **key** is the variable `id` - the subject id, from 1-888.
104 | 2. The **index** is the variable `xp` the experience in years an individual has.
105 | 3. The data is **irregular** since the experience is a fraction of year that is not an integer.
106 |
107 | We can use this information to create a `tsibble` of this data using `as_tsibble`
108 |
109 | ``` {r create-tsibble, eval = FALSE}
110 | library(tsibble)
111 | as_tsibble(x = wages,
112 | key = id,
113 | index = xp,
114 | regular = FALSE)
115 | ```
116 |
117 | ``` {r print-wages-tsibble, echo = FALSE}
118 | wages
119 | ```
120 |
121 | Note that `regular = FALSE`, since we have an `irregular` time
122 | series
123 |
124 | Note the following information printed at the top of `wages`
125 |
126 | # A tsibble: 6,402 x 9 [!]
127 | # Key: id [888]
128 | ...
129 |
130 | This says:
131 |
132 | - We have `r nrow(wages)` rows,
133 | - with `r ncol(wages)` columns.
134 |
135 | The `!` at the top means that there is no regular spacing between series
136 |
137 | The "key" variable is then listed - `id`, of which there `r n_keys(wages)`.
138 |
139 | # example: heights data
140 |
141 | The heights data is a little simpler than the wages data, and contains the
142 | average male heights in 144 countries from 1810-1989, with a smaller number of
143 | countries from 1500-1800.
144 |
145 | It contains four variables:
146 |
147 | - country
148 | - continent
149 | - year
150 | - height_cm
151 |
152 | To create a `tsibble` of the data we ask, "which variables identify...":
153 |
154 | 1. The **key**, the individual, who would have repeated measurements.
155 | 2. The **index**, the time component.
156 | 3. The **regularity** of the time interval (index).
157 |
158 |
159 | In this case:
160 |
161 | - The individual is not a person, but a country
162 | - The time is year
163 | - The year is not regular because there are not measurements at a fixed year point.
164 |
165 | This data is already a `tsibble` object, we can create a `tsibble` with the following code:
166 |
167 | ```{r heights-tsibble}
168 | as_tsibble(x = heights,
169 | key = country,
170 | index = year,
171 | regular = FALSE)
172 | ```
173 |
174 |
175 | # example: gapminder
176 |
177 | The gapminder R package contains a dataset of a subset of the gapminder study (link). This contains data on life expectancy, GDP per capita, and population by country.
178 |
179 | ```{r show-gapminder}
180 | library(gapminder)
181 | gapminder
182 | ```
183 |
184 | Let's identify
185 |
186 | 1. The **key**, the individual, who would have repeated measurements.
187 | 2. The **index**, the time component.
188 | 3. The **regularity** of the time interval (index).
189 |
190 | This is in fact very similar to the `heights` dataset:
191 |
192 | 1. The **key** is the country
193 | 2. The **index** is the year
194 |
195 | To identify if the year is regular, we can do a bit of data exploration using `index_summary()`
196 |
197 | ```{r gap-summarise-index}
198 | gapminder %>%
199 | group_by(country) %>%
200 | index_summary(year)
201 | ```
202 |
203 | This shows us that the year is every five - so now we know that this is a regular longitudinal dataset, and can be encoded like so:
204 |
205 | ```{r tsibble-gapminder}
206 | as_tsibble(gapminder,
207 | key = country,
208 | index = year,
209 | regular = TRUE)
210 | ```
211 |
212 | # example: PISA data
213 |
214 | The PISA study measures school students around the world on a series of math, reading, and science scores. A subset of the data looks like so:
215 |
216 | ```{r pisa-show}
217 | pisa
218 | ```
219 |
220 | Let's identify
221 |
222 | 1. The **key**, the individual, who would have repeated measurements.
223 | 2. The **index**, the time component.
224 | 3. The **regularity** of the time interval (index).
225 |
226 | Here it looks like the key is the student_id, which is nested within school_id and country,
227 |
228 | And the index is year, so we would write the following
229 |
230 | ```r
231 | as_tsibble(pisa,
232 | key = c(country),
233 | index = year)
234 | ```
235 |
236 | We can assess the regularity of the year like so:
237 |
238 | ```{r index-check}
239 | index_regular(pisa, year)
240 | index_summary(pisa, year)
241 | ```
242 |
243 | We can now convert this into a `tsibble`:
244 |
245 | ```{r pisa-as-tsibble}
246 | pisa_ts <- as_tsibble(pisa,
247 | key = country,
248 | index = year,
249 | regular = TRUE)
250 |
251 | pisa_ts
252 | ```
253 |
254 | # Conclusion
255 |
256 | This idea of longitudinal data is core to brolgar. Understanding what longitudinal data is, and how this can be linked to a time series representation of data helps us understand our data structure, and gives us access to more flexible tools. Other vignettes in the
257 | package will further show why the time series `tsibble` is useful.
258 |
--------------------------------------------------------------------------------
/vignettes/mixed-effects-models.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Using brolgar to understand Mixed Effects Models"
3 | output: rmarkdown::html_vignette
4 | vignette: >
5 | %\VignetteIndexEntry{Using brolgar to understand Mixed Effects Models}
6 | %\VignetteEngine{knitr::rmarkdown}
7 | %\VignetteEncoding{UTF-8}
8 | ---
9 |
10 | ```{r knitr-setup, include = FALSE}
11 | knitr::opts_chunk$set(
12 | collapse = TRUE,
13 | comment = "#>",
14 | warning = FALSE,
15 | message = FALSE
16 | )
17 | ```
18 |
19 | ```{r setup}
20 | library(brolgar)
21 | library(lme4)
22 | library(modelr)
23 | library(ggplot2)
24 |
25 | ```
26 |
27 | Just as it is important to explore your data before modelling, it is important to explore your data after you fit a model, and during the modelling process.
28 |
29 | Let's take our wages data
30 |
31 | ```{r print-wages}
32 | wages
33 | ```
34 |
35 | We might explore this by looking at experience against wages, for each individual:
36 |
37 | ```{r wages-ggplot}
38 |
39 | gg_wages_all <-
40 | ggplot(wages,
41 | aes(x = xp,
42 | y = ln_wages,
43 | group = id)) +
44 | geom_line(alpha = 0.25)
45 |
46 | gg_wages_all
47 | ```
48 | But - Ugh. Spaghetti plot.
49 |
50 | Let's look at a random sample of people using `facet_sample()`
51 |
52 | ```{r wages-ggplot-sample}
53 | gg_wages_all +
54 | facet_sample()
55 | ```
56 |
57 | Now let's look at all of the data, arranging by `unemploy_rate`:
58 |
59 |
60 | ```{r}
61 | gg_wages_all + facet_strata()
62 | gg_wages_all +
63 | facet_strata(along = unemploy_rate)
64 |
65 | gg_wages_all +
66 | facet_strata(along = xp_since_ged)
67 |
68 | gg_wages_all + facet_wrap(~high_grade)
69 | ```
70 |
71 | So let's fit a model where we look at the impact of xp, unemployment rate, and fit an intercept for each individual.
72 |
73 | ```{r fit-int}
74 | library(lme4)
75 | wages_fit_int <- lmer(ln_wages ~ xp + ged + unemploy_rate + (xp |id),
76 | data = wages)
77 | ```
78 |
79 | We can use the tools from `modelr` to add predictions and residuals to the data
80 |
81 | ```{r wages-aug}
82 | library(modelr)
83 | wages_aug <- wages %>%
84 | add_predictions(wages_fit_int, var = "pred_int") %>%
85 | add_residuals(wages_fit_int, var = "res_int")
86 | ```
87 |
88 | Now let's look at the predictions over xp
89 |
90 | ```{r gg-wages-aug}
91 | ggplot(wages_aug,
92 | aes(x = xp,
93 | y = pred_int,
94 | group = id)) +
95 | geom_line(alpha = 0.4)
96 |
97 | ```
98 |
99 | Ugh. Straight spaghetti. Let's sample that.
100 |
101 | ```{r gg-wages-sample}
102 | ggplot(wages_aug,
103 | aes(x = xp,
104 | y = pred_int,
105 | group = id)) +
106 | geom_line() +
107 | facet_sample()
108 | ```
109 |
110 | And let's explore these according to residuals.
111 |
112 | ```{r}
113 | ggplot(wages_aug,
114 | aes(x = xp,
115 | y = pred_int,
116 | group = id)) +
117 | geom_line() +
118 | facet_strata(along = res_int)
119 | ```
120 |
121 |
122 | Now let's add in the data to the predictions.
123 |
124 | ```{r gg-wages-predictions}
125 | wages_aug %>%
126 | sample_n_keys(size = 9) %>%
127 | ggplot(aes(x = xp,
128 | y = pred_int,
129 | group = id,
130 | colour = factor(id))) +
131 | geom_line() +
132 | geom_point(aes(x = xp,
133 | y = ln_wages,
134 | colour = factor(id))) +
135 | facet_wrap(~id) +
136 | theme(legend.position = "none")
137 | ```
138 |
139 | What if we grabbed a sample of those who have the best, middle, and worst residuals? Those who are closest to these values:
140 |
141 | ```{r summary-residuals}
142 | summary(wages_aug$res_int)
143 | ```
144 |
145 | We can use `keys_near()` to return those specified keys that are close to these values. Because this is a `tsibble` object, we don't need to specify the `key` variable here.
146 |
147 | ```{r wages-keys-near}
148 | wages_aug_near <- wages_aug %>%
149 | keys_near(var = res_int)
150 |
151 | wages_aug_near
152 | ```
153 |
154 | This shows us the keys where we the residuals match closest to the five number summary.
155 |
156 | We can plot this data by joining it back to the wages data with predictions, to see what the spread of predictions is like.
157 |
158 | ```{r join-wages-near}
159 | library(dplyr)
160 |
161 | wages_aug_near_full <- left_join(wages_aug_near,
162 | wages_aug,
163 | by = "id")
164 |
165 | gg_wages_near <-
166 | ggplot(wages_aug_near_full,
167 | aes(x = xp,
168 | y = pred_int,
169 | group = id,
170 | colour = stat)) +
171 | geom_line() +
172 | geom_point(aes(y = ln_wages))
173 |
174 | gg_wages_near
175 |
176 | gg_wages_near +
177 | facet_wrap(~stat) +
178 | theme(legend.position = "none")
179 | ```
180 |
181 | We can also use `stratify_along` to group by the worst fits
182 |
183 | ```{r gg-stratify-residuals}
184 | wages_aug %>%
185 | stratify_keys(n_strata = 12,
186 | along = res_int) %>%
187 | sample_n_keys(size = 9) %>%
188 | ggplot(aes(x = xp,
189 | y = pred_int,
190 | group = id,
191 | colour = factor(id))) +
192 | geom_line() +
193 | geom_point(aes(x = xp,
194 | y = ln_wages,
195 | colour = factor(id))) +
196 | facet_wrap(~.strata) +
197 | theme(legend.position = "none")
198 |
199 | ```
200 |
--------------------------------------------------------------------------------