├── .Rbuildignore
├── .github
    ├── .gitignore
    └── workflows
    │   ├── R-CMD-check.yaml
    │   ├── pkgdown.yaml
    │   └── test-coverage.yaml
├── .gitignore
├── .hooks
    └── description
├── .pre-commit-config.yaml
├── DESCRIPTION
├── NAMESPACE
├── R
    ├── RcppExports.R
    ├── edges.R
    ├── plot-fns.R
    ├── scl-full.R
    ├── scl-redcap.R
    ├── spatialcluster-package.R
    ├── statistics.R
    ├── tree.R
    ├── utils.R
    └── zzz.R
├── README.Rmd
├── README.md
├── _pkgdown.yml
├── codemeta.json
├── makefile
├── man
    ├── figures
    │   ├── README-cautionary-1.png
    │   └── README-full-single-1.png
    ├── plot.scl.Rd
    ├── plot_merges.Rd
    ├── scl_full.Rd
    ├── scl_recluster.Rd
    ├── scl_redcap.Rd
    └── spatialcluster.Rd
├── src
    ├── RcppExports.cpp
    ├── alk.cpp
    ├── alk.h
    ├── bst.h
    ├── clk.cpp
    ├── clk.h
    ├── common.h
    ├── cuttree.cpp
    ├── cuttree.h
    ├── full-init.cpp
    ├── full-init.h
    ├── full-merge.cpp
    ├── full-merge.h
    ├── mst.cpp
    ├── mst.h
    ├── slk.cpp
    ├── slk.h
    ├── spatialcluster_init.c
    ├── utils.cpp
    └── utils.h
├── tests
    ├── testthat.R
    └── testthat
    │   ├── test-full.R
    │   ├── test-plots.R
    │   └── test-redcap.R
└── vignettes
    ├── makefile
    ├── spatialcluster.Rmd
    └── spatialcluster.bib


/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^.*\.Rproj$
 2 | ^.*\.pdf$
 3 | ^README-.*\.png$
 4 | ^README\.Rmd$
 5 | ^README\.html$
 6 | ^\.Rproj\.user$
 7 | ^\.git$
 8 | ^\.github$
 9 | ^\.gitignore$
10 | ^\.hooks$
11 | ^\.pre-commit-config\.yaml$
12 | ^_pkgdown.yml$
13 | ^aaa*
14 | ^codemeta\.json$
15 | ^inst/WORDLIST$
16 | ^makefile$
17 | ^vignettes/makefile$
18 | docs/
19 | 


--------------------------------------------------------------------------------
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 | 
 8 | name: R-CMD-check.yaml
 9 | 
10 | permissions: read-all
11 | 
12 | jobs:
13 |   R-CMD-check:
14 |     runs-on: ${{ matrix.config.os }}
15 | 
16 |     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
17 | 
18 |     strategy:
19 |       fail-fast: false
20 |       matrix:
21 |         config:
22 |           - {os: macos-latest,   r: 'release'}
23 |           - {os: windows-latest, r: 'release'}
24 |           - {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
25 |           - {os: ubuntu-latest,   r: 'release'}
26 |           - {os: ubuntu-latest,   r: 'oldrel-1'}
27 | 
28 |     env:
29 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
30 |       R_KEEP_PKG_SOURCE: yes
31 | 
32 |     steps:
33 |       - uses: actions/checkout@v4
34 | 
35 |       - uses: r-lib/actions/setup-pandoc@v2
36 | 
37 |       - uses: r-lib/actions/setup-r@v2
38 |         with:
39 |           r-version: ${{ matrix.config.r }}
40 |           http-user-agent: ${{ matrix.config.http-user-agent }}
41 |           use-public-rspm: true
42 | 
43 |       - uses: r-lib/actions/setup-r-dependencies@v2
44 |         with:
45 |           extra-packages: any::rcmdcheck
46 |           needs: check
47 | 
48 |       - uses: r-lib/actions/check-r-package@v2
49 |         with:
50 |           upload-snapshots: true
51 |           build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")'
52 | 


--------------------------------------------------------------------------------
/.github/workflows/pkgdown.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |   release:
 8 |     types: [published]
 9 |   workflow_dispatch:
10 | 
11 | name: pkgdown.yaml
12 | 
13 | permissions: read-all
14 | 
15 | jobs:
16 |   pkgdown:
17 |     runs-on: ubuntu-latest
18 |     # Only restrict concurrency for non-PR jobs
19 |     concurrency:
20 |       group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }}
21 |     env:
22 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
23 |     permissions:
24 |       contents: write
25 |     steps:
26 |       - uses: actions/checkout@v4
27 | 
28 |       - uses: r-lib/actions/setup-pandoc@v2
29 | 
30 |       - uses: r-lib/actions/setup-r@v2
31 |         with:
32 |           use-public-rspm: true
33 | 
34 |       - uses: r-lib/actions/setup-r-dependencies@v2
35 |         with:
36 |           extra-packages: any::pkgdown, local::.
37 |           needs: website
38 | 
39 |       - name: Build site
40 |         run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
41 |         shell: Rscript {0}
42 | 
43 |       - name: Deploy to GitHub pages 🚀
44 |         if: github.event_name != 'pull_request'
45 |         uses: JamesIves/github-pages-deploy-action@v4.5.0
46 |         with:
47 |           clean: false
48 |           branch: gh-pages
49 |           folder: docs
50 | 


--------------------------------------------------------------------------------
/.github/workflows/test-coverage.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 | 
 8 | name: test-coverage.yaml
 9 | 
10 | permissions: read-all
11 | 
12 | jobs:
13 |   test-coverage:
14 |     runs-on: ubuntu-latest
15 |     env:
16 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
17 | 
18 |     steps:
19 |       - uses: actions/checkout@v4
20 | 
21 |       - uses: r-lib/actions/setup-r@v2
22 |         with:
23 |           use-public-rspm: true
24 | 
25 |       - uses: r-lib/actions/setup-r-dependencies@v2
26 |         with:
27 |           extra-packages: any::covr, any::xml2
28 |           needs: coverage
29 | 
30 |       - name: Test coverage
31 |         run: |
32 |           cov <- covr::package_coverage(
33 |             quiet = FALSE,
34 |             clean = FALSE,
35 |             install_path = file.path(normalizePath(Sys.getenv("RUNNER_TEMP"), winslash = "/"), "package")
36 |           )
37 |           covr::to_cobertura(cov)
38 |         shell: Rscript {0}
39 | 
40 |       - uses: codecov/codecov-action@v4
41 |         with:
42 |           # Fail if error if not on PR, or if on PR and token is given
43 |           fail_ci_if_error: ${{ github.event_name != 'pull_request' || secrets.CODECOV_TOKEN }}
44 |           file: ./cobertura.xml
45 |           plugin: noop
46 |           disable_search: true
47 |           token: ${{ secrets.CODECOV_TOKEN }}
48 | 
49 |       - name: Show testthat output
50 |         if: always()
51 |         run: |
52 |           ## --------------------------------------------------------------------
53 |           find '${{ runner.temp }}/package' -name 'testthat.Rout*' -exec cat '{}' \; || true
54 |         shell: bash
55 | 
56 |       - name: Upload test results
57 |         if: failure()
58 |         uses: actions/upload-artifact@v4
59 |         with:
60 |           name: coverage-test-failures
61 |           path: ${{ runner.temp }}/package
62 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .Rproj.user
 2 | .Rhistory
 3 | .RData
 4 | inst/doc
 5 | aaa\.Rmd*
 6 | # History files
 7 | .Rhistory
 8 | .Rapp.history
 9 | # Session Data files
10 | .RData
11 | # Output files from R CMD build
12 | /*.tar.gz
13 | # vim files
14 | .*.un~
15 | .*.swp
16 | # compiled object files
17 | *.o
18 | *.so
19 | 


--------------------------------------------------------------------------------
/.hooks/description:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | # only stop on main branch
 4 | on_main <- identical (gert::git_branch (), "main")
 5 | 
 6 | s <- gert::git_status()
 7 | chk <- ("DESCRIPTION" %in% s$file &&
 8 |     (s$status [s$file == "DESCRIPTION"] == "modified" |
 9 |      s$status [s$file == "DESCRIPTION"]  == "new"))
10 | if (!chk & on_main)
11 |     stop ("DESCRIPTION has not been updated")
12 | 
13 | f <- file.path (rprojroot::find_root("DESCRIPTION"), "DESCRIPTION")
14 | x <- system2 ("git", args = c ("diff", "--cached", "-U0", f), stdout = TRUE)
15 | if (!any (grepl ("^\\+Version", x)) & on_main)
16 |     stop ("Version number in DESCRIPTION has not been incremented")
17 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # All available hooks: https://pre-commit.com/hooks.html
 2 | # R specific hooks: https://github.com/lorenzwalthert/precommit
 3 | repos:
 4 | -   repo: https://github.com/lorenzwalthert/precommit
 5 |     rev: v0.4.3.9003
 6 |     hooks:
 7 |     -   id: style-files
 8 |         args: [--style_pkg=spaceout, --style_fun=spaceout_style]    
 9 |         additional_dependencies:
10 |         - ropensci-review-tools/spaceout
11 |       # -   id: roxygenize
12 |     # codemeta must be above use-tidy-description when both are used
13 |     # -   id: codemeta-description-updated
14 |     -   id: use-tidy-description
15 |     -   id: spell-check
16 |         exclude: >
17 |           (?x)^(
18 |           .*\.[rR]|
19 |           .*\.feather|
20 |           .*\.jpeg|
21 |           .*\.pdf|
22 |           .*\.png|
23 |           .*\.py|
24 |           .*\.RData|
25 |           .*\.rds|
26 |           .*\.Rds|
27 |           .*\.Rproj|
28 |           .*\.sh|
29 |           (.*/|)\.gitignore|
30 |           (.*/|)\.gitlab-ci\.yml|
31 |           (.*/|)\.lintr|
32 |           (.*/|)\.pre-commit-.*|
33 |           (.*/|)\.Rbuildignore|
34 |           (.*/|)\.Renviron|
35 |           (.*/|)\.Rprofile|
36 |           (.*/|)\.travis\.yml|
37 |           (.*/|)appveyor\.yml|
38 |           (.*/|)NAMESPACE|
39 |           (.*/|)renv/settings\.dcf|
40 |           (.*/|)renv\.lock|
41 |           (.*/|)WORDLIST|
42 |           \.github/workflows/.*|
43 |           data/.*|
44 |           )$
45 |       # -   id: lintr
46 |     -   id: readme-rmd-rendered
47 |     -   id: parsable-R
48 |     -   id: no-browser-statement
49 |     -   id: no-print-statement
50 |     -   id: no-debug-statement
51 |     -   id: deps-in-desc
52 |       # -   id: pkgdown
53 | -   repo: https://github.com/pre-commit/pre-commit-hooks
54 |     rev: v5.0.0
55 |     hooks: 
56 |     -   id: check-added-large-files
57 |         args: ['--maxkb=200']
58 |     -   id: file-contents-sorter
59 |         files: '^\.Rbuildignore$'
60 |     -   id: end-of-file-fixer
61 |         exclude: '\.Rd'
62 | -   repo: https://github.com/pre-commit-ci/pre-commit-ci-config
63 |     rev: v1.6.1
64 |     hooks:
65 |     # Only required when https://pre-commit.ci is used for config validation
66 |     -   id: check-pre-commit-ci-config
67 | -   repo: local
68 |     hooks:
69 |     -   id: forbid-to-commit
70 |         name: Don't commit common R artifacts
71 |         entry: Cannot commit .Rhistory, .RData, .Rds or .rds.
72 |         language: fail
73 |         files: '\.(Rhistory|RData|Rds|rds)$'
74 |         # `exclude: <regex>` to allow committing specific files
75 |     -   id: description version
76 |         name: Version has been incremeneted in DESCRIPTION
77 |         entry: .hooks/description
78 |         language: script
79 | 
80 | ci:
81 |     autoupdate_schedule: monthly
82 |   # skip: [pkgdown]
83 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: spatialcluster
 2 | Title: R port of redcap
 3 | Version: 0.2.0.017
 4 | Authors@R: 
 5 |     person("Mark", "Padgham", , "mark.padgham@email.com", role = c("aut", "cre"))
 6 | Description: R port of redcap (Regionalization with dynamically
 7 |     constrained agglomerative clustering and partitioning).
 8 | License: GPL-3
 9 | URL: https://github.com/mpadge/spatialcluster
10 | BugReports: https://github.com/mpadge/spatialcluster/issues
11 | Depends: 
12 |     R (>= 4.1.0)
13 | Imports:
14 |     alphahull,
15 |     dplyr,
16 |     ggplot2,
17 |     ggthemes,
18 |     methods,
19 |     Rcpp (>= 0.12.6),
20 |     tibble,
21 |     tripack
22 | Suggests:
23 |     dbscan,
24 |     knitr,
25 |     rmarkdown,
26 |     roxygen2,
27 |     testthat
28 | LinkingTo:
29 |     Rcpp,
30 |     RcppArmadillo
31 | VignetteBuilder: 
32 |     knitr
33 | Encoding: UTF-8
34 | LazyData: true
35 | NeedsCompilation: yes
36 | RoxygenNote: 7.3.2
37 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(plot,scl)
 4 | export(plot_merges)
 5 | export(scl_full)
 6 | export(scl_recluster)
 7 | export(scl_redcap)
 8 | importFrom(Rcpp,evalCpp)
 9 | useDynLib(spatialcluster, .registration = TRUE)
10 | 


--------------------------------------------------------------------------------
/R/RcppExports.R:
--------------------------------------------------------------------------------
 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 3 | 
 4 | #' rcpp_alk
 5 | #'
 6 | #' Full-order average linkage cluster redcap algorithm
 7 | #'
 8 | #' @noRd
 9 | rcpp_alk <- function(gr, shortest, quiet) {
10 |     .Call(`_spatialcluster_rcpp_alk`, gr, shortest, quiet)
11 | }
12 | 
13 | #' clk_step
14 | #'
15 | #' @param ei The i'th edge of the full sorted list of edge weights
16 | #' @noRd
17 | NULL
18 | 
19 | #' rcpp_clk
20 | #'
21 | #' Full-order complete linkage cluster redcap algorithm
22 | #'
23 | #' @noRd
24 | rcpp_clk <- function(gr_full, gr, shortest, quiet) {
25 |     .Call(`_spatialcluster_rcpp_clk`, gr_full, gr, shortest, quiet)
26 | }
27 | 
28 | #' rcpp_cut_tree
29 | #'
30 | #' Cut tree into specified number of clusters by minimising internal cluster
31 | #' variance.
32 | #'
33 | #' @param tree tree to be processed
34 | #'
35 | #' @return Vector of cluster IDs for each tree edge
36 | #' @noRd
37 | rcpp_cut_tree <- function(tree, ncl, shortest, quiet) {
38 |     .Call(`_spatialcluster_rcpp_cut_tree`, tree, ncl, shortest, quiet)
39 | }
40 | 
41 | #' step
42 | #'
43 | #' All edges are initially in their own clusters. This merges edge#i with the
44 | #' next closest edge
45 | #'
46 | #' @param ei The i'th edge of the sorted list of NN edge weights
47 | #' @noRd
48 | NULL
49 | 
50 | #' fill_cl_edges
51 | #'
52 | #' Fill (arma) matrix of strongest/shortest connections between all clusters
53 | #' used to construct the hierarchical relationships
54 | #' @noRd
55 | NULL
56 | 
57 | #' rcpp_full_initial
58 | #'
59 | #' Initial allocation for full clustering
60 | #'
61 | #' @noRd
62 | rcpp_full_initial <- function(gr, shortest) {
63 |     .Call(`_spatialcluster_rcpp_full_initial`, gr, shortest)
64 | }
65 | 
66 | #' rcpp_full_merge
67 | #'
68 | #' Merge clusters generated by rcpp_full_initial to full hierarchy of all
69 | #' possible merges.
70 | #'
71 | #' @noRd
72 | rcpp_full_merge <- function(gr, linkage, shortest) {
73 |     .Call(`_spatialcluster_rcpp_full_merge`, gr, linkage, shortest)
74 | }
75 | 
76 | #' rcpp_mst
77 | #'
78 | #' Minimum spanning tree
79 | #'
80 | #' @noRd
81 | rcpp_mst <- function(input) {
82 |     .Call(`_spatialcluster_rcpp_mst`, input)
83 | }
84 | 
85 | #' rcpp_slk
86 | #'
87 | #' Full-order single linkage cluster redcap algorithm
88 | #'
89 | #' @noRd
90 | rcpp_slk <- function(gr_full, gr, shortest, quiet) {
91 |     .Call(`_spatialcluster_rcpp_slk`, gr_full, gr, shortest, quiet)
92 | }
93 | 
94 | 


--------------------------------------------------------------------------------
/R/edges.R:
--------------------------------------------------------------------------------
  1 | #' scl_edges_tri
  2 | #'
  3 | #' Generate triangulated nearest-neighbour edges between a set of input points
  4 | #'
  5 | #' @inheritParams scl_redcap
  6 | #' @examples
  7 | #' set.seed (1)
  8 | #' n <- 100
  9 | #' xy <- matrix (runif (2 * n), ncol = 2)
 10 | #' edges <- scl_edges_tri (xy)
 11 | #' @noRd
 12 | scl_edges_tri <- function (xy, shortest = TRUE) {
 13 | 
 14 |     if (!inherits (xy, "data.frame")) {
 15 |         xy <- as.data.frame (xy)
 16 |     }
 17 |     if (ncol (xy) > 2) {
 18 |         xy <- dplyr::select (xy, c (x, y))
 19 |     }
 20 |     names (xy) <- c ("x", "y")
 21 |     nbs <- tripack::tri.mesh (xy) |>
 22 |         tripack::neighbours ()
 23 | 
 24 |     n <- length (nbs)
 25 |     edges <- lapply (
 26 |         seq (n),
 27 |         function (i) cbind (i, nbs [[i]])
 28 |     )
 29 |     edges <- do.call (rbind, edges)
 30 | 
 31 |     edges <- tibble::tibble (
 32 |         from = edges [, 1],
 33 |         to = edges [, 2]
 34 |     )
 35 | 
 36 |     dxy <- as.matrix (stats::dist (xy))
 37 | 
 38 |     append_dist_to_edges (edges, dxy, shortest)
 39 | }
 40 | 
 41 | #' scl_edges_nn
 42 | #'
 43 | #' Generate distance-based nearest-neighbour edges between a set of input
 44 | #' points, ensuring that all edges connect to a single component. The minimal
 45 | #' spanning tree is constructed from **spatial** distances, not from distances
 46 | #' given in `dmat`.
 47 | #' @param nnbs Number of nearest neighbours
 48 | #' @inheritParams scl_redcap
 49 | #'
 50 | #' @return A `tibble` of `from` and `to` vertex indices for the minimal spanning
 51 | #' tree edges, along with corresponding spatial distances calculated from 'xy'.
 52 | #'
 53 | #' @examples
 54 | #' set.seed (1)
 55 | #' n <- 100
 56 | #' xy <- matrix (runif (2 * n), ncol = 2)
 57 | #' scl_edges_nn (xy, nnbs = 3L)
 58 | #' @noRd
 59 | scl_edges_nn <- function (xy, nnbs, shortest = TRUE) {
 60 | 
 61 |     # Initially contruct with nnbs + 1, because the `d` matrix includes
 62 |     # self-distances of zero, which are subsequently removed.
 63 |     nnbs <- nnbs + 1
 64 |     d <- apply (
 65 |         as.matrix (stats::dist (xy)),
 66 |         2,
 67 |         function (i) order (i, decreasing = !shortest) [seq_len (nnbs)]
 68 |     )
 69 | 
 70 |     edges <- tibble::tibble (
 71 |         from = rep (as.integer (colnames (d)),
 72 |             each = nnbs
 73 |         ),
 74 |         to = as.vector (d)
 75 |     )
 76 |     # rm self-edges:
 77 |     edges <- edges [which (edges$from != edges$to), ]
 78 | 
 79 |     # then ensure that the minimal spanning tree is included, to ensure all
 80 |     # nearest neighbour edges are connected in a single component. The distances
 81 |     # used for this MST are spatial distances, not from `dmat`.
 82 |     dxy <- as.matrix (stats::dist (xy))
 83 | 
 84 |     n <- nrow (xy)
 85 |     edges_all <- tibble::tibble (
 86 |         from = rep (seq_len (n), n),
 87 |         to = rep (seq_len (n), each = n),
 88 |         d = as.vector (dxy)
 89 |     ) |>
 90 |         dplyr::arrange (d, from, to) |>
 91 |         dplyr::filter (from != to)
 92 | 
 93 |     mst <- scl_spantree_ord1 (edges_all)
 94 |     # duplicate all of those:
 95 |     mst <- rbind (
 96 |         mst [, c ("from", "to")],
 97 |         tibble::tibble (from = mst$to, to = mst$from)
 98 |     )
 99 | 
100 |     edges <- rbind (edges, mst)
101 |     edges <- edges [which (!duplicated (edges)), ]
102 | 
103 |     # Then append final distances from `dxy` to the return value:
104 |     edges <- append_dist_to_edges (edges, dxy, shortest)
105 | 
106 |     return (edges)
107 | }
108 | 
109 | append_dist_to_edges <- function (edges, dmat, shortest) {
110 |     index <- (edges$to - 1) * nrow (dmat) + edges$from
111 |     edges$d <- dmat [index]
112 | 
113 |     if (shortest) {
114 |         edges <- dplyr::arrange (edges, d)
115 |     } else {
116 |         edges <- dplyr::arrange (edges, dplyr::desc (d))
117 |     }
118 | 
119 |     return (edges)
120 | }
121 | 
122 | #' scl_edges_all
123 | #'
124 | #' Generate full set of edges between a set of input points
125 | #'
126 | #' @param dmat Either a spatial distance matrix generated from 'xy', or a
127 | #' separate distance matrix passed as the 'dmat' parameter to the main
128 | #' \link{scl_redcap} function.
129 | #' @inheritParams scl_redcap
130 | #' @noRd
131 | scl_edges_all <- function (xy, dmat, shortest = TRUE) {
132 | 
133 |     n <- nrow (dmat)
134 |     edges <- tibble::tibble (
135 |         from = rep (seq (n), times = n),
136 |         to = rep (seq (n), each = n),
137 |         d = as.vector (dmat)
138 |     )
139 |     edges <- na.omit (edges)
140 | 
141 |     if (shortest) {
142 |         edges <- dplyr::arrange (edges, d)
143 |     } else {
144 |         edges <- dplyr::arrange (edges, dplyr::desc (d))
145 |     }
146 | 
147 |     return (edges)
148 | }
149 | 


--------------------------------------------------------------------------------
/R/plot-fns.R:
--------------------------------------------------------------------------------
  1 | #' scl_ahulls
  2 | #'
  3 | #' Calculate alpha hulls around clusters via the \pkg{alphahull} package
  4 | #'
  5 | #' @param tree Spanning tree obtained from \link{scl_redcap}
  6 | #' @param xy Matrix of spatial coordinates of points indexed by \code{tree}.
  7 | #' @param alpha Parameter used to create alpha hulls
  8 | #' @return tibble of (id, x, y), where the coordinates trace the convex hulls
  9 | #' for each cluster id
 10 | #' @noRd
 11 | scl_ahulls <- function (nodes, alpha = 0.1) {
 12 | 
 13 |     clnums <- unique (nodes$cluster [!is.na (nodes$cluster)])
 14 |     bdry <- list ()
 15 |     for (i in clnums) {
 16 |         if (length (which (nodes$cluster == i)) > 2) {
 17 |             xyi <- nodes |>
 18 |                 dplyr::filter (cluster == i) |>
 19 |                 dplyr::select (x, y)
 20 | 
 21 |             a <- alphahull::ashape (xyi, alpha = alpha)$edges |>
 22 |                 data.frame ()
 23 | 
 24 |             xy <- rbind (
 25 |                 data.frame (ind = a$ind1, x = a$x1, y = a$y1),
 26 |                 data.frame (ind = a$ind2, x = a$x2, y = a$y2)
 27 |             ) |>
 28 |                 unique () |>
 29 |                 dplyr::arrange (ind)
 30 |             inds <- data.frame (ind1 = a$ind1, ind2 = a$ind2)
 31 |             # Then just have to wrap those around xy:
 32 |             # TODO: Find a better way to do this!
 33 |             ind_seq <- as.numeric (inds [1, ])
 34 |             inds <- inds [-1, ]
 35 |             while (nrow (inds) > 0) {
 36 |                 j <- which (inds$ind1 == utils::tail (ind_seq, n = 1))
 37 |                 if (length (j) > 0) {
 38 |                     ind_seq <- c (ind_seq, inds [j, 2])
 39 |                 } else {
 40 |                     j <- which (inds$ind2 == utils::tail (ind_seq, n = 1))
 41 |                     ind_seq <- c (ind_seq, inds [j, 1])
 42 |                 }
 43 |                 inds <- inds [-j, , drop = FALSE] # nolint
 44 |             }
 45 |             xy <- xy [match (ind_seq, xy$ind), ]
 46 |             bdry [[length (bdry) + 1]] <- cbind (i, xy$x, xy$y)
 47 |         }
 48 |     }
 49 |     bdry <- data.frame (do.call (rbind, bdry))
 50 |     names (bdry) <- c ("id", "x", "y")
 51 |     return (bdry)
 52 | }
 53 | 
 54 | #' plot.scl
 55 | #' @method plot scl
 56 | #' @param x object to be plotted
 57 | #' @param hull_alpha alpha value of (non-)convex hulls, with default generating
 58 | #' a convex hull, and smaller values generating concave hulls. (See
 59 | #' ?alphashape::ashape for details).
 60 | #' @param ... ignored here
 61 | #' @family plot_fns
 62 | #' @export
 63 | #' @examples
 64 | #' set.seed (1)
 65 | #' n <- 100
 66 | #' xy <- matrix (runif (2 * n), ncol = 2)
 67 | #' dmat <- matrix (runif (n^2), ncol = n)
 68 | #' scl <- scl_redcap (xy, dmat, ncl = 4)
 69 | #' plot (scl)
 70 | #' # Connect clusters according to highest (\code{shortest = FALSE}) values of
 71 | #' # \code{dmat}:
 72 | #' scl <- scl_redcap (xy, dmat, ncl = 4, shortest = FALSE, full_order = FALSE)
 73 | #' plot (scl)
 74 | plot.scl <- function (x, ..., hull_alpha = 1) {
 75 | 
 76 |     # Clusters are defined has having > 2 edges, so any with < 3 edges need to
 77 |     # be removed here:
 78 |     etab <- table (x$tree$cluster)
 79 |     clusters_to_rm <- as.integer (names (etab) [which (etab < 3)])
 80 |     if (length (clusters_to_rm) > 0L) {
 81 |         x$nodes$cluster [x$nodes$cluster %in% clusters_to_rm] <- NA_integer_
 82 |     }
 83 | 
 84 |     # Reset cluster numbers to sequence starting at 1:
 85 |     x$nodes$cluster <- match (x$nodes$cluster, sort (unique (x$nodes$cluster)))
 86 | 
 87 |     hull_alpha <- check_hull_alpha (hull_alpha)
 88 | 
 89 |     hulls <- scl_ahulls (x$nodes, alpha = hull_alpha)
 90 | 
 91 |     nc <- length (unique (x$nodes$cluster [!is.na (x$nodes$cluster)]))
 92 | 
 93 |     # clnum in cl_cols is + 1 because xy below increases cluster numbers by 1 to
 94 |     # allocate cl_num == 1 to unassigned points
 95 |     cl_cols <- grDevices::rainbow (nc) |>
 96 |         tibble::as_tibble () |>
 97 |         dplyr::mutate (cluster = seq (nc) + 1) |>
 98 |         dplyr::rename (col = value)
 99 | 
100 |     xy <- x$nodes |>
101 |         dplyr::mutate (cluster = ifelse (is.na (cluster), 1, cluster + 1)) |>
102 |         dplyr::left_join (cl_cols, by = "cluster") |>
103 |         dplyr::mutate (col = ifelse (is.na (col), "#333333FF", col))
104 | 
105 |     y <- id <- NULL # suppress no visible binding warnings
106 |     hull_aes <- ggplot2::aes (x = x, y = y, group = id)
107 |     hull_width <- 0.5
108 |     g <- ggplot2::ggplot (xy, ggplot2::aes (x = x, y = y)) +
109 |         ggplot2::geom_point (
110 |             size = 5, color = xy$col,
111 |             show.legend = FALSE
112 |         ) +
113 |         ggplot2::geom_polygon (
114 |             data = hulls,
115 |             mapping = hull_aes,
116 |             colour = cl_cols$col [hulls$id],
117 |             fill = cl_cols$col [hulls$id],
118 |             alpha = 0.1,
119 |             linewidth = hull_width
120 |         ) +
121 |         ggthemes::theme_solarized ()
122 | 
123 |     g
124 | }
125 | 
126 | check_hull_alpha <- function (a) {
127 | 
128 |     if (length (a) > 1) {
129 |         stop ("hull_alpha must be a single value")
130 |     }
131 |     if (!is.numeric (a)) {
132 |         stop ("hull_alpha must be numeric")
133 |     }
134 | 
135 |     if (a <= 0 || a > 1) {
136 |         stop ("hull_alpha must be between 0 and 1")
137 |     }
138 | 
139 |     return (a)
140 | }
141 | 
142 | #' plot_merges
143 | #'
144 | #' Plot dendrogram of merges for \code{scl} object with \code{method = "full"}.
145 | #' @param x Object of class \code{scl} obtained with \code{method = "full"}.
146 | #' @param root_tree If \code{TRUE}, tree leaves are connected to bottom of plot,
147 | #' otherwise floating as determined by \link{plot.hclust}.
148 | #' @return Nothing (generates plot)
149 | #' @family plot_fns
150 | #' @export
151 | plot_merges <- function (x, root_tree = FALSE) {
152 |     if (!(methods::is (x, "scl") && x$pars$method == "full")) {
153 |         stop (
154 |             "plot_merges can only be applied to scl objects ",
155 |             "generated with method = full"
156 |         )
157 |     }
158 | 
159 |     hc <- structure (class = "hclust", .Data = list ())
160 |     merges <- convert_merges_to_hclust (x)
161 |     hc$merge <- merges [, 1:2]
162 |     hc$height <- merges [, 3]
163 |     hc$order <- x$ord + 1 # it's 0-indexed
164 |     hc$labels <- x$ord
165 |     if (root_tree) {
166 |         plot (stats::as.dendrogram (hc))
167 |     } else {
168 |         plot (hc)
169 |     }
170 | }
171 | 
172 | convert_merges_to_hclust <- function (x) {
173 |     mt <- as.matrix (x$merges [, c ("from", "to")]) + 1
174 |     dists <- as.vector (x$merges$dist)
175 |     indx <- sort (unique (as.vector (mt)))
176 |     mt <- apply (mt, 2, function (i) match (i, indx))
177 |     merged <- d <- NULL
178 |     map <- rep (NA, max (mt))
179 |     for (i in seq_len (nrow (mt))) {
180 |         m1 <- mt [i, 1]
181 |         m2 <- mt [i, 2]
182 |         if (!m1 %in% merged) {
183 |             merged <- c (merged, m1)
184 |             mt [i, 1] <- -m1
185 |         } else {
186 |             mt [i, 1] <- map [m1]
187 |             dists [i] <- dists [i] + dists [map [m1]]
188 |         }
189 |         map [m1] <- i
190 | 
191 |         if (!m2 %in% merged) {
192 |             merged <- c (merged, m2)
193 |             mt [i, 2] <- -m2
194 |         } else {
195 |             mt [i, 2] <- map [m2]
196 |             dists [i] <- dists [i] + dists [map [m2]]
197 |         }
198 |         map [m2] <- i
199 |     }
200 |     cbind (mt, dists)
201 | }
202 | 


--------------------------------------------------------------------------------
/R/scl-full.R:
--------------------------------------------------------------------------------
  1 | #' scl_full
  2 | #'
  3 | #' Full spatially-constrained clustering.
  4 | #'
  5 | #' @param linkage Either \code{"single"} or \code{"average"}. For covariance
  6 | #' clustering, use \code{"single"} with `shortest = FALSE`.
  7 | #' @inheritParams scl_redcap
  8 | #'
  9 | #' @family clustering_fns
 10 | #' @export
 11 | #' @examples
 12 | #' n <- 100
 13 | #' xy <- matrix (runif (2 * n), ncol = 2)
 14 | #' dmat <- matrix (runif (n^2), ncol = n)
 15 | #' scl <- scl_full (xy, dmat, ncl = 4)
 16 | scl_full <- function (xy,
 17 |                       dmat,
 18 |                       ncl,
 19 |                       linkage = "single",
 20 |                       shortest = TRUE,
 21 |                       nnbs = 6L) {
 22 | 
 23 |     linkage <- match.arg (tolower (linkage), c ("single", "average"))
 24 | 
 25 |     if (methods::is (xy, "scl")) {
 26 |         message (
 27 |             "scl_full is for initial cluster construction; ",
 28 |             "passing to scl_recluster"
 29 |         )
 30 |         scl_recluster_full (xy, ncl = ncl)
 31 |     } else {
 32 |         xy <- scl_tbl (xy)
 33 | 
 34 |         if (nnbs <= 0) {
 35 |             edges <- scl_edges_tri (xy, shortest = shortest)
 36 |         } else {
 37 |             edges <- scl_edges_nn (xy, nnbs = nnbs, shortest = shortest)
 38 |         }
 39 | 
 40 |         # cluster numbers can be joined with edges through either from or to:
 41 |         cl <- as.integer (rcpp_full_initial (edges, shortest) + 1)
 42 | 
 43 |         # make 3 vectors of cluster numbers:
 44 |         #   1. cl = cluster number for intra-cluster edges only;
 45 |         #   2. cl_from = Num of origin cluster for inter-cluster edges only; and
 46 |         #   3. cl_to = Num of destination cluster for inter-cluster edges only.
 47 |         from_cl <- cl [edges$from]
 48 |         to_cl <- cl [edges$to]
 49 |         indx <- which (from_cl == to_cl)
 50 |         cl_in <- cl_join_from <- cl_join_to <- rep (NA, nrow (edges))
 51 |         cl_in [indx] <- from_cl [indx]
 52 |         indx <- which (from_cl != to_cl)
 53 |         cl_join_from [indx] <- from_cl [indx]
 54 |         cl_join_to [indx] <- to_cl [indx]
 55 | 
 56 |         edges$cluster <- cl_in - 1L # convert back to C++ 0-indexed values
 57 |         edges$cl_from <- cl_join_from - 1L
 58 |         edges$cl_to <- cl_join_to - 1L
 59 |         edges$cluster [is.na (edges$cluster)] <- -1L
 60 |         edges$cl_from [is.na (edges$cl_from)] <- -1L
 61 |         edges$cl_to [is.na (edges$cl_to)] <- -1L
 62 | 
 63 |         # Then replace the spatial distance in the edges table with the distance
 64 |         # from the data to use that as the basis for merging:
 65 |         edges <- append_dist_to_edges (edges, dmat, shortest = shortest)
 66 | 
 67 |         merges <- rcpp_full_merge (
 68 |             edges,
 69 |             linkage = linkage,
 70 |             shortest = shortest
 71 |         ) |> data.frame ()
 72 | 
 73 |         merges <- tibble::tibble (
 74 |             from = as.integer (merges$from),
 75 |             to = as.integer (merges$to),
 76 |             dist = merges$dist
 77 |         )
 78 | 
 79 |         # full_cluster_nodes just auto-merges the tree to the specified number,
 80 |         # but some of these may be clusters with only 2 members. These are
 81 |         # excluded here by iterating until the desired number is achieved in
 82 |         # which each cluster has >= 3 members:
 83 |         num_clusters <- 0
 84 |         ncl_trial <- ncl
 85 |         while (num_clusters < ncl) {
 86 | 
 87 |             nodes <- full_cluster_nodes (edges, merges, ncl_trial)
 88 |             num_clusters <- length (which (table (nodes$cluster) > 2))
 89 |             ncl_trial <- ncl_trial + 1
 90 |             if (ncl_trial >= nrow (nodes)) {
 91 |                 break
 92 |             }
 93 |         }
 94 |         nt <- sort (table (nodes$cluster), decreasing = TRUE)
 95 |         n <- as.integer (names (nt) [which (nt <= 2)])
 96 |         nodes$cluster [nodes$cluster %in% n] <- NA
 97 | 
 98 |         # tree at that point has initial cluster numbers which must be
 99 |         # re-aligned with clusters from the nodal merges:
100 |         tree <- edges |> dplyr::select (from, to, d, cluster)
101 |         tree$cluster <- tree$cl_fr <-
102 |             nodes$cluster [match (tree$from, nodes$node)]
103 |         tree$cl_to <- nodes$cluster [match (tree$to, nodes$node)]
104 |         tree$cluster [tree$cl_fr != tree$cl_to] <- NA
105 | 
106 |         pars <- list (
107 |             method = "full",
108 |             ncl = ncl,
109 |             linkage = linkage
110 |         )
111 | 
112 |         res <- structure (
113 |             list (
114 |                 tree = dplyr::select (
115 |                     tree,
116 |                     c (from, to, d, cluster)
117 |                 ),
118 |                 merges = merges,
119 |                 ord = order_merges (merges),
120 |                 nodes = dplyr::bind_cols (nodes, xy),
121 |                 pars = pars
122 |             ),
123 |             class = "scl"
124 |         )
125 | 
126 |         res <- scl_statistics (res)
127 | 
128 |         return (res)
129 |     }
130 | }
131 | 
132 | #' order_merges
133 | #'
134 | #' Order merges so they can be plotted as dendrogram
135 | #' @param merges output from rccp_full_merge
136 | #' @noRd
137 | order_merges <- function (merges) {
138 | 
139 |     merges <- as.matrix (merges)
140 |     nodes <- merges [nrow (merges), c ("from", "to")]
141 |     for (i in rev (seq_len (nrow (merges))) [-1]) {
142 |         ii <- which (nodes == merges [i, 2])
143 |         n1 <- n2 <- NULL
144 |         if (ii > 1) {
145 |             n1 <- nodes [1:(ii - 1)]
146 |         }
147 |         if (ii <= length (nodes)) {
148 |             n2 <- nodes [ii:length (nodes)]
149 |         }
150 |         nodes <- c (n1, merges [i, 1], n2)
151 |     }
152 |     return (as.numeric (nodes))
153 | }
154 | 
155 | #' full_cluster_nodes
156 | #'
157 | #' Transform edge and merge data into rectangle of nodes and cluster IDs
158 | #' @noRd
159 | full_cluster_nodes <- function (edges, merges, ncl) {
160 | 
161 |     edges$cluster [edges$cluster < 0] <- NA
162 |     ncl_full <- length (unique (edges$cluster))
163 |     merge_tree <- merges [1:(ncl_full - ncl - 1), ]
164 |     for (i in seq_len (nrow (merge_tree))) {
165 |         edges$cluster [edges$cluster == merge_tree$from [i]] <-
166 |             merge_tree$to [i]
167 |     }
168 | 
169 |     node <- cluster <- NULL # rm undefined variable note
170 |     all_nodes <- unique (c (edges$from, edges$to))
171 |     nodes <- tibble::tibble (
172 |         node = c (edges$from, edges$to),
173 |         cluster = rep (edges$cluster, 2)
174 |     ) |>
175 |         dplyr::distinct () |>
176 |         dplyr::arrange (node) |>
177 |         dplyr::filter (!is.na (cluster))
178 | 
179 |     # nodes can still be in multiple clusters, so these are set to NA
180 |     dup_nodes <- unique (nodes$node [which (duplicated (nodes$node))])
181 |     nodes$cluster [nodes$node %in% dup_nodes] <- NA_integer_
182 |     nodes <- nodes [which (!duplicated (nodes)), ]
183 | 
184 |     # Plus nodes entirely in NA clusters can then be removed, and need to be
185 |     # re-inserted:
186 |     na_nodes <- all_nodes [which (!all_nodes %in% nodes$node)]
187 |     if (length (na_nodes) > 0L) {
188 |         na_nodes <- tibble::tibble (
189 |             node = na_nodes,
190 |             cluster = rep (NA_integer_, length (na_nodes))
191 |         )
192 |         nodes <- rbind (nodes, na_nodes)
193 |     }
194 | 
195 |     # re-order cluster numbers by frequencies
196 |     nt <- sort (table (nodes$cluster), decreasing = TRUE)
197 |     nodes$cluster <- as.integer (names (nt) [match (nodes$cluster, names (nt))])
198 | 
199 |     return (nodes)
200 | }
201 | 
202 | #' scl_recluster_full
203 | #'
204 | #' @noRd
205 | scl_recluster_full <- function (scl, ncl = ncl) {
206 | 
207 |     xy <- scl$nodes |> dplyr::select (x, y)
208 |     num_clusters <- 0
209 |     ncl_trial <- ncl
210 | 
211 |     while (num_clusters < ncl) {
212 | 
213 |         scl$nodes <- full_cluster_nodes (scl$tree, scl$merges, ncl_trial)
214 |         num_clusters <- length (which (table (scl$nodes$cluster) > 2))
215 |         ncl_trial <- ncl_trial + 1
216 |         if (ncl_trial >= nrow (scl$nodes)) {
217 |             break
218 |         }
219 |     }
220 |     nt <- sort (table (scl$nodes$cluster), decreasing = TRUE)
221 |     n <- as.integer (names (nt) [which (nt <= 2)])
222 |     scl$nodes$cluster [scl$nodes$cluster %in% n] <- NA
223 | 
224 |     scl$nodes <- dplyr::bind_cols (scl$nodes, xy)
225 | 
226 |     return (scl)
227 | }
228 | 


--------------------------------------------------------------------------------
/R/scl-redcap.R:
--------------------------------------------------------------------------------
  1 | #' scl_redcap
  2 | #'
  3 | #' Cluster spatial data with REDCAP (REgionalization with Dynamically
  4 | #' Constrained Agglomerative clustering and Partitioning) routines.
  5 | #'
  6 | #' @param xy Rectangular structure (matrix, data.frame, tibble), containing
  7 | #' coordinates of points to be clustered.
  8 | #' @param dmat Square structure (matrix, data.frame, tibble) containing
  9 | #' distances or equivalent metrics between all points in \code{xy}. If \code{xy}
 10 | #' has \code{n} rows, then \code{dat} must have \code{n} rows and \code{n}
 11 | #' columns.
 12 | #' @param ncl Desired number of clusters. See description of `ncl_iterate`
 13 | #' parameter for conditions under which actual number may be less than this
 14 | #' value.
 15 | #' @param full_order If \code{FALSE}, build spanning trees from first-order
 16 | #' relationships only, otherwise build from full-order relationships (see Note).
 17 | #' @param linkage One of \code{"single"}, \code{"average"}, or
 18 | #' \code{"complete"}; see Note.
 19 | #' @param shortest If \code{TRUE}, the \code{dmat} is interpreted as distances
 20 | #' such that lower values are preferentially selected; if \code{FALSE}, then
 21 | #' higher values of \code{dmat} are interpreted to indicate stronger
 22 | #' relationships, as is the case for example with covariances.
 23 | #' @param nnbs Number of nearest neighbours to be used in calculating clustering
 24 | #' trees. Triangulation will be used if \code{nnbs <= 0}.
 25 | #' @param iterate_ncl Actual numbers of clusters found may be less than the
 26 | #' specified value of `ncl`, because clusters formed from < 3 edges are removed.
 27 | #' If `iterate_ncl = FALSE` (the default), the value is returned with whatever
 28 | #' number of actual clusters is found. Setting this parameter to `TRUE` forces
 29 | #' the algorithm to iterate until the exact number of clusters has been found.
 30 | #' For large data sets, this may result in considerable longer calculation
 31 | #' times.
 32 | #' @param quiet If `FALSE` (default), display progress information on screen.
 33 | #'
 34 | #' @return A object of class \code{scl} with \code{tree} containing the
 35 | #' clustering scheme, and \code{xy} the original coordinate data of the
 36 | #' clustered points. An additional component, \code{tree_rest}, enables the tree
 37 | #' to be re-cut to a different number of clusters via \link{scl_recluster},
 38 | #' rather than calculating clusters anew.
 39 | #'
 40 | #' @note Please refer to the original REDCAP paper ('Regionalization with
 41 | #' dynamically constrained agglomerative clustering and partitioning (REDCAP)',
 42 | #' by D. Guo (2008), Int.J.Geo.Inf.Sci 22:801-823) for details of the
 43 | #' \code{full_order} and \code{linkage} parameters. This paper clearly
 44 | #' demonstrates the general inferiority of spanning trees constructed from
 45 | #' first-order relationships. It is therefore strongly recommended that the
 46 | #' default \code{full_order = TRUE} be used at all times.
 47 | #'
 48 | #' @family clustering_fns
 49 | #'
 50 | #' @examples
 51 | #' n <- 100
 52 | #' xy <- matrix (runif (2 * n), ncol = 2)
 53 | #' dmat <- matrix (runif (n^2), ncol = n)
 54 | #' scl <- scl_redcap (xy, dmat, ncl = 4)
 55 | #' # Those clusters will by default be constructed by connecting edges with the
 56 | #' # lowest (\code{shortest}) values of \code{dmat}, and will differ from
 57 | #' scl <- scl_redcap (xy, dmat, ncl = 4, shortest = FALSE)
 58 | #' # using 'full_order = FALSE' constructs clusters from first-order
 59 | #' # relationships only; not recommended, but possible nevertheless:
 60 | #' scl <- scl_redcap (xy, dmat, ncl = 4, full_order = FALSE)
 61 | #'
 62 | #' @export
 63 | scl_redcap <- function (xy,
 64 |                         dmat,
 65 |                         ncl,
 66 |                         full_order = TRUE,
 67 |                         linkage = "single",
 68 |                         shortest = TRUE,
 69 |                         nnbs = 6L,
 70 |                         iterate_ncl = FALSE,
 71 |                         quiet = FALSE) {
 72 | 
 73 |     linkage <- scl_linkage_type (linkage)
 74 | 
 75 |     if (methods::is (xy, "scl")) {
 76 | 
 77 |         if (!identical (xy$pars$method, "redcap")) {
 78 |             stop (
 79 |                 "scl_redcap can pass to scl_recluster only for scl objects",
 80 |                 " previously generated with scl_redcap"
 81 |             )
 82 |         }
 83 | 
 84 |         message (
 85 |             "scl_redcap is for initial cluster construction; ",
 86 |             "passing to scl_recluster"
 87 |         )
 88 | 
 89 |         scl_recluster_redcap (xy, ncl = ncl, shortest = shortest)
 90 | 
 91 |     } else {
 92 | 
 93 |         xy <- scl_tbl (xy)
 94 | 
 95 |         if (nnbs <= 0) {
 96 |             edges_nn <- scl_edges_tri (xy, shortest = shortest)
 97 |         } else {
 98 |             edges_nn <- scl_edges_nn (xy, nnbs = nnbs, shortest = shortest)
 99 |         }
100 | 
101 |         if (!full_order) {
102 | 
103 |             tree_full <- scl_spantree_ord1 (edges_nn) [, c ("from", "to")]
104 | 
105 |         } else {
106 | 
107 |             if (linkage == "average") {
108 | 
109 |                 tree_full <- scl_spantree_alk (edges_nn, shortest)
110 | 
111 |             } else {
112 | 
113 |                 d_xy <- as.matrix (stats::dist (xy))
114 |                 edges_all <- scl_edges_all (xy, d_xy, shortest)
115 | 
116 |                 if (linkage == "single") {
117 | 
118 |                     tree_full <- scl_spantree_slk (
119 |                         edges_all,
120 |                         edges_nn,
121 |                         shortest = shortest,
122 |                         quiet = quiet
123 |                     )
124 | 
125 |                 } else if (linkage == "complete") {
126 | 
127 |                     tree_full <- scl_spantree_clk (
128 |                         edges_all,
129 |                         edges_nn,
130 |                         shortest = shortest,
131 |                         quiet = quiet
132 |                     )
133 | 
134 |                 } else {
135 | 
136 |                     stop (
137 |                         "linkage must be one of ",
138 |                         "(single, average, complete)"
139 |                     )
140 |                 }
141 |             }
142 | 
143 |         }
144 | 
145 |         # Then the critical stage of changing the distance metric on 'edges_nn'
146 |         # from the spatial distances of 'd_xy' to the data-based distances in
147 |         # 'dmat':
148 |         edges_nn <- append_dist_to_edges (edges_nn, dmat, shortest = shortest)
149 | 
150 |         tree <- scl_cuttree (
151 |             tree_full,
152 |             edges_nn,
153 |             ncl,
154 |             shortest = shortest,
155 |             iterate_ncl = iterate_ncl,
156 |             quiet = quiet
157 |         )
158 | 
159 |         # meta-data:
160 |         clo <- c ("single", "full") [match (full_order, c (FALSE, TRUE))]
161 |         pars <- list (
162 |             method = "redcap",
163 |             ncl = ncl,
164 |             cl_order = clo,
165 |             linkage = linkage
166 |         )
167 | 
168 |         res <- structure (
169 |             list (
170 |                 tree = tree,
171 |                 nodes = dplyr::bind_cols (tree_nodes (tree), xy),
172 |                 pars = pars
173 |             ),
174 |             class = "scl"
175 |         )
176 | 
177 |         res <- scl_statistics (res)
178 | 
179 |         return (res)
180 |     }
181 | }
182 | 
183 | # Match cluster numbers in edge tree to actual nodes
184 | tree_nodes <- function (tree) {
185 | 
186 |     node <- NULL # suppress no visible binding note
187 | 
188 |     res <- tibble::tibble (
189 |         node = c (tree$from, tree$to),
190 |         cluster = rep (tree$cluster, 2)
191 |     ) |>
192 |         dplyr::distinct () |>
193 |         dplyr::arrange (node) |>
194 |         dplyr::filter (!is.na (cluster))
195 | 
196 |     # remove clusters with < 3 members:
197 |     res$cluster [res$cluster %in% which (table (res$cluster) < 3)] <- NA
198 | 
199 |     return (res)
200 | }
201 | 
202 | #' scl_reccluster
203 | #'
204 | #' Re-cut a spatial cluster tree (\code{scl}) at a different number of clusters.
205 | #'
206 | #' @param scl An \code{scl} object returned from \link{scl_redcap}.
207 | #' @inheritParams scl_redcap
208 | #'
209 | #' @return Modified \code{scl} object in which \code{tree} is re-cut into
210 | #' \code{ncl} clusters.
211 | #' @family clustering_fns
212 | #'
213 | #' @examples
214 | #' n <- 100
215 | #' xy <- matrix (runif (2 * n), ncol = 2)
216 | #' dmat <- matrix (runif (n^2), ncol = n)
217 | #' scl <- scl_redcap (xy, dmat, ncl = 4)
218 | #' plot (scl)
219 | #' scl <- scl_recluster (scl, ncl = 5)
220 | #' plot (scl)
221 | #'
222 | #' @export
223 | scl_recluster <- function (scl, ncl, shortest = TRUE, quiet = FALSE) {
224 | 
225 |     if (!methods::is (scl, "scl")) {
226 |         stop (
227 |             "scl_recluster can only be applied to 'scl' objects ",
228 |             "returned from scl_redcap"
229 |         )
230 |     } else if (identical (scl$pars$method, "redcap")) {
231 |         scl_recluster_redcap (scl = scl, ncl = ncl, shortest = shortest)
232 |     } else if (identical (scl$pars$method, "full")) {
233 |         scl_recluster_full (scl = scl, ncl = ncl)
234 |     }
235 | }
236 | 
237 | scl_recluster_redcap <- function (scl, ncl, shortest = TRUE, quiet = FALSE) {
238 | 
239 |     from <- to <- d <- NULL # no visible binding messages
240 | 
241 |     tree_full <- scl$tree |> dplyr::select (from, to, d)
242 | 
243 |     if (shortest) {
244 |         tree_full <- dplyr::arrange (tree_full, d)
245 |     } else {
246 |         tree_full <- dplyr::arrange (tree_full, dplyr::desc (d))
247 |     }
248 | 
249 |     tree_full$cluster <- rcpp_cut_tree (tree_full, ncl,
250 |         shortest = shortest,
251 |         quiet = quiet
252 |     ) + 1
253 | 
254 |     pars <- scl$pars
255 |     pars$ncl <- ncl
256 | 
257 |     structure (
258 |         list (
259 |             tree = tree_full,
260 |             nodes = dplyr::bind_cols (
261 |                 tree_nodes (tree_full),
262 |                 scl$nodes [, c ("x", "y")]
263 |             ),
264 |             pars = pars
265 |         ),
266 |         class = "scl"
267 |     )
268 | }
269 | 


--------------------------------------------------------------------------------
/R/spatialcluster-package.R:
--------------------------------------------------------------------------------
 1 | #' spatialcluster.
 2 | #'
 3 | #' R port of redcap (Regionalization with dynamically constrained agglomerative
 4 | #' clustering and partitioning).
 5 | #'
 6 | #' @name spatialcluster
 7 | #' @family package
 8 | #' @importFrom Rcpp evalCpp
 9 | #' @useDynLib spatialcluster, .registration = TRUE
10 | "_PACKAGE"
11 | 


--------------------------------------------------------------------------------
/R/statistics.R:
--------------------------------------------------------------------------------
 1 | #' scl_statistics
 2 | #'
 3 | #' @param scl Output of either \link{scl_redcap} or \link{scl_full}.
 4 | #'
 5 | #' @return A modified version of the input object with statistics appended.
 6 | #'
 7 | #' @noRd
 8 | scl_statistics <- function (scl) {
 9 | 
10 |     tree <- scl$tree |>
11 |         dplyr::mutate (tf = paste0 (to, "-", from))
12 |     edges_in <- scl$tree [which (scl$tree$cluster >= 0), ] |>
13 |         dplyr::mutate (tf = paste0 (to, "-", from))
14 |     tree <- tree [which (!tree$tf %in% edges_in$tf), ]
15 | 
16 |     # t.test (edges_in$d, tree$d, alternative = "greater", var.equal = TRUE)
17 |     tt_global <- stats::t.test (
18 |         edges_in$d,
19 |         tree$d,
20 |         alternative = "less",
21 |         var.equal = TRUE
22 |     )
23 |     tt_global <- c (tt_global$statistic, tt_global$parameter, tt_global$p.value)
24 |     names (tt_global) <- c ("statistic", "parameter", "p.value")
25 | 
26 |     tt_cl <- vapply (
27 |         sort (unique (scl$tree$cluster)),
28 |         function (i) {
29 |             index <- which (edges_in$cluster == i)
30 |             if (length (index) <= 3) {
31 |                 res <- rep (NA, 3L)
32 |             } else {
33 |                 tt <- stats::t.test (edges_in$d [index], tree$d,
34 |                     alternative = "less",
35 |                     var.equal = TRUE
36 |                 )
37 |                 res <- c (tt$statistic, tt$parameter, tt$p.value)
38 |             }
39 |             names (res) <- c ("statistic", "parameter", "p.value")
40 |             return (res)
41 |         },
42 |         numeric (3)
43 |     ) |>
44 |         t ()
45 | 
46 |     scl$statistics <- list (tt_global = tt_global, tt_clusters = tt_cl)
47 | 
48 |     return (scl)
49 | }
50 | 


--------------------------------------------------------------------------------
/R/tree.R:
--------------------------------------------------------------------------------
  1 | #' scl_spantree_ord1
  2 | #'
  3 | #' Generate a spanning tree from first-order relationships expressed via a set
  4 | #' of edges
  5 | #'
  6 | #' @param edges A set of edges resulting from \link{scl_edges}, which are sorted
  7 | #' in ascending order according to user-specified data. The only aspect of that
  8 | #' data which affect tree construction is this order, so only the set of
  9 | #' \code{edges} are needed here
 10 | #'
 11 | #' @return A tree
 12 | #' @noRd
 13 | scl_spantree_ord1 <- function (edges) {
 14 | 
 15 |     tree <- rcpp_mst (edges) |>
 16 |         dplyr::arrange (from, to) |>
 17 |         tibble::tibble ()
 18 | 
 19 |     return (tree)
 20 | }
 21 | 
 22 | #' scl_spantree_slk
 23 | #'
 24 | #' Generate a spanning tree from full-order, single linkage clustering (SLK)
 25 | #' relationships expressed via a set of edges
 26 | #'
 27 | #' @param edges_all A set of ALL edges resulting from \link{scl_edges_all},
 28 | #' which are sorted in ascending order according to user-specified data.
 29 | #' @param edges_nn A equivalent set of nearest neighbour edges only, resulting
 30 | #' from \link{scl_edges_tri} or \link{scl_edges_nn}.
 31 | #'
 32 | #' @return A tree
 33 | #' @noRd
 34 | scl_spantree_slk <- function (edges_all, edges_nn, shortest, quiet = FALSE) {
 35 | 
 36 |     clusters <- rcpp_slk (edges_all, edges_nn,
 37 |         shortest = shortest, quiet = quiet
 38 |     ) + 1
 39 | 
 40 |     tibble::tibble (
 41 |         from = edges_nn$from [clusters],
 42 |         to = edges_nn$to [clusters]
 43 |     )
 44 | }
 45 | 
 46 | #' scl_spantree_alk
 47 | #'
 48 | #' Generate a spanning tree from full-order, average linkage clustering (ALK)
 49 | #' relationships expressed via a set of edges
 50 | #'
 51 | #' @inheritParams scl_spantree_slk
 52 | #' @noRd
 53 | scl_spantree_alk <- function (edges, shortest, quiet = FALSE) {
 54 | 
 55 |     clusters <- rcpp_alk (edges, shortest = shortest, quiet = quiet) + 1
 56 |     tibble::tibble (
 57 |         from = edges$from [clusters],
 58 |         to = edges$to [clusters]
 59 |     )
 60 | }
 61 | 
 62 | #' scl_spantree_clk
 63 | #'
 64 | #' Generate a spanning tree from full-order, complete linkage clustering (CLK)
 65 | #' relationships expressed via a set of edges
 66 | #'
 67 | #' @inheritParams scl_spantree_slk
 68 | #' @noRd
 69 | scl_spantree_clk <- function (edges_all, edges_nn, shortest, quiet = FALSE) {
 70 | 
 71 |     clusters <- rcpp_clk (edges_all, edges_nn,
 72 |         shortest = shortest, quiet = quiet
 73 |     ) + 1
 74 | 
 75 |     tibble::tibble (
 76 |         from = edges_nn$from [clusters],
 77 |         to = edges_nn$to [clusters]
 78 |     )
 79 | }
 80 | 
 81 | #' scl_cuttree
 82 | #'
 83 | #' Cut a tree generated with \link{scl_spantree} into a specified number of
 84 | #' clusters or components
 85 | #'
 86 | #' @param tree result of \link{scl_spantree}
 87 | #' @param edges A set of edges resulting from \link{scl_edges}, but with
 88 | #' additional data specifying edge weights, distances, or desired properties
 89 | #' from which to construct the tree
 90 | #' @inheritParams scl_redcap
 91 | #'
 92 | #' @return Modified version of \code{tree}, including an additional column
 93 | #' specifying the cluster number of each edge, with NA for edges that lie
 94 | #' between clusters.
 95 | #'
 96 | #' @note The \code{rcpp_cut_tree} routine in \code{src/cuttree} includes
 97 | #' \code{constexpr MIN_CLUSTER_SIZE = 3}.
 98 | #'
 99 | #' @noRd
100 | scl_cuttree <- function (tree, edges, ncl, shortest,
101 |                          iterate_ncl = FALSE, quiet = FALSE) {
102 | 
103 |     num_clusters <- 0
104 |     ncl_trial <- ncl
105 | 
106 |     quiet <- !(!quiet & nrow (tree) > 100)
107 | 
108 |     while (num_clusters < ncl) {
109 | 
110 |         if (num_clusters > 0 && !quiet) {
111 |             message ("Not enough clusters found; re-starting search.")
112 |         }
113 | 
114 |         tree_temp <- dplyr::left_join (tree, edges, by = c ("from", "to"))
115 |         tree_temp$cluster <- rcpp_cut_tree (
116 |             tree_temp,
117 |             ncl = ncl_trial,
118 |             shortest = shortest,
119 |             quiet = quiet
120 |         ) + 1L
121 |         num_clusters <- length (which (table (tree_temp$cluster) > 2))
122 |         if (!quiet) {
123 |             message ("Total clusters found with > 2 members: ", num_clusters)
124 |         }
125 |         ncl_trial <- ncl_trial + 1
126 |         if (ncl_trial >= nrow (tree) || iterate_ncl) {
127 |             break
128 |         }
129 |     }
130 | 
131 |     return (tree_temp)
132 | }
133 | 


--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
 1 | #' scl_tbl
 2 | #'
 3 | #' Convert anything to a tibble
 4 | #'
 5 | #' @param xy A rectangular object containing the coordinates
 6 | #' @return A tibble-ified version of the input, with coordinate columns
 7 | #' identified and re-labelled "x" and "y"
 8 | #' @noRd
 9 | scl_tbl <- function (xy) {
10 |     if (!inherits (xy, "data.frame")) {
11 |         if (!is.numeric (xy)) {
12 |             stop ("coordinates must be numeric")
13 |         }
14 |         if (is.vector (xy)) {
15 |             stop ("coordinates require at least 2 columns")
16 |         }
17 |         xy <- data.frame (xy)
18 |     }
19 |     xi <- grep ("^x|^lon", names (xy), ignore.case = TRUE)
20 |     yi <- grep ("^y|^lat", names (xy), ignore.case = TRUE)
21 |     if (length (xi) == 1L && length (yi) == 1L) {
22 |         names (xy) [xi] <- "x"
23 |         names (xy) [yi] <- "y"
24 |     } else if (ncol (xy) == 2) {
25 |         colnames (xy) <- c ("x", "y")
26 |     } else {
27 |         stop ("Cannot determine unambiguous coordinate columns")
28 |     }
29 |     tibble::as_tibble (xy)
30 | }
31 | 
32 | #' scl_linkage_type
33 | #'
34 | #' Convert \code{linkage} string arg to matching type
35 | #' @param linkage Type of linkage
36 | #' @return Strict match to one of three options
37 | #' @noRd
38 | scl_linkage_type <- function (linkage) {
39 |     linkages <- c ("single", "average", "complete", "full")
40 |     i <- grep (linkage, linkages, ignore.case = TRUE)
41 |     if (length (i) == 0L) {
42 |         stop ("linkage must be one of (single, average, complete, full)")
43 |     }
44 | 
45 |     return (linkages [i])
46 | }
47 | 


--------------------------------------------------------------------------------
/R/zzz.R:
--------------------------------------------------------------------------------
 1 | .onLoad <- function (libname, pkgname) { # nolint
 2 | 
 3 |     # make data set names global to avoid CHECK notes
 4 |     utils::globalVariables (".")
 5 |     utils::globalVariables ("i")
 6 |     utils::globalVariables ("x")
 7 |     utils::globalVariables ("y")
 8 |     utils::globalVariables ("d")
 9 |     utils::globalVariables ("id")
10 |     utils::globalVariables ("v")
11 |     utils::globalVariables ("V1")
12 |     utils::globalVariables ("V2")
13 |     utils::globalVariables ("V3")
14 |     utils::globalVariables ("from")
15 |     utils::globalVariables ("to")
16 |     utils::globalVariables ("clnum")
17 |     utils::globalVariables ("cluster")
18 |     utils::globalVariables ("merged")
19 |     utils::globalVariables ("comp")
20 |     utils::globalVariables ("value")
21 |     utils::globalVariables ("na.omit")
22 |     utils::globalVariables ("xfr")
23 |     utils::globalVariables ("xto")
24 |     utils::globalVariables ("yfr")
25 |     utils::globalVariables ("yto")
26 |     utils::globalVariables ("ind")
27 | 
28 |     invisible ()
29 | }
30 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | output: github_document
  3 | ---
  4 | 
  5 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  6 | 
  7 | ```{r, echo = FALSE}
  8 | knitr::opts_chunk$set (
  9 |     collapse = TRUE,
 10 |     comment = "#>",
 11 |     fig.path = "man/figures/README-"
 12 | )
 13 | ```
 14 | 
 15 | [![R build status](https://github.com/mpadge/spatialcluster/workflows/R-CMD-check/badge.svg)](https://github.com/mpadge/spatialcluster/actions?query=workflow%3AR-CMD-check)
 16 | [![Project Status: WIP](http://www.repostatus.org/badges/latest/wip.svg)](http://www.repostatus.org/#wip)
 17 | [![codecov](https://codecov.io/gh/mpadge/spatialcluster/branch/master/graph/badge.svg)](https://codecov.io/gh/mpadge/spatialcluster)
 18 | 
 19 | # spatialcluster
 20 | 
 21 | An **R** package for spatially-constrained clustering using either distance or
 22 | covariance matrices. "*Spatially-constrained*" means that the data from which
 23 | clusters are to be formed also map on to spatial coordinates, and the
 24 | constraint is that clusters must be spatially contiguous.
 25 | 
 26 | The package includes both an implementation of the
 27 | REDCAP collection of efficient yet approximate algorithms described in [D. Guo's
 28 | 2008 paper, "Regionalization with dynamically constrained agglomerative
 29 | clustering and
 30 | partitioning."](https://www.tandfonline.com/doi/abs/10.1080/13658810701674970)
 31 | (pdf available
 32 | [here](https://pdfs.semanticscholar.org/ead1/7df8aaa1aed0e433b3ae1ec1ec5c7e785b2b.pdf)),
 33 | with extension to covariance matrices, and a new technique for computing
 34 | clusters using complete data sets. The package is also designed to analyse
 35 | matrices of spatial interactions (counts, densities) between sets of origin and
 36 | destination points. The spatial structure of interaction matrices is able to be
 37 | statistically analysed to yield both global statistics for the overall spatial
 38 | structure, and local statistics for individual clusters.
 39 | 
 40 | 
 41 | ## Installation
 42 | 
 43 | The easiest way to install `spatialcluster` is be enabling the [corresponding
 44 | `r-universe`](https://mpadge.r-universe.dev/):
 45 | 
 46 | ```{r r-univ, eval = FALSE}
 47 | options (repos = c (
 48 |     mpadge = "https://mpadge.r-universe.dev",
 49 |     CRAN = "https://cloud.r-project.org"
 50 | ))
 51 | ```
 52 | 
 53 | The package can then be installed as usual with,
 54 | 
 55 | ```{r install, eval = FALSE}
 56 | install.packges ("spatialcluster")
 57 | ```
 58 | 
 59 | Alternatively, the package can also be installed using any of the following
 60 | options:
 61 | 
 62 | ```{r gh-installation, eval = FALSE}
 63 | # install.packages("remotes")
 64 | remotes::install_git ("https://codeberg.org/mpadge/spatialcluster")
 65 | remotes::install_git ("https://git.sr.ht/~mpadge/spatialcluster")
 66 | remotes::install_bitbucket ("mpadge/spatialcluster")
 67 | remotes::install_gitlab ("mpadge/spatialcluster")
 68 | remotes::install_github ("mpadge/spatialcluster")
 69 | ```
 70 | 
 71 | ## Usage
 72 | 
 73 | The two main functions, `scl_redcap()` and `scl_full()`, implement different
 74 | algorithms for spatial clustering. The former implements the REDCAP collection
 75 | of efficient yet approximate algorithms described in [D. Guo's 2008 paper,
 76 | "Regionalization with dynamically constrained agglomerative clustering and
 77 | partitioning."](https://www.tandfonline.com/doi/abs/10.1080/13658810701674970)
 78 | (pdf available
 79 | [here](https://pdfs.semanticscholar.org/ead1/7df8aaa1aed0e433b3ae1ec1ec5c7e785b2b.pdf)),
 80 | with extension here to apply clustering to covariance matrices. These
 81 | algorithms are computationally efficient yet generate only *approximate*
 82 | estimates of underlying clusters. The second function, `scl_full()`, trades
 83 | computational efficiency for accuracy, through generating clustering schemes
 84 | using all available data.
 85 | 
 86 | In short:
 87 | 
 88 | - `scl_full()` should always be preferred as long as it returns results within
 89 |   a reasonable amount of time
 90 | - `scl_redcap()` should be used only where data are too large for `scl_full()`
 91 |   to be run in a reasonable time.
 92 | 
 93 | For clustering a group of `n` points, both of these functions require three
 94 | main arguments:
 95 | 
 96 | 1. A rectangular matrix of spatial coordinates of points to be clustered (`n`
 97 |     rows; at least 2 columns);
 98 | 2. An `n`-by-`n` square matrix quantifying relationships between those points;
 99 | 3. A single value (`ncl`) specifying the desired number of clusters.
100 | 
101 | The following code demonstrates usage with randomly-generated data:
102 | ```{r}
103 | set.seed (1)
104 | n <- 100
105 | xy <- matrix (runif (2 * n), ncol = 2)
106 | dmat <- matrix (runif (n^2), ncol = n)
107 | ```
108 | 
109 | The load the package and call the function:
110 | 
111 | ```{r full-single, echo = TRUE, eval = TRUE}
112 | library (spatialcluster)
113 | scl <- scl_full (xy, dmat, ncl = 8)
114 | plot (scl)
115 | ```
116 | 
117 | Both functions return a `list` with the following components:
118 | 
119 | ```{r list-components}
120 | names (scl)
121 | ```
122 | 
123 | - `tree` details distances and cluster numbers for all pairwise comparisons
124 |   between objects.
125 | - `merges` details increasing distances at which each pair of objects was
126 |   merged into a single cluster.
127 | - `ord` provides the order of the merges (for `scl_full()` only).
128 | - `nodes` records the spatial coordinates of each point (node) of the input
129 |   data.
130 | - `pars` retains the parameters used to call the clustering function.
131 | - `statsitics` returns the clustering statistics, both for individual clusters
132 |   and an overall global statistic for the clustering scheme as a whole.
133 | 
134 | See the "_Get Started_" vignette for more details.
135 | 
136 | ## A Cautionary Note
137 | 
138 | The following plot compares the results of applying four different clustering
139 | algorithms to the same data.
140 | 
141 | ```{r cautionary, eval = TRUE, fig.width = 7, fig.height = 7}
142 | library (ggplot2)
143 | library (gridExtra)
144 | scl <- scl_full (xy, dmat, ncl = 8, linkage = "single")
145 | p1 <- plot (scl) + ggtitle ("full-single")
146 | scl <- scl_redcap (xy, dmat, ncl = 8, linkage = "single")
147 | p2 <- plot (scl) + ggtitle ("redcap-single")
148 | scl <- scl_redcap (xy, dmat, ncl = 8, linkage = "average")
149 | p3 <- plot (scl) + ggtitle ("redcap-average")
150 | scl <- scl_redcap (xy, dmat, ncl = 8, linkage = "complete")
151 | p4 <- plot (scl) + ggtitle ("redcap-complete")
152 | 
153 | grid.arrange (p1, p2, p3, p4, ncol = 2)
154 | ```
155 | 
156 | 
157 | This example illustrates the universal danger in all clustering algorithms: they
158 | can not fail to produce results, even when the data fed to them are definitely
159 | devoid of any information as in this example. Clustering algorithms should only
160 | be applied to reflect a very specific hypothesis for why data should be
161 | clustered in the first place; spatial clustering algorithms should only be
162 | applied to reflect two very specific hypothesis for (i) why data should be
163 | clustered at all, and (ii) why those clusters should manifest a spatial
164 | pattern.
165 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  3 | 
  4 | [![R build
  5 | status](https://github.com/mpadge/spatialcluster/workflows/R-CMD-check/badge.svg)](https://github.com/mpadge/spatialcluster/actions?query=workflow%3AR-CMD-check)
  6 | [![Project Status:
  7 | WIP](http://www.repostatus.org/badges/latest/wip.svg)](http://www.repostatus.org/#wip)
  8 | [![codecov](https://codecov.io/gh/mpadge/spatialcluster/branch/master/graph/badge.svg)](https://codecov.io/gh/mpadge/spatialcluster)
  9 | 
 10 | # spatialcluster
 11 | 
 12 | An **R** package for spatially-constrained clustering using either
 13 | distance or covariance matrices. “*Spatially-constrained*” means that
 14 | the data from which clusters are to be formed also map on to spatial
 15 | coordinates, and the constraint is that clusters must be spatially contiguous.
 16 | 
 17 | The package includes both an implementation of the REDCAP collection of
 18 | efficient yet approximate algorithms described in [D. Guo’s 2008 paper,
 19 | “Regionalization with dynamically constrained agglomerative clustering
 20 | and
 21 | partitioning.”](https://www.tandfonline.com/doi/abs/10.1080/13658810701674970)
 22 | (pdf available
 23 | [here](https://pdfs.semanticscholar.org/ead1/7df8aaa1aed0e433b3ae1ec1ec5c7e785b2b.pdf)),
 24 | with extension to covariance matrices, and a new technique for computing
 25 | clusters using complete data sets. The package is also designed to
 26 | analyse matrices of spatial interactions (counts, densities) between
 27 | sets of origin and destination points. The spatial structure of
 28 | interaction matrices is able to be statistically analysed to yield both
 29 | global statistics for the overall spatial structure, and local
 30 | statistics for individual clusters.
 31 | 
 32 | ## Installation
 33 | 
 34 | The easiest way to install `spatialcluster` is be enabling the
 35 | [corresponding `r-universe`](https://mpadge.r-universe.dev/):
 36 | 
 37 | ``` r
 38 | options (repos = c (
 39 |     mpadge = "https://mpadge.r-universe.dev",
 40 |     CRAN = "https://cloud.r-project.org"
 41 | ))
 42 | ```
 43 | 
 44 | The package can then be installed as usual with,
 45 | 
 46 | ``` r
 47 | install.packges ("spatialcluster")
 48 | ```
 49 | 
 50 | Alternatively, the package can also be installed using any of the
 51 | following options:
 52 | 
 53 | ``` r
 54 | # install.packages("remotes")
 55 | remotes::install_git ("https://codeberg.org/mpadge/spatialcluster")
 56 | remotes::install_git ("https://git.sr.ht/~mpadge/spatialcluster")
 57 | remotes::install_bitbucket ("mpadge/spatialcluster")
 58 | remotes::install_gitlab ("mpadge/spatialcluster")
 59 | remotes::install_github ("mpadge/spatialcluster")
 60 | ```
 61 | 
 62 | ## Usage
 63 | 
 64 | The two main functions, `scl_redcap()` and `scl_full()`, implement
 65 | different algorithms for spatial clustering. The former implements the
 66 | REDCAP collection of efficient yet approximate algorithms described in
 67 | [D. Guo’s 2008 paper, “Regionalization with dynamically constrained
 68 | agglomerative clustering and
 69 | partitioning.”](https://www.tandfonline.com/doi/abs/10.1080/13658810701674970)
 70 | (pdf available
 71 | [here](https://pdfs.semanticscholar.org/ead1/7df8aaa1aed0e433b3ae1ec1ec5c7e785b2b.pdf)),
 72 | with extension here to apply clustering to covariance matrices. These
 73 | algorithms are computationally efficient yet generate only *approximate*
 74 | estimates of underlying clusters. The second function, `scl_full()`,
 75 | trades computational efficiency for accuracy, through generating
 76 | clustering schemes using all available data.
 77 | 
 78 | In short:
 79 | 
 80 | - `scl_full()` should always be preferred as long as it returns results
 81 |   within a reasonable amount of time
 82 | - `scl_redcap()` should be used only where data are too large for
 83 |   `scl_full()` to be run in a reasonable time.
 84 | 
 85 | For clustering a group of `n` points, both of these functions require three
 86 | main arguments:
 87 | 
 88 | 1.  A rectangular matrix of spatial coordinates of points to be clustered 
 89 |     (`n` rows; at least 2 columns);
 90 | 2.  An `n`-by-`n` square matrix quantifying relationships between those
 91 |     points;
 92 | 3.  A single value (`ncl`) specifying the desired number of clusters.
 93 | 
 94 | The following code demonstrates usage with randomly-generated data:
 95 | 
 96 | ``` r
 97 | set.seed (1)
 98 | n <- 100
 99 | xy <- matrix (runif (2 * n), ncol = 2)
100 | dmat <- matrix (runif (n^2), ncol = n)
101 | ```
102 | 
103 | The load the package and call the function:
104 | 
105 | ``` r
106 | library (spatialcluster)
107 | scl <- scl_full (xy, dmat, ncl = 8)
108 | plot (scl)
109 | ```
110 | 
111 | ![](man/figures/README-full-single-1.png)<!-- -->
112 | 
113 | Both functions return a `list` with the following components:
114 | 
115 | ``` r
116 | names (scl)
117 | #> [1] "tree"       "merges"     "ord"        "nodes"      "pars"      
118 | #> [6] "statistics"
119 | ```
120 | 
121 | - `tree` details distances and cluster numbers for all pairwise
122 |   comparisons between objects.
123 | - `merges` details increasing distances at which each pair of objects
124 |   was merged into a single cluster.
125 | - `ord` provides the order of the merges (for `scl_full()` only).
126 | - `nodes` records the spatial coordinates of each point (node) of the
127 |   input data.
128 | - `pars` retains the parameters used to call the clustering function.
129 | - `statsitics` returns the clustering statistics, both for individual
130 |   clusters and an overall global statistic for the clustering scheme as
131 |   a whole.
132 | 
133 | See the "_Get Started_" vignette for more details.
134 | 
135 | ## A Cautionary Note
136 | 
137 | The following plot compares the results of applying four different
138 | clustering algorithms to the same data.
139 | 
140 | ``` r
141 | library (ggplot2)
142 | library (gridExtra)
143 | scl <- scl_full (xy, dmat, ncl = 8, linkage = "single")
144 | p1 <- plot (scl) + ggtitle ("full-single")
145 | scl <- scl_redcap (xy, dmat, ncl = 8, linkage = "single")
146 | p2 <- plot (scl) + ggtitle ("redcap-single")
147 | scl <- scl_redcap (xy, dmat, ncl = 8, linkage = "average")
148 | p3 <- plot (scl) + ggtitle ("redcap-average")
149 | scl <- scl_redcap (xy, dmat, ncl = 8, linkage = "complete")
150 | p4 <- plot (scl) + ggtitle ("redcap-complete")
151 | 
152 | grid.arrange (p1, p2, p3, p4, ncol = 2)
153 | ```
154 | 
155 | ![](man/figures/README-cautionary-1.png)<!-- -->
156 | 
157 | This example illustrates the universal danger in all clustering
158 | algorithms: they can not fail to produce results, even when the data fed
159 | to them are definitely devoid of any information as in this example.
160 | Clustering algorithms should only be applied to reflect a very specific
161 | hypothesis for why data should be clustered in the first place; spatial
162 | clustering algorithms should only be applied to reflect two very
163 | specific hypothesis for (i) why data should be clustered at all, and
164 | (ii) why those clusters should manifest a spatial pattern.
165 | 


--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
 1 | title: spatialcluster
 2 | url: https://mpadge.github.io/spatialcluster
 3 | 
 4 | template:
 5 |   params:
 6 |     bootswatch: cerulean
 7 | 
 8 | reference:
 9 | - title: Package
10 |   contents:
11 |     - has_concept("package")
12 | - title: Clustering Functions
13 |   contents:
14 |     - has_concept("clustering_fns")
15 | - title: Plotting Functions
16 |   contents:
17 |     - has_concept("plot_fns")
18 | 


--------------------------------------------------------------------------------
/codemeta.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
  3 |   "@type": "SoftwareSourceCode",
  4 |   "identifier": "spatialcluster",
  5 |   "description": "R port of redcap (Regionalization with dynamically constrained agglomerative clustering and partitioning).",
  6 |   "name": "spatialcluster: R port of redcap",
  7 |   "codeRepository": "https://github.com/mpadge/spatialcluster",
  8 |   "issueTracker": "https://github.com/mpadge/spatialcluster/issues",
  9 |   "license": "https://spdx.org/licenses/GPL-3.0",
 10 |   "version": "0.2.0.017",
 11 |   "programmingLanguage": {
 12 |     "@type": "ComputerLanguage",
 13 |     "name": "R",
 14 |     "url": "https://r-project.org"
 15 |   },
 16 |   "runtimePlatform": "R version 4.4.2 (2024-10-31)",
 17 |   "author": [
 18 |     {
 19 |       "@type": "Person",
 20 |       "givenName": "Mark",
 21 |       "familyName": "Padgham",
 22 |       "email": "mark.padgham@email.com"
 23 |     }
 24 |   ],
 25 |   "maintainer": [
 26 |     {
 27 |       "@type": "Person",
 28 |       "givenName": "Mark",
 29 |       "familyName": "Padgham",
 30 |       "email": "mark.padgham@email.com"
 31 |     }
 32 |   ],
 33 |   "softwareSuggestions": [
 34 |     {
 35 |       "@type": "SoftwareApplication",
 36 |       "identifier": "dbscan",
 37 |       "name": "dbscan",
 38 |       "provider": {
 39 |         "@id": "https://cran.r-project.org",
 40 |         "@type": "Organization",
 41 |         "name": "Comprehensive R Archive Network (CRAN)",
 42 |         "url": "https://cran.r-project.org"
 43 |       },
 44 |       "sameAs": "https://CRAN.R-project.org/package=dbscan"
 45 |     },
 46 |     {
 47 |       "@type": "SoftwareApplication",
 48 |       "identifier": "knitr",
 49 |       "name": "knitr",
 50 |       "provider": {
 51 |         "@id": "https://cran.r-project.org",
 52 |         "@type": "Organization",
 53 |         "name": "Comprehensive R Archive Network (CRAN)",
 54 |         "url": "https://cran.r-project.org"
 55 |       },
 56 |       "sameAs": "https://CRAN.R-project.org/package=knitr"
 57 |     },
 58 |     {
 59 |       "@type": "SoftwareApplication",
 60 |       "identifier": "rmarkdown",
 61 |       "name": "rmarkdown",
 62 |       "provider": {
 63 |         "@id": "https://cran.r-project.org",
 64 |         "@type": "Organization",
 65 |         "name": "Comprehensive R Archive Network (CRAN)",
 66 |         "url": "https://cran.r-project.org"
 67 |       },
 68 |       "sameAs": "https://CRAN.R-project.org/package=rmarkdown"
 69 |     },
 70 |     {
 71 |       "@type": "SoftwareApplication",
 72 |       "identifier": "roxygen2",
 73 |       "name": "roxygen2",
 74 |       "provider": {
 75 |         "@id": "https://cran.r-project.org",
 76 |         "@type": "Organization",
 77 |         "name": "Comprehensive R Archive Network (CRAN)",
 78 |         "url": "https://cran.r-project.org"
 79 |       },
 80 |       "sameAs": "https://CRAN.R-project.org/package=roxygen2"
 81 |     },
 82 |     {
 83 |       "@type": "SoftwareApplication",
 84 |       "identifier": "testthat",
 85 |       "name": "testthat",
 86 |       "provider": {
 87 |         "@id": "https://cran.r-project.org",
 88 |         "@type": "Organization",
 89 |         "name": "Comprehensive R Archive Network (CRAN)",
 90 |         "url": "https://cran.r-project.org"
 91 |       },
 92 |       "sameAs": "https://CRAN.R-project.org/package=testthat"
 93 |     }
 94 |   ],
 95 |   "softwareRequirements": {
 96 |     "1": {
 97 |       "@type": "SoftwareApplication",
 98 |       "identifier": "R",
 99 |       "name": "R",
100 |       "version": ">= 4.1.0"
101 |     },
102 |     "2": {
103 |       "@type": "SoftwareApplication",
104 |       "identifier": "alphahull",
105 |       "name": "alphahull",
106 |       "provider": {
107 |         "@id": "https://cran.r-project.org",
108 |         "@type": "Organization",
109 |         "name": "Comprehensive R Archive Network (CRAN)",
110 |         "url": "https://cran.r-project.org"
111 |       },
112 |       "sameAs": "https://CRAN.R-project.org/package=alphahull"
113 |     },
114 |     "3": {
115 |       "@type": "SoftwareApplication",
116 |       "identifier": "dplyr",
117 |       "name": "dplyr",
118 |       "provider": {
119 |         "@id": "https://cran.r-project.org",
120 |         "@type": "Organization",
121 |         "name": "Comprehensive R Archive Network (CRAN)",
122 |         "url": "https://cran.r-project.org"
123 |       },
124 |       "sameAs": "https://CRAN.R-project.org/package=dplyr"
125 |     },
126 |     "4": {
127 |       "@type": "SoftwareApplication",
128 |       "identifier": "ggplot2",
129 |       "name": "ggplot2",
130 |       "provider": {
131 |         "@id": "https://cran.r-project.org",
132 |         "@type": "Organization",
133 |         "name": "Comprehensive R Archive Network (CRAN)",
134 |         "url": "https://cran.r-project.org"
135 |       },
136 |       "sameAs": "https://CRAN.R-project.org/package=ggplot2"
137 |     },
138 |     "5": {
139 |       "@type": "SoftwareApplication",
140 |       "identifier": "ggthemes",
141 |       "name": "ggthemes",
142 |       "provider": {
143 |         "@id": "https://cran.r-project.org",
144 |         "@type": "Organization",
145 |         "name": "Comprehensive R Archive Network (CRAN)",
146 |         "url": "https://cran.r-project.org"
147 |       },
148 |       "sameAs": "https://CRAN.R-project.org/package=ggthemes"
149 |     },
150 |     "6": {
151 |       "@type": "SoftwareApplication",
152 |       "identifier": "methods",
153 |       "name": "methods"
154 |     },
155 |     "7": {
156 |       "@type": "SoftwareApplication",
157 |       "identifier": "Rcpp",
158 |       "name": "Rcpp",
159 |       "version": ">= 0.12.6",
160 |       "provider": {
161 |         "@id": "https://cran.r-project.org",
162 |         "@type": "Organization",
163 |         "name": "Comprehensive R Archive Network (CRAN)",
164 |         "url": "https://cran.r-project.org"
165 |       },
166 |       "sameAs": "https://CRAN.R-project.org/package=Rcpp"
167 |     },
168 |     "8": {
169 |       "@type": "SoftwareApplication",
170 |       "identifier": "tibble",
171 |       "name": "tibble",
172 |       "provider": {
173 |         "@id": "https://cran.r-project.org",
174 |         "@type": "Organization",
175 |         "name": "Comprehensive R Archive Network (CRAN)",
176 |         "url": "https://cran.r-project.org"
177 |       },
178 |       "sameAs": "https://CRAN.R-project.org/package=tibble"
179 |     },
180 |     "9": {
181 |       "@type": "SoftwareApplication",
182 |       "identifier": "tripack",
183 |       "name": "tripack",
184 |       "provider": {
185 |         "@id": "https://cran.r-project.org",
186 |         "@type": "Organization",
187 |         "name": "Comprehensive R Archive Network (CRAN)",
188 |         "url": "https://cran.r-project.org"
189 |       },
190 |       "sameAs": "https://CRAN.R-project.org/package=tripack"
191 |     },
192 |     "SystemRequirements": {}
193 |   },
194 |   "fileSize": "17667.689KB",
195 |   "readme": "https://github.com/mpadge/spatialcluster/blob/main/README.md",
196 |   "contIntegration": [
197 |     "https://github.com/mpadge/spatialcluster/actions?query=workflow%3AR-CMD-check",
198 |     "https://codecov.io/gh/mpadge/spatialcluster"
199 |   ],
200 |   "developmentStatus": "http://www.repostatus.org/#wip",
201 |   "keywords": [
202 |     "clustering-algorithm",
203 |     "cluster",
204 |     "r",
205 |     "spatial"
206 |   ]
207 | }
208 | 


--------------------------------------------------------------------------------
/makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: all build check doc readme test
 2 | 
 3 | all: doc readme build check
 4 | 
 5 | build: doc
 6 | 	R CMD build .
 7 | 
 8 | #check: build
 9 | #	R CMD check spatialcluster*tar.gz
10 | 
11 | clean:
12 | 	-rm -f spatialcluster*tar.gz
13 | 	-rm -fr spatialcluster.Rcheck
14 | 	-rm -fr src/*.{o,so}
15 | 
16 | doc: clean
17 | 	Rscript -e 'devtools::document()'
18 | 
19 | readme:
20 | 	Rscript -e 'rmarkdown::render("README.Rmd")'
21 | 
22 | test:
23 | 	Rscript -e 'devtools::test()'
24 | 
25 | check:
26 | 	Rscript -e 'library(pkgcheck); checks <- pkgcheck(); print(checks); summary (checks)'
27 | 
28 | install: clean
29 | 	R CMD INSTALL .
30 | 


--------------------------------------------------------------------------------
/man/figures/README-cautionary-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mpadge/spatialcluster/ae90bfc66f4b523c0c9988175810da5d3fe60986/man/figures/README-cautionary-1.png


--------------------------------------------------------------------------------
/man/figures/README-full-single-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mpadge/spatialcluster/ae90bfc66f4b523c0c9988175810da5d3fe60986/man/figures/README-full-single-1.png


--------------------------------------------------------------------------------
/man/plot.scl.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plot-fns.R
 3 | \name{plot.scl}
 4 | \alias{plot.scl}
 5 | \title{plot.scl}
 6 | \usage{
 7 | \method{plot}{scl}(x, ..., hull_alpha = 1)
 8 | }
 9 | \arguments{
10 | \item{x}{object to be plotted}
11 | 
12 | \item{...}{ignored here}
13 | 
14 | \item{hull_alpha}{alpha value of (non-)convex hulls, with default generating
15 | a convex hull, and smaller values generating concave hulls. (See
16 | ?alphashape::ashape for details).}
17 | }
18 | \description{
19 | plot.scl
20 | }
21 | \examples{
22 | set.seed (1)
23 | n <- 100
24 | xy <- matrix (runif (2 * n), ncol = 2)
25 | dmat <- matrix (runif (n^2), ncol = n)
26 | scl <- scl_redcap (xy, dmat, ncl = 4)
27 | plot (scl)
28 | # Connect clusters according to highest (\code{shortest = FALSE}) values of
29 | # \code{dmat}:
30 | scl <- scl_redcap (xy, dmat, ncl = 4, shortest = FALSE, full_order = FALSE)
31 | plot (scl)
32 | }
33 | \seealso{
34 | Other plot_fns: 
35 | \code{\link{plot_merges}()}
36 | }
37 | \concept{plot_fns}
38 | 


--------------------------------------------------------------------------------
/man/plot_merges.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plot-fns.R
 3 | \name{plot_merges}
 4 | \alias{plot_merges}
 5 | \title{plot_merges}
 6 | \usage{
 7 | plot_merges(x, root_tree = FALSE)
 8 | }
 9 | \arguments{
10 | \item{x}{Object of class \code{scl} obtained with \code{method = "full"}.}
11 | 
12 | \item{root_tree}{If \code{TRUE}, tree leaves are connected to bottom of plot,
13 | otherwise floating as determined by \link{plot.hclust}.}
14 | }
15 | \value{
16 | Nothing (generates plot)
17 | }
18 | \description{
19 | Plot dendrogram of merges for \code{scl} object with \code{method = "full"}.
20 | }
21 | \seealso{
22 | Other plot_fns: 
23 | \code{\link{plot.scl}()}
24 | }
25 | \concept{plot_fns}
26 | 


--------------------------------------------------------------------------------
/man/scl_full.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/scl-full.R
 3 | \name{scl_full}
 4 | \alias{scl_full}
 5 | \title{scl_full}
 6 | \usage{
 7 | scl_full(xy, dmat, ncl, linkage = "single", shortest = TRUE, nnbs = 6L)
 8 | }
 9 | \arguments{
10 | \item{xy}{Rectangular structure (matrix, data.frame, tibble), containing
11 | coordinates of points to be clustered.}
12 | 
13 | \item{dmat}{Square structure (matrix, data.frame, tibble) containing
14 | distances or equivalent metrics between all points in \code{xy}. If \code{xy}
15 | has \code{n} rows, then \code{dat} must have \code{n} rows and \code{n}
16 | columns.}
17 | 
18 | \item{ncl}{Desired number of clusters. See description of `ncl_iterate`
19 | parameter for conditions under which actual number may be less than this
20 | value.}
21 | 
22 | \item{linkage}{Either \code{"single"} or \code{"average"}. For covariance
23 | clustering, use \code{"single"} with `shortest = FALSE`.}
24 | 
25 | \item{shortest}{If \code{TRUE}, the \code{dmat} is interpreted as distances
26 | such that lower values are preferentially selected; if \code{FALSE}, then
27 | higher values of \code{dmat} are interpreted to indicate stronger
28 | relationships, as is the case for example with covariances.}
29 | 
30 | \item{nnbs}{Number of nearest neighbours to be used in calculating clustering
31 | trees. Triangulation will be used if \code{nnbs <= 0}.}
32 | }
33 | \description{
34 | Full spatially-constrained clustering.
35 | }
36 | \examples{
37 | n <- 100
38 | xy <- matrix (runif (2 * n), ncol = 2)
39 | dmat <- matrix (runif (n^2), ncol = n)
40 | scl <- scl_full (xy, dmat, ncl = 4)
41 | }
42 | \seealso{
43 | Other clustering_fns: 
44 | \code{\link{scl_recluster}()},
45 | \code{\link{scl_redcap}()}
46 | }
47 | \concept{clustering_fns}
48 | 


--------------------------------------------------------------------------------
/man/scl_recluster.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/scl-redcap.R
 3 | \name{scl_recluster}
 4 | \alias{scl_recluster}
 5 | \title{scl_reccluster}
 6 | \usage{
 7 | scl_recluster(scl, ncl, shortest = TRUE, quiet = FALSE)
 8 | }
 9 | \arguments{
10 | \item{scl}{An \code{scl} object returned from \link{scl_redcap}.}
11 | 
12 | \item{ncl}{Desired number of clusters. See description of `ncl_iterate`
13 | parameter for conditions under which actual number may be less than this
14 | value.}
15 | 
16 | \item{shortest}{If \code{TRUE}, the \code{dmat} is interpreted as distances
17 | such that lower values are preferentially selected; if \code{FALSE}, then
18 | higher values of \code{dmat} are interpreted to indicate stronger
19 | relationships, as is the case for example with covariances.}
20 | 
21 | \item{quiet}{If `FALSE` (default), display progress information on screen.}
22 | }
23 | \value{
24 | Modified \code{scl} object in which \code{tree} is re-cut into
25 | \code{ncl} clusters.
26 | }
27 | \description{
28 | Re-cut a spatial cluster tree (\code{scl}) at a different number of clusters.
29 | }
30 | \examples{
31 | n <- 100
32 | xy <- matrix (runif (2 * n), ncol = 2)
33 | dmat <- matrix (runif (n^2), ncol = n)
34 | scl <- scl_redcap (xy, dmat, ncl = 4)
35 | plot (scl)
36 | scl <- scl_recluster (scl, ncl = 5)
37 | plot (scl)
38 | 
39 | }
40 | \seealso{
41 | Other clustering_fns: 
42 | \code{\link{scl_full}()},
43 | \code{\link{scl_redcap}()}
44 | }
45 | \concept{clustering_fns}
46 | 


--------------------------------------------------------------------------------
/man/scl_redcap.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/scl-redcap.R
 3 | \name{scl_redcap}
 4 | \alias{scl_redcap}
 5 | \title{scl_redcap}
 6 | \usage{
 7 | scl_redcap(
 8 |   xy,
 9 |   dmat,
10 |   ncl,
11 |   full_order = TRUE,
12 |   linkage = "single",
13 |   shortest = TRUE,
14 |   nnbs = 6L,
15 |   iterate_ncl = FALSE,
16 |   quiet = FALSE
17 | )
18 | }
19 | \arguments{
20 | \item{xy}{Rectangular structure (matrix, data.frame, tibble), containing
21 | coordinates of points to be clustered.}
22 | 
23 | \item{dmat}{Square structure (matrix, data.frame, tibble) containing
24 | distances or equivalent metrics between all points in \code{xy}. If \code{xy}
25 | has \code{n} rows, then \code{dat} must have \code{n} rows and \code{n}
26 | columns.}
27 | 
28 | \item{ncl}{Desired number of clusters. See description of `ncl_iterate`
29 | parameter for conditions under which actual number may be less than this
30 | value.}
31 | 
32 | \item{full_order}{If \code{FALSE}, build spanning trees from first-order
33 | relationships only, otherwise build from full-order relationships (see Note).}
34 | 
35 | \item{linkage}{One of \code{"single"}, \code{"average"}, or
36 | \code{"complete"}; see Note.}
37 | 
38 | \item{shortest}{If \code{TRUE}, the \code{dmat} is interpreted as distances
39 | such that lower values are preferentially selected; if \code{FALSE}, then
40 | higher values of \code{dmat} are interpreted to indicate stronger
41 | relationships, as is the case for example with covariances.}
42 | 
43 | \item{nnbs}{Number of nearest neighbours to be used in calculating clustering
44 | trees. Triangulation will be used if \code{nnbs <= 0}.}
45 | 
46 | \item{iterate_ncl}{Actual numbers of clusters found may be less than the
47 | specified value of `ncl`, because clusters formed from < 3 edges are removed.
48 | If `iterate_ncl = FALSE` (the default), the value is returned with whatever
49 | number of actual clusters is found. Setting this parameter to `TRUE` forces
50 | the algorithm to iterate until the exact number of clusters has been found.
51 | For large data sets, this may result in considerable longer calculation
52 | times.}
53 | 
54 | \item{quiet}{If `FALSE` (default), display progress information on screen.}
55 | }
56 | \value{
57 | A object of class \code{scl} with \code{tree} containing the
58 | clustering scheme, and \code{xy} the original coordinate data of the
59 | clustered points. An additional component, \code{tree_rest}, enables the tree
60 | to be re-cut to a different number of clusters via \link{scl_recluster},
61 | rather than calculating clusters anew.
62 | }
63 | \description{
64 | Cluster spatial data with REDCAP (REgionalization with Dynamically
65 | Constrained Agglomerative clustering and Partitioning) routines.
66 | }
67 | \note{
68 | Please refer to the original REDCAP paper ('Regionalization with
69 | dynamically constrained agglomerative clustering and partitioning (REDCAP)',
70 | by D. Guo (2008), Int.J.Geo.Inf.Sci 22:801-823) for details of the
71 | \code{full_order} and \code{linkage} parameters. This paper clearly
72 | demonstrates the general inferiority of spanning trees constructed from
73 | first-order relationships. It is therefore strongly recommended that the
74 | default \code{full_order = TRUE} be used at all times.
75 | }
76 | \examples{
77 | n <- 100
78 | xy <- matrix (runif (2 * n), ncol = 2)
79 | dmat <- matrix (runif (n^2), ncol = n)
80 | scl <- scl_redcap (xy, dmat, ncl = 4)
81 | # Those clusters will by default be constructed by connecting edges with the
82 | # lowest (\code{shortest}) values of \code{dmat}, and will differ from
83 | scl <- scl_redcap (xy, dmat, ncl = 4, shortest = FALSE)
84 | # using 'full_order = FALSE' constructs clusters from first-order
85 | # relationships only; not recommended, but possible nevertheless:
86 | scl <- scl_redcap (xy, dmat, ncl = 4, full_order = FALSE)
87 | 
88 | }
89 | \seealso{
90 | Other clustering_fns: 
91 | \code{\link{scl_full}()},
92 | \code{\link{scl_recluster}()}
93 | }
94 | \concept{clustering_fns}
95 | 


--------------------------------------------------------------------------------
/man/spatialcluster.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/spatialcluster-package.R
 3 | \docType{package}
 4 | \name{spatialcluster}
 5 | \alias{spatialcluster-package}
 6 | \alias{spatialcluster}
 7 | \title{spatialcluster.}
 8 | \description{
 9 | R port of redcap (Regionalization with dynamically constrained agglomerative
10 | clustering and partitioning).
11 | }
12 | \seealso{
13 | Useful links:
14 | \itemize{
15 |   \item \url{https://github.com/mpadge/spatialcluster}
16 |   \item Report bugs at \url{https://github.com/mpadge/spatialcluster/issues}
17 | }
18 | 
19 | }
20 | \author{
21 | \strong{Maintainer}: Mark Padgham \email{mark.padgham@email.com}
22 | 
23 | }
24 | \concept{package}
25 | 


--------------------------------------------------------------------------------
/src/RcppExports.cpp:
--------------------------------------------------------------------------------
  1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand
  2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
  3 | 
  4 | #include <RcppArmadillo.h>
  5 | #include <Rcpp.h>
  6 | 
  7 | using namespace Rcpp;
  8 | 
  9 | #ifdef RCPP_USE_GLOBAL_ROSTREAM
 10 | Rcpp::Rostream<true>&  Rcpp::Rcout = Rcpp::Rcpp_cout_get();
 11 | Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
 12 | #endif
 13 | 
 14 | // rcpp_alk
 15 | Rcpp::IntegerVector rcpp_alk(const Rcpp::DataFrame gr, const bool shortest, const bool quiet);
 16 | RcppExport SEXP _spatialcluster_rcpp_alk(SEXP grSEXP, SEXP shortestSEXP, SEXP quietSEXP) {
 17 | BEGIN_RCPP
 18 |     Rcpp::RObject rcpp_result_gen;
 19 |     Rcpp::RNGScope rcpp_rngScope_gen;
 20 |     Rcpp::traits::input_parameter< const Rcpp::DataFrame >::type gr(grSEXP);
 21 |     Rcpp::traits::input_parameter< const bool >::type shortest(shortestSEXP);
 22 |     Rcpp::traits::input_parameter< const bool >::type quiet(quietSEXP);
 23 |     rcpp_result_gen = Rcpp::wrap(rcpp_alk(gr, shortest, quiet));
 24 |     return rcpp_result_gen;
 25 | END_RCPP
 26 | }
 27 | // rcpp_clk
 28 | Rcpp::IntegerVector rcpp_clk(const Rcpp::DataFrame gr_full, const Rcpp::DataFrame gr, const bool shortest, const bool quiet);
 29 | RcppExport SEXP _spatialcluster_rcpp_clk(SEXP gr_fullSEXP, SEXP grSEXP, SEXP shortestSEXP, SEXP quietSEXP) {
 30 | BEGIN_RCPP
 31 |     Rcpp::RObject rcpp_result_gen;
 32 |     Rcpp::RNGScope rcpp_rngScope_gen;
 33 |     Rcpp::traits::input_parameter< const Rcpp::DataFrame >::type gr_full(gr_fullSEXP);
 34 |     Rcpp::traits::input_parameter< const Rcpp::DataFrame >::type gr(grSEXP);
 35 |     Rcpp::traits::input_parameter< const bool >::type shortest(shortestSEXP);
 36 |     Rcpp::traits::input_parameter< const bool >::type quiet(quietSEXP);
 37 |     rcpp_result_gen = Rcpp::wrap(rcpp_clk(gr_full, gr, shortest, quiet));
 38 |     return rcpp_result_gen;
 39 | END_RCPP
 40 | }
 41 | // rcpp_cut_tree
 42 | Rcpp::IntegerVector rcpp_cut_tree(const Rcpp::DataFrame tree, const int ncl, const bool shortest, const bool quiet);
 43 | RcppExport SEXP _spatialcluster_rcpp_cut_tree(SEXP treeSEXP, SEXP nclSEXP, SEXP shortestSEXP, SEXP quietSEXP) {
 44 | BEGIN_RCPP
 45 |     Rcpp::RObject rcpp_result_gen;
 46 |     Rcpp::RNGScope rcpp_rngScope_gen;
 47 |     Rcpp::traits::input_parameter< const Rcpp::DataFrame >::type tree(treeSEXP);
 48 |     Rcpp::traits::input_parameter< const int >::type ncl(nclSEXP);
 49 |     Rcpp::traits::input_parameter< const bool >::type shortest(shortestSEXP);
 50 |     Rcpp::traits::input_parameter< const bool >::type quiet(quietSEXP);
 51 |     rcpp_result_gen = Rcpp::wrap(rcpp_cut_tree(tree, ncl, shortest, quiet));
 52 |     return rcpp_result_gen;
 53 | END_RCPP
 54 | }
 55 | // rcpp_full_initial
 56 | Rcpp::IntegerVector rcpp_full_initial(const Rcpp::DataFrame gr, bool shortest);
 57 | RcppExport SEXP _spatialcluster_rcpp_full_initial(SEXP grSEXP, SEXP shortestSEXP) {
 58 | BEGIN_RCPP
 59 |     Rcpp::RObject rcpp_result_gen;
 60 |     Rcpp::RNGScope rcpp_rngScope_gen;
 61 |     Rcpp::traits::input_parameter< const Rcpp::DataFrame >::type gr(grSEXP);
 62 |     Rcpp::traits::input_parameter< bool >::type shortest(shortestSEXP);
 63 |     rcpp_result_gen = Rcpp::wrap(rcpp_full_initial(gr, shortest));
 64 |     return rcpp_result_gen;
 65 | END_RCPP
 66 | }
 67 | // rcpp_full_merge
 68 | Rcpp::NumericMatrix rcpp_full_merge(const Rcpp::DataFrame gr, const std::string linkage, const bool shortest);
 69 | RcppExport SEXP _spatialcluster_rcpp_full_merge(SEXP grSEXP, SEXP linkageSEXP, SEXP shortestSEXP) {
 70 | BEGIN_RCPP
 71 |     Rcpp::RObject rcpp_result_gen;
 72 |     Rcpp::RNGScope rcpp_rngScope_gen;
 73 |     Rcpp::traits::input_parameter< const Rcpp::DataFrame >::type gr(grSEXP);
 74 |     Rcpp::traits::input_parameter< const std::string >::type linkage(linkageSEXP);
 75 |     Rcpp::traits::input_parameter< const bool >::type shortest(shortestSEXP);
 76 |     rcpp_result_gen = Rcpp::wrap(rcpp_full_merge(gr, linkage, shortest));
 77 |     return rcpp_result_gen;
 78 | END_RCPP
 79 | }
 80 | // rcpp_mst
 81 | Rcpp::DataFrame rcpp_mst(Rcpp::DataFrame input);
 82 | RcppExport SEXP _spatialcluster_rcpp_mst(SEXP inputSEXP) {
 83 | BEGIN_RCPP
 84 |     Rcpp::RObject rcpp_result_gen;
 85 |     Rcpp::RNGScope rcpp_rngScope_gen;
 86 |     Rcpp::traits::input_parameter< Rcpp::DataFrame >::type input(inputSEXP);
 87 |     rcpp_result_gen = Rcpp::wrap(rcpp_mst(input));
 88 |     return rcpp_result_gen;
 89 | END_RCPP
 90 | }
 91 | // rcpp_slk
 92 | Rcpp::IntegerVector rcpp_slk(const Rcpp::DataFrame gr_full, const Rcpp::DataFrame gr, const bool shortest, const bool quiet);
 93 | RcppExport SEXP _spatialcluster_rcpp_slk(SEXP gr_fullSEXP, SEXP grSEXP, SEXP shortestSEXP, SEXP quietSEXP) {
 94 | BEGIN_RCPP
 95 |     Rcpp::RObject rcpp_result_gen;
 96 |     Rcpp::RNGScope rcpp_rngScope_gen;
 97 |     Rcpp::traits::input_parameter< const Rcpp::DataFrame >::type gr_full(gr_fullSEXP);
 98 |     Rcpp::traits::input_parameter< const Rcpp::DataFrame >::type gr(grSEXP);
 99 |     Rcpp::traits::input_parameter< const bool >::type shortest(shortestSEXP);
100 |     Rcpp::traits::input_parameter< const bool >::type quiet(quietSEXP);
101 |     rcpp_result_gen = Rcpp::wrap(rcpp_slk(gr_full, gr, shortest, quiet));
102 |     return rcpp_result_gen;
103 | END_RCPP
104 | }
105 | 


--------------------------------------------------------------------------------
/src/alk.cpp:
--------------------------------------------------------------------------------
  1 | #include "common.h"
  2 | #include "utils.h"
  3 | #include "alk.h"
  4 | 
  5 | // --------- AVERAGE LINKAGE CLUSTER ----------------
  6 | 
  7 | void alk::alk_init (alk::ALKDat &alk_dat,
  8 |         BinarySearchTree &tree,
  9 |         Rcpp::IntegerVector from,
 10 |         Rcpp::IntegerVector to,
 11 |         Rcpp::NumericVector d) {
 12 | 
 13 |     size_t n = utils::sets_init (from, to, alk_dat.vert2index_map,
 14 |             alk_dat.index2vert_map, alk_dat.index2cl_map,
 15 |             alk_dat.cl2index_map);
 16 |     alk_dat.n = n;
 17 | 
 18 |     intset_t vert_set;
 19 | 
 20 |     // Get set of unique vertices, and store binary tree of edge distances
 21 |     for (int i = 0; i < from.size (); i++) {
 22 |         vert_set.emplace (from [i]);
 23 |         vert_set.emplace (to [i]);
 24 |         tree.insert (d [i]);
 25 |     }
 26 |     // Construct vert2index_map to map each unique vertex to an index
 27 |     index_t idx = 0;
 28 |     for (auto v: vert_set) {
 29 |         alk_dat.vert2index_map.emplace (v, idx++);
 30 |     }
 31 | 
 32 |     // Construct idx2edgewt_map and edgewt2idx_pair_map
 33 |     for (int i = 0; i < from.size (); i++) {
 34 |         index_t fi = alk_dat.vert2index_map.at (from [i]),
 35 |                 ti = alk_dat.vert2index_map.at (to [i]);
 36 |         alk_dat.edgewt2idx_pair_map.emplace (d [i], std::make_pair (fi, ti));
 37 | 
 38 |         // idx2edgewt_map.second is an unordered set that needs to be expanded
 39 |         std::unordered_set <double> wtset;
 40 |         if (alk_dat.idx2edgewt_map.find (fi) ==
 41 |                 alk_dat.idx2edgewt_map.end ()) {
 42 |             wtset.emplace (d [i]);
 43 |             alk_dat.idx2edgewt_map.emplace (fi, wtset);
 44 |         } else {
 45 |             wtset = alk_dat.idx2edgewt_map.at (fi);
 46 |             wtset.emplace (d [i]);
 47 |             alk_dat.idx2edgewt_map [fi] = wtset;
 48 |         }
 49 | 
 50 |         // repeat same for "to" vertex
 51 |         if (alk_dat.idx2edgewt_map.find (ti) ==
 52 |                 alk_dat.idx2edgewt_map.end ()) {
 53 |             wtset.emplace (d [i]);
 54 |             alk_dat.idx2edgewt_map.emplace (ti, wtset);
 55 |         } else {
 56 |             wtset = alk_dat.idx2edgewt_map.at (ti);
 57 |             wtset.emplace (d [i]);
 58 |             alk_dat.idx2edgewt_map [ti] = wtset;
 59 |         }
 60 |     }
 61 | 
 62 |     arma::uword nu = static_cast <arma::uword> (n);
 63 |     alk_dat.contig_mat = arma::zeros <arma::Mat <int> > (nu, nu);
 64 |     alk_dat.num_edges = arma::ones <arma::Mat <int> > (nu, nu);
 65 |     alk_dat.avg_dist.set_size (nu, nu);
 66 |     //alk_dat.avg_dist.fill (INFINITE_DOUBLE);
 67 |     alk_dat.avg_dist.fill (0.0);
 68 |     alk_dat.dmat.set_size (nu, nu);
 69 |     if (alk_dat.shortest) {
 70 |         alk_dat.dmat.fill (INFINITE_DOUBLE);
 71 |     } else {
 72 |         alk_dat.dmat.fill (-INFINITE_DOUBLE);
 73 |     }
 74 |     for (int i = 0; i < from.length (); i++) {
 75 |         arma::uword vf = static_cast <arma::uword> (
 76 |                                     alk_dat.vert2index_map.at (from [i])),
 77 |                     vt = static_cast <arma::uword> (
 78 |                                     alk_dat.vert2index_map.at (to [i]));
 79 |         alk_dat.contig_mat (vf, vt) = 1;
 80 |         alk_dat.num_edges (vf, vt) = 1;
 81 |         //alk_dat.avg_dist (vf, vt) = 0.0;
 82 |         alk_dat.avg_dist (vf, vt) = d [i];
 83 |         alk_dat.dmat (vf, vt) = d [i];
 84 |     }
 85 | }
 86 | 
 87 | // update both idx2edgewt and edgewt2idx maps to reflect merging of cluster m
 88 | // into cluster l (using Guo's original notation there). The cl2index
 89 | // and index2cl maps are updated in `merge_clusters`
 90 | void alk::update_edgewt_maps (alk::ALKDat &alk_dat, index_t m, index_t l) {
 91 |     std::unordered_set <double> wtsl = alk_dat.idx2edgewt_map.at (l),
 92 |         wtsm = alk_dat.idx2edgewt_map.at (m);
 93 |     for (auto w: wtsm) {
 94 |         wtsl.insert (w);
 95 |     }
 96 |     alk_dat.idx2edgewt_map.erase (m);
 97 |     alk_dat.idx2edgewt_map.erase (l);
 98 |     alk_dat.idx2edgewt_map.emplace (l, wtsl);
 99 | 
100 |     for (auto w: wtsl) {
101 |         std::pair <index_t, index_t> pr = alk_dat.edgewt2idx_pair_map.at (w);
102 |         bool update = true;
103 |         if (pr.first == m) {
104 |             pr.first = l;
105 |         } else if (pr.second == m) {
106 |             pr.second = l;
107 |         } else {
108 |             update = false;
109 |         }
110 | 
111 |         if (update) {
112 |             alk_dat.edgewt2idx_pair_map [w] = pr;
113 |         }
114 |     }
115 | 
116 |     // Any edgewt2idx pairs with entries of m also have to be re-mapped to l
117 |     // TODO: Is there a better way to do this?
118 |     std::unordered_set <double> wts;
119 |     for (auto w: alk_dat.edgewt2idx_pair_map) {
120 |         if (w.second.first == m || w.second.second == m) {
121 |             wts.emplace (w.first);
122 |         }
123 |     }
124 |     if (wts.size () > 0) {
125 |         for (auto w: wts) {
126 |             std::pair <index_t, index_t> pr = alk_dat.edgewt2idx_pair_map.at (w);
127 |             if (pr.first == m) {
128 |                 pr.first = l;
129 |             }
130 |             if (pr.second == m) {
131 |                 pr.second = l;
132 |             }
133 |             alk_dat.edgewt2idx_pair_map [w] = pr;
134 |         }
135 |     }
136 | }
137 | 
138 | size_t alk::alk_step (alk::ALKDat &alk_dat,
139 |         BinarySearchTree &tree,
140 |         Rcpp::IntegerVector from,
141 |         Rcpp::IntegerVector to,
142 |         Rcpp::NumericVector d) {
143 |     // Step through to find the minimal-distance edge that (i) connects
144 |     // different clusters, (ii) represents contiguous clusters, and (iii) has
145 |     // distance greater than the average dist between those 2 clusters.
146 |     // T used to step through successive min values:
147 |     double edge_dist = tree.treeMin();
148 |     tree_node * node = tree.getRoot ();
149 |     node = tree.getNode (node, edge_dist);
150 |     std::pair <index_t, index_t> pr =
151 |         alk_dat.edgewt2idx_pair_map.at (edge_dist);
152 |     index_t l = pr.first, m = pr.second;
153 |     arma::uword lu = static_cast <arma::uword> (l),
154 |                 mu = static_cast <arma::uword> (m);
155 |     while (l == m || alk_dat.contig_mat (lu, mu) == 0 ||
156 |             edge_dist < alk_dat.avg_dist (lu, mu)) {
157 |         node = tree.nextHi (node);
158 |         if (node == nullptr) {
159 |             Rcpp::stop ("can not go past highest node");
160 |         }
161 |         edge_dist = node->data;
162 |         pr = alk_dat.edgewt2idx_pair_map.at (edge_dist);
163 |         l = pr.first;
164 |         m = pr.second;
165 |         lu = static_cast <arma::uword> (l);
166 |         mu = static_cast <arma::uword> (m);
167 |     }
168 |     int li = static_cast <int> (l), mi = static_cast <int> (m);
169 |     
170 |     size_t ishort = utils::find_shortest_connection (from, to,
171 |             alk_dat.vert2index_map, alk_dat.dmat,
172 |             alk_dat.cl2index_map, mi, li, alk_dat.shortest);
173 |     // ishort is return value; an index into (from, to)
174 |     utils::merge_clusters (alk_dat.contig_mat,
175 |             alk_dat.index2cl_map,
176 |             alk_dat.cl2index_map, mi, li);
177 |     update_edgewt_maps (alk_dat, m, l);
178 | 
179 |     /* Cluster numbers start off here the same as vertex numbers, and so are
180 |      * initially simple indices into the vert-by-vert matrices (contig_mat,
181 |      * dmat, avg_dist, num_edges). As cluster form, numbers merge to one of the
182 |      * pre-existing ones, so are still indexed into these same matrices which do
183 |      * not change size. Cluster merging simply means that previous rows and
184 |      * columns of these matrices will no longer be indexed, and all new indices
185 |      * are derived from constantly updated values of index2cl_map and
186 |      * cl2index_map.
187 |      */
188 | 
189 |     for (auto cl: alk_dat.cl2index_map) {
190 |         if (cl.first != static_cast <int> (l) ||
191 |                 cl.first != static_cast <int> (m)) {
192 |             arma::uword clu = static_cast <arma::uword> (cl.first);
193 |             const double tempd_l = alk_dat.avg_dist (clu, lu),
194 |                          tempd_m = alk_dat.avg_dist (clu, mu);
195 |             const int nedges_l = alk_dat.num_edges (clu, lu),
196 |                       nedges_m = alk_dat.num_edges (clu, mu);
197 | 
198 |             alk_dat.avg_dist (clu, lu) =
199 |                 (tempd_l * nedges_l + tempd_m * nedges_m) /
200 |                 static_cast <double> (nedges_l + nedges_m);
201 |             alk_dat.num_edges (clu, lu) = nedges_l + nedges_m;
202 | 
203 |             if (alk_dat.contig_mat (clu, lu) == 1 ||
204 |                     alk_dat.contig_mat (clu, mu) == 1) {
205 |                 alk_dat.contig_mat (clu, lu) = 1;
206 |                 if (tempd_l > 0.0) {
207 |                     tree.remove (tempd_l);
208 |                 }
209 |                 if (tempd_m > 0.0) {
210 |                     tree.remove (tempd_m);
211 |                 }
212 |                 
213 |                 double tempd = alk_dat.avg_dist (clu, lu);
214 |                 if (tempd > 0.0) {
215 |                     tree.insert (tempd);
216 |                     alk_dat.edgewt2idx_pair_map [tempd] =
217 |                         std::make_pair (cl.first, l);
218 | 
219 |                     std::unordered_set <double> wtset;
220 |                     index_t cli_idx = static_cast <index_t> (cl.first);
221 |                     if (alk_dat.idx2edgewt_map.find (cli_idx) ==
222 |                             alk_dat.idx2edgewt_map.end ()) {
223 |                         wtset.clear ();
224 |                     } else {
225 |                         wtset = alk_dat.idx2edgewt_map.at (cli_idx);
226 |                     }
227 |                     wtset.emplace (tempd);
228 |                     alk_dat.idx2edgewt_map.erase (cli_idx);
229 |                     alk_dat.idx2edgewt_map.emplace (cli_idx, wtset);
230 |                 }
231 |             } // end if C(c, l) = 1 or C(c, m) = 1 in Guo's terminology
232 |         } // end if cl.first != (cfrom, cto)
233 |     } // end for over cl
234 | 
235 |     return ishort;
236 | }
237 | 
238 | 
239 | //' rcpp_alk
240 | //'
241 | //' Full-order average linkage cluster redcap algorithm
242 | //'
243 | //' @noRd
244 | // [[Rcpp::export]]
245 | Rcpp::IntegerVector rcpp_alk (
246 |         const Rcpp::DataFrame gr,
247 |         const bool shortest,
248 |         const bool quiet)
249 | {
250 |     Rcpp::IntegerVector from_ref = gr ["from"];
251 |     Rcpp::IntegerVector to_ref = gr ["to"];
252 |     Rcpp::NumericVector d = gr ["d"];
253 |     // Rcpp classes are always passed by reference, so cloning is necessary to
254 |     // avoid modifying the original data.frames.
255 |     Rcpp::IntegerVector from = Rcpp::clone (from_ref);
256 |     Rcpp::IntegerVector to = Rcpp::clone (to_ref);
257 |     // Index vectors are 1-indexed, so
258 |     from = from - 1;
259 |     to = to - 1;
260 | 
261 |     alk::ALKDat alk_dat;
262 |     alk_dat.shortest = shortest;
263 |     BinarySearchTree tree;
264 |     alk::alk_init (alk_dat, tree, from, to, d);
265 |     const size_t n = alk_dat.n;
266 |     const bool really_quiet = !(!quiet && n > 100);
267 | 
268 |     std::unordered_set <size_t> the_tree;
269 |     while (the_tree.size () < (n - 1)) { // tree has n - 1 edges
270 |         Rcpp::checkUserInterrupt ();
271 | 
272 |         size_t ishort = alk::alk_step (alk_dat, tree, from, to, d);
273 |         the_tree.insert (ishort);
274 | 
275 |         if (!really_quiet && the_tree.size () % 100 == 0) {
276 |             Rcpp::Rcout << "\rBuilding tree: " << the_tree.size () << " / " <<
277 |                 n - 1;
278 |             Rcpp::Rcout.flush ();
279 |         }
280 |     }
281 |     if (!really_quiet) {
282 |         Rcpp::Rcout << "\rBuilding tree: " << the_tree.size () << " / " <<
283 |             n - 1 << " -> done" << std::endl;
284 |     }
285 | 
286 |     std::vector <int> treevec (the_tree.begin (), the_tree.end ());
287 | 
288 |     return Rcpp::wrap (treevec);
289 | }
290 | 


--------------------------------------------------------------------------------
/src/alk.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | // --------- AVERAGE LINKAGE CLUSTER ----------------
 4 | 
 5 | #include "bst.h"
 6 | 
 7 | /* The main matrices (contig, num_edges, dmat, avg_dist) are all referenced by
 8 |  * direct indices throughout, not by vertex numbers. The latter are mapped to
 9 |  * the former by vert2index_map. Note that index2vert_map is not used for this
10 |  * routine, but exists as dummy to pass to `sets_init`
11 |  *
12 |  * The index2cl and cl2index then associate those indices with clusters which
13 |  * are themselves also direct indices into the matrices. Cluster merging simply
14 |  * re-directs multiple indices onto the same cluster (index) numbers.
15 |  *
16 |  * The binary tree only returns minimal distances which need to be associated
17 |  * with particular pairs of clusters. This is done with the final map,
18 |  * edgewt2idx_pair, where the pair of indices is into clusters, requiring this
19 |  * map to be constantly updated. This updating requires in turn a reverse map,
20 |  * idx2edgewt, so that the weight associated with any pre-merge cluster can
21 |  * be obtained, and the edgewt2idx clusters for that weight updated.
22 |  */
23 | 
24 | namespace alk {
25 | 
26 | struct ALKDat {
27 |     bool shortest;
28 |     size_t n;
29 | 
30 |     std::unordered_map <double,
31 |         std::pair <index_t, index_t> > edgewt2idx_pair_map;
32 |     std::unordered_map <index_t, std::unordered_set <double> >
33 |         idx2edgewt_map; // all wts associated with that cluster
34 | 
35 |     arma::Mat <int> contig_mat, num_edges;
36 |     arma::Mat <double> dmat, avg_dist;
37 | 
38 |     int2indxset_map_t cl2index_map;
39 |     indx2int_map_t index2cl_map, index2vert_map;
40 |     int2indx_map_t vert2index_map;
41 | };
42 | 
43 | void alk_init (ALKDat &alk_dat,
44 |         BinarySearchTree &tree,
45 |         Rcpp::IntegerVector from,
46 |         Rcpp::IntegerVector to,
47 |         Rcpp::NumericVector d);
48 | 
49 | void update_edgewt_maps (ALKDat &alk_dat, index_t l, index_t m);
50 | 
51 | size_t alk_step (ALKDat &alk_dat,
52 |         BinarySearchTree &tree,
53 |         Rcpp::IntegerVector from,
54 |         Rcpp::IntegerVector to,
55 |         Rcpp::NumericVector d);
56 | 
57 | } // end namespace alk
58 | 
59 | Rcpp::IntegerVector rcpp_alk (
60 |         const Rcpp::DataFrame gr,
61 |         const bool shortest,
62 |         const bool quiet);
63 | 


--------------------------------------------------------------------------------
/src/bst.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <iostream>
  4 | #include <cstdlib>
  5 | 
  6 | // Templating with recursive pointers is much harder than simply changing the
  7 | // typedef, and makes the code much less readable
  8 | typedef double data_type;
  9 | 
 10 | struct tree_node
 11 | {
 12 |     tree_node * lo;
 13 |     tree_node * hi;
 14 |     tree_node * parent;
 15 |     data_type data;
 16 | };
 17 | 
 18 | class BinarySearchTree
 19 | {
 20 |     private:
 21 |         tree_node * root;
 22 |         data_type tmin (tree_node * node);
 23 |         void clear_node (tree_node * node);
 24 |         tree_node * removeNode (tree_node * node, data_type value);
 25 | 
 26 |     public:
 27 |         BinarySearchTree ()
 28 |         {
 29 |             root = nullptr;
 30 |         }
 31 |         ~BinarySearchTree ()
 32 |         {
 33 |             treeClear ();
 34 |         }
 35 |         void insert (data_type);
 36 |         void remove (data_type value);
 37 |         data_type treeMin ();
 38 | 
 39 |         tree_node * getRoot ();
 40 |         tree_node * getNode (tree_node * node, data_type value);
 41 |         tree_node * treeMinTree ();
 42 |         tree_node * tminTree (tree_node * node);
 43 | 
 44 |         tree_node * nextHi (tree_node * node);
 45 | 
 46 |         void treeClear ();
 47 | };
 48 | 
 49 | void BinarySearchTree::insert (data_type d)
 50 | {
 51 |     tree_node * t = new tree_node;
 52 |     tree_node * parent;
 53 |     t->data = d;
 54 |     t->lo = nullptr;
 55 |     t->hi = nullptr;
 56 |     parent = nullptr;
 57 | 
 58 |     if (root == nullptr)
 59 |         root = t;
 60 |     else
 61 |     {
 62 |         tree_node * node;
 63 |         node = root;
 64 |         while (node != nullptr)
 65 |         {
 66 |             parent = node;
 67 |             if (t->data > node->data)
 68 |                 node = node->hi;
 69 |             else
 70 |                 node = node->lo;
 71 |         }
 72 | 
 73 |         t->parent = parent;
 74 |         if (t->data < parent->data)
 75 |             parent->lo = t;
 76 |         else
 77 |             parent->hi = t;
 78 |     }
 79 | }
 80 | 
 81 | void BinarySearchTree::remove (data_type value)
 82 | {
 83 |     root = removeNode (root, value);
 84 | }
 85 | 
 86 | // recursive private member function:
 87 | tree_node * BinarySearchTree::removeNode (tree_node * node, data_type value)
 88 | {
 89 |     if (node == nullptr)
 90 |         return node;
 91 | 
 92 |     if (value < node->data) {
 93 |         node->lo = removeNode (node->lo, value);
 94 |     } else if (value > node->data) {
 95 |         node->hi = removeNode (node->hi, value);
 96 |     } else {
 97 |         if (node->lo == nullptr && node->hi == nullptr) { // no children
 98 |             delete node;
 99 |             node = nullptr;
100 |         }
101 |         else if (node->lo == nullptr) { // 1 child: hi
102 |             tree_node * temp = node;
103 |             node->hi->parent = node->parent;
104 |             node = node->hi;
105 |             delete temp;
106 |         }
107 |         else if (node->hi == nullptr) { // 1 childe: lo
108 |             tree_node * temp = node;
109 |             node->lo->parent = node->parent;
110 |             node = node->lo;
111 |             delete temp;
112 |         }
113 |         else // 2 children
114 |         {
115 |             tree_node * temp = tminTree (node->hi);
116 |             node->data = temp->data;
117 |             node->hi = removeNode (node->hi, temp->data);
118 |         }
119 |     }
120 |     return node; // then the root node which needs to be updated
121 | }
122 | 
123 | data_type BinarySearchTree::treeMin ()
124 | {
125 |     return tmin (root);
126 | }
127 | 
128 | data_type BinarySearchTree::tmin (tree_node * node)
129 | {
130 |     while (node->lo != nullptr)
131 |         node = node->lo;
132 | 
133 |     return node->data;
134 | }
135 | 
136 | tree_node * BinarySearchTree::treeMinTree ()
137 | {
138 |     return tminTree (root);
139 | }
140 | 
141 | tree_node * BinarySearchTree::tminTree (tree_node * node)
142 | {
143 |     while (node->lo != nullptr)
144 |         node = node->lo;
145 | 
146 |     return node;
147 | }
148 | 
149 | void BinarySearchTree::treeClear ()
150 | {
151 |     clear_node (root);
152 | }
153 | 
154 | void BinarySearchTree::clear_node (tree_node * node)
155 | {
156 |     if (node != nullptr)
157 |     {
158 |         clear_node (node->lo);
159 |         clear_node (node->hi);
160 |         delete node;
161 |     }
162 | }
163 | 
164 | tree_node * BinarySearchTree::getRoot ()
165 | {
166 |     tree_node * node = root;
167 |     return node;
168 | }
169 | 
170 | tree_node * BinarySearchTree::getNode (tree_node * node, data_type value)
171 | {
172 |     if (node == nullptr)
173 |     {
174 |         //std::cout << "value = " << value <<
175 |         //    " does not exist in the tree" << std::endl;
176 |         return node;
177 |     }
178 | 
179 | 	if (value < node->data) 
180 | 		return getNode (node->lo, value);
181 | 	else if (value > node->data)
182 | 		return getNode (node->hi, value);
183 | 	else
184 | 		return node;
185 | }
186 | 
187 | tree_node * BinarySearchTree::nextHi (tree_node * node)
188 | {
189 |     if (node->hi != nullptr)
190 |         return tminTree (node->hi);
191 | 
192 |     tree_node * y = node->parent;
193 |     while (y != nullptr && node == y->hi)
194 |     {
195 |         node = y;
196 |         y = y->parent;
197 |     }
198 |     //if (y == nullptr)
199 |     //    std::cout << "already at max value" << std::endl;
200 |     return y;
201 | }
202 | 


--------------------------------------------------------------------------------
/src/clk.cpp:
--------------------------------------------------------------------------------
  1 | #include "common.h"
  2 | #include "clk.h"
  3 | 
  4 | // --------- COMPLETE LINKAGE CLUSTER ----------------
  5 | 
  6 | void clk::clk_init (clk::CLKDat &clk_dat,
  7 |         Rcpp::IntegerVector from_full,
  8 |         Rcpp::IntegerVector to_full,
  9 |         Rcpp::NumericVector d_full,
 10 |         Rcpp::IntegerVector from,
 11 |         Rcpp::IntegerVector to,
 12 |         Rcpp::NumericVector d) {
 13 |     size_t n = utils::sets_init (from, to, clk_dat.vert2index_map,
 14 |             clk_dat.index2vert_map, clk_dat.index2cl_map,
 15 |             clk_dat.cl2index_map);
 16 |     clk_dat.n = n;
 17 | 
 18 |     clk_dat.edges_all.clear ();
 19 |     clk_dat.edges_all.resize (static_cast <size_t> (from_full.size ()));
 20 |     for (int i = 0; i < from_full.size (); i++) {
 21 |         utils::OneEdge here;
 22 |         here.from = from_full [i];
 23 |         here.to = to_full [i];
 24 |         here.dist = d_full [i];
 25 |         clk_dat.edges_all [static_cast <size_t> (i)] = here;
 26 |     }
 27 |     // These edges are already passed in sorted form, so no need to explicitly
 28 |     // sort here.
 29 | 
 30 |     clk_dat.edges_nn.clear ();
 31 |     clk_dat.edges_nn.resize (static_cast <size_t> (from.size ()));
 32 |     for (int i = 0; i < from.size (); i++) {
 33 |         utils::OneEdge here;
 34 |         here.from = from [i];
 35 |         here.to = to [i];
 36 |         here.dist = d [i];
 37 |         clk_dat.edges_nn [static_cast <size_t> (i)] = here;
 38 |     }
 39 | 
 40 |     // Get set of unique vertices, and store binary tree of edge distances
 41 |     intset_t vert_set;
 42 |     for (int i = 0; i < from.size (); i++) {
 43 |         vert_set.emplace (from [i]);
 44 |         vert_set.emplace (to [i]);
 45 |     }
 46 |     // Construct vert2index_map to map each unique vertex to an index
 47 |     index_t idx = 0;
 48 |     for (auto v: vert_set) {
 49 |         clk_dat.vert2index_map.emplace (v, idx++);
 50 |     }
 51 | 
 52 |     arma::uword nu = static_cast <arma::uword> (n);
 53 |     clk_dat.contig_mat = arma::zeros <arma::Mat <int> > (nu, nu);
 54 |     clk_dat.dmat.zeros (nu, nu);
 55 |     for (int i = 0; i < from.length (); i++) {
 56 |         arma::uword vf = static_cast <arma::uword> (
 57 |                                         clk_dat.vert2index_map.at (from [i])),
 58 |                     vt = static_cast <arma::uword> (
 59 |                                         clk_dat.vert2index_map.at (to [i]));
 60 |         clk_dat.contig_mat (vf, vt) = 1;
 61 |         //clk_dat.dmat (vf, vt) = d [i]; // NOPE - all dmat = 0 at start!
 62 |     }
 63 | }
 64 | 
 65 | //' clk_step
 66 | //'
 67 | //' @param ei The i'th edge of the full sorted list of edge weights
 68 | //' @noRd
 69 | size_t clk::clk_step (clk::CLKDat &clk_dat, size_t i) {
 70 |     // find shortest _all edges that connects the two clusters
 71 |     utils::OneEdge ei = clk_dat.edges_all [i];
 72 |     const size_t u = clk_dat.vert2index_map.at (ei.from),
 73 |                  v = clk_dat.vert2index_map.at (ei.to);
 74 |     const int cl_u = clk_dat.index2cl_map.at (u),
 75 |               cl_v = clk_dat.index2cl_map.at (v);
 76 | 
 77 |     // Find shortest edge (or longest for covariance) in MST that connects 
 78 |     // u and v:
 79 |     size_t mmin = INFINITE_INT, lmin = INFINITE_INT, the_edge = INFINITE_INT;
 80 |     double dlim = INFINITE_DOUBLE;
 81 |     if (!clk_dat.shortest) {
 82 |         dlim = -dlim;
 83 |     }
 84 | 
 85 |     for (size_t j = 0; j < clk_dat.edges_nn.size (); j++) {
 86 |         utils::OneEdge ej = clk_dat.edges_nn [j];
 87 |         size_t m = clk_dat.vert2index_map.at (ej.from),
 88 |                l = clk_dat.vert2index_map.at (ej.to);
 89 |         if (((clk_dat.index2cl_map.at (m) == cl_u &&
 90 |                         clk_dat.index2cl_map.at (l) == cl_v) ||
 91 |                     (clk_dat.index2cl_map.at (m) == cl_v &&
 92 |                      clk_dat.index2cl_map.at (l) == cl_u)) &&
 93 |                 ((clk_dat.shortest && ej.dist < dlim) ||
 94 |                  (!clk_dat.shortest && ej.dist > dlim))) {
 95 |             the_edge = j;
 96 |             mmin = m;
 97 |             lmin = l;
 98 |             dlim = ej.dist;
 99 |         }
100 |     }
101 |     if (fabs (dlim) == INFINITE_DOUBLE) {
102 |         Rcpp::stop ("minimal distance not able to be found");
103 |     }
104 | 
105 |     const int merge_to_id = clk_dat.index2cl_map.at (lmin);
106 |     utils::merge_clusters (clk_dat.contig_mat,
107 |             clk_dat.index2cl_map,
108 |             clk_dat.cl2index_map,
109 |             clk_dat.index2cl_map.at (mmin),
110 |             merge_to_id);
111 | 
112 |     for (auto cl: clk_dat.cl2index_map) {
113 |         if (cl.first != static_cast <int> (lmin) ||
114 |                 cl.first != static_cast <int> (mmin)) {
115 |             arma::uword clu = static_cast <arma::uword> (cl.first),
116 |                         lu = static_cast <arma::uword> (lmin),
117 |                         mu = static_cast <arma::uword> (mmin);
118 |             const double dl = clk_dat.dmat (clu, lu),
119 |                   dm = clk_dat.dmat (clu, mu);
120 |             double dtemp = dl;
121 |             if ((clk_dat.shortest && dm < dl) ||
122 |                     (!clk_dat.shortest && dm > dl)) {
123 |                 dtemp = dm;
124 |             }
125 |             clk_dat.dmat (clu, lu) = dtemp;
126 | 
127 |             if (clk_dat.contig_mat (clu, lu) == 1 ||
128 |                     clk_dat.contig_mat (clu, mu) == 1) {
129 |                 clk_dat.contig_mat (clu, lu) = 1;
130 |             }
131 |         }
132 |     } // end for over cl
133 | 
134 |     return the_edge;
135 | }
136 | 
137 | //' rcpp_clk
138 | //'
139 | //' Full-order complete linkage cluster redcap algorithm
140 | //'
141 | //' @noRd
142 | // [[Rcpp::export]]
143 | Rcpp::IntegerVector rcpp_clk (
144 |         const Rcpp::DataFrame gr_full,
145 |         const Rcpp::DataFrame gr,
146 |         const bool shortest,
147 |         const bool quiet)
148 | {
149 |     Rcpp::IntegerVector from_full_ref = gr_full ["from"];
150 |     Rcpp::IntegerVector to_full_ref = gr_full ["to"];
151 |     Rcpp::NumericVector d_full_ref = gr_full ["d"];
152 |     Rcpp::IntegerVector from_ref = gr ["from"];
153 |     Rcpp::IntegerVector to_ref = gr ["to"];
154 |     Rcpp::NumericVector d_ref = gr ["d"];
155 | 
156 |     // Rcpp classes are always passed by reference, so cloning is necessary to
157 |     // avoid modifying the original data.frames.
158 |     Rcpp::IntegerVector from_full = Rcpp::clone (from_full_ref);
159 |     Rcpp::IntegerVector to_full = Rcpp::clone (to_full_ref);
160 |     Rcpp::NumericVector d_full = Rcpp::clone (d_full_ref);
161 |     Rcpp::IntegerVector from = Rcpp::clone (from_ref);
162 |     Rcpp::IntegerVector to = Rcpp::clone (to_ref);
163 |     Rcpp::NumericVector d = Rcpp::clone (d_ref);
164 | 
165 |     // Index vectors are 1-indexed, so
166 |     from_full = from_full - 1;
167 |     to_full = to_full - 1;
168 |     from = from - 1;
169 |     to = to - 1;
170 | 
171 |     clk::CLKDat clk_dat;
172 |     clk_dat.shortest = shortest;
173 |     clk::clk_init (clk_dat, from_full, to_full, d_full, from, to, d);
174 |     if (clk_dat.shortest) {
175 |         clk_dat.dmat.fill (INFINITE_DOUBLE);
176 |     } else {
177 |         clk_dat.dmat.fill (-INFINITE_DOUBLE);
178 |     }
179 | 
180 |     const size_t n = clk_dat.edges_all.size ();
181 |     const bool really_quiet = !(!quiet && n > (100 * 100));
182 | 
183 |     std::vector <size_t> treevec;
184 |     for (size_t i = 0; i < n; i++) {
185 |         Rcpp::checkUserInterrupt ();
186 | 
187 |         utils::OneEdge ei = clk_dat.edges_all [i];
188 |         arma::uword u = static_cast <arma::uword> (
189 |                                     clk_dat.vert2index_map.at (ei.from)),
190 |                     v = static_cast <arma::uword> (
191 |                                     clk_dat.vert2index_map.at (ei.to));
192 | 
193 |         if (clk_dat.index2cl_map.at (u) != clk_dat.index2cl_map.at (v) &&
194 |                 clk_dat.contig_mat (u, v) == 1 &&
195 |                 ((clk_dat.shortest && ei.dist < clk_dat.dmat (u, v)) ||
196 |                  (!clk_dat.shortest && ei.dist > clk_dat.dmat (u, v)))) {
197 |             size_t the_edge = clk_step (clk_dat, i);
198 |             treevec.push_back (the_edge);
199 |         }
200 |         if (!really_quiet && i % 100 == 0) {
201 |             Rcpp::Rcout << "\rBuilding tree: " << i << " / " << n;
202 |             Rcpp::Rcout.flush ();
203 |         }
204 |     }
205 | 
206 |     if (!really_quiet) {
207 |         Rcpp::Rcout << "\rBuilding tree: " << n << " / " << n <<
208 |             " -> done" << std::endl;
209 |     }
210 | 
211 |     // treevec here in an index into a **sorted** version of (from, to , d)
212 |     return Rcpp::wrap (treevec);
213 | }
214 | 


--------------------------------------------------------------------------------
/src/clk.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "utils.h"
 4 | 
 5 | // --------- COMPLETE LINKAGE CLUSTER ----------------
 6 | 
 7 | namespace clk {
 8 | 
 9 | struct CLKDat {
10 |     bool shortest;
11 |     size_t n;
12 | 
13 |     std::vector <utils::OneEdge> edges_all, edges_nn;
14 | 
15 |     arma::Mat <int> contig_mat;
16 |     arma::Mat <double> dmat;
17 | 
18 |     int2indxset_map_t cl2index_map;
19 | 
20 |     indx2int_map_t index2cl_map, index2vert_map;
21 |     int2indx_map_t vert2index_map;
22 | };
23 | 
24 | void clk_init (CLKDat &clk_dat,
25 |         Rcpp::IntegerVector from_full,
26 |         Rcpp::IntegerVector to_full,
27 |         Rcpp::NumericVector d_full,
28 |         Rcpp::IntegerVector from,
29 |         Rcpp::IntegerVector to,
30 |         Rcpp::NumericVector d);
31 | 
32 | size_t clk_step (CLKDat &clk_dat, size_t i);
33 | 
34 | } // end namespace clk
35 | 
36 | Rcpp::IntegerVector rcpp_clk (
37 |         const Rcpp::DataFrame gr_full,
38 |         const Rcpp::DataFrame gr,
39 |         const bool shortest,
40 |         const bool quiet);
41 | 


--------------------------------------------------------------------------------
/src/common.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <limits>
 4 | #include <map>
 5 | #include <algorithm> // std::find
 6 | #include <vector>
 7 | #include <limits>
 8 | #include <random>
 9 | #include <string> // stoi
10 | #include <cmath> // round
11 | #include <unordered_set>
12 | 
13 | #include <RcppArmadillo.h>
14 | // [[Rcpp::depends(RcppArmadillo)]]
15 | 
16 | constexpr float INFINITE_FLOAT =  std::numeric_limits<float>::max ();
17 | constexpr double INFINITE_DOUBLE =  std::numeric_limits<double>::max ();
18 | constexpr int INFINITE_INT =  std::numeric_limits<int>::max ();
19 | 
20 | typedef size_t index_t;
21 | 
22 | typedef std::unordered_map <int, int> int2int_map_t;
23 | typedef std::unordered_map <index_t, int> indx2int_map_t;
24 | typedef std::unordered_map <int, index_t> int2indx_map_t;
25 | 
26 | typedef std::unordered_set <int> intset_t;
27 | typedef std::unordered_map <int, intset_t> int2intset_map_t;
28 | 
29 | typedef std::unordered_set <size_t> indxset_t;
30 | typedef std::unordered_map <int, indxset_t> int2indxset_map_t;
31 | 


--------------------------------------------------------------------------------
/src/cuttree.cpp:
--------------------------------------------------------------------------------
  1 | #include "common.h"
  2 | #include "cuttree.h"
  3 | 
  4 | void cuttree::fill_edges (cuttree::TreeDat &tree,
  5 |         const std::vector <int> &from,
  6 |         const std::vector <int> &to,
  7 |         Rcpp::NumericVector &d) {
  8 |     std::unordered_map <int, int> vert2index_map;
  9 |     intset_t vert_set;
 10 |     for (size_t i = 0; i < from.size (); i++) {
 11 |         vert_set.emplace (from [i]);
 12 |         vert_set.emplace (to [i]);
 13 |     }
 14 | 
 15 |     int vert_num = 0;
 16 |     for (auto v: vert_set) {
 17 |         vert2index_map.emplace (v, vert_num++);
 18 |     }
 19 | 
 20 |     for (size_t i = 0; i < tree.edges.size (); i++) {
 21 |         cuttree::EdgeComponent this_edge;
 22 |         this_edge.d = d [static_cast <int> (i)];
 23 |         this_edge.cluster_num = 0;
 24 |         this_edge.from = vert2index_map.at (from [i]);
 25 |         this_edge.to = vert2index_map.at (to [i]);
 26 |         tree.edges [i] = this_edge;
 27 |     }
 28 | }
 29 | 
 30 | // Internal sum of squared deviations of specified cluster number (this is just
 31 | // the variance without the scaling by N)
 32 | double cuttree::calc_ss (const std::vector <cuttree::EdgeComponent> &edges,
 33 |         const int cluster_num) {
 34 |     double s2 = 0.0, s = 0.0;
 35 |     double count = 0.0;
 36 |     for (auto i: edges) {
 37 |         if (i.cluster_num == cluster_num) {
 38 |             s += i.d;
 39 |             s2 += i.d * i.d;
 40 |             count += 1.0;
 41 |         }
 42 |     }
 43 |     //return (s2 - s * s / count) / (count - 1.0); // variance
 44 |     return (s2 - s * s / count);
 45 | }
 46 | 
 47 | // Internal mean covariance of specified cluster number 
 48 | double cuttree::calc_covsum (const std::vector <cuttree::EdgeComponent> &edges,
 49 |         const int cluster_num) {
 50 |     double s = 0.0;
 51 |     double count = 0.0;
 52 |     for (auto i: edges) {
 53 |         if (i.cluster_num == cluster_num) {
 54 |             s += i.d;
 55 |             count += 1.0;
 56 |         }
 57 |     }
 58 |     return s / count;
 59 | }
 60 | 
 61 | size_t cuttree::cluster_size (const std::vector <cuttree::EdgeComponent> &edges,
 62 |         const int cluster_num) {
 63 |     size_t n = 0;
 64 |     for (auto i: edges) {
 65 |         if (i.cluster_num == cluster_num) {
 66 |             n++;
 67 |         }
 68 |     }
 69 |     return n;
 70 | }
 71 | 
 72 | // Build connected component of tree starting from first edge.
 73 | std::unordered_set <int> cuttree::build_one_tree (
 74 |         std::vector <cuttree::EdgeComponent> &edges) {
 75 |     std::unordered_set <int> tree;
 76 | 
 77 |     tree.emplace (edges [0].from);
 78 |     tree.emplace (edges [0].to);
 79 | 
 80 |     bool done = false;
 81 |     while (!done) {
 82 |         bool added = false;
 83 |         for (size_t j = 1; j < edges.size (); j++) {
 84 |             if (tree.find (edges [j].from) != tree.end () &&
 85 |                     tree.find (edges [j].to) == tree.end ()) {
 86 |                 tree.emplace (edges [j].to);
 87 |                 added = true;
 88 |             } else if (tree.find (edges [j].to) != tree.end () &&
 89 |                     tree.find (edges [j].from) == tree.end ()) {
 90 |                 tree.emplace (edges [j].from);
 91 |                 added = true;
 92 |             }
 93 |         }
 94 |         done = !added;
 95 |     }
 96 |     return tree;
 97 | }
 98 | 
 99 | cuttree::TwoSS cuttree::sum_component_ss (
100 |         const std::vector <cuttree::EdgeComponent> &edges,
101 |         const std::unordered_set <int> &tree_edges,
102 |         const bool shortest) {
103 |     double sa = 0.0, sa2 = 0.0, sb = 0.0, sb2 = 0.0, na = 0.0, nb = 0.0;
104 |     for (auto e: edges) {
105 |         if (tree_edges.find (e.from) != tree_edges.end ()) {
106 |             sa += e.d;
107 |             if (shortest) {
108 |                 sa2 += e.d * e.d;
109 |             }
110 |             na += 1.0;
111 |         } else {
112 |             sb += e.d;
113 |             if (shortest) {
114 |                 sb2 += e.d * e.d;
115 |             }
116 |             nb += 1.0;
117 |         }
118 |     }
119 | 
120 |     cuttree::TwoSS res;
121 |     // res.ss1 = (sa2 - sa * sa / na) / (na - 1.0); // variance
122 |     if (shortest) {
123 |         res.ss1 = (sa2 - sa * sa / na);
124 |         res.ss2 = (sb2 - sb * sb / nb);
125 |     } else { // covariances are mean values, *NOT* sums like SS values
126 |         res.ss1 = sa / na;
127 |         res.ss2 = sb / nb;
128 |     }
129 |     res.n1 = static_cast <int> (na);
130 |     res.n2 = static_cast <int> (nb);
131 |     return res;
132 | }
133 | 
134 | // Find the component split of edges in cluster_num which yields the lowest sum
135 | // of internal variance.
136 | cuttree::BestCut cuttree::find_min_cut (
137 |         const TreeDat &tree,
138 |         const int cluster_num,
139 |         const bool shortest) {
140 |     size_t n = cuttree::cluster_size (tree.edges, cluster_num);
141 | 
142 |     // fill component vector
143 |     std::vector <cuttree::EdgeComponent> cluster_edges (n);
144 |     size_t pos = 0;
145 |     for (auto e: tree.edges) {
146 |         if (e.cluster_num == cluster_num) {
147 |             cluster_edges [pos++] = e;
148 |         }
149 |     }
150 | 
151 |     std::vector <cuttree::EdgeComponent> edges_copy;
152 | 
153 |     // Remove each edge in turn
154 |     cuttree::BestCut the_cut;
155 |     the_cut.pos = the_cut.n1 = the_cut.n2 = INFINITE_INT;
156 |     the_cut.ss1 = the_cut.ss2 = INFINITE_DOUBLE;
157 |     the_cut.ss_diff = 0.0; // default, coz search is over max ss_diff
158 |     double ssmin = INFINITE_DOUBLE;
159 | 
160 |     // TODO: Rewrite this to just erase and re-insert a single edge each time
161 |     for (int i = 0; i < static_cast <int> (n); i++) {
162 |         edges_copy.resize (0);
163 |         edges_copy.shrink_to_fit ();
164 |         edges_copy.resize (n);
165 |         std::copy (cluster_edges.begin (), cluster_edges.end (),
166 |                 edges_copy.begin ());
167 |         edges_copy.erase (edges_copy.begin () + i);
168 | 
169 |         if (edges_copy.size () < cuttree::MIN_CLUSTER_SIZE) {
170 |             break;
171 |         }
172 | 
173 |         std::unordered_set <int> tree_edges = cuttree::build_one_tree (edges_copy);
174 | 
175 |         // only include groups with >= MIN_CLUSTER_SIZE members
176 |         if (tree_edges.size () >= cuttree::MIN_CLUSTER_SIZE &&
177 |                 tree_edges.size () < (edges_copy.size () -
178 |                                 cuttree::MIN_CLUSTER_SIZE - 1)) {
179 |             cuttree::TwoSS ss;
180 |             ss = cuttree::sum_component_ss (edges_copy, tree_edges, shortest);
181 | 
182 |             if ((ss.ss1 + ss.ss2) < ssmin) { // applies to both distances & cov
183 |                 ssmin = ss.ss1 + ss.ss2;
184 |                 the_cut.pos = i;
185 |                 the_cut.ss1 = ss.ss1;
186 |                 the_cut.ss2 = ss.ss2;
187 | 
188 |                 the_cut.n1 = ss.n1;
189 |                 the_cut.n2 = ss.n2;
190 | 
191 |                 the_cut.nodes.clear ();
192 |                 for (auto te: tree_edges) {
193 |                     the_cut.nodes.emplace (te);
194 |                 }
195 |             }
196 |         }
197 |     }
198 | 
199 |     if (the_cut.ss1 < INFINITE_DOUBLE) {
200 |         the_cut.ss_diff = cuttree::calc_ss (tree.edges, cluster_num) -
201 |             the_cut.ss1 - the_cut.ss2;
202 |     }
203 | 
204 |     return the_cut;
205 | }
206 | 
207 | //' rcpp_cut_tree
208 | //'
209 | //' Cut tree into specified number of clusters by minimising internal cluster
210 | //' variance.
211 | //'
212 | //' @param tree tree to be processed
213 | //'
214 | //' @return Vector of cluster IDs for each tree edge
215 | //' @noRd
216 | // [[Rcpp::export]]
217 | Rcpp::IntegerVector rcpp_cut_tree (const Rcpp::DataFrame tree, const int ncl,
218 |         const bool shortest, const bool quiet) {
219 |     Rcpp::IntegerVector from_in = tree ["from"];
220 |     Rcpp::IntegerVector to_in = tree ["to"];
221 |     Rcpp::NumericVector dref = tree ["d"];
222 | 
223 |     std::vector <int> from = Rcpp::as <std::vector <int> > (from_in);
224 |     std::vector <int> to = Rcpp::as <std::vector <int> > (to_in);
225 | 
226 |     cuttree::TreeDat tree_dat;
227 |     tree_dat.edges.resize (static_cast <size_t> (dref.size ()));
228 |     cuttree::fill_edges (tree_dat, from, to, dref);
229 | 
230 |     cuttree::BestCut the_cut = cuttree::find_min_cut (tree_dat, 0, shortest);
231 |     std::vector <double> ss_diff, ss1, ss2;
232 |     ss_diff.push_back (the_cut.ss_diff); // ss0 - ss1 - ss2
233 |     ss1.push_back (the_cut.ss1);
234 |     ss2.push_back (the_cut.ss2);
235 |     // map from index into ss vectors to actual cluster numbers
236 |     std::unordered_map <size_t, int> cluster_map;
237 |     cluster_map.emplace (0, 0);
238 | 
239 |     const bool really_quiet = !(!quiet && from_in.size () > 100);
240 | 
241 |     int num_clusters = 1;
242 |     // This loop fills the three vectors (ss_diff, ss1, ss2), as well as the
243 |     // cluster_map.
244 |     while (num_clusters < ncl) {
245 |         Rcpp::checkUserInterrupt ();
246 |         if (!really_quiet) {
247 |             Rcpp::Rcout << "\rNumber of clusters: " << num_clusters << " / " << ncl;
248 |             Rcpp::Rcout.flush ();
249 |         }
250 | 
251 |         auto mp = std::max_element (ss_diff.begin (), ss_diff.end ());
252 |         long int maxi_int = std::distance (ss_diff.begin (), mp);
253 |         size_t maxi = static_cast <size_t> (maxi_int);
254 |         if (cluster_map.find (maxi) == cluster_map.end ()) {
255 |             Rcpp::Rcout << "ss_diff has no max element in cluster_map" <<
256 |                 std::endl;
257 |         }
258 |         int clnum = cluster_map.at (maxi);
259 |         // maxi is index of cluster to be split
260 | 
261 |         if (ss_diff [maxi] == 0.0) { // no further cuts possible
262 |             break;
263 |         }
264 |         
265 |         the_cut = cuttree::find_min_cut (tree_dat, clnum, shortest);
266 |         // Break old clnum into 2:
267 |         int count = 0;
268 |         for (auto &e: tree_dat.edges) {
269 |             if (e.cluster_num == clnum) {
270 |                 if (count == the_cut.pos) {
271 |                     e.cluster_num = INFINITE_INT;
272 |                 } else if (the_cut.nodes.find (e.from) == the_cut.nodes.end ()) {
273 |                     e.cluster_num = num_clusters;
274 |                 }
275 |                 count++;
276 |             }
277 |         }
278 |         // find new best cut of now reduced cluster
279 |         the_cut = cuttree::find_min_cut (tree_dat, clnum, shortest);
280 | 
281 |         ss_diff [maxi] = the_cut.ss_diff;
282 |         ss1 [maxi] = the_cut.ss1;
283 |         ss2 [maxi] = the_cut.ss2;
284 |         // and also of new cluster
285 |         the_cut = cuttree::find_min_cut (tree_dat, num_clusters, shortest);
286 | 
287 |         ss_diff.push_back (the_cut.ss_diff);
288 |         ss1.push_back (the_cut.ss1);
289 |         ss2.push_back (the_cut.ss2);
290 | 
291 |         cluster_map.emplace (num_clusters, ss_diff.size () - 1);
292 | 
293 |         num_clusters++;
294 |     }
295 | 
296 |     if (!really_quiet) {
297 |         Rcpp::Rcout << " -> done" << std::endl;
298 |     }
299 | 
300 |     Rcpp::IntegerVector res (tree_dat.edges.size ());
301 |     for (int i = 0; i < static_cast <int> (tree_dat.edges.size ()); i++) {
302 |         if (tree_dat.edges [static_cast <size_t> (i)].cluster_num == INFINITE_INT) {
303 |             res [i] = NA_INTEGER;
304 |         } else {
305 |             res [i] = tree_dat.edges [static_cast <size_t> (i)].cluster_num;
306 |         }
307 |     }
308 |     return res;
309 | }
310 | 


--------------------------------------------------------------------------------
/src/cuttree.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <unordered_map>
 4 | 
 5 | namespace cuttree {
 6 | 
 7 | // clusters are of edges, so size = 2 => 3 nodes
 8 | constexpr int MIN_CLUSTER_SIZE = 2;
 9 | 
10 | struct EdgeComponent {
11 |     double d;
12 |     int from, to, cluster_num;
13 | };
14 | 
15 | struct TreeDat {
16 |     std::vector <EdgeComponent> edges;
17 | };
18 | 
19 | struct BestCut {
20 |     int pos, n1, n2;
21 |     double ss_diff, ss1, ss2;
22 |     std::unordered_set <int> nodes;
23 | };
24 | 
25 | struct TwoSS { // 2 sums-of-squares values
26 |     double ss1, ss2;
27 |     int n1, n2; // sizes of clusters
28 | };
29 | 
30 | void fill_edges (TreeDat &tree,
31 |         const std::vector <int> &from,
32 |         const std::vector <int> &to,
33 |         Rcpp::NumericVector &d);
34 | double calc_ss (const std::vector <EdgeComponent> &edges,
35 |         const int cluster_num);
36 | double calc_covsum (const std::vector <EdgeComponent> &edges,
37 |         const int cluster_num);
38 | size_t cluster_size (const std::vector <EdgeComponent> &edges,
39 |         const int cluster_num);
40 | std::unordered_set <int> build_one_tree (std::vector <EdgeComponent> &edges);
41 | 
42 | TwoSS sum_component_ss (const std::vector <EdgeComponent> &edges,
43 |         const std::unordered_set <int> &tree_edges, const bool shortest);
44 | BestCut find_min_cut (const TreeDat &tree, const int cluster_num,
45 |         const bool shortest);
46 | 
47 | } // end namespace cuttree
48 | 
49 | Rcpp::IntegerVector rcpp_cut_tree (const Rcpp::DataFrame tree, const int ncl,
50 |         const bool shortest, const bool quiet);
51 | 


--------------------------------------------------------------------------------
/src/full-init.cpp:
--------------------------------------------------------------------------------
  1 | #include "common.h"
  2 | #include "utils.h"
  3 | #include "full-init.h"
  4 | 
  5 | // --------- FULL CLUSTER ----------------
  6 | 
  7 | void full_init::init (full_init::FullInitDat &clfull_dat,
  8 |         Rcpp::IntegerVector from,
  9 |         Rcpp::IntegerVector to,
 10 |         Rcpp::NumericVector d) {
 11 |     intset_t vert_set;
 12 |     for (int i = 0; i < from.size (); i++) {
 13 |         vert_set.emplace (from [i]);
 14 |         vert_set.emplace (to [i]);
 15 |     }
 16 |     clfull_dat.n = vert_set.size ();
 17 | 
 18 |     index_t idx = 0;
 19 |     for (auto v: vert_set) {
 20 |         clfull_dat.index2vert_map.emplace (idx, v);
 21 |         clfull_dat.vert2index_map.emplace (v, idx);
 22 |         clfull_dat.index2cl_map.emplace (idx++, -1);
 23 |     }
 24 | 
 25 |     clfull_dat.edges.clear ();
 26 |     clfull_dat.edges.resize (static_cast <size_t> (from.size ()));
 27 |     for (int i = 0; i < from.size (); i++) {
 28 |         utils::OneEdge here;
 29 |         here.from = from [i];
 30 |         here.to = to [i];
 31 |         here.dist = d [i];
 32 |         clfull_dat.edges [static_cast <size_t> (i)] = here;
 33 |     }
 34 | 
 35 |     clfull_dat.index_in_cluster.resize (clfull_dat.n);
 36 |     std::fill (clfull_dat.index_in_cluster.begin (),
 37 |             clfull_dat.index_in_cluster.end (), false);
 38 | }
 39 | 
 40 | 
 41 | void full_init::assign_first_edge (full_init::FullInitDat &clfull_dat) {
 42 |     int clnum = 0;
 43 |     index_t ei = 0;
 44 |     utils::OneEdge edge = clfull_dat.edges [ei];
 45 |     index_t ito = clfull_dat.vert2index_map.at (edge.to),
 46 |             ifrom = clfull_dat.vert2index_map.at (edge.from);
 47 | 
 48 |     clfull_dat.index2cl_map [ito] = clnum;
 49 |     clfull_dat.index2cl_map [ifrom] = clnum;
 50 |     clfull_dat.index_in_cluster [ito] = true;
 51 |     clfull_dat.index_in_cluster [ifrom] = true;
 52 | 
 53 |     intset_t cli;
 54 |     cli.insert (static_cast <int> (ito));
 55 |     cli.insert (static_cast <int> (ifrom));
 56 |     clfull_dat.cl2index_map.emplace (clnum, cli);
 57 | 
 58 |     clfull_dat.vert2cl_map.emplace (edge.to, clnum);
 59 |     clfull_dat.vert2cl_map.emplace (edge.from, clnum);
 60 | }
 61 | 
 62 | 
 63 | //' step
 64 | //'
 65 | //' All edges are initially in their own clusters. This merges edge#i with the
 66 | //' next closest edge
 67 | //'
 68 | //' @param ei The i'th edge of the sorted list of NN edge weights
 69 | //' @noRd
 70 | int full_init::step (full_init::FullInitDat &clfull_dat,
 71 |         const index_t ei, const int clnum) {
 72 |     bool from_in = false, to_in = false;
 73 |     utils::OneEdge edge = clfull_dat.edges [ei];
 74 |     index_t ito = clfull_dat.vert2index_map.at (edge.to),
 75 |             ifrom = clfull_dat.vert2index_map.at (edge.from);
 76 |     if (clfull_dat.index_in_cluster [ito]) {
 77 |         to_in = true;
 78 |     }
 79 |     if (clfull_dat.index_in_cluster [ifrom]) {
 80 |         from_in = true;
 81 |     }
 82 | 
 83 |     int clnum_i = clnum;
 84 | 
 85 |     if (from_in && to_in) {
 86 |         clnum_i = INFINITE_INT;
 87 |     } else {
 88 |         if (from_in) { // then to is not in cluster
 89 |             clnum_i = clfull_dat.index2cl_map [ifrom];
 90 |         } else if (to_in) {
 91 |             clnum_i = clfull_dat.index2cl_map [ito];
 92 |         }
 93 | 
 94 |         clfull_dat.index_in_cluster [ito] = true;
 95 |         clfull_dat.index_in_cluster [ifrom] = true;
 96 |         clfull_dat.index2cl_map [ifrom] = clnum_i;
 97 |         clfull_dat.index2cl_map [ito] = clnum_i;
 98 | 
 99 |         intset_t cli;
100 |         if (clfull_dat.cl2index_map.find (clnum_i) !=
101 |                 clfull_dat.cl2index_map.end ()) {
102 |             cli = clfull_dat.cl2index_map.at (clnum_i);
103 |         }
104 |         cli.insert (static_cast <int> (ito));
105 |         cli.insert (static_cast <int> (ifrom));
106 |         clfull_dat.cl2index_map [clnum_i] = cli;
107 | 
108 |         // These values may already be in the map here, but that's okay
109 |         clfull_dat.vert2cl_map.emplace (edge.to, clnum_i);
110 |         clfull_dat.vert2cl_map.emplace (edge.from, clnum_i);
111 |     }
112 | 
113 |     return clnum_i;
114 | }
115 | 
116 | //' fill_cl_edges
117 | //'
118 | //' Fill (arma) matrix of strongest/shortest connections between all clusters
119 | //' used to construct the hierarchical relationships
120 | //' @noRd
121 | void full_init::fill_cl_edges (full_init::FullInitDat &clfull_dat,
122 |         arma::Mat <double> &cl_edges, int num_clusters) {
123 |     int2intset_map_t vert_sets;
124 |     for (int i = 0; i < num_clusters; i++) {
125 |         intset_t verts;
126 |         for (auto vi: clfull_dat.vert2cl_map) {
127 |             if (vi.second == i) {
128 |                 verts.emplace (vi.first);
129 |             }
130 |         }
131 |         vert_sets.emplace (i, verts);
132 |     }
133 | 
134 |     // need a (sparse) matrix of all pairwise edge distances:
135 |     arma::uword nu = static_cast <arma::uword> (clfull_dat.n);
136 |     arma::Mat <double> vert_dists (nu, nu);
137 |     if (!clfull_dat.shortest) {
138 |         vert_dists.fill (INFINITE_DOUBLE);
139 |     }
140 |     for (auto ei: clfull_dat.edges) {
141 |         arma::uword i = static_cast <arma::uword> (
142 |                                     clfull_dat.vert2index_map.at (ei.from)),
143 |                     j = static_cast <arma::uword> (
144 |                                     clfull_dat.vert2index_map.at (ei.to));
145 |         vert_dists (i, j) = vert_dists (j, i) = ei.dist;
146 |     }
147 | 
148 |     for (int i = 0; i < (num_clusters - 1); i++) {
149 |         for (int j = (i + 1); j < num_clusters; j++)
150 |         {
151 |             intset_t verts_i = vert_sets.at (i),
152 |                      verts_j = vert_sets.at (j);
153 |             double max_d = 0.0;
154 |             if (!clfull_dat.shortest) {
155 |                 max_d = INFINITE_DOUBLE; // min covariance
156 |             }
157 |             for (auto vi: verts_i) {
158 |                 for (auto vj: verts_j)
159 |                 {
160 |                     arma::uword viu = static_cast <arma::uword> (
161 |                             clfull_dat.vert2index_map.at (vi)),
162 |                                 vju = static_cast <arma::uword> (
163 |                             clfull_dat.vert2index_map.at (vj));
164 |                     if ((clfull_dat.shortest &&
165 |                                 vert_dists (viu, vju) > max_d) ||
166 |                         (!clfull_dat.shortest &&
167 |                                 vert_dists (viu, vju) < max_d)) {
168 |                         max_d = vert_dists (viu, vju);
169 |                     }
170 |                 }
171 |             }
172 |             arma::uword iu = static_cast <arma::uword> (i),
173 |                         ju = static_cast <arma::uword> (j);
174 |             cl_edges (iu, ju) = cl_edges (ju, iu) = max_d;
175 |         }
176 |     }
177 | }
178 | 
179 | 
180 | //' rcpp_full_initial
181 | //'
182 | //' Initial allocation for full clustering
183 | //'
184 | //' @noRd
185 | // [[Rcpp::export]]
186 | Rcpp::IntegerVector rcpp_full_initial (
187 |         const Rcpp::DataFrame gr,
188 |         bool shortest) {
189 |     Rcpp::IntegerVector from_ref = gr ["from"];
190 |     Rcpp::IntegerVector to_ref = gr ["to"];
191 |     Rcpp::NumericVector d_ref = gr ["d"];
192 |     Rcpp::IntegerVector from = Rcpp::clone (from_ref);
193 |     Rcpp::IntegerVector to = Rcpp::clone (to_ref);
194 |     Rcpp::NumericVector d = Rcpp::clone (d_ref);
195 | 
196 |     // Index vectors are 1-indexed, so
197 |     from = from - 1;
198 |     to = to - 1;
199 | 
200 |     full_init::FullInitDat clfull_dat;
201 |     clfull_dat.shortest = shortest;
202 |     full_init::init (clfull_dat, from, to, d);
203 | 
204 |     full_init::assign_first_edge (clfull_dat);
205 |     int clnum = 1; // #1 assigned in assign_first_edge
206 |     index_t ei = 1; // index of next edge to be assigned
207 | 
208 |     while (clfull_dat.vert2cl_map.size () < clfull_dat.n) {
209 |         int clnum_i = full_init::step (clfull_dat, ei, clnum);
210 |         ei++;
211 |         if (clnum_i == clnum) {
212 |             clnum++;
213 |         }
214 |     }
215 | 
216 |     // Then construct the hierarchical relationships among clusters
217 |     arma::uword cu = static_cast <arma::uword> (clnum);
218 |     arma::Mat <double> cl_edges (cu, cu);
219 |     full_init::fill_cl_edges (clfull_dat, cl_edges, clnum);
220 | 
221 |     // Then construct vector mapping edges to cluster numbers
222 |     std::vector <int> clvec (clfull_dat.n);
223 |     for (auto ci: clfull_dat.vert2cl_map) {
224 |         clvec [static_cast <size_t> (ci.first)] = ci.second;
225 |     }
226 |     
227 |     return Rcpp::wrap (clvec);
228 | }
229 | 


--------------------------------------------------------------------------------
/src/full-init.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "utils.h"
 4 | 
 5 | // --------- FULL CLUSTER ----------------
 6 | 
 7 | /* All `index2cl` values are initally set to -1, and there are no `cl2index`
 8 |  * values.  There is also a single binary vector of `index_in_cluster`, initialy
 9 |  * set to `false`.
10 |  */
11 | 
12 | namespace full_init {
13 | 
14 | struct FullInitDat {
15 |     bool shortest;
16 |     size_t n;
17 | 
18 |     std::vector <utils::OneEdge> edges; // nearest neighbour edges only
19 |     std::vector <bool> index_in_cluster;
20 | 
21 |     int2int_map_t vert2cl_map;
22 |     int2indx_map_t vert2index_map;
23 |     indx2int_map_t index2cl_map, index2vert_map;
24 |     int2intset_map_t cl2index_map;
25 | };
26 | 
27 | void init (FullInitDat &clfull_dat,
28 |         Rcpp::IntegerVector from,
29 |         Rcpp::IntegerVector to,
30 |         Rcpp::NumericVector d);
31 | 
32 | void assign_first_edge (FullInitDat &clfull_dat);
33 | 
34 | int step (FullInitDat &clfull_dat, const index_t ei,
35 |         const int clnum);
36 | 
37 | void fill_cl_edges (FullInitDat &clfull_dat, arma::Mat <double> &cl_edges,
38 |         int num_clusters);
39 | 
40 | } // end namespace ex_init
41 | 
42 | Rcpp::IntegerVector rcpp_full_initial (
43 |         const Rcpp::DataFrame gr,
44 |         bool shortest);
45 | 


--------------------------------------------------------------------------------
/src/full-merge.cpp:
--------------------------------------------------------------------------------
  1 | #include "common.h"
  2 | #include "full-merge.h"
  3 | 
  4 | // load data from rcpp_full_initial into the FullMergeDat struct. The gr data
  5 | // are pre-sorted by increasing d.
  6 | void full_merge::init (const Rcpp::DataFrame &gr,
  7 |         full_merge::FullMergeDat &cldat)
  8 | {
  9 |     Rcpp::IntegerVector from = gr ["from"];
 10 |     Rcpp::IntegerVector to = gr ["to"];
 11 |     Rcpp::NumericVector d = gr ["d"];
 12 |     Rcpp::IntegerVector clnum = gr ["cluster"];
 13 |     Rcpp::IntegerVector clfrom = gr ["cl_from"];
 14 |     Rcpp::IntegerVector clto = gr ["cl_to"];
 15 | 
 16 |     const size_t n = static_cast <size_t> (d.size ());
 17 |     
 18 |     cldat.edges.resize (n);
 19 |     std::unordered_map <int, std::unordered_set <double> > cl2dist_map;
 20 |     std::unordered_map <std::string, double> edge_dist_map;
 21 |     for (int i = 0; i < static_cast <int> (n); i++) {
 22 |         if (clnum [i] >= 0) { // edge in a cluster
 23 |             int clnum_i = clnum [i];
 24 | 
 25 |             std::unordered_set <double> distset;
 26 |             if (cl2dist_map.find (clnum_i) != cl2dist_map.end ())
 27 |                 distset = cl2dist_map.at (clnum_i);
 28 |             distset.emplace (d [i]);
 29 | 
 30 |             cl2dist_map [clnum_i] = distset;
 31 |         } else {
 32 |             // make set of unordered edge names; the actual edge_dist_map is a
 33 |             // dummy here, and serves just to get number of edges
 34 |             std::string eft = std::to_string (clfrom [i]) + "-" +
 35 |                               std::to_string (clto [i]),
 36 |                         etf = std::to_string (clto [i]) + "-" +
 37 |                               std::to_string (clfrom [i]);
 38 |             if (edge_dist_map.find (eft) == edge_dist_map.end () &&
 39 |                     edge_dist_map.find (etf) == edge_dist_map.end ())
 40 |                 edge_dist_map.emplace (eft, d [i]);
 41 |         }
 42 |     }
 43 | 
 44 |     cldat.edges.resize (edge_dist_map.size ());
 45 |     edge_dist_map.clear ();
 46 |     size_t edge_count = 0;
 47 |     for (int i = 0; i < static_cast <int> (n); i++) {
 48 |         if (clnum [i] < 0) { // edge not in a cluster
 49 |             utils::OneEdge edgei;
 50 |             // clfrom and clto hold cluster numbers, NOT vertex numbers
 51 |             edgei.from = clfrom [i];
 52 |             edgei.to = clto [i];
 53 |             edgei.dist = d [i];
 54 | 
 55 |             std::string eft = std::to_string (edgei.from) + "-" +
 56 |                               std::to_string (edgei.to),
 57 |                         etf = std::to_string (edgei.to) + "-" +
 58 |                               std::to_string (edgei.from);
 59 |             if (edge_dist_map.find (eft) == edge_dist_map.end () &&
 60 |                     edge_dist_map.find (etf) == edge_dist_map.end ())
 61 |             {
 62 |                 edge_dist_map.emplace (eft, d [i]);
 63 |                 cldat.edges [edge_count++] = edgei;
 64 |             } else if (edge_dist_map.find (etf) != edge_dist_map.end ())
 65 |             {
 66 |                 if ((cldat.shortest && d [i] < edge_dist_map.at (etf)) ||
 67 |                         (!cldat.shortest && d [i] > edge_dist_map.at (etf)))
 68 |                     edge_dist_map [etf] = d [i];
 69 |             } else
 70 |             {
 71 |                 if ((cldat.shortest && d [i] < edge_dist_map.at (eft)) ||
 72 |                         (!cldat.shortest && d [i] > edge_dist_map.at (eft)))
 73 |                     edge_dist_map [eft] = d [i];
 74 |             }
 75 |         }
 76 |     }
 77 |     // Then just loop over cldat.edges to update min distances
 78 |     for (auto ei: cldat.edges) {
 79 |         std::string eft = std::to_string (ei.from) + "-" +
 80 |                           std::to_string (ei.to);
 81 |         if ((cldat.shortest && edge_dist_map.at (eft) < ei.dist) ||
 82 |                 (!cldat.shortest && edge_dist_map.at (eft) > ei.dist))
 83 |             ei.dist = edge_dist_map.at (eft);
 84 |     }
 85 | 
 86 |     // Fill intra-cluster data:
 87 |     for (auto i: cl2dist_map) {
 88 |         std::unordered_set <double> distset = i.second;
 89 |         OneCluster cli;
 90 |         cli.id = i.first;
 91 |         cli.n = distset.size ();
 92 |         cli.dist_sum = 0.0;
 93 |         cli.dist_max = 0.0;
 94 |         if (!cldat.shortest)
 95 |             cli.dist_max = INFINITE_DOUBLE;
 96 |         for (auto di: distset) {
 97 |             cli.dist_sum += di;
 98 |             if ((cldat.shortest && di > cli.dist_max) ||
 99 |                     (!cldat.shortest && di < cli.dist_max))
100 |                 cli.dist_max = di;
101 |         }
102 |         cldat.clusters.emplace (i.first, cli);
103 | 
104 |         cldat.cl_remap.emplace (i.first, i.first);
105 |         intset_t members;
106 |         members.emplace (i.first);
107 |         cldat.cl_members.emplace (i.first, members);
108 |     }
109 | }
110 | 
111 | // merge cluster clfrom with clto; clfrom remains as it was but is no longer
112 | // indexed so simply ignored from that point on
113 | full_merge::OneMerge full_merge::merge_one_single (full_merge::FullMergeDat &cldat,
114 |         index_t ei) {
115 |     const int cl_from_i = cldat.cl_remap.at (cldat.edges [ei].from),
116 |               cl_to_i = cldat.cl_remap.at (cldat.edges [ei].to);
117 | 
118 |     full_merge::OneCluster clfrom = cldat.clusters.at (cl_from_i),
119 |                          clto = cldat.clusters.at (cl_to_i);
120 |     clto.n += clfrom.n;
121 |     clto.dist_sum += clfrom.dist_sum;
122 | 
123 |     if ((cldat.shortest && clfrom.dist_max > clto.dist_max) ||
124 |             (!cldat.shortest && clfrom.dist_max < clto.dist_max))
125 |         clto.dist_max = clfrom.dist_max;
126 | 
127 |     std::vector <utils::OneEdge> edges_from = clfrom.edges,
128 |                                  edges_to = clto.edges;
129 |     edges_to.insert (edges_to.end (), edges_from.begin (), edges_from.end ());
130 |     clto.edges.clear ();
131 |     clto.edges.shrink_to_fit ();
132 |     clto.edges = edges_to;
133 | 
134 |     cldat.clusters.erase (cl_from_i);
135 |     cldat.clusters [cl_to_i] = clto;
136 | 
137 |     cldat.cl_remap [cl_from_i] = cldat.cl_remap [cl_to_i];
138 |     intset_t members_f = cldat.cl_members.at (cl_from_i),
139 |              members_t = cldat.cl_members.at (cl_to_i);
140 |     members_t.insert (members_f.begin (), members_f.end ());
141 |     cldat.cl_members [cl_to_i] = members_t;
142 |     for (auto m: members_t)
143 |         cldat.cl_remap [m] = cldat.cl_remap [cl_to_i];
144 | 
145 |     full_merge::OneMerge the_merge;
146 |     the_merge.cli = cl_from_i;
147 |     the_merge.clj = cl_to_i;
148 |     the_merge.merge_dist = cldat.edges [ei].dist;
149 | 
150 |     return the_merge;
151 | }
152 | 
153 | // Each merge joins from to to; from remains unchanged but is no longer indexed.
154 | // Edges nevertheless always refer to original (non-merged) cluster numbers, so
155 | // need to be re-mapped via the cl_remap
156 | void full_merge::merge_single (full_merge::FullMergeDat &cldat) {
157 |     index_t edgei = 0;
158 |     while (cldat.clusters.size () > 1) {
159 |         int clfr = cldat.cl_remap.at (cldat.edges [edgei].from),
160 |             clto = cldat.cl_remap.at (cldat.edges [edgei].to);
161 |         if (clfr != clto) {
162 |             full_merge::OneMerge the_merge =
163 |                 full_merge::merge_one_single (cldat, edgei);
164 |             cldat.merges.push_back (the_merge);
165 |         }
166 |         edgei++;
167 |         if (edgei == cldat.edges.size ())
168 |             break;
169 |     }
170 | }
171 | 
172 | bool full_merge::avgdist_sorter_incr (const OneDist &lhs,
173 |         const OneDist &rhs) {
174 |     return lhs.value < rhs.value;
175 | }
176 | 
177 | bool full_merge::avgdist_sorter_decr (const OneDist &lhs,
178 |         const OneDist &rhs) {
179 |     return lhs.value > rhs.value;
180 | }
181 | 
182 | bool full_merge::maxdist_sorter_incr (const OneDist &lhs,
183 |         const OneDist &rhs) {
184 |     return lhs.d < rhs.d;
185 | }
186 | 
187 | bool full_merge::maxdist_sorter_decr (const OneDist &lhs,
188 |         const OneDist &rhs) {
189 |     return lhs.d > rhs.d;
190 | }
191 | 
192 | void full_merge::fill_avg_dists (full_merge::FullMergeDat &cldat,
193 |         full_merge::AvgDists &cl_dists) {
194 |     cl_dists.avg_dists.resize (cldat.edges.size ());
195 |     size_t nc = 0;
196 |     std::unordered_set <std::string> edgenames; // TODO: Remove
197 |     for (auto ei: cldat.edges) {
198 |         full_merge::OneDist onedist;
199 |         onedist.cli = ei.from;
200 |         onedist.clj = ei.to;
201 |         onedist.d = ei.dist;
202 |         onedist.di = cldat.clusters [ei.from].dist_sum;
203 |         onedist.dj = cldat.clusters [ei.to].dist_sum;
204 |         onedist.ni = cldat.clusters [ei.from].n;
205 |         onedist.nj = cldat.clusters [ei.to].n;
206 | 
207 |         onedist.value = (onedist.di + onedist.dj + onedist.d) /
208 |             static_cast <double> (onedist.ni + onedist.nj + 1);
209 | 
210 |         cl_dists.avg_dists [nc++] = onedist;
211 |     }
212 | 
213 |     if (cldat.shortest) {
214 |         std::sort (cl_dists.avg_dists.begin (), cl_dists.avg_dists.end (),
215 |                 &full_merge::avgdist_sorter_incr);
216 |     } else {
217 |         std::sort (cl_dists.avg_dists.begin (), cl_dists.avg_dists.end (),
218 |                 &full_merge::avgdist_sorter_decr);
219 |     }
220 | }
221 | 
222 | // Fill the cli_map and clj_map entries which map cluster numbers onto sets of
223 | // indices in cl_dists.avg_dists
224 | void full_merge::fill_cl_indx_maps (full_merge::AvgDists &cl_dists) {
225 |     cl_dists.cl_map.clear ();
226 |     for (size_t i = 0; i < cl_dists.avg_dists.size (); i++) {
227 |         indxset_t indxs;
228 |         const int cli = cl_dists.avg_dists [i].cli;
229 |         if (cl_dists.cl_map.find (cli) != cl_dists.cl_map.end ())
230 |             indxs = cl_dists.cl_map.at (cli);
231 |         indxs.emplace (i);
232 |         cl_dists.cl_map [cli] = indxs;
233 | 
234 |         indxs.clear ();
235 |         const int clj = cl_dists.avg_dists [i].clj;
236 |         if (cl_dists.cl_map.find (clj) != cl_dists.cl_map.end ())
237 |             indxs = cl_dists.cl_map.at (clj);
238 |         indxs.emplace (i);
239 |         cl_dists.cl_map [clj] = indxs;
240 |     }
241 | }
242 | 
243 | // Merging is based on AvgDists, which holds all possible pair-wise merges of
244 | // existing clusters. One merge combines the pair in one AvgDists item to make a
245 | // new one. The convention is to merge cli into clj, so cli disappears.
246 | // Importantly, this requires updating all other AvgDists.avg_dists items which
247 | // contain either one of the newly merged pairs. Indices from clusters to
248 | // AvgDists.avg_dists are kept in AvgDists.cli_map and .clj_map. The values of
249 | // the latter are updated to reflect merges, as are the entries of the new cli
250 | // in AvgDists.avg_dists.
251 | full_merge::OneMerge full_merge::merge_avg (full_merge::FullMergeDat &cldat,
252 |         full_merge::AvgDists &cl_dists)
253 | {
254 |     full_merge::OneDist the_dist = cl_dists.avg_dists [0];
255 |     const double dtot = the_dist.di + the_dist.dj + the_dist.d;
256 |     const size_t ntot = the_dist.ni + the_dist.nj + 1;
257 |     const double average = dtot / static_cast <double> (ntot);
258 |     const int cli = the_dist.cli,
259 |               clj = the_dist.clj;
260 |     double dmin = INFINITE_DOUBLE; // shortest connecting distance
261 |     if (!cldat.shortest) {
262 |         dmin = -dmin;
263 |     }
264 | 
265 |     indxset_t cli_indx = cl_dists.cl_map.at (cli),
266 |               clj_indx = cl_dists.cl_map.at (clj);
267 |     // update cli_indx & clj_indx entries, and get value of dmin
268 |     for (auto i: clj_indx) {
269 |         cl_dists.avg_dists [i].dj = dtot;
270 |         cl_dists.avg_dists [i].nj = ntot;
271 |         if (cl_dists.avg_dists [i].cli == cli)
272 |             cl_dists.avg_dists [i].cli = clj;
273 |         else if (cl_dists.avg_dists [i].clj == cli)
274 |             cl_dists.avg_dists [i].clj = clj;
275 |         if ((cldat.shortest && cl_dists.avg_dists [i].d < dmin) ||
276 |                 (!cldat.shortest && cl_dists.avg_dists [i].d > dmin))
277 |             dmin = cl_dists.avg_dists [i].d;
278 |     }
279 | 
280 |     for (auto i: cli_indx) {
281 |         cl_dists.avg_dists [i].di = dtot;
282 |         cl_dists.avg_dists [i].ni = ntot;
283 |         if (cl_dists.avg_dists [i].cli == cli) {
284 |             cl_dists.avg_dists [i].cli = clj;
285 |         } else if (cl_dists.avg_dists [i].clj == cli) {
286 |             cl_dists.avg_dists [i].clj = clj;
287 |         }
288 |         if ((cldat.shortest && cl_dists.avg_dists [i].d < dmin) ||
289 |                 (!cldat.shortest && cl_dists.avg_dists [i].d > dmin)) {
290 |             dmin = cl_dists.avg_dists [i].d;
291 |         }
292 |     }
293 |     // Then update all dmin and average dist values
294 |     for (auto i: clj_indx) {
295 |         cl_dists.avg_dists [i].d = dmin;
296 |         cl_dists.avg_dists [i].value =
297 |             (cl_dists.avg_dists [i].di + dtot + dmin) /
298 |             static_cast <double> (cl_dists.avg_dists [i].ni + ntot + 1);
299 |     }
300 |     for (auto i: cli_indx) {
301 |         cl_dists.avg_dists [i].d = dmin;
302 |         cl_dists.avg_dists [i].value =
303 |             (cl_dists.avg_dists [i].dj + dtot + dmin) /
304 |             static_cast <double> (cl_dists.avg_dists [i].nj + ntot + 1);
305 |     }
306 |     cl_dists.avg_dists.pop_front ();
307 | 
308 |     // These can now have reverse-duplicated entries because after merging A->B
309 |     // entries A->C and C->B will become B->C and C->B. There can also be D->A
310 |     // and D->B which will both become D->B.
311 |     std::vector <int> rm;
312 |     std::unordered_set <std::string> edge_names;
313 |     for (size_t i = 0; i < cl_dists.avg_dists.size (); i++) {
314 |         std::string cij = std::to_string (cl_dists.avg_dists [i].cli) + "-" +
315 |                           std::to_string (cl_dists.avg_dists [i].clj),
316 |                     cji = std::to_string (cl_dists.avg_dists [i].clj) + "-" +
317 |                           std::to_string (cl_dists.avg_dists [i].cli);
318 |         if (edge_names.find (cij) == edge_names.end () &&
319 |                 edge_names.find (cji) == edge_names.end ()) {
320 |             edge_names.emplace (cij);
321 |         } else {
322 |             rm.push_back (static_cast <int> (i));
323 |         }
324 |     }
325 |     std::sort (rm.begin (), rm.end (), std::greater <int> ());
326 |     for (auto i: rm) {
327 |         cl_dists.avg_dists.erase (cl_dists.avg_dists.begin () + i);
328 |     }
329 | 
330 |     if (cldat.shortest) {
331 |         std::sort (cl_dists.avg_dists.begin (), cl_dists.avg_dists.end (),
332 |                 &full_merge::avgdist_sorter_incr);
333 |     } else {
334 |         std::sort (cl_dists.avg_dists.begin (), cl_dists.avg_dists.end (),
335 |                 &full_merge::avgdist_sorter_decr);
336 |     }
337 | 
338 |     // Finally, update the cl_dists.cli_map & clj_map entries
339 |     fill_cl_indx_maps (cl_dists);
340 | 
341 |     full_merge::OneMerge the_merge;
342 |     the_merge.cli = cli;
343 |     the_merge.clj = clj;
344 |     the_merge.merge_dist = average;
345 | 
346 |     return the_merge;
347 | }
348 | 
349 | // Successively merge pairs of clusters which yield the lower average
350 | // intra-cluster edge distance
351 | void full_merge::avg (full_merge::FullMergeDat &cldat) {
352 |     AvgDists cl_dists;
353 |     full_merge::fill_avg_dists (cldat, cl_dists);
354 |     full_merge::fill_cl_indx_maps (cl_dists);
355 | 
356 |     while (cl_dists.avg_dists.size () > 1) {
357 |         full_merge::OneMerge the_merge = full_merge::merge_avg (cldat, cl_dists);
358 |         cldat.merges.push_back (the_merge);
359 |     }
360 | }
361 | 
362 | void full_merge::fill_max_dists (full_merge::FullMergeDat &cldat,
363 |         full_merge::AvgDists &cl_dists) {
364 |     cl_dists.avg_dists.resize (cldat.edges.size ());
365 |     size_t nc = 0;
366 |     std::unordered_set <std::string> edgenames; // TODO: Remove
367 |     for (auto ei: cldat.edges) {
368 |         full_merge::OneDist onedist;
369 |         onedist.cli = ei.from;
370 |         onedist.clj = ei.to;
371 |         onedist.d = ei.dist;
372 | 
373 |         cl_dists.avg_dists [nc++] = onedist;
374 |     }
375 | 
376 |     if (cldat.shortest) {
377 |         std::sort (cl_dists.avg_dists.begin (), cl_dists.avg_dists.end (),
378 |                 &full_merge::maxdist_sorter_incr);
379 |     } else {
380 |         std::sort (cl_dists.avg_dists.begin (), cl_dists.avg_dists.end (),
381 |                 &full_merge::maxdist_sorter_decr);
382 |     }
383 | }
384 | 
385 | void full_merge::max (full_merge::FullMergeDat &cldat) {
386 | }
387 | 
388 | 
389 | full_merge::OneMerge full_merge::merge_max (full_merge::FullMergeDat &cldat,
390 |         full_merge::AvgDists &cl_dists) {
391 | 
392 |     full_merge::OneDist the_dist = cl_dists.avg_dists [0];
393 |     const double dtot = the_dist.di + the_dist.dj + the_dist.d;
394 |     const size_t ntot = the_dist.ni + the_dist.nj + 1;
395 |     const double average = dtot / static_cast <double> (ntot);
396 |     const int cli = the_dist.cli,
397 |               clj = the_dist.clj;
398 |     double dmin = INFINITE_DOUBLE; // shortest connecting distance
399 |     if (!cldat.shortest) {
400 |         dmin = -dmin;
401 |     }
402 | 
403 |     indxset_t cli_indx = cl_dists.cl_map.at (cli),
404 |               clj_indx = cl_dists.cl_map.at (clj);
405 |     // update cli_indx & clj_indx entries, and get value of dmin
406 |     for (auto i: clj_indx) {
407 |         cl_dists.avg_dists [i].dj = dtot;
408 |         cl_dists.avg_dists [i].nj = ntot;
409 |         if (cl_dists.avg_dists [i].cli == cli)
410 |             cl_dists.avg_dists [i].cli = clj;
411 |         else if (cl_dists.avg_dists [i].clj == cli)
412 |             cl_dists.avg_dists [i].clj = clj;
413 |         if ((cldat.shortest && cl_dists.avg_dists [i].d < dmin) ||
414 |                 (!cldat.shortest && cl_dists.avg_dists [i].d > dmin)) {
415 |             dmin = cl_dists.avg_dists [i].d;
416 |         }
417 |     }
418 |     for (auto i: cli_indx) {
419 |         cl_dists.avg_dists [i].di = dtot;
420 |         cl_dists.avg_dists [i].ni = ntot;
421 |         if (cl_dists.avg_dists [i].cli == cli) {
422 |             cl_dists.avg_dists [i].cli = clj;
423 |         } else if (cl_dists.avg_dists [i].clj == cli) {
424 |             cl_dists.avg_dists [i].clj = clj;
425 |         }
426 |         if ((cldat.shortest && cl_dists.avg_dists [i].d < dmin) ||
427 |                 (!cldat.shortest && cl_dists.avg_dists [i].d > dmin)) {
428 |             dmin = cl_dists.avg_dists [i].d;
429 |         }
430 |     }
431 |     // Then update all dmin and average dist values
432 |     for (auto i: clj_indx) {
433 |         cl_dists.avg_dists [i].d = dmin;
434 |         cl_dists.avg_dists [i].value =
435 |             (cl_dists.avg_dists [i].di + dtot + dmin) /
436 |             static_cast <double> (cl_dists.avg_dists [i].ni + ntot + 1);
437 |     }
438 |     for (auto i: cli_indx) {
439 |         cl_dists.avg_dists [i].d = dmin;
440 |         cl_dists.avg_dists [i].value =
441 |             (cl_dists.avg_dists [i].dj + dtot + dmin) /
442 |             static_cast <double> (cl_dists.avg_dists [i].nj + ntot + 1);
443 |     }
444 |     cl_dists.avg_dists.pop_front ();
445 | 
446 |     // These can now have reverse-duplicated entries because after merging A->B
447 |     // entries A->C and C->B will become B->C and C->B. There can also be D->A
448 |     // and D->B which will both become D->B.
449 |     std::vector <int> rm;
450 |     std::unordered_set <std::string> edge_names;
451 |     for (size_t i = 0; i < cl_dists.avg_dists.size (); i++) {
452 |         std::string cij = std::to_string (cl_dists.avg_dists [i].cli) + "-" +
453 |                           std::to_string (cl_dists.avg_dists [i].clj),
454 |                     cji = std::to_string (cl_dists.avg_dists [i].clj) + "-" +
455 |                           std::to_string (cl_dists.avg_dists [i].cli);
456 |         if (edge_names.find (cij) == edge_names.end () &&
457 |                 edge_names.find (cji) == edge_names.end ()) {
458 |             edge_names.emplace (cij);
459 |         } else {
460 |             rm.push_back (static_cast <int> (i));
461 |         }
462 |     }
463 |     std::sort (rm.begin (), rm.end (), std::greater <int> ());
464 |     for (auto i: rm) {
465 |         cl_dists.avg_dists.erase (cl_dists.avg_dists.begin () + i);
466 |     }
467 | 
468 |     if (cldat.shortest) {
469 |         std::sort (cl_dists.avg_dists.begin (), cl_dists.avg_dists.end (),
470 |                 &full_merge::avgdist_sorter_incr);
471 |     } else {
472 |         std::sort (cl_dists.avg_dists.begin (), cl_dists.avg_dists.end (),
473 |                 &full_merge::avgdist_sorter_decr);
474 |     }
475 | 
476 |     // Finally, update the cl_dists.cli_map & clj_map entries
477 |     fill_cl_indx_maps (cl_dists);
478 | 
479 |     full_merge::OneMerge the_merge;
480 |     the_merge.cli = cli;
481 |     the_merge.clj = clj;
482 |     the_merge.merge_dist = average;
483 | 
484 | 
485 |     return the_merge;
486 | }
487 | 
488 | //' rcpp_full_merge
489 | //'
490 | //' Merge clusters generated by rcpp_full_initial to full hierarchy of all
491 | //' possible merges.
492 | //'
493 | //' @noRd
494 | // [[Rcpp::export]]
495 | Rcpp::NumericMatrix rcpp_full_merge (
496 |         const Rcpp::DataFrame gr,
497 |         const std::string linkage,
498 |         const bool shortest)
499 | {
500 |     full_merge::FullMergeDat clmerge_dat;
501 |     clmerge_dat.shortest = shortest;
502 |     full_merge::init (gr, clmerge_dat);
503 | 
504 |     if (utils::strfound (linkage, "single")) {
505 |         full_merge::merge_single (clmerge_dat);
506 |     } else if (utils::strfound (linkage, "average")) {
507 |         full_merge::avg (clmerge_dat);
508 |     } else if (utils::strfound (linkage, "max")) {
509 |         full_merge::max (clmerge_dat);
510 |     } else {
511 |         Rcpp::stop ("linkage not found for full_merge");
512 |     }
513 | 
514 |     const size_t n = clmerge_dat.merges.size ();
515 |     Rcpp::NumericMatrix res (static_cast <int> (n), 3);
516 |     for (size_t i = 0; i < n; i++) {
517 |         res (i, 0) = clmerge_dat.merges [i].cli;
518 |         res (i, 1) = clmerge_dat.merges [i].clj;
519 |         res (i, 2) = clmerge_dat.merges [i].merge_dist;
520 |     }
521 | 
522 |     std::vector <std::string> colnames (3);
523 |     colnames [0] = "from";
524 |     colnames [1] = "to";
525 |     colnames [2] = "dist";
526 |     Rcpp::List dimnames (2);
527 |     dimnames (1) = colnames;
528 |     res.attr ("dimnames") = dimnames;
529 | 
530 |     return res;
531 | }
532 | 


--------------------------------------------------------------------------------
/src/full-merge.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <deque>
 4 | 
 5 | #include "utils.h"
 6 | 
 7 | // Merge the clusters generated by the rcpp_full_initial. Separate class and
 8 | // routines to allow results from rcpp_full_initial to be returned and cached
 9 | // for subsequent re-merging.
10 | 
11 | namespace full_merge {
12 | 
13 | struct OneCluster {
14 |     int id;
15 |     size_t n;
16 |     double dist_sum, dist_max;
17 |     std::vector <utils::OneEdge> edges;
18 | };
19 | 
20 | struct OneMerge {
21 |     int cli, clj;
22 |     double merge_dist;
23 | };
24 | 
25 | struct FullMergeDat {
26 |     bool shortest;
27 |     std::unordered_map <int, int> cl_remap;
28 |     std::unordered_map <int, intset_t> cl_members;
29 |     std::unordered_map <int, OneCluster> clusters;
30 |     std::vector <utils::OneEdge> edges; // edges between clusters
31 |     std::vector <OneMerge> merges;
32 | };
33 | 
34 | struct OneDist {
35 |     int cli, clj;
36 |     size_t ni, nj;
37 |     double di, dj, d, value; 
38 |     // di, dj are dist_sums, d is min dist of connecting edge
39 | };
40 | 
41 | struct AvgDists {
42 |     std::unordered_map <int, indxset_t> cl_map;
43 |     std::deque <OneDist> avg_dists;
44 | };
45 | 
46 | void init (const Rcpp::DataFrame &gr, FullMergeDat &cldat);
47 | 
48 | OneMerge merge_one_single (FullMergeDat &cldat, index_t ei);
49 | void merge_single (FullMergeDat &cldat);
50 | 
51 | bool avgdist_sorter_incr (const OneDist &lhs, const OneDist &rhs);
52 | bool avgdist_sorter_decr (const OneDist &lhs, const OneDist &rhs);
53 | void fill_avg_dists (FullMergeDat &cldat, AvgDists &cl_dists);
54 | void fill_cl_indx_maps (AvgDists &cl_dists);
55 | OneMerge merge_avg (FullMergeDat &cldat, AvgDists &cl_dists);
56 | void avg (FullMergeDat &cldat);
57 | 
58 | bool maxdist_sorter_incr (const OneDist &lhs, const OneDist &rhs);
59 | bool maxdist_sorter_decr (const OneDist &lhs, const OneDist &rhs);
60 | void fill_max_dists (FullMergeDat &cldat, AvgDists &cl_dists);
61 | OneMerge merge_max (FullMergeDat &cldat, AvgDists &cl_dists);
62 | void max (FullMergeDat &cldat);
63 | 
64 | } // end namespace full_merge
65 | 
66 | Rcpp::NumericMatrix rcpp_full_merge (
67 |         const Rcpp::DataFrame gr,
68 |         const std::string method,
69 |         const bool shortest);
70 | 


--------------------------------------------------------------------------------
/src/mst.cpp:
--------------------------------------------------------------------------------
 1 | #include "mst.h"
 2 | 
 3 | std::vector <MSTEdge> mst (Rcpp::IntegerVector from,
 4 |         Rcpp::IntegerVector to,
 5 |         Rcpp::NumericVector d) {
 6 |     const size_t n = static_cast <size_t> (from.size ());
 7 | 
 8 |     std::vector <MSTEdge> edges (n);
 9 |     for (size_t i = 0; i < n; i++) {
10 |         MSTEdge ei;
11 |         ei.from = from (i);
12 |         ei.to = to (i);
13 |         ei.dist = d (i);
14 |         edges [i] = ei;
15 |     }
16 | 
17 |     std::vector <size_t> cl_id (n);
18 |     for (size_t i = 0; i < n; i++) {
19 |         cl_id [i] = i;
20 |     }
21 | 
22 |     std::sort (edges.begin (), edges.end ());
23 | 
24 |     std::vector <MSTEdge> result;
25 | 
26 |     for (MSTEdge e : edges) {
27 |         const size_t cl_from = cl_id [static_cast <size_t> (e.from)],
28 |             cl_to = cl_id [static_cast <size_t> (e.to)];
29 | 
30 |         if (cl_from != cl_to) {
31 |             result.push_back (e);
32 | 
33 |             const size_t cl_min = std::min (cl_from, cl_to),
34 |                   cl_max = std::max (cl_from, cl_to);
35 | 
36 |             for (size_t i = 0; i < n; i++) {
37 |                 if (cl_id [i] == cl_max) {
38 |                     cl_id [i] = cl_min;
39 |                 }
40 |             }
41 |         }
42 |     }
43 | 
44 |     return result;
45 | }
46 | 
47 | 
48 | //' rcpp_mst
49 | //'
50 | //' Minimum spanning tree
51 | //'
52 | //' @noRd
53 | // [[Rcpp::export]]
54 | Rcpp::DataFrame rcpp_mst (Rcpp::DataFrame input) {
55 |     Rcpp::IntegerVector from = input ["from"];
56 |     Rcpp::IntegerVector to = input ["to"];
57 |     Rcpp::NumericVector d = input ["d"];
58 | 
59 |     std::vector <MSTEdge> tree = mst (from, to, d);
60 | 
61 |     Rcpp::IntegerVector from_out (tree.size ());
62 |     Rcpp::IntegerVector to_out (tree.size ());
63 |     Rcpp::NumericVector d_out (tree.size ());
64 |     size_t i = 0;
65 |     for (auto t: tree) {
66 |         from_out (i) = t.from;
67 |         to_out (i) = t.to;
68 |         d_out (i) = t.dist;
69 |         i++;
70 |     }
71 | 
72 |     Rcpp::DataFrame res = Rcpp::DataFrame::create (
73 |         Rcpp::Named ("from") = from_out,
74 |         Rcpp::Named ("to") = to_out,
75 |         Rcpp::Named ("d") = d_out,
76 |         Rcpp::_["stringsAsFactors"] = false);
77 | 
78 |     return res;
79 | };
80 | 


--------------------------------------------------------------------------------
/src/mst.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <algorithm>
 4 | #include <Rcpp.h>
 5 | 
 6 | struct MSTEdge {
 7 |     int from, to;
 8 |     double dist;
 9 |     bool operator<(const MSTEdge& rhs) const { return dist < rhs.dist; }
10 | };
11 | 
12 | std::vector <MSTEdge> mst (Rcpp::IntegerVector from,
13 |         Rcpp::IntegerVector to,
14 |         Rcpp::NumericVector d);
15 | 


--------------------------------------------------------------------------------
/src/slk.cpp:
--------------------------------------------------------------------------------
  1 | #include "common.h"
  2 | #include "utils.h"
  3 | #include "slk.h"
  4 | #include <algorithm>
  5 | 
  6 | // --------- SINGLE LINKAGE CLUSTER ----------------
  7 | 
  8 | //' rcpp_slk
  9 | //'
 10 | //' Full-order single linkage cluster redcap algorithm
 11 | //'
 12 | //' @noRd
 13 | // [[Rcpp::export]]
 14 | Rcpp::IntegerVector rcpp_slk (
 15 |         const Rcpp::DataFrame gr_full,
 16 |         const Rcpp::DataFrame gr,
 17 |         const bool shortest,
 18 |         const bool quiet) {
 19 |     Rcpp::IntegerVector from_full_ref = gr_full ["from"];
 20 |     Rcpp::IntegerVector to_full_ref = gr_full ["to"];
 21 |     Rcpp::NumericVector d_full = gr_full ["d"];
 22 |     Rcpp::IntegerVector from_ref = gr ["from"];
 23 |     Rcpp::IntegerVector to_ref = gr ["to"];
 24 |     Rcpp::NumericVector d = gr ["d"];
 25 | 
 26 |     // Rcpp classes are always passed by reference, so cloning is necessary to
 27 |     // avoid modifying the original data.frames.
 28 |     Rcpp::IntegerVector from_full = Rcpp::clone (from_full_ref);
 29 |     Rcpp::IntegerVector to_full = Rcpp::clone (to_full_ref);
 30 |     Rcpp::IntegerVector from = Rcpp::clone (from_ref);
 31 |     Rcpp::IntegerVector to = Rcpp::clone (to_ref);
 32 | 
 33 |     // Index vectors are 1-indexed, so
 34 |     from_full = from_full - 1;
 35 |     to_full = to_full - 1;
 36 |     from = from - 1;
 37 |     to = to - 1;
 38 | 
 39 |     arma::Mat <int> contig_mat;
 40 |     arma::Mat <double> d_mat;
 41 | 
 42 |     // index2cl and cl2index are dynamically updated with cluster memberships;
 43 |     // vert2index and index2vert are retained at initial values which map (from,
 44 |     // to) vectors to matrix indices. All operations are performed on matrices
 45 |     // directly, with membership re-traced at the end via index2vert_map.
 46 |     int2indxset_map_t cl2index_map;
 47 |     int2indx_map_t vert2index_map;
 48 |     indx2int_map_t index2vert_map, index2cl_map;
 49 | 
 50 |     size_t n = utils::sets_init (from, to, vert2index_map, index2vert_map,
 51 |                           index2cl_map, cl2index_map);
 52 | 
 53 |     utils_slk::mats_init (from, to, d, vert2index_map, contig_mat, d_mat,
 54 |             shortest);
 55 | 
 56 |     /* The contiguity matrix retains is shape, so is always indexed by the
 57 |      * (from, to) vectors. Merging clusters simply switches additional entries
 58 |      * from  0 to 1.
 59 |      */
 60 | 
 61 |     const bool really_quiet = !(!quiet && n > 100);
 62 | 
 63 |     indxset_t the_tree;
 64 |     size_t e = 0; // edge number in gr_full
 65 |     while (the_tree.size () < (n - 1)) {// tree has n - 1 edges
 66 |         Rcpp::checkUserInterrupt ();
 67 | 
 68 |         index_t ifrom = vert2index_map.at (from_full (e)),
 69 |                 ito = vert2index_map.at (to_full (e));
 70 |         if (index2cl_map.find (ifrom) != index2cl_map.end () &&
 71 |                 index2cl_map.find (ito) != index2cl_map.end ()) {
 72 |             int cfrom = index2cl_map.at (ifrom),
 73 |                 cto = index2cl_map.at (ito);
 74 |             if (cfrom != cto &&
 75 |                     contig_mat (static_cast <arma::uword> (ifrom),
 76 |                                 static_cast <arma::uword> (ito)) > 0) {
 77 |                 size_t ishort = utils::find_shortest_connection (from, to,
 78 |                         vert2index_map, d_mat, cl2index_map, cfrom, cto,
 79 |                         shortest);
 80 |                 the_tree.insert (ishort);
 81 |                 utils::merge_clusters (contig_mat,
 82 |                         index2cl_map, cl2index_map, cfrom, cto);
 83 |                 e = 0;
 84 |             } else {
 85 |                 e++;
 86 |             }
 87 |         } else {
 88 |             e++;
 89 |         }
 90 |         if (!really_quiet && the_tree.size () % 100 == 0) {
 91 |             Rcpp::Rcout << "\rBuilding tree: " << the_tree.size () << " / " <<
 92 |                 n - 1;
 93 |             Rcpp::Rcout.flush ();
 94 |         }
 95 |     }
 96 | 
 97 |     if (!really_quiet) {
 98 |         Rcpp::Rcout << "\rBuilding tree: " << the_tree.size () << " / " <<
 99 |             n - 1 << " -> done" << std::endl;
100 |     }
101 | 
102 |     std::vector <index_t> treevec (the_tree.begin (), the_tree.end ());
103 | 
104 |     return Rcpp::wrap (treevec);
105 | }
106 | 


--------------------------------------------------------------------------------
/src/slk.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | // --------- SINGLE LINKAGE CLUSTER ----------------
 4 | 
 5 | Rcpp::IntegerVector rcpp_slk (
 6 |         const Rcpp::DataFrame gr_full,
 7 |         const Rcpp::DataFrame gr,
 8 |         const bool shortest,
 9 |         const bool quiet);
10 | 


--------------------------------------------------------------------------------
/src/spatialcluster_init.c:
--------------------------------------------------------------------------------
 1 | #include <R.h>
 2 | #include <Rinternals.h>
 3 | #include <stdlib.h> // for NULL
 4 | #include <R_ext/Rdynload.h>
 5 | 
 6 | /* FIXME: 
 7 |    Check these declarations against the C/Fortran source code.
 8 | */
 9 | 
10 | /* .Call calls */
11 | extern SEXP _spatialcluster_rcpp_alk(SEXP, SEXP, SEXP);
12 | extern SEXP _spatialcluster_rcpp_clk(SEXP, SEXP, SEXP, SEXP);
13 | extern SEXP _spatialcluster_rcpp_cut_tree(SEXP, SEXP, SEXP, SEXP);
14 | extern SEXP _spatialcluster_rcpp_full_initial(SEXP, SEXP);
15 | extern SEXP _spatialcluster_rcpp_full_merge(SEXP, SEXP, SEXP);
16 | extern SEXP _spatialcluster_rcpp_mst(SEXP);
17 | extern SEXP _spatialcluster_rcpp_slk(SEXP, SEXP, SEXP, SEXP);
18 | 
19 | static const R_CallMethodDef CallEntries[] = {
20 |     {"_spatialcluster_rcpp_alk",          (DL_FUNC) &_spatialcluster_rcpp_alk,          3},
21 |     {"_spatialcluster_rcpp_clk",          (DL_FUNC) &_spatialcluster_rcpp_clk,          4},
22 |     {"_spatialcluster_rcpp_cut_tree",     (DL_FUNC) &_spatialcluster_rcpp_cut_tree,     4},
23 |     {"_spatialcluster_rcpp_full_initial", (DL_FUNC) &_spatialcluster_rcpp_full_initial, 2},
24 |     {"_spatialcluster_rcpp_full_merge",   (DL_FUNC) &_spatialcluster_rcpp_full_merge,   3},
25 |     {"_spatialcluster_rcpp_mst",          (DL_FUNC) &_spatialcluster_rcpp_mst,          1},
26 |     {"_spatialcluster_rcpp_slk",          (DL_FUNC) &_spatialcluster_rcpp_slk,          4},
27 |     {NULL, NULL, 0}
28 | };
29 | 
30 | void R_init_spatialcluster(DllInfo *dll)
31 | {
32 |     R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
33 |     R_useDynamicSymbols(dll, FALSE);
34 | }
35 | 


--------------------------------------------------------------------------------
/src/utils.cpp:
--------------------------------------------------------------------------------
  1 | #include "common.h"
  2 | #include "utils.h"
  3 | #include <algorithm>
  4 | 
  5 | // Note that all matrices **CAN** be asymmetrical, and so are always indexed
  6 | // (from, to)
  7 | 
  8 | /* These routines all work with 2 lots of 2 main maps:
  9 |  * 1. vert2index and index2vert maps
 10 |  * 2. cl2index and index2cl maps
 11 |  *
 12 |  * The former map all vertices enumerated in the original from and to vectors to
 13 |  * sequential index numbers into the matrices (dists, contig_mat, whatever). The
 14 |  * latter are initially direct a->a maps of all indices to themselves. As
 15 |  * clusters merge, the values of index2cl maps are updated so that, for example,
 16 |  * index2cl(a)->b and index2cl(b)->b. The cl2index map then holds an
 17 |  * unordered_set of target indices for each cluster.
 18 |  */
 19 | 
 20 | bool utils::strfound (const std::string str, const std::string target) {
 21 |     bool found = false;
 22 |     if (str.find (target) != std::string::npos)
 23 |         found = true;
 24 |     return found;
 25 | }
 26 | 
 27 | size_t utils::sets_init (
 28 |         const Rcpp::IntegerVector &from,
 29 |         const Rcpp::IntegerVector &to,
 30 |         int2indx_map_t &vert2index_map,
 31 |         indx2int_map_t &index2vert_map,
 32 |         indx2int_map_t &index2cl_map,
 33 |         int2indxset_map_t &cl2index_map) {
 34 |     vert2index_map.clear ();
 35 |     index2vert_map.clear ();
 36 |     index2cl_map.clear ();
 37 |     cl2index_map.clear ();
 38 | 
 39 |     intset_t vert_set;
 40 |     for (int i = 0; i < from.size (); i++) {
 41 |         vert_set.emplace (from [i]);
 42 |         vert_set.emplace (to [i]);
 43 |     }
 44 |     int idx = 0;
 45 |     for (auto v: vert_set) {
 46 |         index2vert_map.emplace (idx, v);
 47 |         vert2index_map.emplace (v, idx++);
 48 |     }
 49 | 
 50 |     for (int i = 0; i < from.length (); i++) {
 51 |         size_t fi = vert2index_map.at (from [i]);
 52 |         indxset_t eset; // only one index for each vert at this stage
 53 |         eset.insert (fi);
 54 |         cl2index_map.emplace (fi, eset);
 55 |     }
 56 |     for (auto v: vert_set) {
 57 |         // all verts are their own clusters, so cast indx to cli
 58 |         int cli = static_cast <int> (vert2index_map.at (v));
 59 |         indxset_t eset;
 60 |         if (cl2index_map.find (cli) != cl2index_map.end ()) {
 61 |             eset = cl2index_map.at (cli);
 62 |         }
 63 |         eset.emplace (cli);
 64 |         cl2index_map.emplace (cli, eset);
 65 |         index2cl_map.emplace (static_cast <size_t> (cli), cli);
 66 |     }
 67 |     
 68 |     return static_cast <size_t> (vert_set.size ());
 69 | }
 70 | 
 71 | //' find shortest (or longest) connection between two clusters
 72 | //' @param from, to, d the columns of the edge graph
 73 | //' @param d_mat distance matrix between all edges (not between clusters!)
 74 | //' @param cl2vert_map map of list of all (from, to, d) edges for each cluster
 75 | //' @param cfrom Number of cluster which is to be merged
 76 | //' @param cto Number of cluster with which it is to be merged
 77 | //'
 78 | //' @return Index directly into from, to - **NOT** into the actual matrices!
 79 | //' @noRd
 80 | size_t utils::find_shortest_connection (
 81 |         const Rcpp::IntegerVector &from,
 82 |         const Rcpp::IntegerVector &to,
 83 |         const int2indx_map_t &vert2index_map,
 84 |         const arma::Mat <double> &d_mat,
 85 |         const int2indxset_map_t &cl2index_map,
 86 |         const int cfrom,
 87 |         const int cto,
 88 |         const bool shortest) {
 89 |     if (cl2index_map.find (cfrom) == cl2index_map.end ()) {
 90 |         Rcpp::stop ("cluster index not found");
 91 |     }
 92 |     if (cl2index_map.find (cto) == cl2index_map.end ()) {
 93 |         Rcpp::stop ("cluster index not found");
 94 |     }
 95 | 
 96 |     indxset_t index_i = cl2index_map.at (cfrom),
 97 |              index_j = cl2index_map.at (cto);
 98 | 
 99 |     double dlim = INFINITE_DOUBLE;
100 |     if (!shortest) {
101 |         dlim = -dlim;
102 |     }
103 |     size_t short_i = INFINITE_INT, short_j = INFINITE_INT;
104 | 
105 |     // from and to here are directional, so need to examine both directions
106 |     for (auto i: index_i) {
107 |         for (auto j: index_j) {
108 |             arma::uword ia = static_cast <arma::uword> (i),
109 |                         ja = static_cast <arma::uword> (j);
110 |             if ((shortest && d_mat (ia, ja) < dlim) ||
111 |                     (!shortest && d_mat (ia, ja) > dlim)) {
112 |                 dlim = d_mat (ia, ja);
113 |                 short_i = i;
114 |                 short_j = j;
115 |             } else if ((shortest && d_mat (ja, ia) < dlim) ||
116 |                     (!shortest && d_mat (ja, ia) > dlim)) {
117 |                 dlim = d_mat (ja, ia);
118 |                 short_i = j;
119 |                 short_j = i;
120 |             }
121 |         }
122 |     }
123 |     if (dlim == INFINITE_DOUBLE) {
124 |         Rcpp::stop ("no minimal distance; this should not happen");
125 |     }
126 | 
127 |     // convert short_i and short_j to a single edge 
128 |     // TODO: Make a std::map of vert2dist to avoid this loop
129 |     size_t shortest_edge = INFINITE_INT;
130 |     for (int i = 0; i < from.length (); i++) { // int for Rcpp index
131 |         if ((vert2index_map.at (from [i]) == short_i &&
132 |                 vert2index_map.at (to [i]) == short_j) ||
133 |             (vert2index_map.at (from [i]) == short_j &&
134 |                 vert2index_map.at (to [i]) == short_i)) {
135 |             shortest_edge = static_cast <size_t> (i);
136 |             break;
137 |         }
138 |     }
139 |     if (shortest_edge == INFINITE_INT) {
140 |         Rcpp::stop ("This shouldn't happen (in utils::find_shortest_connection)");
141 |     }
142 | 
143 |     return shortest_edge;
144 | }
145 | 
146 | //' merge two clusters in the contiguity matrix, reducing the size of the matrix
147 | //' by one row and column.
148 | //'
149 | //' @return A logical parameter indicating whether or not the newly formed
150 | //' cluster has any outgoing connections.
151 | //' @noRd
152 | void utils::merge_clusters (
153 |         arma::Mat <int> &contig_mat,
154 |         indx2int_map_t &index2cl_map,
155 |         int2indxset_map_t &cl2index_map,
156 |         const int cluster_from,
157 |         const int cluster_to) {
158 |     if (cluster_from < 0) {
159 |         Rcpp::stop ("cluster_from must be non-negative");
160 |     }
161 |     if (cluster_to < 0) {
162 |         Rcpp::stop ("cluster_to must be non-negative");
163 |     }
164 | 
165 |     arma::uword cfr = static_cast <arma::uword> (cluster_from),
166 |                 cto = static_cast <arma::uword> (cluster_to);
167 |     // Set all contig_mat (cluster_from, .) to 1
168 |     for (arma::uword i = 0; i < contig_mat.n_rows; i++) {
169 |         if (contig_mat (cfr, i) == 1 || contig_mat (i, cfr) == 1) {
170 |             contig_mat (cfr, i) = contig_mat (i, cfr) = 1;
171 |             contig_mat (cto, i) = contig_mat (i, cto) = 1;
172 |         }
173 |     }
174 | 
175 |     indxset_t idx_from = cl2index_map.at (cluster_from),
176 |               idx_to = cl2index_map.at (cluster_to);
177 | 
178 |     for (auto i: idx_from) {
179 |         for (auto j: idx_to) {
180 |             arma::uword ia = static_cast <arma::uword> (i),
181 |                         ja = static_cast <arma::uword> (j);
182 |             contig_mat (ia, ja) = contig_mat (ja, ia) = 1;
183 |         }
184 |     }
185 | 
186 |     // then re-number all cluster numbers in cl2index 
187 |     cl2index_map.erase (cluster_from);
188 |     cl2index_map.erase (cluster_to);
189 |     for (auto i: idx_from) {
190 |         idx_to.insert (i);
191 |     }
192 |     cl2index_map.emplace (cluster_to, idx_to);
193 |     // and in index2cl:
194 |     for (auto i: idx_from) {
195 |         index2cl_map.erase (i);
196 |         index2cl_map.emplace (i, cluster_to);
197 |     }
198 | }
199 | 
200 | //' initial contiguity and distance matrices. The contiguity matrix is between
201 | //' clusters, so is constantly modified, whereas the distance matrix is between
202 | //' edges, so is fixed at load time.
203 | //' @noRd
204 | void utils_slk::mats_init (
205 |         const Rcpp::IntegerVector &from,
206 |         const Rcpp::IntegerVector &to,
207 |         const Rcpp::NumericVector &d,
208 |         const int2indx_map_t &vert2index_map,
209 |         arma::Mat <int> &contig_mat,
210 |         arma::Mat <double> &d_mat,
211 |         bool shortest) {
212 |     // arma::uword = unsigned int
213 |     const arma::uword n = static_cast <arma::uword> (vert2index_map.size ());
214 | 
215 |     contig_mat = arma::zeros <arma::Mat <int> > (n, n);
216 |     //d_mat = arma::zeros <arma::Mat <double> > (n, n);
217 |     d_mat.resize (n, n);
218 |     if (shortest) {
219 |         d_mat.fill (INFINITE_DOUBLE);
220 |     } else {
221 |         d_mat.fill (-INFINITE_DOUBLE);
222 |     }
223 | 
224 |     for (int i = 0; i < from.length (); i++) {
225 |         arma::uword fi = static_cast <arma::uword> (vert2index_map.at (from [i])),
226 |                     ti = static_cast <arma::uword> (vert2index_map.at (to [i]));
227 |         contig_mat (fi, ti) = contig_mat (ti, fi) = 1;
228 |         d_mat (fi, ti) = d_mat (ti, fi) = d [i];
229 |     }
230 | }
231 | 


--------------------------------------------------------------------------------
/src/utils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | namespace utils {
 4 | 
 5 | bool strfound (const std::string str, const std::string target);
 6 | 
 7 | struct OneEdge {
 8 |     int from, to;
 9 |     double dist;
10 | };
11 | 
12 | size_t sets_init (
13 |         const Rcpp::IntegerVector &from,
14 |         const Rcpp::IntegerVector &to,
15 |         int2indx_map_t &vert2index_map,
16 |         indx2int_map_t &index2vert_map,
17 |         indx2int_map_t &index2cl_map,
18 |         int2indxset_map_t &cl2index_map);
19 | 
20 | size_t find_shortest_connection (
21 |         const Rcpp::IntegerVector &from,
22 |         const Rcpp::IntegerVector &to,
23 |         const int2indx_map_t &vert2index_map,
24 |         const arma::Mat <double> &d_mat,
25 |         const int2indxset_map_t &cl2index_map,
26 |         const int cfrom,
27 |         const int cto,
28 |         const bool shortest);
29 | 
30 | void merge_clusters (
31 |         arma::Mat <int> &contig_mat,
32 |         indx2int_map_t &index2cl_map,
33 |         int2indxset_map_t &cl2index_map,
34 |         const int merge_from,
35 |         const int merge_to);
36 | 
37 | } // end namespace utils
38 | 
39 | // These are only used in slk
40 | namespace utils_slk {
41 | 
42 | void mats_init (
43 |         const Rcpp::IntegerVector &from,
44 |         const Rcpp::IntegerVector &to,
45 |         const Rcpp::NumericVector &d,
46 |         const int2indx_map_t &vert2index_map,
47 |         arma::Mat <int> &contig_mat,
48 |         arma::Mat <double> &d_mat,
49 |         bool shortest);
50 | 
51 | void dmat_full_init (
52 |         const Rcpp::IntegerVector &from,
53 |         const Rcpp::IntegerVector &to,
54 |         const Rcpp::NumericVector &d,
55 |         const int2indx_map_t &vert2index_map,
56 |         arma::Mat <double> &d_mat,
57 |         bool shortest);
58 | 
59 | } // end namespace utils_slk
60 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library (testthat)
2 | library (spatialcluster)
3 | 
4 | test_check ("spatialcluster")
5 | 


--------------------------------------------------------------------------------
/tests/testthat/test-full.R:
--------------------------------------------------------------------------------
 1 | context ("full")
 2 | 
 3 | test_that ("structure", {
 4 |     set.seed (1)
 5 |     n <- 100
 6 |     xy <- matrix (runif (2 * n), ncol = 2)
 7 |     dmat <- matrix (runif (n^2), ncol = n)
 8 |     ncl <- 4
 9 |     scl <- scl_full (xy, dmat, ncl = ncl)
10 |     expect_is (scl, "scl")
11 |     expect_true (scl$pars$ncl == ncl)
12 |     expect_true (all (names (scl) %in%
13 |         c (
14 |             "tree", "merges", "ord", "nodes",
15 |             "pars", "statistics"
16 |         )))
17 |     cl <- scl$nodes$cluster [!is.na (scl$nodes$cluster)]
18 |     expect_true (length (unique (cl)) == ncl)
19 | })
20 | 
21 | test_that ("methods", {
22 |     set.seed (1)
23 |     n <- 100
24 |     xy <- matrix (runif (2 * n), ncol = 2)
25 |     dmat <- matrix (runif (n^2), ncol = n)
26 |     ncl <- 8
27 |     scl1 <- scl_full (xy, dmat, ncl = ncl, linkage = "single")
28 |     scl2 <- scl_full (xy, dmat, ncl = ncl, linkage = "average")
29 |     expect_true (!identical (scl1, scl2))
30 |     cl1 <- scl1$nodes$cluster [!is.na (scl1$nodes$cluster)]
31 |     expect_equal (length (unique (cl1)), ncl)
32 |     cl2 <- scl2$nodes$cluster [!is.na (scl2$nodes$cluster)]
33 |     expect_equal (length (unique (cl2)), ncl)
34 | })
35 | 
36 | test_that ("recluster", {
37 |     set.seed (1)
38 |     n <- 100
39 |     xy <- matrix (runif (2 * n), ncol = 2)
40 |     dmat <- matrix (runif (n^2), ncol = n)
41 |     scl <- scl_full (xy, dmat, ncl = 4)
42 |     scl1 <- scl_full (scl, ncl = 3)
43 |     scl2 <- scl_recluster (scl, ncl = 3)
44 |     expect_identical (scl1, scl2)
45 |     expect_error (
46 |         scl3 <- scl_redcap (scl, ncl = 3),
47 |         "scl_redcap can pass to scl_recluster only"
48 |     )
49 | })
50 | 


--------------------------------------------------------------------------------
/tests/testthat/test-plots.R:
--------------------------------------------------------------------------------
 1 | context ("plot")
 2 | 
 3 | test_that ("redcap plot", {
 4 |     set.seed (1)
 5 |     n <- 100
 6 |     xy <- matrix (runif (2 * n), ncol = 2)
 7 |     dmat <- matrix (runif (n^2), ncol = n)
 8 |     scl <- scl_redcap (xy, dmat, ncl = 4)
 9 |     g1 <- plot (scl)
10 |     expect_is (g1, "ggplot")
11 |     g2 <- plot (scl, hull_alpha = 0.5)
12 |     # The hulls are then contained in
13 |     h1 <- g1$layers [[2]]$data
14 |     h2 <- g2$layers [[2]]$data
15 |     expect_true (!identical (h1, h2))
16 | })
17 | 
18 | test_that ("full plot", {
19 |     set.seed (1)
20 |     n <- 100
21 |     xy <- matrix (runif (2 * n), ncol = 2)
22 |     dmat <- matrix (runif (n^2), ncol = n)
23 |     scl <- scl_full (xy, dmat, ncl = 4)
24 |     g1 <- plot (scl)
25 |     expect_is (g1, "ggplot")
26 |     g2 <- plot (scl, hull_alpha = 0.5)
27 |     # The hulls are then contained in
28 |     h1 <- g1$layers [[2]]$data
29 |     h2 <- g2$layers [[2]]$data
30 |     expect_true (!identical (h1, h2))
31 | })
32 | 
33 | test_that ("plot_merges", {
34 |     set.seed (1)
35 |     n <- 100
36 |     xy <- matrix (runif (2 * n), ncol = 2)
37 |     dmat <- matrix (runif (n^2), ncol = n)
38 |     scl <- scl_full (xy, dmat, ncl = 8)
39 |     expect_silent (plot_merges (scl))
40 |     graphics.off ()
41 |     expect_silent (plot_merges (scl, root_tree = TRUE))
42 |     graphics.off ()
43 |     scl <- scl_redcap (xy, dmat, ncl = 8)
44 |     expect_error (
45 |         plot_merges (scl),
46 |         "plot_merges can only be applied to scl objects"
47 |     )
48 | })
49 | 


--------------------------------------------------------------------------------
/tests/testthat/test-redcap.R:
--------------------------------------------------------------------------------
 1 | context ("redcap ")
 2 | 
 3 | test_that ("structure", {
 4 |     set.seed (1)
 5 |     n <- 100
 6 |     xy <- matrix (runif (2 * n), ncol = 2)
 7 |     dmat <- matrix (runif (n^2), ncol = n)
 8 |     scl <- scl_redcap (xy, dmat, ncl = 4)
 9 |     expect_is (scl, "scl")
10 |     expect_true (scl$pars$ncl >= 4)
11 |     expect_true (all (names (scl) %in%
12 |         c ("tree", "nodes", "pars", "statistics")))
13 |     expect_true (nrow (scl$tree) < n)
14 | })
15 | 
16 | test_that ("methods", {
17 |     set.seed (1)
18 |     n <- 100
19 |     xy <- matrix (runif (2 * n), ncol = 2)
20 |     dmat <- matrix (runif (n^2), ncol = n)
21 |     scl1 <- scl_redcap (xy, dmat, ncl = 4, shortest = FALSE)
22 |     scl2 <- scl_redcap (xy, dmat, ncl = 4, shortest = TRUE)
23 |     expect_true (!identical (scl1, scl2))
24 | 
25 |     # these are the default pars:
26 |     scl3 <- scl_redcap (xy, dmat, ncl = 4, full_order = TRUE)
27 |     expect_true (!identical (scl1, scl2))
28 | 
29 |     scl4 <- scl_redcap (xy, dmat, ncl = 4, linkage = "single")
30 |     expect_true (identical (scl3, scl4))
31 |     scl5 <- scl_redcap (xy, dmat, ncl = 4, linkage = "average")
32 |     expect_false (identical (scl4, scl5))
33 |     scl6 <- scl_redcap (xy, dmat, ncl = 4, linkage = "complete")
34 |     expect_false (identical (scl6, scl5))
35 |     expect_false (identical (scl6, scl4))
36 |     scl7 <- scl_redcap (xy, dmat, ncl = 4, full_order = FALSE)
37 |     expect_false (identical (scl7, scl4))
38 |     expect_false (identical (scl7, scl5))
39 |     expect_false (identical (scl7, scl6))
40 |     expect_error (
41 |         scl8 <- scl_redcap (xy, dmat,
42 |             ncl = 4,
43 |             linkage = "blah"
44 |         ),
45 |         "linkage must be one of"
46 |     )
47 | })
48 | 
49 | test_that ("recluster", {
50 |     set.seed (1)
51 |     n <- 100
52 |     xy <- matrix (runif (2 * n), ncol = 2)
53 |     dmat <- matrix (runif (n^2), ncol = n)
54 |     scl <- scl_redcap (xy, dmat, ncl = 4)
55 |     scl2 <- scl_recluster (scl, ncl = 3)
56 |     expect_message (
57 |         scl3 <- scl_redcap (scl, ncl = 3),
58 |         "scl_redcap is for initial cluster"
59 |     )
60 |     expect_identical (scl2, scl3)
61 |     expect_true (!identical (scl, scl2))
62 | })
63 | 


--------------------------------------------------------------------------------
/vignettes/makefile:
--------------------------------------------------------------------------------
 1 | LFILE = spatialcluster
 2 | 
 3 | all: knith
 4 | #all: knith open 
 5 | 
 6 | knith: $(LFILE).Rmd
 7 | 	echo "rmarkdown::render('$(LFILE).Rmd',output_file='$(LFILE).html')" | R --no-save -q
 8 | 
 9 | knitr: $(LFILE).Rmd
10 | 	echo "rmarkdown::render('$(LFILE).Rmd',rmarkdown::md_document(variant='markdown_github'))" | R --no-save -q
11 | 
12 | open: $(LFILE).html
13 | 	xdg-open $(LFILE).html &
14 | 
15 | clean:
16 | 	rm -rf *.html *.png 
17 | 


--------------------------------------------------------------------------------
/vignettes/spatialcluster.bib:
--------------------------------------------------------------------------------
  1 | @article{Guo2008,
  2 |     author = {D. Guo },
  3 |     title = {Regionalization with dynamically constrained agglomerative
  4 |         clustering and partitioning (REDCAP)},
  5 |     journal = {International Journal of Geographical Information Science},
  6 |     volume = {22},
  7 |     number = {7},
  8 |     pages = {801-823},
  9 |     year  = {2008},
 10 |     publisher = {Taylor & Francis},
 11 |     doi = {10.1080/13658810701674970},
 12 |     URL = {https://doi.org/10.1080/13658810701674970}
 13 | }
 14 | 
 15 | @incollection{Lu2009,
 16 |     title = "Spatial Clustering, Detection and Analysis of ",
 17 |     editor = "Kitchin, Rob  and Thrift, Nigel ",
 18 |     booktitle = "International Encyclopedia of Human Geography ",
 19 |     publisher = "Elsevier",
 20 |     edition = "",
 21 |     address = "Oxford",
 22 |     year = {2009},
 23 |     pages = {317 - 324},
 24 |     isbn = "978-0-08-044910-4",
 25 |     doi = "https://doi.org/10.1016/B978-008044910-4.00523-X",
 26 |     url = "https://www.sciencedirect.com/science/article/pii/B978008044910400523X",
 27 |     author = "Y. Lu",
 28 | }
 29 | 
 30 | @incollection{Han2001,
 31 |     author = {Han, Jiawei and Kamber, M and Tung, Anthony},
 32 |     year = {2001},
 33 |     month = {01},
 34 |     pages = {},
 35 |     title = {Spatial clustering methods in data mining: a survey},
 36 |     booktitle = {Data Mining and Knowledge Discovery - DATAMINE},
 37 |     url = "http://hanj.cs.illinois.edu/pdf/gkdbk01.pdf"
 38 | }
 39 | 
 40 | @article{Assuncao2006,
 41 |     author = {R.M. AssunÇão and M.C. Neves and G. Câmara and C. Da Costa Freitas},
 42 |     title = {Efficient regionalization techniques for socio‐economic
 43 |         geographical units using minimum spanning trees},
 44 |     journal = {International Journal of Geographical Information Science},
 45 |     volume = {20},
 46 |     number = {7},
 47 |     pages = {797-811},
 48 |     year  = {2006},
 49 |     publisher = {Taylor \& Francis},
 50 |     doi = {10.1080/13658810600665111},
 51 |     URL = {https://doi.org/10.1080/13658810600665111}
 52 | }
 53 | 
 54 | @article{Duque2007,
 55 |     author = {Juan Carlos Duque and Raúl Ramos and Jordi Suriñach},
 56 |     title ={Supervised Regionalization Methods: A Survey},
 57 |     journal = {International Regional Science Review},
 58 |     volume = {30},
 59 |     number = {3},
 60 |     pages = {195-220},
 61 |     year = {2007},
 62 |     doi = {10.1177/0160017607301605},
 63 |     URL = {https://doi.org/10.1177/0160017607301605}
 64 | }
 65 | 
 66 | @ARTICLE{Chavent2017,
 67 |     author = {Chavent, M. and Kuentz-Simonet, V. and Labenne, A. and Saracco J.},
 68 |     title = "ClustGeo: an R package for hierarchical clustering with spatial constraints",
 69 |     journal = {ArXiv e-prints},
 70 |     archivePrefix = "arXiv",
 71 |     eprint = {1707.03897},
 72 |     year = {2017},
 73 |     month = {jul},
 74 |     URL = {https://arxiv.org/abs/1707.03897}
 75 | }
 76 | 
 77 | @article{Bivand2015,
 78 |     author = {Roger Bivand and Gianfranco Piras},
 79 |     title = {Comparing Implementations of Estimation Methods for Spatial Econometrics},
 80 |     journal = {Journal of Statistical Software, Articles},
 81 |     volume = {63},
 82 |     number = {18},
 83 |     year = {2015},
 84 |     issn = {1548-7660},
 85 |     pages = {1--36},
 86 |     doi = {10.18637/jss.v063.i18},
 87 |     url = {https://www.jstatsoft.org/v063/i18},
 88 |     note = {Reference 1 of 2 for the spdep package}
 89 | }
 90 | 
 91 | @article{Bivand2013,
 92 |     author = {Roger Bivand and Jan Hauke and Tomasz Kossowski},
 93 |     title = {Computing the Jacobian in Gaussian Spatial Autoregressive Models:
 94 |         An Illustrated Comparison of Available Methods},
 95 |     journal = {Geographical Analysis},
 96 |     year = {2013},
 97 |     volume = {45},
 98 |     number = {2},
 99 |     pages = {150-179},
100 |     doi = {10.1111/gean.12008},
101 |     url = {https://onlinelibrary.wiley.com/doi/abs/10.1111/gean.12008},
102 |     note = {Reference 2 of 2 for the spdep package}
103 | }
104 | 
105 | 


--------------------------------------------------------------------------------