├── .Rbuildignore ├── .github ├── .gitignore └── workflows │ ├── R-CMD-check.yaml │ ├── pkgdown.yaml │ └── test-coverage.yaml ├── .gitignore ├── .hooks └── description ├── .pre-commit-config.yaml ├── DESCRIPTION ├── NAMESPACE ├── R ├── RcppExports.R ├── edges.R ├── plot-fns.R ├── scl-full.R ├── scl-redcap.R ├── spatialcluster-package.R ├── statistics.R ├── tree.R ├── utils.R └── zzz.R ├── README.Rmd ├── README.md ├── _pkgdown.yml ├── codemeta.json ├── makefile ├── man ├── figures │ ├── README-cautionary-1.png │ └── README-full-single-1.png ├── plot.scl.Rd ├── plot_merges.Rd ├── scl_full.Rd ├── scl_recluster.Rd ├── scl_redcap.Rd └── spatialcluster.Rd ├── src ├── RcppExports.cpp ├── alk.cpp ├── alk.h ├── bst.h ├── clk.cpp ├── clk.h ├── common.h ├── cuttree.cpp ├── cuttree.h ├── full-init.cpp ├── full-init.h ├── full-merge.cpp ├── full-merge.h ├── mst.cpp ├── mst.h ├── slk.cpp ├── slk.h ├── spatialcluster_init.c ├── utils.cpp └── utils.h ├── tests ├── testthat.R └── testthat │ ├── test-full.R │ ├── test-plots.R │ └── test-redcap.R └── vignettes ├── makefile ├── spatialcluster.Rmd └── spatialcluster.bib /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^.*\.pdf$ 3 | ^README-.*\.png$ 4 | ^README\.Rmd$ 5 | ^README\.html$ 6 | ^\.Rproj\.user$ 7 | ^\.git$ 8 | ^\.github$ 9 | ^\.gitignore$ 10 | ^\.hooks$ 11 | ^\.pre-commit-config\.yaml$ 12 | ^_pkgdown.yml$ 13 | ^aaa* 14 | ^codemeta\.json$ 15 | ^inst/WORDLIST$ 16 | ^makefile$ 17 | ^vignettes/makefile$ 18 | docs/ 19 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | 8 | name: R-CMD-check.yaml 9 | 10 | permissions: read-all 11 | 12 | jobs: 13 | R-CMD-check: 14 | runs-on: ${{ matrix.config.os }} 15 | 16 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 17 | 18 | strategy: 19 | fail-fast: false 20 | matrix: 21 | config: 22 | - {os: macos-latest, r: 'release'} 23 | - {os: windows-latest, r: 'release'} 24 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 25 | - {os: ubuntu-latest, r: 'release'} 26 | - {os: ubuntu-latest, r: 'oldrel-1'} 27 | 28 | env: 29 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 30 | R_KEEP_PKG_SOURCE: yes 31 | 32 | steps: 33 | - uses: actions/checkout@v4 34 | 35 | - uses: r-lib/actions/setup-pandoc@v2 36 | 37 | - uses: r-lib/actions/setup-r@v2 38 | with: 39 | r-version: ${{ matrix.config.r }} 40 | http-user-agent: ${{ matrix.config.http-user-agent }} 41 | use-public-rspm: true 42 | 43 | - uses: r-lib/actions/setup-r-dependencies@v2 44 | with: 45 | extra-packages: any::rcmdcheck 46 | needs: check 47 | 48 | - uses: r-lib/actions/check-r-package@v2 49 | with: 50 | upload-snapshots: true 51 | build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")' 52 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | release: 8 | types: [published] 9 | workflow_dispatch: 10 | 11 | name: pkgdown.yaml 12 | 13 | permissions: read-all 14 | 15 | jobs: 16 | pkgdown: 17 | runs-on: ubuntu-latest 18 | # Only restrict concurrency for non-PR jobs 19 | concurrency: 20 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} 21 | env: 22 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 23 | permissions: 24 | contents: write 25 | steps: 26 | - uses: actions/checkout@v4 27 | 28 | - uses: r-lib/actions/setup-pandoc@v2 29 | 30 | - uses: r-lib/actions/setup-r@v2 31 | with: 32 | use-public-rspm: true 33 | 34 | - uses: r-lib/actions/setup-r-dependencies@v2 35 | with: 36 | extra-packages: any::pkgdown, local::. 37 | needs: website 38 | 39 | - name: Build site 40 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) 41 | shell: Rscript {0} 42 | 43 | - name: Deploy to GitHub pages 🚀 44 | if: github.event_name != 'pull_request' 45 | uses: JamesIves/github-pages-deploy-action@v4.5.0 46 | with: 47 | clean: false 48 | branch: gh-pages 49 | folder: docs 50 | -------------------------------------------------------------------------------- /.github/workflows/test-coverage.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | 8 | name: test-coverage.yaml 9 | 10 | permissions: read-all 11 | 12 | jobs: 13 | test-coverage: 14 | runs-on: ubuntu-latest 15 | env: 16 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 17 | 18 | steps: 19 | - uses: actions/checkout@v4 20 | 21 | - uses: r-lib/actions/setup-r@v2 22 | with: 23 | use-public-rspm: true 24 | 25 | - uses: r-lib/actions/setup-r-dependencies@v2 26 | with: 27 | extra-packages: any::covr, any::xml2 28 | needs: coverage 29 | 30 | - name: Test coverage 31 | run: | 32 | cov <- covr::package_coverage( 33 | quiet = FALSE, 34 | clean = FALSE, 35 | install_path = file.path(normalizePath(Sys.getenv("RUNNER_TEMP"), winslash = "/"), "package") 36 | ) 37 | covr::to_cobertura(cov) 38 | shell: Rscript {0} 39 | 40 | - uses: codecov/codecov-action@v4 41 | with: 42 | # Fail if error if not on PR, or if on PR and token is given 43 | fail_ci_if_error: ${{ github.event_name != 'pull_request' || secrets.CODECOV_TOKEN }} 44 | file: ./cobertura.xml 45 | plugin: noop 46 | disable_search: true 47 | token: ${{ secrets.CODECOV_TOKEN }} 48 | 49 | - name: Show testthat output 50 | if: always() 51 | run: | 52 | ## -------------------------------------------------------------------- 53 | find '${{ runner.temp }}/package' -name 'testthat.Rout*' -exec cat '{}' \; || true 54 | shell: bash 55 | 56 | - name: Upload test results 57 | if: failure() 58 | uses: actions/upload-artifact@v4 59 | with: 60 | name: coverage-test-failures 61 | path: ${{ runner.temp }}/package 62 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | inst/doc 5 | aaa\.Rmd* 6 | # History files 7 | .Rhistory 8 | .Rapp.history 9 | # Session Data files 10 | .RData 11 | # Output files from R CMD build 12 | /*.tar.gz 13 | # vim files 14 | .*.un~ 15 | .*.swp 16 | # compiled object files 17 | *.o 18 | *.so 19 | -------------------------------------------------------------------------------- /.hooks/description: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # only stop on main branch 4 | on_main <- identical (gert::git_branch (), "main") 5 | 6 | s <- gert::git_status() 7 | chk <- ("DESCRIPTION" %in% s$file && 8 | (s$status [s$file == "DESCRIPTION"] == "modified" | 9 | s$status [s$file == "DESCRIPTION"] == "new")) 10 | if (!chk & on_main) 11 | stop ("DESCRIPTION has not been updated") 12 | 13 | f <- file.path (rprojroot::find_root("DESCRIPTION"), "DESCRIPTION") 14 | x <- system2 ("git", args = c ("diff", "--cached", "-U0", f), stdout = TRUE) 15 | if (!any (grepl ("^\\+Version", x)) & on_main) 16 | stop ("Version number in DESCRIPTION has not been incremented") 17 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # All available hooks: https://pre-commit.com/hooks.html 2 | # R specific hooks: https://github.com/lorenzwalthert/precommit 3 | repos: 4 | - repo: https://github.com/lorenzwalthert/precommit 5 | rev: v0.4.3.9003 6 | hooks: 7 | - id: style-files 8 | args: [--style_pkg=spaceout, --style_fun=spaceout_style] 9 | additional_dependencies: 10 | - ropensci-review-tools/spaceout 11 | # - id: roxygenize 12 | # codemeta must be above use-tidy-description when both are used 13 | # - id: codemeta-description-updated 14 | - id: use-tidy-description 15 | - id: spell-check 16 | exclude: > 17 | (?x)^( 18 | .*\.[rR]| 19 | .*\.feather| 20 | .*\.jpeg| 21 | .*\.pdf| 22 | .*\.png| 23 | .*\.py| 24 | .*\.RData| 25 | .*\.rds| 26 | .*\.Rds| 27 | .*\.Rproj| 28 | .*\.sh| 29 | (.*/|)\.gitignore| 30 | (.*/|)\.gitlab-ci\.yml| 31 | (.*/|)\.lintr| 32 | (.*/|)\.pre-commit-.*| 33 | (.*/|)\.Rbuildignore| 34 | (.*/|)\.Renviron| 35 | (.*/|)\.Rprofile| 36 | (.*/|)\.travis\.yml| 37 | (.*/|)appveyor\.yml| 38 | (.*/|)NAMESPACE| 39 | (.*/|)renv/settings\.dcf| 40 | (.*/|)renv\.lock| 41 | (.*/|)WORDLIST| 42 | \.github/workflows/.*| 43 | data/.*| 44 | )$ 45 | # - id: lintr 46 | - id: readme-rmd-rendered 47 | - id: parsable-R 48 | - id: no-browser-statement 49 | - id: no-print-statement 50 | - id: no-debug-statement 51 | - id: deps-in-desc 52 | # - id: pkgdown 53 | - repo: https://github.com/pre-commit/pre-commit-hooks 54 | rev: v5.0.0 55 | hooks: 56 | - id: check-added-large-files 57 | args: ['--maxkb=200'] 58 | - id: file-contents-sorter 59 | files: '^\.Rbuildignore$' 60 | - id: end-of-file-fixer 61 | exclude: '\.Rd' 62 | - repo: https://github.com/pre-commit-ci/pre-commit-ci-config 63 | rev: v1.6.1 64 | hooks: 65 | # Only required when https://pre-commit.ci is used for config validation 66 | - id: check-pre-commit-ci-config 67 | - repo: local 68 | hooks: 69 | - id: forbid-to-commit 70 | name: Don't commit common R artifacts 71 | entry: Cannot commit .Rhistory, .RData, .Rds or .rds. 72 | language: fail 73 | files: '\.(Rhistory|RData|Rds|rds)$' 74 | # `exclude: ` to allow committing specific files 75 | - id: description version 76 | name: Version has been incremeneted in DESCRIPTION 77 | entry: .hooks/description 78 | language: script 79 | 80 | ci: 81 | autoupdate_schedule: monthly 82 | # skip: [pkgdown] 83 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: spatialcluster 2 | Title: R port of redcap 3 | Version: 0.2.0.017 4 | Authors@R: 5 | person("Mark", "Padgham", , "mark.padgham@email.com", role = c("aut", "cre")) 6 | Description: R port of redcap (Regionalization with dynamically 7 | constrained agglomerative clustering and partitioning). 8 | License: GPL-3 9 | URL: https://github.com/mpadge/spatialcluster 10 | BugReports: https://github.com/mpadge/spatialcluster/issues 11 | Depends: 12 | R (>= 4.1.0) 13 | Imports: 14 | alphahull, 15 | dplyr, 16 | ggplot2, 17 | ggthemes, 18 | methods, 19 | Rcpp (>= 0.12.6), 20 | tibble, 21 | tripack 22 | Suggests: 23 | dbscan, 24 | knitr, 25 | rmarkdown, 26 | roxygen2, 27 | testthat 28 | LinkingTo: 29 | Rcpp, 30 | RcppArmadillo 31 | VignetteBuilder: 32 | knitr 33 | Encoding: UTF-8 34 | LazyData: true 35 | NeedsCompilation: yes 36 | RoxygenNote: 7.3.2 37 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(plot,scl) 4 | export(plot_merges) 5 | export(scl_full) 6 | export(scl_recluster) 7 | export(scl_redcap) 8 | importFrom(Rcpp,evalCpp) 9 | useDynLib(spatialcluster, .registration = TRUE) 10 | -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #' rcpp_alk 5 | #' 6 | #' Full-order average linkage cluster redcap algorithm 7 | #' 8 | #' @noRd 9 | rcpp_alk <- function(gr, shortest, quiet) { 10 | .Call(`_spatialcluster_rcpp_alk`, gr, shortest, quiet) 11 | } 12 | 13 | #' clk_step 14 | #' 15 | #' @param ei The i'th edge of the full sorted list of edge weights 16 | #' @noRd 17 | NULL 18 | 19 | #' rcpp_clk 20 | #' 21 | #' Full-order complete linkage cluster redcap algorithm 22 | #' 23 | #' @noRd 24 | rcpp_clk <- function(gr_full, gr, shortest, quiet) { 25 | .Call(`_spatialcluster_rcpp_clk`, gr_full, gr, shortest, quiet) 26 | } 27 | 28 | #' rcpp_cut_tree 29 | #' 30 | #' Cut tree into specified number of clusters by minimising internal cluster 31 | #' variance. 32 | #' 33 | #' @param tree tree to be processed 34 | #' 35 | #' @return Vector of cluster IDs for each tree edge 36 | #' @noRd 37 | rcpp_cut_tree <- function(tree, ncl, shortest, quiet) { 38 | .Call(`_spatialcluster_rcpp_cut_tree`, tree, ncl, shortest, quiet) 39 | } 40 | 41 | #' step 42 | #' 43 | #' All edges are initially in their own clusters. This merges edge#i with the 44 | #' next closest edge 45 | #' 46 | #' @param ei The i'th edge of the sorted list of NN edge weights 47 | #' @noRd 48 | NULL 49 | 50 | #' fill_cl_edges 51 | #' 52 | #' Fill (arma) matrix of strongest/shortest connections between all clusters 53 | #' used to construct the hierarchical relationships 54 | #' @noRd 55 | NULL 56 | 57 | #' rcpp_full_initial 58 | #' 59 | #' Initial allocation for full clustering 60 | #' 61 | #' @noRd 62 | rcpp_full_initial <- function(gr, shortest) { 63 | .Call(`_spatialcluster_rcpp_full_initial`, gr, shortest) 64 | } 65 | 66 | #' rcpp_full_merge 67 | #' 68 | #' Merge clusters generated by rcpp_full_initial to full hierarchy of all 69 | #' possible merges. 70 | #' 71 | #' @noRd 72 | rcpp_full_merge <- function(gr, linkage, shortest) { 73 | .Call(`_spatialcluster_rcpp_full_merge`, gr, linkage, shortest) 74 | } 75 | 76 | #' rcpp_mst 77 | #' 78 | #' Minimum spanning tree 79 | #' 80 | #' @noRd 81 | rcpp_mst <- function(input) { 82 | .Call(`_spatialcluster_rcpp_mst`, input) 83 | } 84 | 85 | #' rcpp_slk 86 | #' 87 | #' Full-order single linkage cluster redcap algorithm 88 | #' 89 | #' @noRd 90 | rcpp_slk <- function(gr_full, gr, shortest, quiet) { 91 | .Call(`_spatialcluster_rcpp_slk`, gr_full, gr, shortest, quiet) 92 | } 93 | 94 | -------------------------------------------------------------------------------- /R/edges.R: -------------------------------------------------------------------------------- 1 | #' scl_edges_tri 2 | #' 3 | #' Generate triangulated nearest-neighbour edges between a set of input points 4 | #' 5 | #' @inheritParams scl_redcap 6 | #' @examples 7 | #' set.seed (1) 8 | #' n <- 100 9 | #' xy <- matrix (runif (2 * n), ncol = 2) 10 | #' edges <- scl_edges_tri (xy) 11 | #' @noRd 12 | scl_edges_tri <- function (xy, shortest = TRUE) { 13 | 14 | if (!inherits (xy, "data.frame")) { 15 | xy <- as.data.frame (xy) 16 | } 17 | if (ncol (xy) > 2) { 18 | xy <- dplyr::select (xy, c (x, y)) 19 | } 20 | names (xy) <- c ("x", "y") 21 | nbs <- tripack::tri.mesh (xy) |> 22 | tripack::neighbours () 23 | 24 | n <- length (nbs) 25 | edges <- lapply ( 26 | seq (n), 27 | function (i) cbind (i, nbs [[i]]) 28 | ) 29 | edges <- do.call (rbind, edges) 30 | 31 | edges <- tibble::tibble ( 32 | from = edges [, 1], 33 | to = edges [, 2] 34 | ) 35 | 36 | dxy <- as.matrix (stats::dist (xy)) 37 | 38 | append_dist_to_edges (edges, dxy, shortest) 39 | } 40 | 41 | #' scl_edges_nn 42 | #' 43 | #' Generate distance-based nearest-neighbour edges between a set of input 44 | #' points, ensuring that all edges connect to a single component. The minimal 45 | #' spanning tree is constructed from **spatial** distances, not from distances 46 | #' given in `dmat`. 47 | #' @param nnbs Number of nearest neighbours 48 | #' @inheritParams scl_redcap 49 | #' 50 | #' @return A `tibble` of `from` and `to` vertex indices for the minimal spanning 51 | #' tree edges, along with corresponding spatial distances calculated from 'xy'. 52 | #' 53 | #' @examples 54 | #' set.seed (1) 55 | #' n <- 100 56 | #' xy <- matrix (runif (2 * n), ncol = 2) 57 | #' scl_edges_nn (xy, nnbs = 3L) 58 | #' @noRd 59 | scl_edges_nn <- function (xy, nnbs, shortest = TRUE) { 60 | 61 | # Initially contruct with nnbs + 1, because the `d` matrix includes 62 | # self-distances of zero, which are subsequently removed. 63 | nnbs <- nnbs + 1 64 | d <- apply ( 65 | as.matrix (stats::dist (xy)), 66 | 2, 67 | function (i) order (i, decreasing = !shortest) [seq_len (nnbs)] 68 | ) 69 | 70 | edges <- tibble::tibble ( 71 | from = rep (as.integer (colnames (d)), 72 | each = nnbs 73 | ), 74 | to = as.vector (d) 75 | ) 76 | # rm self-edges: 77 | edges <- edges [which (edges$from != edges$to), ] 78 | 79 | # then ensure that the minimal spanning tree is included, to ensure all 80 | # nearest neighbour edges are connected in a single component. The distances 81 | # used for this MST are spatial distances, not from `dmat`. 82 | dxy <- as.matrix (stats::dist (xy)) 83 | 84 | n <- nrow (xy) 85 | edges_all <- tibble::tibble ( 86 | from = rep (seq_len (n), n), 87 | to = rep (seq_len (n), each = n), 88 | d = as.vector (dxy) 89 | ) |> 90 | dplyr::arrange (d, from, to) |> 91 | dplyr::filter (from != to) 92 | 93 | mst <- scl_spantree_ord1 (edges_all) 94 | # duplicate all of those: 95 | mst <- rbind ( 96 | mst [, c ("from", "to")], 97 | tibble::tibble (from = mst$to, to = mst$from) 98 | ) 99 | 100 | edges <- rbind (edges, mst) 101 | edges <- edges [which (!duplicated (edges)), ] 102 | 103 | # Then append final distances from `dxy` to the return value: 104 | edges <- append_dist_to_edges (edges, dxy, shortest) 105 | 106 | return (edges) 107 | } 108 | 109 | append_dist_to_edges <- function (edges, dmat, shortest) { 110 | index <- (edges$to - 1) * nrow (dmat) + edges$from 111 | edges$d <- dmat [index] 112 | 113 | if (shortest) { 114 | edges <- dplyr::arrange (edges, d) 115 | } else { 116 | edges <- dplyr::arrange (edges, dplyr::desc (d)) 117 | } 118 | 119 | return (edges) 120 | } 121 | 122 | #' scl_edges_all 123 | #' 124 | #' Generate full set of edges between a set of input points 125 | #' 126 | #' @param dmat Either a spatial distance matrix generated from 'xy', or a 127 | #' separate distance matrix passed as the 'dmat' parameter to the main 128 | #' \link{scl_redcap} function. 129 | #' @inheritParams scl_redcap 130 | #' @noRd 131 | scl_edges_all <- function (xy, dmat, shortest = TRUE) { 132 | 133 | n <- nrow (dmat) 134 | edges <- tibble::tibble ( 135 | from = rep (seq (n), times = n), 136 | to = rep (seq (n), each = n), 137 | d = as.vector (dmat) 138 | ) 139 | edges <- na.omit (edges) 140 | 141 | if (shortest) { 142 | edges <- dplyr::arrange (edges, d) 143 | } else { 144 | edges <- dplyr::arrange (edges, dplyr::desc (d)) 145 | } 146 | 147 | return (edges) 148 | } 149 | -------------------------------------------------------------------------------- /R/plot-fns.R: -------------------------------------------------------------------------------- 1 | #' scl_ahulls 2 | #' 3 | #' Calculate alpha hulls around clusters via the \pkg{alphahull} package 4 | #' 5 | #' @param tree Spanning tree obtained from \link{scl_redcap} 6 | #' @param xy Matrix of spatial coordinates of points indexed by \code{tree}. 7 | #' @param alpha Parameter used to create alpha hulls 8 | #' @return tibble of (id, x, y), where the coordinates trace the convex hulls 9 | #' for each cluster id 10 | #' @noRd 11 | scl_ahulls <- function (nodes, alpha = 0.1) { 12 | 13 | clnums <- unique (nodes$cluster [!is.na (nodes$cluster)]) 14 | bdry <- list () 15 | for (i in clnums) { 16 | if (length (which (nodes$cluster == i)) > 2) { 17 | xyi <- nodes |> 18 | dplyr::filter (cluster == i) |> 19 | dplyr::select (x, y) 20 | 21 | a <- alphahull::ashape (xyi, alpha = alpha)$edges |> 22 | data.frame () 23 | 24 | xy <- rbind ( 25 | data.frame (ind = a$ind1, x = a$x1, y = a$y1), 26 | data.frame (ind = a$ind2, x = a$x2, y = a$y2) 27 | ) |> 28 | unique () |> 29 | dplyr::arrange (ind) 30 | inds <- data.frame (ind1 = a$ind1, ind2 = a$ind2) 31 | # Then just have to wrap those around xy: 32 | # TODO: Find a better way to do this! 33 | ind_seq <- as.numeric (inds [1, ]) 34 | inds <- inds [-1, ] 35 | while (nrow (inds) > 0) { 36 | j <- which (inds$ind1 == utils::tail (ind_seq, n = 1)) 37 | if (length (j) > 0) { 38 | ind_seq <- c (ind_seq, inds [j, 2]) 39 | } else { 40 | j <- which (inds$ind2 == utils::tail (ind_seq, n = 1)) 41 | ind_seq <- c (ind_seq, inds [j, 1]) 42 | } 43 | inds <- inds [-j, , drop = FALSE] # nolint 44 | } 45 | xy <- xy [match (ind_seq, xy$ind), ] 46 | bdry [[length (bdry) + 1]] <- cbind (i, xy$x, xy$y) 47 | } 48 | } 49 | bdry <- data.frame (do.call (rbind, bdry)) 50 | names (bdry) <- c ("id", "x", "y") 51 | return (bdry) 52 | } 53 | 54 | #' plot.scl 55 | #' @method plot scl 56 | #' @param x object to be plotted 57 | #' @param hull_alpha alpha value of (non-)convex hulls, with default generating 58 | #' a convex hull, and smaller values generating concave hulls. (See 59 | #' ?alphashape::ashape for details). 60 | #' @param ... ignored here 61 | #' @family plot_fns 62 | #' @export 63 | #' @examples 64 | #' set.seed (1) 65 | #' n <- 100 66 | #' xy <- matrix (runif (2 * n), ncol = 2) 67 | #' dmat <- matrix (runif (n^2), ncol = n) 68 | #' scl <- scl_redcap (xy, dmat, ncl = 4) 69 | #' plot (scl) 70 | #' # Connect clusters according to highest (\code{shortest = FALSE}) values of 71 | #' # \code{dmat}: 72 | #' scl <- scl_redcap (xy, dmat, ncl = 4, shortest = FALSE, full_order = FALSE) 73 | #' plot (scl) 74 | plot.scl <- function (x, ..., hull_alpha = 1) { 75 | 76 | # Clusters are defined has having > 2 edges, so any with < 3 edges need to 77 | # be removed here: 78 | etab <- table (x$tree$cluster) 79 | clusters_to_rm <- as.integer (names (etab) [which (etab < 3)]) 80 | if (length (clusters_to_rm) > 0L) { 81 | x$nodes$cluster [x$nodes$cluster %in% clusters_to_rm] <- NA_integer_ 82 | } 83 | 84 | # Reset cluster numbers to sequence starting at 1: 85 | x$nodes$cluster <- match (x$nodes$cluster, sort (unique (x$nodes$cluster))) 86 | 87 | hull_alpha <- check_hull_alpha (hull_alpha) 88 | 89 | hulls <- scl_ahulls (x$nodes, alpha = hull_alpha) 90 | 91 | nc <- length (unique (x$nodes$cluster [!is.na (x$nodes$cluster)])) 92 | 93 | # clnum in cl_cols is + 1 because xy below increases cluster numbers by 1 to 94 | # allocate cl_num == 1 to unassigned points 95 | cl_cols <- grDevices::rainbow (nc) |> 96 | tibble::as_tibble () |> 97 | dplyr::mutate (cluster = seq (nc) + 1) |> 98 | dplyr::rename (col = value) 99 | 100 | xy <- x$nodes |> 101 | dplyr::mutate (cluster = ifelse (is.na (cluster), 1, cluster + 1)) |> 102 | dplyr::left_join (cl_cols, by = "cluster") |> 103 | dplyr::mutate (col = ifelse (is.na (col), "#333333FF", col)) 104 | 105 | y <- id <- NULL # suppress no visible binding warnings 106 | hull_aes <- ggplot2::aes (x = x, y = y, group = id) 107 | hull_width <- 0.5 108 | g <- ggplot2::ggplot (xy, ggplot2::aes (x = x, y = y)) + 109 | ggplot2::geom_point ( 110 | size = 5, color = xy$col, 111 | show.legend = FALSE 112 | ) + 113 | ggplot2::geom_polygon ( 114 | data = hulls, 115 | mapping = hull_aes, 116 | colour = cl_cols$col [hulls$id], 117 | fill = cl_cols$col [hulls$id], 118 | alpha = 0.1, 119 | linewidth = hull_width 120 | ) + 121 | ggthemes::theme_solarized () 122 | 123 | g 124 | } 125 | 126 | check_hull_alpha <- function (a) { 127 | 128 | if (length (a) > 1) { 129 | stop ("hull_alpha must be a single value") 130 | } 131 | if (!is.numeric (a)) { 132 | stop ("hull_alpha must be numeric") 133 | } 134 | 135 | if (a <= 0 || a > 1) { 136 | stop ("hull_alpha must be between 0 and 1") 137 | } 138 | 139 | return (a) 140 | } 141 | 142 | #' plot_merges 143 | #' 144 | #' Plot dendrogram of merges for \code{scl} object with \code{method = "full"}. 145 | #' @param x Object of class \code{scl} obtained with \code{method = "full"}. 146 | #' @param root_tree If \code{TRUE}, tree leaves are connected to bottom of plot, 147 | #' otherwise floating as determined by \link{plot.hclust}. 148 | #' @return Nothing (generates plot) 149 | #' @family plot_fns 150 | #' @export 151 | plot_merges <- function (x, root_tree = FALSE) { 152 | if (!(methods::is (x, "scl") && x$pars$method == "full")) { 153 | stop ( 154 | "plot_merges can only be applied to scl objects ", 155 | "generated with method = full" 156 | ) 157 | } 158 | 159 | hc <- structure (class = "hclust", .Data = list ()) 160 | merges <- convert_merges_to_hclust (x) 161 | hc$merge <- merges [, 1:2] 162 | hc$height <- merges [, 3] 163 | hc$order <- x$ord + 1 # it's 0-indexed 164 | hc$labels <- x$ord 165 | if (root_tree) { 166 | plot (stats::as.dendrogram (hc)) 167 | } else { 168 | plot (hc) 169 | } 170 | } 171 | 172 | convert_merges_to_hclust <- function (x) { 173 | mt <- as.matrix (x$merges [, c ("from", "to")]) + 1 174 | dists <- as.vector (x$merges$dist) 175 | indx <- sort (unique (as.vector (mt))) 176 | mt <- apply (mt, 2, function (i) match (i, indx)) 177 | merged <- d <- NULL 178 | map <- rep (NA, max (mt)) 179 | for (i in seq_len (nrow (mt))) { 180 | m1 <- mt [i, 1] 181 | m2 <- mt [i, 2] 182 | if (!m1 %in% merged) { 183 | merged <- c (merged, m1) 184 | mt [i, 1] <- -m1 185 | } else { 186 | mt [i, 1] <- map [m1] 187 | dists [i] <- dists [i] + dists [map [m1]] 188 | } 189 | map [m1] <- i 190 | 191 | if (!m2 %in% merged) { 192 | merged <- c (merged, m2) 193 | mt [i, 2] <- -m2 194 | } else { 195 | mt [i, 2] <- map [m2] 196 | dists [i] <- dists [i] + dists [map [m2]] 197 | } 198 | map [m2] <- i 199 | } 200 | cbind (mt, dists) 201 | } 202 | -------------------------------------------------------------------------------- /R/scl-full.R: -------------------------------------------------------------------------------- 1 | #' scl_full 2 | #' 3 | #' Full spatially-constrained clustering. 4 | #' 5 | #' @param linkage Either \code{"single"} or \code{"average"}. For covariance 6 | #' clustering, use \code{"single"} with `shortest = FALSE`. 7 | #' @inheritParams scl_redcap 8 | #' 9 | #' @family clustering_fns 10 | #' @export 11 | #' @examples 12 | #' n <- 100 13 | #' xy <- matrix (runif (2 * n), ncol = 2) 14 | #' dmat <- matrix (runif (n^2), ncol = n) 15 | #' scl <- scl_full (xy, dmat, ncl = 4) 16 | scl_full <- function (xy, 17 | dmat, 18 | ncl, 19 | linkage = "single", 20 | shortest = TRUE, 21 | nnbs = 6L) { 22 | 23 | linkage <- match.arg (tolower (linkage), c ("single", "average")) 24 | 25 | if (methods::is (xy, "scl")) { 26 | message ( 27 | "scl_full is for initial cluster construction; ", 28 | "passing to scl_recluster" 29 | ) 30 | scl_recluster_full (xy, ncl = ncl) 31 | } else { 32 | xy <- scl_tbl (xy) 33 | 34 | if (nnbs <= 0) { 35 | edges <- scl_edges_tri (xy, shortest = shortest) 36 | } else { 37 | edges <- scl_edges_nn (xy, nnbs = nnbs, shortest = shortest) 38 | } 39 | 40 | # cluster numbers can be joined with edges through either from or to: 41 | cl <- as.integer (rcpp_full_initial (edges, shortest) + 1) 42 | 43 | # make 3 vectors of cluster numbers: 44 | # 1. cl = cluster number for intra-cluster edges only; 45 | # 2. cl_from = Num of origin cluster for inter-cluster edges only; and 46 | # 3. cl_to = Num of destination cluster for inter-cluster edges only. 47 | from_cl <- cl [edges$from] 48 | to_cl <- cl [edges$to] 49 | indx <- which (from_cl == to_cl) 50 | cl_in <- cl_join_from <- cl_join_to <- rep (NA, nrow (edges)) 51 | cl_in [indx] <- from_cl [indx] 52 | indx <- which (from_cl != to_cl) 53 | cl_join_from [indx] <- from_cl [indx] 54 | cl_join_to [indx] <- to_cl [indx] 55 | 56 | edges$cluster <- cl_in - 1L # convert back to C++ 0-indexed values 57 | edges$cl_from <- cl_join_from - 1L 58 | edges$cl_to <- cl_join_to - 1L 59 | edges$cluster [is.na (edges$cluster)] <- -1L 60 | edges$cl_from [is.na (edges$cl_from)] <- -1L 61 | edges$cl_to [is.na (edges$cl_to)] <- -1L 62 | 63 | # Then replace the spatial distance in the edges table with the distance 64 | # from the data to use that as the basis for merging: 65 | edges <- append_dist_to_edges (edges, dmat, shortest = shortest) 66 | 67 | merges <- rcpp_full_merge ( 68 | edges, 69 | linkage = linkage, 70 | shortest = shortest 71 | ) |> data.frame () 72 | 73 | merges <- tibble::tibble ( 74 | from = as.integer (merges$from), 75 | to = as.integer (merges$to), 76 | dist = merges$dist 77 | ) 78 | 79 | # full_cluster_nodes just auto-merges the tree to the specified number, 80 | # but some of these may be clusters with only 2 members. These are 81 | # excluded here by iterating until the desired number is achieved in 82 | # which each cluster has >= 3 members: 83 | num_clusters <- 0 84 | ncl_trial <- ncl 85 | while (num_clusters < ncl) { 86 | 87 | nodes <- full_cluster_nodes (edges, merges, ncl_trial) 88 | num_clusters <- length (which (table (nodes$cluster) > 2)) 89 | ncl_trial <- ncl_trial + 1 90 | if (ncl_trial >= nrow (nodes)) { 91 | break 92 | } 93 | } 94 | nt <- sort (table (nodes$cluster), decreasing = TRUE) 95 | n <- as.integer (names (nt) [which (nt <= 2)]) 96 | nodes$cluster [nodes$cluster %in% n] <- NA 97 | 98 | # tree at that point has initial cluster numbers which must be 99 | # re-aligned with clusters from the nodal merges: 100 | tree <- edges |> dplyr::select (from, to, d, cluster) 101 | tree$cluster <- tree$cl_fr <- 102 | nodes$cluster [match (tree$from, nodes$node)] 103 | tree$cl_to <- nodes$cluster [match (tree$to, nodes$node)] 104 | tree$cluster [tree$cl_fr != tree$cl_to] <- NA 105 | 106 | pars <- list ( 107 | method = "full", 108 | ncl = ncl, 109 | linkage = linkage 110 | ) 111 | 112 | res <- structure ( 113 | list ( 114 | tree = dplyr::select ( 115 | tree, 116 | c (from, to, d, cluster) 117 | ), 118 | merges = merges, 119 | ord = order_merges (merges), 120 | nodes = dplyr::bind_cols (nodes, xy), 121 | pars = pars 122 | ), 123 | class = "scl" 124 | ) 125 | 126 | res <- scl_statistics (res) 127 | 128 | return (res) 129 | } 130 | } 131 | 132 | #' order_merges 133 | #' 134 | #' Order merges so they can be plotted as dendrogram 135 | #' @param merges output from rccp_full_merge 136 | #' @noRd 137 | order_merges <- function (merges) { 138 | 139 | merges <- as.matrix (merges) 140 | nodes <- merges [nrow (merges), c ("from", "to")] 141 | for (i in rev (seq_len (nrow (merges))) [-1]) { 142 | ii <- which (nodes == merges [i, 2]) 143 | n1 <- n2 <- NULL 144 | if (ii > 1) { 145 | n1 <- nodes [1:(ii - 1)] 146 | } 147 | if (ii <= length (nodes)) { 148 | n2 <- nodes [ii:length (nodes)] 149 | } 150 | nodes <- c (n1, merges [i, 1], n2) 151 | } 152 | return (as.numeric (nodes)) 153 | } 154 | 155 | #' full_cluster_nodes 156 | #' 157 | #' Transform edge and merge data into rectangle of nodes and cluster IDs 158 | #' @noRd 159 | full_cluster_nodes <- function (edges, merges, ncl) { 160 | 161 | edges$cluster [edges$cluster < 0] <- NA 162 | ncl_full <- length (unique (edges$cluster)) 163 | merge_tree <- merges [1:(ncl_full - ncl - 1), ] 164 | for (i in seq_len (nrow (merge_tree))) { 165 | edges$cluster [edges$cluster == merge_tree$from [i]] <- 166 | merge_tree$to [i] 167 | } 168 | 169 | node <- cluster <- NULL # rm undefined variable note 170 | all_nodes <- unique (c (edges$from, edges$to)) 171 | nodes <- tibble::tibble ( 172 | node = c (edges$from, edges$to), 173 | cluster = rep (edges$cluster, 2) 174 | ) |> 175 | dplyr::distinct () |> 176 | dplyr::arrange (node) |> 177 | dplyr::filter (!is.na (cluster)) 178 | 179 | # nodes can still be in multiple clusters, so these are set to NA 180 | dup_nodes <- unique (nodes$node [which (duplicated (nodes$node))]) 181 | nodes$cluster [nodes$node %in% dup_nodes] <- NA_integer_ 182 | nodes <- nodes [which (!duplicated (nodes)), ] 183 | 184 | # Plus nodes entirely in NA clusters can then be removed, and need to be 185 | # re-inserted: 186 | na_nodes <- all_nodes [which (!all_nodes %in% nodes$node)] 187 | if (length (na_nodes) > 0L) { 188 | na_nodes <- tibble::tibble ( 189 | node = na_nodes, 190 | cluster = rep (NA_integer_, length (na_nodes)) 191 | ) 192 | nodes <- rbind (nodes, na_nodes) 193 | } 194 | 195 | # re-order cluster numbers by frequencies 196 | nt <- sort (table (nodes$cluster), decreasing = TRUE) 197 | nodes$cluster <- as.integer (names (nt) [match (nodes$cluster, names (nt))]) 198 | 199 | return (nodes) 200 | } 201 | 202 | #' scl_recluster_full 203 | #' 204 | #' @noRd 205 | scl_recluster_full <- function (scl, ncl = ncl) { 206 | 207 | xy <- scl$nodes |> dplyr::select (x, y) 208 | num_clusters <- 0 209 | ncl_trial <- ncl 210 | 211 | while (num_clusters < ncl) { 212 | 213 | scl$nodes <- full_cluster_nodes (scl$tree, scl$merges, ncl_trial) 214 | num_clusters <- length (which (table (scl$nodes$cluster) > 2)) 215 | ncl_trial <- ncl_trial + 1 216 | if (ncl_trial >= nrow (scl$nodes)) { 217 | break 218 | } 219 | } 220 | nt <- sort (table (scl$nodes$cluster), decreasing = TRUE) 221 | n <- as.integer (names (nt) [which (nt <= 2)]) 222 | scl$nodes$cluster [scl$nodes$cluster %in% n] <- NA 223 | 224 | scl$nodes <- dplyr::bind_cols (scl$nodes, xy) 225 | 226 | return (scl) 227 | } 228 | -------------------------------------------------------------------------------- /R/scl-redcap.R: -------------------------------------------------------------------------------- 1 | #' scl_redcap 2 | #' 3 | #' Cluster spatial data with REDCAP (REgionalization with Dynamically 4 | #' Constrained Agglomerative clustering and Partitioning) routines. 5 | #' 6 | #' @param xy Rectangular structure (matrix, data.frame, tibble), containing 7 | #' coordinates of points to be clustered. 8 | #' @param dmat Square structure (matrix, data.frame, tibble) containing 9 | #' distances or equivalent metrics between all points in \code{xy}. If \code{xy} 10 | #' has \code{n} rows, then \code{dat} must have \code{n} rows and \code{n} 11 | #' columns. 12 | #' @param ncl Desired number of clusters. See description of `ncl_iterate` 13 | #' parameter for conditions under which actual number may be less than this 14 | #' value. 15 | #' @param full_order If \code{FALSE}, build spanning trees from first-order 16 | #' relationships only, otherwise build from full-order relationships (see Note). 17 | #' @param linkage One of \code{"single"}, \code{"average"}, or 18 | #' \code{"complete"}; see Note. 19 | #' @param shortest If \code{TRUE}, the \code{dmat} is interpreted as distances 20 | #' such that lower values are preferentially selected; if \code{FALSE}, then 21 | #' higher values of \code{dmat} are interpreted to indicate stronger 22 | #' relationships, as is the case for example with covariances. 23 | #' @param nnbs Number of nearest neighbours to be used in calculating clustering 24 | #' trees. Triangulation will be used if \code{nnbs <= 0}. 25 | #' @param iterate_ncl Actual numbers of clusters found may be less than the 26 | #' specified value of `ncl`, because clusters formed from < 3 edges are removed. 27 | #' If `iterate_ncl = FALSE` (the default), the value is returned with whatever 28 | #' number of actual clusters is found. Setting this parameter to `TRUE` forces 29 | #' the algorithm to iterate until the exact number of clusters has been found. 30 | #' For large data sets, this may result in considerable longer calculation 31 | #' times. 32 | #' @param quiet If `FALSE` (default), display progress information on screen. 33 | #' 34 | #' @return A object of class \code{scl} with \code{tree} containing the 35 | #' clustering scheme, and \code{xy} the original coordinate data of the 36 | #' clustered points. An additional component, \code{tree_rest}, enables the tree 37 | #' to be re-cut to a different number of clusters via \link{scl_recluster}, 38 | #' rather than calculating clusters anew. 39 | #' 40 | #' @note Please refer to the original REDCAP paper ('Regionalization with 41 | #' dynamically constrained agglomerative clustering and partitioning (REDCAP)', 42 | #' by D. Guo (2008), Int.J.Geo.Inf.Sci 22:801-823) for details of the 43 | #' \code{full_order} and \code{linkage} parameters. This paper clearly 44 | #' demonstrates the general inferiority of spanning trees constructed from 45 | #' first-order relationships. It is therefore strongly recommended that the 46 | #' default \code{full_order = TRUE} be used at all times. 47 | #' 48 | #' @family clustering_fns 49 | #' 50 | #' @examples 51 | #' n <- 100 52 | #' xy <- matrix (runif (2 * n), ncol = 2) 53 | #' dmat <- matrix (runif (n^2), ncol = n) 54 | #' scl <- scl_redcap (xy, dmat, ncl = 4) 55 | #' # Those clusters will by default be constructed by connecting edges with the 56 | #' # lowest (\code{shortest}) values of \code{dmat}, and will differ from 57 | #' scl <- scl_redcap (xy, dmat, ncl = 4, shortest = FALSE) 58 | #' # using 'full_order = FALSE' constructs clusters from first-order 59 | #' # relationships only; not recommended, but possible nevertheless: 60 | #' scl <- scl_redcap (xy, dmat, ncl = 4, full_order = FALSE) 61 | #' 62 | #' @export 63 | scl_redcap <- function (xy, 64 | dmat, 65 | ncl, 66 | full_order = TRUE, 67 | linkage = "single", 68 | shortest = TRUE, 69 | nnbs = 6L, 70 | iterate_ncl = FALSE, 71 | quiet = FALSE) { 72 | 73 | linkage <- scl_linkage_type (linkage) 74 | 75 | if (methods::is (xy, "scl")) { 76 | 77 | if (!identical (xy$pars$method, "redcap")) { 78 | stop ( 79 | "scl_redcap can pass to scl_recluster only for scl objects", 80 | " previously generated with scl_redcap" 81 | ) 82 | } 83 | 84 | message ( 85 | "scl_redcap is for initial cluster construction; ", 86 | "passing to scl_recluster" 87 | ) 88 | 89 | scl_recluster_redcap (xy, ncl = ncl, shortest = shortest) 90 | 91 | } else { 92 | 93 | xy <- scl_tbl (xy) 94 | 95 | if (nnbs <= 0) { 96 | edges_nn <- scl_edges_tri (xy, shortest = shortest) 97 | } else { 98 | edges_nn <- scl_edges_nn (xy, nnbs = nnbs, shortest = shortest) 99 | } 100 | 101 | if (!full_order) { 102 | 103 | tree_full <- scl_spantree_ord1 (edges_nn) [, c ("from", "to")] 104 | 105 | } else { 106 | 107 | if (linkage == "average") { 108 | 109 | tree_full <- scl_spantree_alk (edges_nn, shortest) 110 | 111 | } else { 112 | 113 | d_xy <- as.matrix (stats::dist (xy)) 114 | edges_all <- scl_edges_all (xy, d_xy, shortest) 115 | 116 | if (linkage == "single") { 117 | 118 | tree_full <- scl_spantree_slk ( 119 | edges_all, 120 | edges_nn, 121 | shortest = shortest, 122 | quiet = quiet 123 | ) 124 | 125 | } else if (linkage == "complete") { 126 | 127 | tree_full <- scl_spantree_clk ( 128 | edges_all, 129 | edges_nn, 130 | shortest = shortest, 131 | quiet = quiet 132 | ) 133 | 134 | } else { 135 | 136 | stop ( 137 | "linkage must be one of ", 138 | "(single, average, complete)" 139 | ) 140 | } 141 | } 142 | 143 | } 144 | 145 | # Then the critical stage of changing the distance metric on 'edges_nn' 146 | # from the spatial distances of 'd_xy' to the data-based distances in 147 | # 'dmat': 148 | edges_nn <- append_dist_to_edges (edges_nn, dmat, shortest = shortest) 149 | 150 | tree <- scl_cuttree ( 151 | tree_full, 152 | edges_nn, 153 | ncl, 154 | shortest = shortest, 155 | iterate_ncl = iterate_ncl, 156 | quiet = quiet 157 | ) 158 | 159 | # meta-data: 160 | clo <- c ("single", "full") [match (full_order, c (FALSE, TRUE))] 161 | pars <- list ( 162 | method = "redcap", 163 | ncl = ncl, 164 | cl_order = clo, 165 | linkage = linkage 166 | ) 167 | 168 | res <- structure ( 169 | list ( 170 | tree = tree, 171 | nodes = dplyr::bind_cols (tree_nodes (tree), xy), 172 | pars = pars 173 | ), 174 | class = "scl" 175 | ) 176 | 177 | res <- scl_statistics (res) 178 | 179 | return (res) 180 | } 181 | } 182 | 183 | # Match cluster numbers in edge tree to actual nodes 184 | tree_nodes <- function (tree) { 185 | 186 | node <- NULL # suppress no visible binding note 187 | 188 | res <- tibble::tibble ( 189 | node = c (tree$from, tree$to), 190 | cluster = rep (tree$cluster, 2) 191 | ) |> 192 | dplyr::distinct () |> 193 | dplyr::arrange (node) |> 194 | dplyr::filter (!is.na (cluster)) 195 | 196 | # remove clusters with < 3 members: 197 | res$cluster [res$cluster %in% which (table (res$cluster) < 3)] <- NA 198 | 199 | return (res) 200 | } 201 | 202 | #' scl_reccluster 203 | #' 204 | #' Re-cut a spatial cluster tree (\code{scl}) at a different number of clusters. 205 | #' 206 | #' @param scl An \code{scl} object returned from \link{scl_redcap}. 207 | #' @inheritParams scl_redcap 208 | #' 209 | #' @return Modified \code{scl} object in which \code{tree} is re-cut into 210 | #' \code{ncl} clusters. 211 | #' @family clustering_fns 212 | #' 213 | #' @examples 214 | #' n <- 100 215 | #' xy <- matrix (runif (2 * n), ncol = 2) 216 | #' dmat <- matrix (runif (n^2), ncol = n) 217 | #' scl <- scl_redcap (xy, dmat, ncl = 4) 218 | #' plot (scl) 219 | #' scl <- scl_recluster (scl, ncl = 5) 220 | #' plot (scl) 221 | #' 222 | #' @export 223 | scl_recluster <- function (scl, ncl, shortest = TRUE, quiet = FALSE) { 224 | 225 | if (!methods::is (scl, "scl")) { 226 | stop ( 227 | "scl_recluster can only be applied to 'scl' objects ", 228 | "returned from scl_redcap" 229 | ) 230 | } else if (identical (scl$pars$method, "redcap")) { 231 | scl_recluster_redcap (scl = scl, ncl = ncl, shortest = shortest) 232 | } else if (identical (scl$pars$method, "full")) { 233 | scl_recluster_full (scl = scl, ncl = ncl) 234 | } 235 | } 236 | 237 | scl_recluster_redcap <- function (scl, ncl, shortest = TRUE, quiet = FALSE) { 238 | 239 | from <- to <- d <- NULL # no visible binding messages 240 | 241 | tree_full <- scl$tree |> dplyr::select (from, to, d) 242 | 243 | if (shortest) { 244 | tree_full <- dplyr::arrange (tree_full, d) 245 | } else { 246 | tree_full <- dplyr::arrange (tree_full, dplyr::desc (d)) 247 | } 248 | 249 | tree_full$cluster <- rcpp_cut_tree (tree_full, ncl, 250 | shortest = shortest, 251 | quiet = quiet 252 | ) + 1 253 | 254 | pars <- scl$pars 255 | pars$ncl <- ncl 256 | 257 | structure ( 258 | list ( 259 | tree = tree_full, 260 | nodes = dplyr::bind_cols ( 261 | tree_nodes (tree_full), 262 | scl$nodes [, c ("x", "y")] 263 | ), 264 | pars = pars 265 | ), 266 | class = "scl" 267 | ) 268 | } 269 | -------------------------------------------------------------------------------- /R/spatialcluster-package.R: -------------------------------------------------------------------------------- 1 | #' spatialcluster. 2 | #' 3 | #' R port of redcap (Regionalization with dynamically constrained agglomerative 4 | #' clustering and partitioning). 5 | #' 6 | #' @name spatialcluster 7 | #' @family package 8 | #' @importFrom Rcpp evalCpp 9 | #' @useDynLib spatialcluster, .registration = TRUE 10 | "_PACKAGE" 11 | -------------------------------------------------------------------------------- /R/statistics.R: -------------------------------------------------------------------------------- 1 | #' scl_statistics 2 | #' 3 | #' @param scl Output of either \link{scl_redcap} or \link{scl_full}. 4 | #' 5 | #' @return A modified version of the input object with statistics appended. 6 | #' 7 | #' @noRd 8 | scl_statistics <- function (scl) { 9 | 10 | tree <- scl$tree |> 11 | dplyr::mutate (tf = paste0 (to, "-", from)) 12 | edges_in <- scl$tree [which (scl$tree$cluster >= 0), ] |> 13 | dplyr::mutate (tf = paste0 (to, "-", from)) 14 | tree <- tree [which (!tree$tf %in% edges_in$tf), ] 15 | 16 | # t.test (edges_in$d, tree$d, alternative = "greater", var.equal = TRUE) 17 | tt_global <- stats::t.test ( 18 | edges_in$d, 19 | tree$d, 20 | alternative = "less", 21 | var.equal = TRUE 22 | ) 23 | tt_global <- c (tt_global$statistic, tt_global$parameter, tt_global$p.value) 24 | names (tt_global) <- c ("statistic", "parameter", "p.value") 25 | 26 | tt_cl <- vapply ( 27 | sort (unique (scl$tree$cluster)), 28 | function (i) { 29 | index <- which (edges_in$cluster == i) 30 | if (length (index) <= 3) { 31 | res <- rep (NA, 3L) 32 | } else { 33 | tt <- stats::t.test (edges_in$d [index], tree$d, 34 | alternative = "less", 35 | var.equal = TRUE 36 | ) 37 | res <- c (tt$statistic, tt$parameter, tt$p.value) 38 | } 39 | names (res) <- c ("statistic", "parameter", "p.value") 40 | return (res) 41 | }, 42 | numeric (3) 43 | ) |> 44 | t () 45 | 46 | scl$statistics <- list (tt_global = tt_global, tt_clusters = tt_cl) 47 | 48 | return (scl) 49 | } 50 | -------------------------------------------------------------------------------- /R/tree.R: -------------------------------------------------------------------------------- 1 | #' scl_spantree_ord1 2 | #' 3 | #' Generate a spanning tree from first-order relationships expressed via a set 4 | #' of edges 5 | #' 6 | #' @param edges A set of edges resulting from \link{scl_edges}, which are sorted 7 | #' in ascending order according to user-specified data. The only aspect of that 8 | #' data which affect tree construction is this order, so only the set of 9 | #' \code{edges} are needed here 10 | #' 11 | #' @return A tree 12 | #' @noRd 13 | scl_spantree_ord1 <- function (edges) { 14 | 15 | tree <- rcpp_mst (edges) |> 16 | dplyr::arrange (from, to) |> 17 | tibble::tibble () 18 | 19 | return (tree) 20 | } 21 | 22 | #' scl_spantree_slk 23 | #' 24 | #' Generate a spanning tree from full-order, single linkage clustering (SLK) 25 | #' relationships expressed via a set of edges 26 | #' 27 | #' @param edges_all A set of ALL edges resulting from \link{scl_edges_all}, 28 | #' which are sorted in ascending order according to user-specified data. 29 | #' @param edges_nn A equivalent set of nearest neighbour edges only, resulting 30 | #' from \link{scl_edges_tri} or \link{scl_edges_nn}. 31 | #' 32 | #' @return A tree 33 | #' @noRd 34 | scl_spantree_slk <- function (edges_all, edges_nn, shortest, quiet = FALSE) { 35 | 36 | clusters <- rcpp_slk (edges_all, edges_nn, 37 | shortest = shortest, quiet = quiet 38 | ) + 1 39 | 40 | tibble::tibble ( 41 | from = edges_nn$from [clusters], 42 | to = edges_nn$to [clusters] 43 | ) 44 | } 45 | 46 | #' scl_spantree_alk 47 | #' 48 | #' Generate a spanning tree from full-order, average linkage clustering (ALK) 49 | #' relationships expressed via a set of edges 50 | #' 51 | #' @inheritParams scl_spantree_slk 52 | #' @noRd 53 | scl_spantree_alk <- function (edges, shortest, quiet = FALSE) { 54 | 55 | clusters <- rcpp_alk (edges, shortest = shortest, quiet = quiet) + 1 56 | tibble::tibble ( 57 | from = edges$from [clusters], 58 | to = edges$to [clusters] 59 | ) 60 | } 61 | 62 | #' scl_spantree_clk 63 | #' 64 | #' Generate a spanning tree from full-order, complete linkage clustering (CLK) 65 | #' relationships expressed via a set of edges 66 | #' 67 | #' @inheritParams scl_spantree_slk 68 | #' @noRd 69 | scl_spantree_clk <- function (edges_all, edges_nn, shortest, quiet = FALSE) { 70 | 71 | clusters <- rcpp_clk (edges_all, edges_nn, 72 | shortest = shortest, quiet = quiet 73 | ) + 1 74 | 75 | tibble::tibble ( 76 | from = edges_nn$from [clusters], 77 | to = edges_nn$to [clusters] 78 | ) 79 | } 80 | 81 | #' scl_cuttree 82 | #' 83 | #' Cut a tree generated with \link{scl_spantree} into a specified number of 84 | #' clusters or components 85 | #' 86 | #' @param tree result of \link{scl_spantree} 87 | #' @param edges A set of edges resulting from \link{scl_edges}, but with 88 | #' additional data specifying edge weights, distances, or desired properties 89 | #' from which to construct the tree 90 | #' @inheritParams scl_redcap 91 | #' 92 | #' @return Modified version of \code{tree}, including an additional column 93 | #' specifying the cluster number of each edge, with NA for edges that lie 94 | #' between clusters. 95 | #' 96 | #' @note The \code{rcpp_cut_tree} routine in \code{src/cuttree} includes 97 | #' \code{constexpr MIN_CLUSTER_SIZE = 3}. 98 | #' 99 | #' @noRd 100 | scl_cuttree <- function (tree, edges, ncl, shortest, 101 | iterate_ncl = FALSE, quiet = FALSE) { 102 | 103 | num_clusters <- 0 104 | ncl_trial <- ncl 105 | 106 | quiet <- !(!quiet & nrow (tree) > 100) 107 | 108 | while (num_clusters < ncl) { 109 | 110 | if (num_clusters > 0 && !quiet) { 111 | message ("Not enough clusters found; re-starting search.") 112 | } 113 | 114 | tree_temp <- dplyr::left_join (tree, edges, by = c ("from", "to")) 115 | tree_temp$cluster <- rcpp_cut_tree ( 116 | tree_temp, 117 | ncl = ncl_trial, 118 | shortest = shortest, 119 | quiet = quiet 120 | ) + 1L 121 | num_clusters <- length (which (table (tree_temp$cluster) > 2)) 122 | if (!quiet) { 123 | message ("Total clusters found with > 2 members: ", num_clusters) 124 | } 125 | ncl_trial <- ncl_trial + 1 126 | if (ncl_trial >= nrow (tree) || iterate_ncl) { 127 | break 128 | } 129 | } 130 | 131 | return (tree_temp) 132 | } 133 | -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | #' scl_tbl 2 | #' 3 | #' Convert anything to a tibble 4 | #' 5 | #' @param xy A rectangular object containing the coordinates 6 | #' @return A tibble-ified version of the input, with coordinate columns 7 | #' identified and re-labelled "x" and "y" 8 | #' @noRd 9 | scl_tbl <- function (xy) { 10 | if (!inherits (xy, "data.frame")) { 11 | if (!is.numeric (xy)) { 12 | stop ("coordinates must be numeric") 13 | } 14 | if (is.vector (xy)) { 15 | stop ("coordinates require at least 2 columns") 16 | } 17 | xy <- data.frame (xy) 18 | } 19 | xi <- grep ("^x|^lon", names (xy), ignore.case = TRUE) 20 | yi <- grep ("^y|^lat", names (xy), ignore.case = TRUE) 21 | if (length (xi) == 1L && length (yi) == 1L) { 22 | names (xy) [xi] <- "x" 23 | names (xy) [yi] <- "y" 24 | } else if (ncol (xy) == 2) { 25 | colnames (xy) <- c ("x", "y") 26 | } else { 27 | stop ("Cannot determine unambiguous coordinate columns") 28 | } 29 | tibble::as_tibble (xy) 30 | } 31 | 32 | #' scl_linkage_type 33 | #' 34 | #' Convert \code{linkage} string arg to matching type 35 | #' @param linkage Type of linkage 36 | #' @return Strict match to one of three options 37 | #' @noRd 38 | scl_linkage_type <- function (linkage) { 39 | linkages <- c ("single", "average", "complete", "full") 40 | i <- grep (linkage, linkages, ignore.case = TRUE) 41 | if (length (i) == 0L) { 42 | stop ("linkage must be one of (single, average, complete, full)") 43 | } 44 | 45 | return (linkages [i]) 46 | } 47 | -------------------------------------------------------------------------------- /R/zzz.R: -------------------------------------------------------------------------------- 1 | .onLoad <- function (libname, pkgname) { # nolint 2 | 3 | # make data set names global to avoid CHECK notes 4 | utils::globalVariables (".") 5 | utils::globalVariables ("i") 6 | utils::globalVariables ("x") 7 | utils::globalVariables ("y") 8 | utils::globalVariables ("d") 9 | utils::globalVariables ("id") 10 | utils::globalVariables ("v") 11 | utils::globalVariables ("V1") 12 | utils::globalVariables ("V2") 13 | utils::globalVariables ("V3") 14 | utils::globalVariables ("from") 15 | utils::globalVariables ("to") 16 | utils::globalVariables ("clnum") 17 | utils::globalVariables ("cluster") 18 | utils::globalVariables ("merged") 19 | utils::globalVariables ("comp") 20 | utils::globalVariables ("value") 21 | utils::globalVariables ("na.omit") 22 | utils::globalVariables ("xfr") 23 | utils::globalVariables ("xto") 24 | utils::globalVariables ("yfr") 25 | utils::globalVariables ("yto") 26 | utils::globalVariables ("ind") 27 | 28 | invisible () 29 | } 30 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | 6 | 7 | ```{r, echo = FALSE} 8 | knitr::opts_chunk$set ( 9 | collapse = TRUE, 10 | comment = "#>", 11 | fig.path = "man/figures/README-" 12 | ) 13 | ``` 14 | 15 | [![R build status](https://github.com/mpadge/spatialcluster/workflows/R-CMD-check/badge.svg)](https://github.com/mpadge/spatialcluster/actions?query=workflow%3AR-CMD-check) 16 | [![Project Status: WIP](http://www.repostatus.org/badges/latest/wip.svg)](http://www.repostatus.org/#wip) 17 | [![codecov](https://codecov.io/gh/mpadge/spatialcluster/branch/master/graph/badge.svg)](https://codecov.io/gh/mpadge/spatialcluster) 18 | 19 | # spatialcluster 20 | 21 | An **R** package for spatially-constrained clustering using either distance or 22 | covariance matrices. "*Spatially-constrained*" means that the data from which 23 | clusters are to be formed also map on to spatial coordinates, and the 24 | constraint is that clusters must be spatially contiguous. 25 | 26 | The package includes both an implementation of the 27 | REDCAP collection of efficient yet approximate algorithms described in [D. Guo's 28 | 2008 paper, "Regionalization with dynamically constrained agglomerative 29 | clustering and 30 | partitioning."](https://www.tandfonline.com/doi/abs/10.1080/13658810701674970) 31 | (pdf available 32 | [here](https://pdfs.semanticscholar.org/ead1/7df8aaa1aed0e433b3ae1ec1ec5c7e785b2b.pdf)), 33 | with extension to covariance matrices, and a new technique for computing 34 | clusters using complete data sets. The package is also designed to analyse 35 | matrices of spatial interactions (counts, densities) between sets of origin and 36 | destination points. The spatial structure of interaction matrices is able to be 37 | statistically analysed to yield both global statistics for the overall spatial 38 | structure, and local statistics for individual clusters. 39 | 40 | 41 | ## Installation 42 | 43 | The easiest way to install `spatialcluster` is be enabling the [corresponding 44 | `r-universe`](https://mpadge.r-universe.dev/): 45 | 46 | ```{r r-univ, eval = FALSE} 47 | options (repos = c ( 48 | mpadge = "https://mpadge.r-universe.dev", 49 | CRAN = "https://cloud.r-project.org" 50 | )) 51 | ``` 52 | 53 | The package can then be installed as usual with, 54 | 55 | ```{r install, eval = FALSE} 56 | install.packges ("spatialcluster") 57 | ``` 58 | 59 | Alternatively, the package can also be installed using any of the following 60 | options: 61 | 62 | ```{r gh-installation, eval = FALSE} 63 | # install.packages("remotes") 64 | remotes::install_git ("https://codeberg.org/mpadge/spatialcluster") 65 | remotes::install_git ("https://git.sr.ht/~mpadge/spatialcluster") 66 | remotes::install_bitbucket ("mpadge/spatialcluster") 67 | remotes::install_gitlab ("mpadge/spatialcluster") 68 | remotes::install_github ("mpadge/spatialcluster") 69 | ``` 70 | 71 | ## Usage 72 | 73 | The two main functions, `scl_redcap()` and `scl_full()`, implement different 74 | algorithms for spatial clustering. The former implements the REDCAP collection 75 | of efficient yet approximate algorithms described in [D. Guo's 2008 paper, 76 | "Regionalization with dynamically constrained agglomerative clustering and 77 | partitioning."](https://www.tandfonline.com/doi/abs/10.1080/13658810701674970) 78 | (pdf available 79 | [here](https://pdfs.semanticscholar.org/ead1/7df8aaa1aed0e433b3ae1ec1ec5c7e785b2b.pdf)), 80 | with extension here to apply clustering to covariance matrices. These 81 | algorithms are computationally efficient yet generate only *approximate* 82 | estimates of underlying clusters. The second function, `scl_full()`, trades 83 | computational efficiency for accuracy, through generating clustering schemes 84 | using all available data. 85 | 86 | In short: 87 | 88 | - `scl_full()` should always be preferred as long as it returns results within 89 | a reasonable amount of time 90 | - `scl_redcap()` should be used only where data are too large for `scl_full()` 91 | to be run in a reasonable time. 92 | 93 | For clustering a group of `n` points, both of these functions require three 94 | main arguments: 95 | 96 | 1. A rectangular matrix of spatial coordinates of points to be clustered (`n` 97 | rows; at least 2 columns); 98 | 2. An `n`-by-`n` square matrix quantifying relationships between those points; 99 | 3. A single value (`ncl`) specifying the desired number of clusters. 100 | 101 | The following code demonstrates usage with randomly-generated data: 102 | ```{r} 103 | set.seed (1) 104 | n <- 100 105 | xy <- matrix (runif (2 * n), ncol = 2) 106 | dmat <- matrix (runif (n^2), ncol = n) 107 | ``` 108 | 109 | The load the package and call the function: 110 | 111 | ```{r full-single, echo = TRUE, eval = TRUE} 112 | library (spatialcluster) 113 | scl <- scl_full (xy, dmat, ncl = 8) 114 | plot (scl) 115 | ``` 116 | 117 | Both functions return a `list` with the following components: 118 | 119 | ```{r list-components} 120 | names (scl) 121 | ``` 122 | 123 | - `tree` details distances and cluster numbers for all pairwise comparisons 124 | between objects. 125 | - `merges` details increasing distances at which each pair of objects was 126 | merged into a single cluster. 127 | - `ord` provides the order of the merges (for `scl_full()` only). 128 | - `nodes` records the spatial coordinates of each point (node) of the input 129 | data. 130 | - `pars` retains the parameters used to call the clustering function. 131 | - `statsitics` returns the clustering statistics, both for individual clusters 132 | and an overall global statistic for the clustering scheme as a whole. 133 | 134 | See the "_Get Started_" vignette for more details. 135 | 136 | ## A Cautionary Note 137 | 138 | The following plot compares the results of applying four different clustering 139 | algorithms to the same data. 140 | 141 | ```{r cautionary, eval = TRUE, fig.width = 7, fig.height = 7} 142 | library (ggplot2) 143 | library (gridExtra) 144 | scl <- scl_full (xy, dmat, ncl = 8, linkage = "single") 145 | p1 <- plot (scl) + ggtitle ("full-single") 146 | scl <- scl_redcap (xy, dmat, ncl = 8, linkage = "single") 147 | p2 <- plot (scl) + ggtitle ("redcap-single") 148 | scl <- scl_redcap (xy, dmat, ncl = 8, linkage = "average") 149 | p3 <- plot (scl) + ggtitle ("redcap-average") 150 | scl <- scl_redcap (xy, dmat, ncl = 8, linkage = "complete") 151 | p4 <- plot (scl) + ggtitle ("redcap-complete") 152 | 153 | grid.arrange (p1, p2, p3, p4, ncol = 2) 154 | ``` 155 | 156 | 157 | This example illustrates the universal danger in all clustering algorithms: they 158 | can not fail to produce results, even when the data fed to them are definitely 159 | devoid of any information as in this example. Clustering algorithms should only 160 | be applied to reflect a very specific hypothesis for why data should be 161 | clustered in the first place; spatial clustering algorithms should only be 162 | applied to reflect two very specific hypothesis for (i) why data should be 163 | clustered at all, and (ii) why those clusters should manifest a spatial 164 | pattern. 165 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | [![R build 5 | status](https://github.com/mpadge/spatialcluster/workflows/R-CMD-check/badge.svg)](https://github.com/mpadge/spatialcluster/actions?query=workflow%3AR-CMD-check) 6 | [![Project Status: 7 | WIP](http://www.repostatus.org/badges/latest/wip.svg)](http://www.repostatus.org/#wip) 8 | [![codecov](https://codecov.io/gh/mpadge/spatialcluster/branch/master/graph/badge.svg)](https://codecov.io/gh/mpadge/spatialcluster) 9 | 10 | # spatialcluster 11 | 12 | An **R** package for spatially-constrained clustering using either 13 | distance or covariance matrices. “*Spatially-constrained*” means that 14 | the data from which clusters are to be formed also map on to spatial 15 | coordinates, and the constraint is that clusters must be spatially contiguous. 16 | 17 | The package includes both an implementation of the REDCAP collection of 18 | efficient yet approximate algorithms described in [D. Guo’s 2008 paper, 19 | “Regionalization with dynamically constrained agglomerative clustering 20 | and 21 | partitioning.”](https://www.tandfonline.com/doi/abs/10.1080/13658810701674970) 22 | (pdf available 23 | [here](https://pdfs.semanticscholar.org/ead1/7df8aaa1aed0e433b3ae1ec1ec5c7e785b2b.pdf)), 24 | with extension to covariance matrices, and a new technique for computing 25 | clusters using complete data sets. The package is also designed to 26 | analyse matrices of spatial interactions (counts, densities) between 27 | sets of origin and destination points. The spatial structure of 28 | interaction matrices is able to be statistically analysed to yield both 29 | global statistics for the overall spatial structure, and local 30 | statistics for individual clusters. 31 | 32 | ## Installation 33 | 34 | The easiest way to install `spatialcluster` is be enabling the 35 | [corresponding `r-universe`](https://mpadge.r-universe.dev/): 36 | 37 | ``` r 38 | options (repos = c ( 39 | mpadge = "https://mpadge.r-universe.dev", 40 | CRAN = "https://cloud.r-project.org" 41 | )) 42 | ``` 43 | 44 | The package can then be installed as usual with, 45 | 46 | ``` r 47 | install.packges ("spatialcluster") 48 | ``` 49 | 50 | Alternatively, the package can also be installed using any of the 51 | following options: 52 | 53 | ``` r 54 | # install.packages("remotes") 55 | remotes::install_git ("https://codeberg.org/mpadge/spatialcluster") 56 | remotes::install_git ("https://git.sr.ht/~mpadge/spatialcluster") 57 | remotes::install_bitbucket ("mpadge/spatialcluster") 58 | remotes::install_gitlab ("mpadge/spatialcluster") 59 | remotes::install_github ("mpadge/spatialcluster") 60 | ``` 61 | 62 | ## Usage 63 | 64 | The two main functions, `scl_redcap()` and `scl_full()`, implement 65 | different algorithms for spatial clustering. The former implements the 66 | REDCAP collection of efficient yet approximate algorithms described in 67 | [D. Guo’s 2008 paper, “Regionalization with dynamically constrained 68 | agglomerative clustering and 69 | partitioning.”](https://www.tandfonline.com/doi/abs/10.1080/13658810701674970) 70 | (pdf available 71 | [here](https://pdfs.semanticscholar.org/ead1/7df8aaa1aed0e433b3ae1ec1ec5c7e785b2b.pdf)), 72 | with extension here to apply clustering to covariance matrices. These 73 | algorithms are computationally efficient yet generate only *approximate* 74 | estimates of underlying clusters. The second function, `scl_full()`, 75 | trades computational efficiency for accuracy, through generating 76 | clustering schemes using all available data. 77 | 78 | In short: 79 | 80 | - `scl_full()` should always be preferred as long as it returns results 81 | within a reasonable amount of time 82 | - `scl_redcap()` should be used only where data are too large for 83 | `scl_full()` to be run in a reasonable time. 84 | 85 | For clustering a group of `n` points, both of these functions require three 86 | main arguments: 87 | 88 | 1. A rectangular matrix of spatial coordinates of points to be clustered 89 | (`n` rows; at least 2 columns); 90 | 2. An `n`-by-`n` square matrix quantifying relationships between those 91 | points; 92 | 3. A single value (`ncl`) specifying the desired number of clusters. 93 | 94 | The following code demonstrates usage with randomly-generated data: 95 | 96 | ``` r 97 | set.seed (1) 98 | n <- 100 99 | xy <- matrix (runif (2 * n), ncol = 2) 100 | dmat <- matrix (runif (n^2), ncol = n) 101 | ``` 102 | 103 | The load the package and call the function: 104 | 105 | ``` r 106 | library (spatialcluster) 107 | scl <- scl_full (xy, dmat, ncl = 8) 108 | plot (scl) 109 | ``` 110 | 111 | ![](man/figures/README-full-single-1.png) 112 | 113 | Both functions return a `list` with the following components: 114 | 115 | ``` r 116 | names (scl) 117 | #> [1] "tree" "merges" "ord" "nodes" "pars" 118 | #> [6] "statistics" 119 | ``` 120 | 121 | - `tree` details distances and cluster numbers for all pairwise 122 | comparisons between objects. 123 | - `merges` details increasing distances at which each pair of objects 124 | was merged into a single cluster. 125 | - `ord` provides the order of the merges (for `scl_full()` only). 126 | - `nodes` records the spatial coordinates of each point (node) of the 127 | input data. 128 | - `pars` retains the parameters used to call the clustering function. 129 | - `statsitics` returns the clustering statistics, both for individual 130 | clusters and an overall global statistic for the clustering scheme as 131 | a whole. 132 | 133 | See the "_Get Started_" vignette for more details. 134 | 135 | ## A Cautionary Note 136 | 137 | The following plot compares the results of applying four different 138 | clustering algorithms to the same data. 139 | 140 | ``` r 141 | library (ggplot2) 142 | library (gridExtra) 143 | scl <- scl_full (xy, dmat, ncl = 8, linkage = "single") 144 | p1 <- plot (scl) + ggtitle ("full-single") 145 | scl <- scl_redcap (xy, dmat, ncl = 8, linkage = "single") 146 | p2 <- plot (scl) + ggtitle ("redcap-single") 147 | scl <- scl_redcap (xy, dmat, ncl = 8, linkage = "average") 148 | p3 <- plot (scl) + ggtitle ("redcap-average") 149 | scl <- scl_redcap (xy, dmat, ncl = 8, linkage = "complete") 150 | p4 <- plot (scl) + ggtitle ("redcap-complete") 151 | 152 | grid.arrange (p1, p2, p3, p4, ncol = 2) 153 | ``` 154 | 155 | ![](man/figures/README-cautionary-1.png) 156 | 157 | This example illustrates the universal danger in all clustering 158 | algorithms: they can not fail to produce results, even when the data fed 159 | to them are definitely devoid of any information as in this example. 160 | Clustering algorithms should only be applied to reflect a very specific 161 | hypothesis for why data should be clustered in the first place; spatial 162 | clustering algorithms should only be applied to reflect two very 163 | specific hypothesis for (i) why data should be clustered at all, and 164 | (ii) why those clusters should manifest a spatial pattern. 165 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | title: spatialcluster 2 | url: https://mpadge.github.io/spatialcluster 3 | 4 | template: 5 | params: 6 | bootswatch: cerulean 7 | 8 | reference: 9 | - title: Package 10 | contents: 11 | - has_concept("package") 12 | - title: Clustering Functions 13 | contents: 14 | - has_concept("clustering_fns") 15 | - title: Plotting Functions 16 | contents: 17 | - has_concept("plot_fns") 18 | -------------------------------------------------------------------------------- /codemeta.json: -------------------------------------------------------------------------------- 1 | { 2 | "@context": "https://doi.org/10.5063/schema/codemeta-2.0", 3 | "@type": "SoftwareSourceCode", 4 | "identifier": "spatialcluster", 5 | "description": "R port of redcap (Regionalization with dynamically constrained agglomerative clustering and partitioning).", 6 | "name": "spatialcluster: R port of redcap", 7 | "codeRepository": "https://github.com/mpadge/spatialcluster", 8 | "issueTracker": "https://github.com/mpadge/spatialcluster/issues", 9 | "license": "https://spdx.org/licenses/GPL-3.0", 10 | "version": "0.2.0.017", 11 | "programmingLanguage": { 12 | "@type": "ComputerLanguage", 13 | "name": "R", 14 | "url": "https://r-project.org" 15 | }, 16 | "runtimePlatform": "R version 4.4.2 (2024-10-31)", 17 | "author": [ 18 | { 19 | "@type": "Person", 20 | "givenName": "Mark", 21 | "familyName": "Padgham", 22 | "email": "mark.padgham@email.com" 23 | } 24 | ], 25 | "maintainer": [ 26 | { 27 | "@type": "Person", 28 | "givenName": "Mark", 29 | "familyName": "Padgham", 30 | "email": "mark.padgham@email.com" 31 | } 32 | ], 33 | "softwareSuggestions": [ 34 | { 35 | "@type": "SoftwareApplication", 36 | "identifier": "dbscan", 37 | "name": "dbscan", 38 | "provider": { 39 | "@id": "https://cran.r-project.org", 40 | "@type": "Organization", 41 | "name": "Comprehensive R Archive Network (CRAN)", 42 | "url": "https://cran.r-project.org" 43 | }, 44 | "sameAs": "https://CRAN.R-project.org/package=dbscan" 45 | }, 46 | { 47 | "@type": "SoftwareApplication", 48 | "identifier": "knitr", 49 | "name": "knitr", 50 | "provider": { 51 | "@id": "https://cran.r-project.org", 52 | "@type": "Organization", 53 | "name": "Comprehensive R Archive Network (CRAN)", 54 | "url": "https://cran.r-project.org" 55 | }, 56 | "sameAs": "https://CRAN.R-project.org/package=knitr" 57 | }, 58 | { 59 | "@type": "SoftwareApplication", 60 | "identifier": "rmarkdown", 61 | "name": "rmarkdown", 62 | "provider": { 63 | "@id": "https://cran.r-project.org", 64 | "@type": "Organization", 65 | "name": "Comprehensive R Archive Network (CRAN)", 66 | "url": "https://cran.r-project.org" 67 | }, 68 | "sameAs": "https://CRAN.R-project.org/package=rmarkdown" 69 | }, 70 | { 71 | "@type": "SoftwareApplication", 72 | "identifier": "roxygen2", 73 | "name": "roxygen2", 74 | "provider": { 75 | "@id": "https://cran.r-project.org", 76 | "@type": "Organization", 77 | "name": "Comprehensive R Archive Network (CRAN)", 78 | "url": "https://cran.r-project.org" 79 | }, 80 | "sameAs": "https://CRAN.R-project.org/package=roxygen2" 81 | }, 82 | { 83 | "@type": "SoftwareApplication", 84 | "identifier": "testthat", 85 | "name": "testthat", 86 | "provider": { 87 | "@id": "https://cran.r-project.org", 88 | "@type": "Organization", 89 | "name": "Comprehensive R Archive Network (CRAN)", 90 | "url": "https://cran.r-project.org" 91 | }, 92 | "sameAs": "https://CRAN.R-project.org/package=testthat" 93 | } 94 | ], 95 | "softwareRequirements": { 96 | "1": { 97 | "@type": "SoftwareApplication", 98 | "identifier": "R", 99 | "name": "R", 100 | "version": ">= 4.1.0" 101 | }, 102 | "2": { 103 | "@type": "SoftwareApplication", 104 | "identifier": "alphahull", 105 | "name": "alphahull", 106 | "provider": { 107 | "@id": "https://cran.r-project.org", 108 | "@type": "Organization", 109 | "name": "Comprehensive R Archive Network (CRAN)", 110 | "url": "https://cran.r-project.org" 111 | }, 112 | "sameAs": "https://CRAN.R-project.org/package=alphahull" 113 | }, 114 | "3": { 115 | "@type": "SoftwareApplication", 116 | "identifier": "dplyr", 117 | "name": "dplyr", 118 | "provider": { 119 | "@id": "https://cran.r-project.org", 120 | "@type": "Organization", 121 | "name": "Comprehensive R Archive Network (CRAN)", 122 | "url": "https://cran.r-project.org" 123 | }, 124 | "sameAs": "https://CRAN.R-project.org/package=dplyr" 125 | }, 126 | "4": { 127 | "@type": "SoftwareApplication", 128 | "identifier": "ggplot2", 129 | "name": "ggplot2", 130 | "provider": { 131 | "@id": "https://cran.r-project.org", 132 | "@type": "Organization", 133 | "name": "Comprehensive R Archive Network (CRAN)", 134 | "url": "https://cran.r-project.org" 135 | }, 136 | "sameAs": "https://CRAN.R-project.org/package=ggplot2" 137 | }, 138 | "5": { 139 | "@type": "SoftwareApplication", 140 | "identifier": "ggthemes", 141 | "name": "ggthemes", 142 | "provider": { 143 | "@id": "https://cran.r-project.org", 144 | "@type": "Organization", 145 | "name": "Comprehensive R Archive Network (CRAN)", 146 | "url": "https://cran.r-project.org" 147 | }, 148 | "sameAs": "https://CRAN.R-project.org/package=ggthemes" 149 | }, 150 | "6": { 151 | "@type": "SoftwareApplication", 152 | "identifier": "methods", 153 | "name": "methods" 154 | }, 155 | "7": { 156 | "@type": "SoftwareApplication", 157 | "identifier": "Rcpp", 158 | "name": "Rcpp", 159 | "version": ">= 0.12.6", 160 | "provider": { 161 | "@id": "https://cran.r-project.org", 162 | "@type": "Organization", 163 | "name": "Comprehensive R Archive Network (CRAN)", 164 | "url": "https://cran.r-project.org" 165 | }, 166 | "sameAs": "https://CRAN.R-project.org/package=Rcpp" 167 | }, 168 | "8": { 169 | "@type": "SoftwareApplication", 170 | "identifier": "tibble", 171 | "name": "tibble", 172 | "provider": { 173 | "@id": "https://cran.r-project.org", 174 | "@type": "Organization", 175 | "name": "Comprehensive R Archive Network (CRAN)", 176 | "url": "https://cran.r-project.org" 177 | }, 178 | "sameAs": "https://CRAN.R-project.org/package=tibble" 179 | }, 180 | "9": { 181 | "@type": "SoftwareApplication", 182 | "identifier": "tripack", 183 | "name": "tripack", 184 | "provider": { 185 | "@id": "https://cran.r-project.org", 186 | "@type": "Organization", 187 | "name": "Comprehensive R Archive Network (CRAN)", 188 | "url": "https://cran.r-project.org" 189 | }, 190 | "sameAs": "https://CRAN.R-project.org/package=tripack" 191 | }, 192 | "SystemRequirements": {} 193 | }, 194 | "fileSize": "17667.689KB", 195 | "readme": "https://github.com/mpadge/spatialcluster/blob/main/README.md", 196 | "contIntegration": [ 197 | "https://github.com/mpadge/spatialcluster/actions?query=workflow%3AR-CMD-check", 198 | "https://codecov.io/gh/mpadge/spatialcluster" 199 | ], 200 | "developmentStatus": "http://www.repostatus.org/#wip", 201 | "keywords": [ 202 | "clustering-algorithm", 203 | "cluster", 204 | "r", 205 | "spatial" 206 | ] 207 | } 208 | -------------------------------------------------------------------------------- /makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all build check doc readme test 2 | 3 | all: doc readme build check 4 | 5 | build: doc 6 | R CMD build . 7 | 8 | #check: build 9 | # R CMD check spatialcluster*tar.gz 10 | 11 | clean: 12 | -rm -f spatialcluster*tar.gz 13 | -rm -fr spatialcluster.Rcheck 14 | -rm -fr src/*.{o,so} 15 | 16 | doc: clean 17 | Rscript -e 'devtools::document()' 18 | 19 | readme: 20 | Rscript -e 'rmarkdown::render("README.Rmd")' 21 | 22 | test: 23 | Rscript -e 'devtools::test()' 24 | 25 | check: 26 | Rscript -e 'library(pkgcheck); checks <- pkgcheck(); print(checks); summary (checks)' 27 | 28 | install: clean 29 | R CMD INSTALL . 30 | -------------------------------------------------------------------------------- /man/figures/README-cautionary-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mpadge/spatialcluster/ae90bfc66f4b523c0c9988175810da5d3fe60986/man/figures/README-cautionary-1.png -------------------------------------------------------------------------------- /man/figures/README-full-single-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mpadge/spatialcluster/ae90bfc66f4b523c0c9988175810da5d3fe60986/man/figures/README-full-single-1.png -------------------------------------------------------------------------------- /man/plot.scl.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot-fns.R 3 | \name{plot.scl} 4 | \alias{plot.scl} 5 | \title{plot.scl} 6 | \usage{ 7 | \method{plot}{scl}(x, ..., hull_alpha = 1) 8 | } 9 | \arguments{ 10 | \item{x}{object to be plotted} 11 | 12 | \item{...}{ignored here} 13 | 14 | \item{hull_alpha}{alpha value of (non-)convex hulls, with default generating 15 | a convex hull, and smaller values generating concave hulls. (See 16 | ?alphashape::ashape for details).} 17 | } 18 | \description{ 19 | plot.scl 20 | } 21 | \examples{ 22 | set.seed (1) 23 | n <- 100 24 | xy <- matrix (runif (2 * n), ncol = 2) 25 | dmat <- matrix (runif (n^2), ncol = n) 26 | scl <- scl_redcap (xy, dmat, ncl = 4) 27 | plot (scl) 28 | # Connect clusters according to highest (\code{shortest = FALSE}) values of 29 | # \code{dmat}: 30 | scl <- scl_redcap (xy, dmat, ncl = 4, shortest = FALSE, full_order = FALSE) 31 | plot (scl) 32 | } 33 | \seealso{ 34 | Other plot_fns: 35 | \code{\link{plot_merges}()} 36 | } 37 | \concept{plot_fns} 38 | -------------------------------------------------------------------------------- /man/plot_merges.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot-fns.R 3 | \name{plot_merges} 4 | \alias{plot_merges} 5 | \title{plot_merges} 6 | \usage{ 7 | plot_merges(x, root_tree = FALSE) 8 | } 9 | \arguments{ 10 | \item{x}{Object of class \code{scl} obtained with \code{method = "full"}.} 11 | 12 | \item{root_tree}{If \code{TRUE}, tree leaves are connected to bottom of plot, 13 | otherwise floating as determined by \link{plot.hclust}.} 14 | } 15 | \value{ 16 | Nothing (generates plot) 17 | } 18 | \description{ 19 | Plot dendrogram of merges for \code{scl} object with \code{method = "full"}. 20 | } 21 | \seealso{ 22 | Other plot_fns: 23 | \code{\link{plot.scl}()} 24 | } 25 | \concept{plot_fns} 26 | -------------------------------------------------------------------------------- /man/scl_full.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/scl-full.R 3 | \name{scl_full} 4 | \alias{scl_full} 5 | \title{scl_full} 6 | \usage{ 7 | scl_full(xy, dmat, ncl, linkage = "single", shortest = TRUE, nnbs = 6L) 8 | } 9 | \arguments{ 10 | \item{xy}{Rectangular structure (matrix, data.frame, tibble), containing 11 | coordinates of points to be clustered.} 12 | 13 | \item{dmat}{Square structure (matrix, data.frame, tibble) containing 14 | distances or equivalent metrics between all points in \code{xy}. If \code{xy} 15 | has \code{n} rows, then \code{dat} must have \code{n} rows and \code{n} 16 | columns.} 17 | 18 | \item{ncl}{Desired number of clusters. See description of `ncl_iterate` 19 | parameter for conditions under which actual number may be less than this 20 | value.} 21 | 22 | \item{linkage}{Either \code{"single"} or \code{"average"}. For covariance 23 | clustering, use \code{"single"} with `shortest = FALSE`.} 24 | 25 | \item{shortest}{If \code{TRUE}, the \code{dmat} is interpreted as distances 26 | such that lower values are preferentially selected; if \code{FALSE}, then 27 | higher values of \code{dmat} are interpreted to indicate stronger 28 | relationships, as is the case for example with covariances.} 29 | 30 | \item{nnbs}{Number of nearest neighbours to be used in calculating clustering 31 | trees. Triangulation will be used if \code{nnbs <= 0}.} 32 | } 33 | \description{ 34 | Full spatially-constrained clustering. 35 | } 36 | \examples{ 37 | n <- 100 38 | xy <- matrix (runif (2 * n), ncol = 2) 39 | dmat <- matrix (runif (n^2), ncol = n) 40 | scl <- scl_full (xy, dmat, ncl = 4) 41 | } 42 | \seealso{ 43 | Other clustering_fns: 44 | \code{\link{scl_recluster}()}, 45 | \code{\link{scl_redcap}()} 46 | } 47 | \concept{clustering_fns} 48 | -------------------------------------------------------------------------------- /man/scl_recluster.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/scl-redcap.R 3 | \name{scl_recluster} 4 | \alias{scl_recluster} 5 | \title{scl_reccluster} 6 | \usage{ 7 | scl_recluster(scl, ncl, shortest = TRUE, quiet = FALSE) 8 | } 9 | \arguments{ 10 | \item{scl}{An \code{scl} object returned from \link{scl_redcap}.} 11 | 12 | \item{ncl}{Desired number of clusters. See description of `ncl_iterate` 13 | parameter for conditions under which actual number may be less than this 14 | value.} 15 | 16 | \item{shortest}{If \code{TRUE}, the \code{dmat} is interpreted as distances 17 | such that lower values are preferentially selected; if \code{FALSE}, then 18 | higher values of \code{dmat} are interpreted to indicate stronger 19 | relationships, as is the case for example with covariances.} 20 | 21 | \item{quiet}{If `FALSE` (default), display progress information on screen.} 22 | } 23 | \value{ 24 | Modified \code{scl} object in which \code{tree} is re-cut into 25 | \code{ncl} clusters. 26 | } 27 | \description{ 28 | Re-cut a spatial cluster tree (\code{scl}) at a different number of clusters. 29 | } 30 | \examples{ 31 | n <- 100 32 | xy <- matrix (runif (2 * n), ncol = 2) 33 | dmat <- matrix (runif (n^2), ncol = n) 34 | scl <- scl_redcap (xy, dmat, ncl = 4) 35 | plot (scl) 36 | scl <- scl_recluster (scl, ncl = 5) 37 | plot (scl) 38 | 39 | } 40 | \seealso{ 41 | Other clustering_fns: 42 | \code{\link{scl_full}()}, 43 | \code{\link{scl_redcap}()} 44 | } 45 | \concept{clustering_fns} 46 | -------------------------------------------------------------------------------- /man/scl_redcap.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/scl-redcap.R 3 | \name{scl_redcap} 4 | \alias{scl_redcap} 5 | \title{scl_redcap} 6 | \usage{ 7 | scl_redcap( 8 | xy, 9 | dmat, 10 | ncl, 11 | full_order = TRUE, 12 | linkage = "single", 13 | shortest = TRUE, 14 | nnbs = 6L, 15 | iterate_ncl = FALSE, 16 | quiet = FALSE 17 | ) 18 | } 19 | \arguments{ 20 | \item{xy}{Rectangular structure (matrix, data.frame, tibble), containing 21 | coordinates of points to be clustered.} 22 | 23 | \item{dmat}{Square structure (matrix, data.frame, tibble) containing 24 | distances or equivalent metrics between all points in \code{xy}. If \code{xy} 25 | has \code{n} rows, then \code{dat} must have \code{n} rows and \code{n} 26 | columns.} 27 | 28 | \item{ncl}{Desired number of clusters. See description of `ncl_iterate` 29 | parameter for conditions under which actual number may be less than this 30 | value.} 31 | 32 | \item{full_order}{If \code{FALSE}, build spanning trees from first-order 33 | relationships only, otherwise build from full-order relationships (see Note).} 34 | 35 | \item{linkage}{One of \code{"single"}, \code{"average"}, or 36 | \code{"complete"}; see Note.} 37 | 38 | \item{shortest}{If \code{TRUE}, the \code{dmat} is interpreted as distances 39 | such that lower values are preferentially selected; if \code{FALSE}, then 40 | higher values of \code{dmat} are interpreted to indicate stronger 41 | relationships, as is the case for example with covariances.} 42 | 43 | \item{nnbs}{Number of nearest neighbours to be used in calculating clustering 44 | trees. Triangulation will be used if \code{nnbs <= 0}.} 45 | 46 | \item{iterate_ncl}{Actual numbers of clusters found may be less than the 47 | specified value of `ncl`, because clusters formed from < 3 edges are removed. 48 | If `iterate_ncl = FALSE` (the default), the value is returned with whatever 49 | number of actual clusters is found. Setting this parameter to `TRUE` forces 50 | the algorithm to iterate until the exact number of clusters has been found. 51 | For large data sets, this may result in considerable longer calculation 52 | times.} 53 | 54 | \item{quiet}{If `FALSE` (default), display progress information on screen.} 55 | } 56 | \value{ 57 | A object of class \code{scl} with \code{tree} containing the 58 | clustering scheme, and \code{xy} the original coordinate data of the 59 | clustered points. An additional component, \code{tree_rest}, enables the tree 60 | to be re-cut to a different number of clusters via \link{scl_recluster}, 61 | rather than calculating clusters anew. 62 | } 63 | \description{ 64 | Cluster spatial data with REDCAP (REgionalization with Dynamically 65 | Constrained Agglomerative clustering and Partitioning) routines. 66 | } 67 | \note{ 68 | Please refer to the original REDCAP paper ('Regionalization with 69 | dynamically constrained agglomerative clustering and partitioning (REDCAP)', 70 | by D. Guo (2008), Int.J.Geo.Inf.Sci 22:801-823) for details of the 71 | \code{full_order} and \code{linkage} parameters. This paper clearly 72 | demonstrates the general inferiority of spanning trees constructed from 73 | first-order relationships. It is therefore strongly recommended that the 74 | default \code{full_order = TRUE} be used at all times. 75 | } 76 | \examples{ 77 | n <- 100 78 | xy <- matrix (runif (2 * n), ncol = 2) 79 | dmat <- matrix (runif (n^2), ncol = n) 80 | scl <- scl_redcap (xy, dmat, ncl = 4) 81 | # Those clusters will by default be constructed by connecting edges with the 82 | # lowest (\code{shortest}) values of \code{dmat}, and will differ from 83 | scl <- scl_redcap (xy, dmat, ncl = 4, shortest = FALSE) 84 | # using 'full_order = FALSE' constructs clusters from first-order 85 | # relationships only; not recommended, but possible nevertheless: 86 | scl <- scl_redcap (xy, dmat, ncl = 4, full_order = FALSE) 87 | 88 | } 89 | \seealso{ 90 | Other clustering_fns: 91 | \code{\link{scl_full}()}, 92 | \code{\link{scl_recluster}()} 93 | } 94 | \concept{clustering_fns} 95 | -------------------------------------------------------------------------------- /man/spatialcluster.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/spatialcluster-package.R 3 | \docType{package} 4 | \name{spatialcluster} 5 | \alias{spatialcluster-package} 6 | \alias{spatialcluster} 7 | \title{spatialcluster.} 8 | \description{ 9 | R port of redcap (Regionalization with dynamically constrained agglomerative 10 | clustering and partitioning). 11 | } 12 | \seealso{ 13 | Useful links: 14 | \itemize{ 15 | \item \url{https://github.com/mpadge/spatialcluster} 16 | \item Report bugs at \url{https://github.com/mpadge/spatialcluster/issues} 17 | } 18 | 19 | } 20 | \author{ 21 | \strong{Maintainer}: Mark Padgham \email{mark.padgham@email.com} 22 | 23 | } 24 | \concept{package} 25 | -------------------------------------------------------------------------------- /src/RcppExports.cpp: -------------------------------------------------------------------------------- 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #include 5 | #include 6 | 7 | using namespace Rcpp; 8 | 9 | #ifdef RCPP_USE_GLOBAL_ROSTREAM 10 | Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); 11 | Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); 12 | #endif 13 | 14 | // rcpp_alk 15 | Rcpp::IntegerVector rcpp_alk(const Rcpp::DataFrame gr, const bool shortest, const bool quiet); 16 | RcppExport SEXP _spatialcluster_rcpp_alk(SEXP grSEXP, SEXP shortestSEXP, SEXP quietSEXP) { 17 | BEGIN_RCPP 18 | Rcpp::RObject rcpp_result_gen; 19 | Rcpp::RNGScope rcpp_rngScope_gen; 20 | Rcpp::traits::input_parameter< const Rcpp::DataFrame >::type gr(grSEXP); 21 | Rcpp::traits::input_parameter< const bool >::type shortest(shortestSEXP); 22 | Rcpp::traits::input_parameter< const bool >::type quiet(quietSEXP); 23 | rcpp_result_gen = Rcpp::wrap(rcpp_alk(gr, shortest, quiet)); 24 | return rcpp_result_gen; 25 | END_RCPP 26 | } 27 | // rcpp_clk 28 | Rcpp::IntegerVector rcpp_clk(const Rcpp::DataFrame gr_full, const Rcpp::DataFrame gr, const bool shortest, const bool quiet); 29 | RcppExport SEXP _spatialcluster_rcpp_clk(SEXP gr_fullSEXP, SEXP grSEXP, SEXP shortestSEXP, SEXP quietSEXP) { 30 | BEGIN_RCPP 31 | Rcpp::RObject rcpp_result_gen; 32 | Rcpp::RNGScope rcpp_rngScope_gen; 33 | Rcpp::traits::input_parameter< const Rcpp::DataFrame >::type gr_full(gr_fullSEXP); 34 | Rcpp::traits::input_parameter< const Rcpp::DataFrame >::type gr(grSEXP); 35 | Rcpp::traits::input_parameter< const bool >::type shortest(shortestSEXP); 36 | Rcpp::traits::input_parameter< const bool >::type quiet(quietSEXP); 37 | rcpp_result_gen = Rcpp::wrap(rcpp_clk(gr_full, gr, shortest, quiet)); 38 | return rcpp_result_gen; 39 | END_RCPP 40 | } 41 | // rcpp_cut_tree 42 | Rcpp::IntegerVector rcpp_cut_tree(const Rcpp::DataFrame tree, const int ncl, const bool shortest, const bool quiet); 43 | RcppExport SEXP _spatialcluster_rcpp_cut_tree(SEXP treeSEXP, SEXP nclSEXP, SEXP shortestSEXP, SEXP quietSEXP) { 44 | BEGIN_RCPP 45 | Rcpp::RObject rcpp_result_gen; 46 | Rcpp::RNGScope rcpp_rngScope_gen; 47 | Rcpp::traits::input_parameter< const Rcpp::DataFrame >::type tree(treeSEXP); 48 | Rcpp::traits::input_parameter< const int >::type ncl(nclSEXP); 49 | Rcpp::traits::input_parameter< const bool >::type shortest(shortestSEXP); 50 | Rcpp::traits::input_parameter< const bool >::type quiet(quietSEXP); 51 | rcpp_result_gen = Rcpp::wrap(rcpp_cut_tree(tree, ncl, shortest, quiet)); 52 | return rcpp_result_gen; 53 | END_RCPP 54 | } 55 | // rcpp_full_initial 56 | Rcpp::IntegerVector rcpp_full_initial(const Rcpp::DataFrame gr, bool shortest); 57 | RcppExport SEXP _spatialcluster_rcpp_full_initial(SEXP grSEXP, SEXP shortestSEXP) { 58 | BEGIN_RCPP 59 | Rcpp::RObject rcpp_result_gen; 60 | Rcpp::RNGScope rcpp_rngScope_gen; 61 | Rcpp::traits::input_parameter< const Rcpp::DataFrame >::type gr(grSEXP); 62 | Rcpp::traits::input_parameter< bool >::type shortest(shortestSEXP); 63 | rcpp_result_gen = Rcpp::wrap(rcpp_full_initial(gr, shortest)); 64 | return rcpp_result_gen; 65 | END_RCPP 66 | } 67 | // rcpp_full_merge 68 | Rcpp::NumericMatrix rcpp_full_merge(const Rcpp::DataFrame gr, const std::string linkage, const bool shortest); 69 | RcppExport SEXP _spatialcluster_rcpp_full_merge(SEXP grSEXP, SEXP linkageSEXP, SEXP shortestSEXP) { 70 | BEGIN_RCPP 71 | Rcpp::RObject rcpp_result_gen; 72 | Rcpp::RNGScope rcpp_rngScope_gen; 73 | Rcpp::traits::input_parameter< const Rcpp::DataFrame >::type gr(grSEXP); 74 | Rcpp::traits::input_parameter< const std::string >::type linkage(linkageSEXP); 75 | Rcpp::traits::input_parameter< const bool >::type shortest(shortestSEXP); 76 | rcpp_result_gen = Rcpp::wrap(rcpp_full_merge(gr, linkage, shortest)); 77 | return rcpp_result_gen; 78 | END_RCPP 79 | } 80 | // rcpp_mst 81 | Rcpp::DataFrame rcpp_mst(Rcpp::DataFrame input); 82 | RcppExport SEXP _spatialcluster_rcpp_mst(SEXP inputSEXP) { 83 | BEGIN_RCPP 84 | Rcpp::RObject rcpp_result_gen; 85 | Rcpp::RNGScope rcpp_rngScope_gen; 86 | Rcpp::traits::input_parameter< Rcpp::DataFrame >::type input(inputSEXP); 87 | rcpp_result_gen = Rcpp::wrap(rcpp_mst(input)); 88 | return rcpp_result_gen; 89 | END_RCPP 90 | } 91 | // rcpp_slk 92 | Rcpp::IntegerVector rcpp_slk(const Rcpp::DataFrame gr_full, const Rcpp::DataFrame gr, const bool shortest, const bool quiet); 93 | RcppExport SEXP _spatialcluster_rcpp_slk(SEXP gr_fullSEXP, SEXP grSEXP, SEXP shortestSEXP, SEXP quietSEXP) { 94 | BEGIN_RCPP 95 | Rcpp::RObject rcpp_result_gen; 96 | Rcpp::RNGScope rcpp_rngScope_gen; 97 | Rcpp::traits::input_parameter< const Rcpp::DataFrame >::type gr_full(gr_fullSEXP); 98 | Rcpp::traits::input_parameter< const Rcpp::DataFrame >::type gr(grSEXP); 99 | Rcpp::traits::input_parameter< const bool >::type shortest(shortestSEXP); 100 | Rcpp::traits::input_parameter< const bool >::type quiet(quietSEXP); 101 | rcpp_result_gen = Rcpp::wrap(rcpp_slk(gr_full, gr, shortest, quiet)); 102 | return rcpp_result_gen; 103 | END_RCPP 104 | } 105 | -------------------------------------------------------------------------------- /src/alk.cpp: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include "utils.h" 3 | #include "alk.h" 4 | 5 | // --------- AVERAGE LINKAGE CLUSTER ---------------- 6 | 7 | void alk::alk_init (alk::ALKDat &alk_dat, 8 | BinarySearchTree &tree, 9 | Rcpp::IntegerVector from, 10 | Rcpp::IntegerVector to, 11 | Rcpp::NumericVector d) { 12 | 13 | size_t n = utils::sets_init (from, to, alk_dat.vert2index_map, 14 | alk_dat.index2vert_map, alk_dat.index2cl_map, 15 | alk_dat.cl2index_map); 16 | alk_dat.n = n; 17 | 18 | intset_t vert_set; 19 | 20 | // Get set of unique vertices, and store binary tree of edge distances 21 | for (int i = 0; i < from.size (); i++) { 22 | vert_set.emplace (from [i]); 23 | vert_set.emplace (to [i]); 24 | tree.insert (d [i]); 25 | } 26 | // Construct vert2index_map to map each unique vertex to an index 27 | index_t idx = 0; 28 | for (auto v: vert_set) { 29 | alk_dat.vert2index_map.emplace (v, idx++); 30 | } 31 | 32 | // Construct idx2edgewt_map and edgewt2idx_pair_map 33 | for (int i = 0; i < from.size (); i++) { 34 | index_t fi = alk_dat.vert2index_map.at (from [i]), 35 | ti = alk_dat.vert2index_map.at (to [i]); 36 | alk_dat.edgewt2idx_pair_map.emplace (d [i], std::make_pair (fi, ti)); 37 | 38 | // idx2edgewt_map.second is an unordered set that needs to be expanded 39 | std::unordered_set wtset; 40 | if (alk_dat.idx2edgewt_map.find (fi) == 41 | alk_dat.idx2edgewt_map.end ()) { 42 | wtset.emplace (d [i]); 43 | alk_dat.idx2edgewt_map.emplace (fi, wtset); 44 | } else { 45 | wtset = alk_dat.idx2edgewt_map.at (fi); 46 | wtset.emplace (d [i]); 47 | alk_dat.idx2edgewt_map [fi] = wtset; 48 | } 49 | 50 | // repeat same for "to" vertex 51 | if (alk_dat.idx2edgewt_map.find (ti) == 52 | alk_dat.idx2edgewt_map.end ()) { 53 | wtset.emplace (d [i]); 54 | alk_dat.idx2edgewt_map.emplace (ti, wtset); 55 | } else { 56 | wtset = alk_dat.idx2edgewt_map.at (ti); 57 | wtset.emplace (d [i]); 58 | alk_dat.idx2edgewt_map [ti] = wtset; 59 | } 60 | } 61 | 62 | arma::uword nu = static_cast (n); 63 | alk_dat.contig_mat = arma::zeros > (nu, nu); 64 | alk_dat.num_edges = arma::ones > (nu, nu); 65 | alk_dat.avg_dist.set_size (nu, nu); 66 | //alk_dat.avg_dist.fill (INFINITE_DOUBLE); 67 | alk_dat.avg_dist.fill (0.0); 68 | alk_dat.dmat.set_size (nu, nu); 69 | if (alk_dat.shortest) { 70 | alk_dat.dmat.fill (INFINITE_DOUBLE); 71 | } else { 72 | alk_dat.dmat.fill (-INFINITE_DOUBLE); 73 | } 74 | for (int i = 0; i < from.length (); i++) { 75 | arma::uword vf = static_cast ( 76 | alk_dat.vert2index_map.at (from [i])), 77 | vt = static_cast ( 78 | alk_dat.vert2index_map.at (to [i])); 79 | alk_dat.contig_mat (vf, vt) = 1; 80 | alk_dat.num_edges (vf, vt) = 1; 81 | //alk_dat.avg_dist (vf, vt) = 0.0; 82 | alk_dat.avg_dist (vf, vt) = d [i]; 83 | alk_dat.dmat (vf, vt) = d [i]; 84 | } 85 | } 86 | 87 | // update both idx2edgewt and edgewt2idx maps to reflect merging of cluster m 88 | // into cluster l (using Guo's original notation there). The cl2index 89 | // and index2cl maps are updated in `merge_clusters` 90 | void alk::update_edgewt_maps (alk::ALKDat &alk_dat, index_t m, index_t l) { 91 | std::unordered_set wtsl = alk_dat.idx2edgewt_map.at (l), 92 | wtsm = alk_dat.idx2edgewt_map.at (m); 93 | for (auto w: wtsm) { 94 | wtsl.insert (w); 95 | } 96 | alk_dat.idx2edgewt_map.erase (m); 97 | alk_dat.idx2edgewt_map.erase (l); 98 | alk_dat.idx2edgewt_map.emplace (l, wtsl); 99 | 100 | for (auto w: wtsl) { 101 | std::pair pr = alk_dat.edgewt2idx_pair_map.at (w); 102 | bool update = true; 103 | if (pr.first == m) { 104 | pr.first = l; 105 | } else if (pr.second == m) { 106 | pr.second = l; 107 | } else { 108 | update = false; 109 | } 110 | 111 | if (update) { 112 | alk_dat.edgewt2idx_pair_map [w] = pr; 113 | } 114 | } 115 | 116 | // Any edgewt2idx pairs with entries of m also have to be re-mapped to l 117 | // TODO: Is there a better way to do this? 118 | std::unordered_set wts; 119 | for (auto w: alk_dat.edgewt2idx_pair_map) { 120 | if (w.second.first == m || w.second.second == m) { 121 | wts.emplace (w.first); 122 | } 123 | } 124 | if (wts.size () > 0) { 125 | for (auto w: wts) { 126 | std::pair pr = alk_dat.edgewt2idx_pair_map.at (w); 127 | if (pr.first == m) { 128 | pr.first = l; 129 | } 130 | if (pr.second == m) { 131 | pr.second = l; 132 | } 133 | alk_dat.edgewt2idx_pair_map [w] = pr; 134 | } 135 | } 136 | } 137 | 138 | size_t alk::alk_step (alk::ALKDat &alk_dat, 139 | BinarySearchTree &tree, 140 | Rcpp::IntegerVector from, 141 | Rcpp::IntegerVector to, 142 | Rcpp::NumericVector d) { 143 | // Step through to find the minimal-distance edge that (i) connects 144 | // different clusters, (ii) represents contiguous clusters, and (iii) has 145 | // distance greater than the average dist between those 2 clusters. 146 | // T used to step through successive min values: 147 | double edge_dist = tree.treeMin(); 148 | tree_node * node = tree.getRoot (); 149 | node = tree.getNode (node, edge_dist); 150 | std::pair pr = 151 | alk_dat.edgewt2idx_pair_map.at (edge_dist); 152 | index_t l = pr.first, m = pr.second; 153 | arma::uword lu = static_cast (l), 154 | mu = static_cast (m); 155 | while (l == m || alk_dat.contig_mat (lu, mu) == 0 || 156 | edge_dist < alk_dat.avg_dist (lu, mu)) { 157 | node = tree.nextHi (node); 158 | if (node == nullptr) { 159 | Rcpp::stop ("can not go past highest node"); 160 | } 161 | edge_dist = node->data; 162 | pr = alk_dat.edgewt2idx_pair_map.at (edge_dist); 163 | l = pr.first; 164 | m = pr.second; 165 | lu = static_cast (l); 166 | mu = static_cast (m); 167 | } 168 | int li = static_cast (l), mi = static_cast (m); 169 | 170 | size_t ishort = utils::find_shortest_connection (from, to, 171 | alk_dat.vert2index_map, alk_dat.dmat, 172 | alk_dat.cl2index_map, mi, li, alk_dat.shortest); 173 | // ishort is return value; an index into (from, to) 174 | utils::merge_clusters (alk_dat.contig_mat, 175 | alk_dat.index2cl_map, 176 | alk_dat.cl2index_map, mi, li); 177 | update_edgewt_maps (alk_dat, m, l); 178 | 179 | /* Cluster numbers start off here the same as vertex numbers, and so are 180 | * initially simple indices into the vert-by-vert matrices (contig_mat, 181 | * dmat, avg_dist, num_edges). As cluster form, numbers merge to one of the 182 | * pre-existing ones, so are still indexed into these same matrices which do 183 | * not change size. Cluster merging simply means that previous rows and 184 | * columns of these matrices will no longer be indexed, and all new indices 185 | * are derived from constantly updated values of index2cl_map and 186 | * cl2index_map. 187 | */ 188 | 189 | for (auto cl: alk_dat.cl2index_map) { 190 | if (cl.first != static_cast (l) || 191 | cl.first != static_cast (m)) { 192 | arma::uword clu = static_cast (cl.first); 193 | const double tempd_l = alk_dat.avg_dist (clu, lu), 194 | tempd_m = alk_dat.avg_dist (clu, mu); 195 | const int nedges_l = alk_dat.num_edges (clu, lu), 196 | nedges_m = alk_dat.num_edges (clu, mu); 197 | 198 | alk_dat.avg_dist (clu, lu) = 199 | (tempd_l * nedges_l + tempd_m * nedges_m) / 200 | static_cast (nedges_l + nedges_m); 201 | alk_dat.num_edges (clu, lu) = nedges_l + nedges_m; 202 | 203 | if (alk_dat.contig_mat (clu, lu) == 1 || 204 | alk_dat.contig_mat (clu, mu) == 1) { 205 | alk_dat.contig_mat (clu, lu) = 1; 206 | if (tempd_l > 0.0) { 207 | tree.remove (tempd_l); 208 | } 209 | if (tempd_m > 0.0) { 210 | tree.remove (tempd_m); 211 | } 212 | 213 | double tempd = alk_dat.avg_dist (clu, lu); 214 | if (tempd > 0.0) { 215 | tree.insert (tempd); 216 | alk_dat.edgewt2idx_pair_map [tempd] = 217 | std::make_pair (cl.first, l); 218 | 219 | std::unordered_set wtset; 220 | index_t cli_idx = static_cast (cl.first); 221 | if (alk_dat.idx2edgewt_map.find (cli_idx) == 222 | alk_dat.idx2edgewt_map.end ()) { 223 | wtset.clear (); 224 | } else { 225 | wtset = alk_dat.idx2edgewt_map.at (cli_idx); 226 | } 227 | wtset.emplace (tempd); 228 | alk_dat.idx2edgewt_map.erase (cli_idx); 229 | alk_dat.idx2edgewt_map.emplace (cli_idx, wtset); 230 | } 231 | } // end if C(c, l) = 1 or C(c, m) = 1 in Guo's terminology 232 | } // end if cl.first != (cfrom, cto) 233 | } // end for over cl 234 | 235 | return ishort; 236 | } 237 | 238 | 239 | //' rcpp_alk 240 | //' 241 | //' Full-order average linkage cluster redcap algorithm 242 | //' 243 | //' @noRd 244 | // [[Rcpp::export]] 245 | Rcpp::IntegerVector rcpp_alk ( 246 | const Rcpp::DataFrame gr, 247 | const bool shortest, 248 | const bool quiet) 249 | { 250 | Rcpp::IntegerVector from_ref = gr ["from"]; 251 | Rcpp::IntegerVector to_ref = gr ["to"]; 252 | Rcpp::NumericVector d = gr ["d"]; 253 | // Rcpp classes are always passed by reference, so cloning is necessary to 254 | // avoid modifying the original data.frames. 255 | Rcpp::IntegerVector from = Rcpp::clone (from_ref); 256 | Rcpp::IntegerVector to = Rcpp::clone (to_ref); 257 | // Index vectors are 1-indexed, so 258 | from = from - 1; 259 | to = to - 1; 260 | 261 | alk::ALKDat alk_dat; 262 | alk_dat.shortest = shortest; 263 | BinarySearchTree tree; 264 | alk::alk_init (alk_dat, tree, from, to, d); 265 | const size_t n = alk_dat.n; 266 | const bool really_quiet = !(!quiet && n > 100); 267 | 268 | std::unordered_set the_tree; 269 | while (the_tree.size () < (n - 1)) { // tree has n - 1 edges 270 | Rcpp::checkUserInterrupt (); 271 | 272 | size_t ishort = alk::alk_step (alk_dat, tree, from, to, d); 273 | the_tree.insert (ishort); 274 | 275 | if (!really_quiet && the_tree.size () % 100 == 0) { 276 | Rcpp::Rcout << "\rBuilding tree: " << the_tree.size () << " / " << 277 | n - 1; 278 | Rcpp::Rcout.flush (); 279 | } 280 | } 281 | if (!really_quiet) { 282 | Rcpp::Rcout << "\rBuilding tree: " << the_tree.size () << " / " << 283 | n - 1 << " -> done" << std::endl; 284 | } 285 | 286 | std::vector treevec (the_tree.begin (), the_tree.end ()); 287 | 288 | return Rcpp::wrap (treevec); 289 | } 290 | -------------------------------------------------------------------------------- /src/alk.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | // --------- AVERAGE LINKAGE CLUSTER ---------------- 4 | 5 | #include "bst.h" 6 | 7 | /* The main matrices (contig, num_edges, dmat, avg_dist) are all referenced by 8 | * direct indices throughout, not by vertex numbers. The latter are mapped to 9 | * the former by vert2index_map. Note that index2vert_map is not used for this 10 | * routine, but exists as dummy to pass to `sets_init` 11 | * 12 | * The index2cl and cl2index then associate those indices with clusters which 13 | * are themselves also direct indices into the matrices. Cluster merging simply 14 | * re-directs multiple indices onto the same cluster (index) numbers. 15 | * 16 | * The binary tree only returns minimal distances which need to be associated 17 | * with particular pairs of clusters. This is done with the final map, 18 | * edgewt2idx_pair, where the pair of indices is into clusters, requiring this 19 | * map to be constantly updated. This updating requires in turn a reverse map, 20 | * idx2edgewt, so that the weight associated with any pre-merge cluster can 21 | * be obtained, and the edgewt2idx clusters for that weight updated. 22 | */ 23 | 24 | namespace alk { 25 | 26 | struct ALKDat { 27 | bool shortest; 28 | size_t n; 29 | 30 | std::unordered_map > edgewt2idx_pair_map; 32 | std::unordered_map > 33 | idx2edgewt_map; // all wts associated with that cluster 34 | 35 | arma::Mat contig_mat, num_edges; 36 | arma::Mat dmat, avg_dist; 37 | 38 | int2indxset_map_t cl2index_map; 39 | indx2int_map_t index2cl_map, index2vert_map; 40 | int2indx_map_t vert2index_map; 41 | }; 42 | 43 | void alk_init (ALKDat &alk_dat, 44 | BinarySearchTree &tree, 45 | Rcpp::IntegerVector from, 46 | Rcpp::IntegerVector to, 47 | Rcpp::NumericVector d); 48 | 49 | void update_edgewt_maps (ALKDat &alk_dat, index_t l, index_t m); 50 | 51 | size_t alk_step (ALKDat &alk_dat, 52 | BinarySearchTree &tree, 53 | Rcpp::IntegerVector from, 54 | Rcpp::IntegerVector to, 55 | Rcpp::NumericVector d); 56 | 57 | } // end namespace alk 58 | 59 | Rcpp::IntegerVector rcpp_alk ( 60 | const Rcpp::DataFrame gr, 61 | const bool shortest, 62 | const bool quiet); 63 | -------------------------------------------------------------------------------- /src/bst.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | // Templating with recursive pointers is much harder than simply changing the 7 | // typedef, and makes the code much less readable 8 | typedef double data_type; 9 | 10 | struct tree_node 11 | { 12 | tree_node * lo; 13 | tree_node * hi; 14 | tree_node * parent; 15 | data_type data; 16 | }; 17 | 18 | class BinarySearchTree 19 | { 20 | private: 21 | tree_node * root; 22 | data_type tmin (tree_node * node); 23 | void clear_node (tree_node * node); 24 | tree_node * removeNode (tree_node * node, data_type value); 25 | 26 | public: 27 | BinarySearchTree () 28 | { 29 | root = nullptr; 30 | } 31 | ~BinarySearchTree () 32 | { 33 | treeClear (); 34 | } 35 | void insert (data_type); 36 | void remove (data_type value); 37 | data_type treeMin (); 38 | 39 | tree_node * getRoot (); 40 | tree_node * getNode (tree_node * node, data_type value); 41 | tree_node * treeMinTree (); 42 | tree_node * tminTree (tree_node * node); 43 | 44 | tree_node * nextHi (tree_node * node); 45 | 46 | void treeClear (); 47 | }; 48 | 49 | void BinarySearchTree::insert (data_type d) 50 | { 51 | tree_node * t = new tree_node; 52 | tree_node * parent; 53 | t->data = d; 54 | t->lo = nullptr; 55 | t->hi = nullptr; 56 | parent = nullptr; 57 | 58 | if (root == nullptr) 59 | root = t; 60 | else 61 | { 62 | tree_node * node; 63 | node = root; 64 | while (node != nullptr) 65 | { 66 | parent = node; 67 | if (t->data > node->data) 68 | node = node->hi; 69 | else 70 | node = node->lo; 71 | } 72 | 73 | t->parent = parent; 74 | if (t->data < parent->data) 75 | parent->lo = t; 76 | else 77 | parent->hi = t; 78 | } 79 | } 80 | 81 | void BinarySearchTree::remove (data_type value) 82 | { 83 | root = removeNode (root, value); 84 | } 85 | 86 | // recursive private member function: 87 | tree_node * BinarySearchTree::removeNode (tree_node * node, data_type value) 88 | { 89 | if (node == nullptr) 90 | return node; 91 | 92 | if (value < node->data) { 93 | node->lo = removeNode (node->lo, value); 94 | } else if (value > node->data) { 95 | node->hi = removeNode (node->hi, value); 96 | } else { 97 | if (node->lo == nullptr && node->hi == nullptr) { // no children 98 | delete node; 99 | node = nullptr; 100 | } 101 | else if (node->lo == nullptr) { // 1 child: hi 102 | tree_node * temp = node; 103 | node->hi->parent = node->parent; 104 | node = node->hi; 105 | delete temp; 106 | } 107 | else if (node->hi == nullptr) { // 1 childe: lo 108 | tree_node * temp = node; 109 | node->lo->parent = node->parent; 110 | node = node->lo; 111 | delete temp; 112 | } 113 | else // 2 children 114 | { 115 | tree_node * temp = tminTree (node->hi); 116 | node->data = temp->data; 117 | node->hi = removeNode (node->hi, temp->data); 118 | } 119 | } 120 | return node; // then the root node which needs to be updated 121 | } 122 | 123 | data_type BinarySearchTree::treeMin () 124 | { 125 | return tmin (root); 126 | } 127 | 128 | data_type BinarySearchTree::tmin (tree_node * node) 129 | { 130 | while (node->lo != nullptr) 131 | node = node->lo; 132 | 133 | return node->data; 134 | } 135 | 136 | tree_node * BinarySearchTree::treeMinTree () 137 | { 138 | return tminTree (root); 139 | } 140 | 141 | tree_node * BinarySearchTree::tminTree (tree_node * node) 142 | { 143 | while (node->lo != nullptr) 144 | node = node->lo; 145 | 146 | return node; 147 | } 148 | 149 | void BinarySearchTree::treeClear () 150 | { 151 | clear_node (root); 152 | } 153 | 154 | void BinarySearchTree::clear_node (tree_node * node) 155 | { 156 | if (node != nullptr) 157 | { 158 | clear_node (node->lo); 159 | clear_node (node->hi); 160 | delete node; 161 | } 162 | } 163 | 164 | tree_node * BinarySearchTree::getRoot () 165 | { 166 | tree_node * node = root; 167 | return node; 168 | } 169 | 170 | tree_node * BinarySearchTree::getNode (tree_node * node, data_type value) 171 | { 172 | if (node == nullptr) 173 | { 174 | //std::cout << "value = " << value << 175 | // " does not exist in the tree" << std::endl; 176 | return node; 177 | } 178 | 179 | if (value < node->data) 180 | return getNode (node->lo, value); 181 | else if (value > node->data) 182 | return getNode (node->hi, value); 183 | else 184 | return node; 185 | } 186 | 187 | tree_node * BinarySearchTree::nextHi (tree_node * node) 188 | { 189 | if (node->hi != nullptr) 190 | return tminTree (node->hi); 191 | 192 | tree_node * y = node->parent; 193 | while (y != nullptr && node == y->hi) 194 | { 195 | node = y; 196 | y = y->parent; 197 | } 198 | //if (y == nullptr) 199 | // std::cout << "already at max value" << std::endl; 200 | return y; 201 | } 202 | -------------------------------------------------------------------------------- /src/clk.cpp: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include "clk.h" 3 | 4 | // --------- COMPLETE LINKAGE CLUSTER ---------------- 5 | 6 | void clk::clk_init (clk::CLKDat &clk_dat, 7 | Rcpp::IntegerVector from_full, 8 | Rcpp::IntegerVector to_full, 9 | Rcpp::NumericVector d_full, 10 | Rcpp::IntegerVector from, 11 | Rcpp::IntegerVector to, 12 | Rcpp::NumericVector d) { 13 | size_t n = utils::sets_init (from, to, clk_dat.vert2index_map, 14 | clk_dat.index2vert_map, clk_dat.index2cl_map, 15 | clk_dat.cl2index_map); 16 | clk_dat.n = n; 17 | 18 | clk_dat.edges_all.clear (); 19 | clk_dat.edges_all.resize (static_cast (from_full.size ())); 20 | for (int i = 0; i < from_full.size (); i++) { 21 | utils::OneEdge here; 22 | here.from = from_full [i]; 23 | here.to = to_full [i]; 24 | here.dist = d_full [i]; 25 | clk_dat.edges_all [static_cast (i)] = here; 26 | } 27 | // These edges are already passed in sorted form, so no need to explicitly 28 | // sort here. 29 | 30 | clk_dat.edges_nn.clear (); 31 | clk_dat.edges_nn.resize (static_cast (from.size ())); 32 | for (int i = 0; i < from.size (); i++) { 33 | utils::OneEdge here; 34 | here.from = from [i]; 35 | here.to = to [i]; 36 | here.dist = d [i]; 37 | clk_dat.edges_nn [static_cast (i)] = here; 38 | } 39 | 40 | // Get set of unique vertices, and store binary tree of edge distances 41 | intset_t vert_set; 42 | for (int i = 0; i < from.size (); i++) { 43 | vert_set.emplace (from [i]); 44 | vert_set.emplace (to [i]); 45 | } 46 | // Construct vert2index_map to map each unique vertex to an index 47 | index_t idx = 0; 48 | for (auto v: vert_set) { 49 | clk_dat.vert2index_map.emplace (v, idx++); 50 | } 51 | 52 | arma::uword nu = static_cast (n); 53 | clk_dat.contig_mat = arma::zeros > (nu, nu); 54 | clk_dat.dmat.zeros (nu, nu); 55 | for (int i = 0; i < from.length (); i++) { 56 | arma::uword vf = static_cast ( 57 | clk_dat.vert2index_map.at (from [i])), 58 | vt = static_cast ( 59 | clk_dat.vert2index_map.at (to [i])); 60 | clk_dat.contig_mat (vf, vt) = 1; 61 | //clk_dat.dmat (vf, vt) = d [i]; // NOPE - all dmat = 0 at start! 62 | } 63 | } 64 | 65 | //' clk_step 66 | //' 67 | //' @param ei The i'th edge of the full sorted list of edge weights 68 | //' @noRd 69 | size_t clk::clk_step (clk::CLKDat &clk_dat, size_t i) { 70 | // find shortest _all edges that connects the two clusters 71 | utils::OneEdge ei = clk_dat.edges_all [i]; 72 | const size_t u = clk_dat.vert2index_map.at (ei.from), 73 | v = clk_dat.vert2index_map.at (ei.to); 74 | const int cl_u = clk_dat.index2cl_map.at (u), 75 | cl_v = clk_dat.index2cl_map.at (v); 76 | 77 | // Find shortest edge (or longest for covariance) in MST that connects 78 | // u and v: 79 | size_t mmin = INFINITE_INT, lmin = INFINITE_INT, the_edge = INFINITE_INT; 80 | double dlim = INFINITE_DOUBLE; 81 | if (!clk_dat.shortest) { 82 | dlim = -dlim; 83 | } 84 | 85 | for (size_t j = 0; j < clk_dat.edges_nn.size (); j++) { 86 | utils::OneEdge ej = clk_dat.edges_nn [j]; 87 | size_t m = clk_dat.vert2index_map.at (ej.from), 88 | l = clk_dat.vert2index_map.at (ej.to); 89 | if (((clk_dat.index2cl_map.at (m) == cl_u && 90 | clk_dat.index2cl_map.at (l) == cl_v) || 91 | (clk_dat.index2cl_map.at (m) == cl_v && 92 | clk_dat.index2cl_map.at (l) == cl_u)) && 93 | ((clk_dat.shortest && ej.dist < dlim) || 94 | (!clk_dat.shortest && ej.dist > dlim))) { 95 | the_edge = j; 96 | mmin = m; 97 | lmin = l; 98 | dlim = ej.dist; 99 | } 100 | } 101 | if (fabs (dlim) == INFINITE_DOUBLE) { 102 | Rcpp::stop ("minimal distance not able to be found"); 103 | } 104 | 105 | const int merge_to_id = clk_dat.index2cl_map.at (lmin); 106 | utils::merge_clusters (clk_dat.contig_mat, 107 | clk_dat.index2cl_map, 108 | clk_dat.cl2index_map, 109 | clk_dat.index2cl_map.at (mmin), 110 | merge_to_id); 111 | 112 | for (auto cl: clk_dat.cl2index_map) { 113 | if (cl.first != static_cast (lmin) || 114 | cl.first != static_cast (mmin)) { 115 | arma::uword clu = static_cast (cl.first), 116 | lu = static_cast (lmin), 117 | mu = static_cast (mmin); 118 | const double dl = clk_dat.dmat (clu, lu), 119 | dm = clk_dat.dmat (clu, mu); 120 | double dtemp = dl; 121 | if ((clk_dat.shortest && dm < dl) || 122 | (!clk_dat.shortest && dm > dl)) { 123 | dtemp = dm; 124 | } 125 | clk_dat.dmat (clu, lu) = dtemp; 126 | 127 | if (clk_dat.contig_mat (clu, lu) == 1 || 128 | clk_dat.contig_mat (clu, mu) == 1) { 129 | clk_dat.contig_mat (clu, lu) = 1; 130 | } 131 | } 132 | } // end for over cl 133 | 134 | return the_edge; 135 | } 136 | 137 | //' rcpp_clk 138 | //' 139 | //' Full-order complete linkage cluster redcap algorithm 140 | //' 141 | //' @noRd 142 | // [[Rcpp::export]] 143 | Rcpp::IntegerVector rcpp_clk ( 144 | const Rcpp::DataFrame gr_full, 145 | const Rcpp::DataFrame gr, 146 | const bool shortest, 147 | const bool quiet) 148 | { 149 | Rcpp::IntegerVector from_full_ref = gr_full ["from"]; 150 | Rcpp::IntegerVector to_full_ref = gr_full ["to"]; 151 | Rcpp::NumericVector d_full_ref = gr_full ["d"]; 152 | Rcpp::IntegerVector from_ref = gr ["from"]; 153 | Rcpp::IntegerVector to_ref = gr ["to"]; 154 | Rcpp::NumericVector d_ref = gr ["d"]; 155 | 156 | // Rcpp classes are always passed by reference, so cloning is necessary to 157 | // avoid modifying the original data.frames. 158 | Rcpp::IntegerVector from_full = Rcpp::clone (from_full_ref); 159 | Rcpp::IntegerVector to_full = Rcpp::clone (to_full_ref); 160 | Rcpp::NumericVector d_full = Rcpp::clone (d_full_ref); 161 | Rcpp::IntegerVector from = Rcpp::clone (from_ref); 162 | Rcpp::IntegerVector to = Rcpp::clone (to_ref); 163 | Rcpp::NumericVector d = Rcpp::clone (d_ref); 164 | 165 | // Index vectors are 1-indexed, so 166 | from_full = from_full - 1; 167 | to_full = to_full - 1; 168 | from = from - 1; 169 | to = to - 1; 170 | 171 | clk::CLKDat clk_dat; 172 | clk_dat.shortest = shortest; 173 | clk::clk_init (clk_dat, from_full, to_full, d_full, from, to, d); 174 | if (clk_dat.shortest) { 175 | clk_dat.dmat.fill (INFINITE_DOUBLE); 176 | } else { 177 | clk_dat.dmat.fill (-INFINITE_DOUBLE); 178 | } 179 | 180 | const size_t n = clk_dat.edges_all.size (); 181 | const bool really_quiet = !(!quiet && n > (100 * 100)); 182 | 183 | std::vector treevec; 184 | for (size_t i = 0; i < n; i++) { 185 | Rcpp::checkUserInterrupt (); 186 | 187 | utils::OneEdge ei = clk_dat.edges_all [i]; 188 | arma::uword u = static_cast ( 189 | clk_dat.vert2index_map.at (ei.from)), 190 | v = static_cast ( 191 | clk_dat.vert2index_map.at (ei.to)); 192 | 193 | if (clk_dat.index2cl_map.at (u) != clk_dat.index2cl_map.at (v) && 194 | clk_dat.contig_mat (u, v) == 1 && 195 | ((clk_dat.shortest && ei.dist < clk_dat.dmat (u, v)) || 196 | (!clk_dat.shortest && ei.dist > clk_dat.dmat (u, v)))) { 197 | size_t the_edge = clk_step (clk_dat, i); 198 | treevec.push_back (the_edge); 199 | } 200 | if (!really_quiet && i % 100 == 0) { 201 | Rcpp::Rcout << "\rBuilding tree: " << i << " / " << n; 202 | Rcpp::Rcout.flush (); 203 | } 204 | } 205 | 206 | if (!really_quiet) { 207 | Rcpp::Rcout << "\rBuilding tree: " << n << " / " << n << 208 | " -> done" << std::endl; 209 | } 210 | 211 | // treevec here in an index into a **sorted** version of (from, to , d) 212 | return Rcpp::wrap (treevec); 213 | } 214 | -------------------------------------------------------------------------------- /src/clk.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "utils.h" 4 | 5 | // --------- COMPLETE LINKAGE CLUSTER ---------------- 6 | 7 | namespace clk { 8 | 9 | struct CLKDat { 10 | bool shortest; 11 | size_t n; 12 | 13 | std::vector edges_all, edges_nn; 14 | 15 | arma::Mat contig_mat; 16 | arma::Mat dmat; 17 | 18 | int2indxset_map_t cl2index_map; 19 | 20 | indx2int_map_t index2cl_map, index2vert_map; 21 | int2indx_map_t vert2index_map; 22 | }; 23 | 24 | void clk_init (CLKDat &clk_dat, 25 | Rcpp::IntegerVector from_full, 26 | Rcpp::IntegerVector to_full, 27 | Rcpp::NumericVector d_full, 28 | Rcpp::IntegerVector from, 29 | Rcpp::IntegerVector to, 30 | Rcpp::NumericVector d); 31 | 32 | size_t clk_step (CLKDat &clk_dat, size_t i); 33 | 34 | } // end namespace clk 35 | 36 | Rcpp::IntegerVector rcpp_clk ( 37 | const Rcpp::DataFrame gr_full, 38 | const Rcpp::DataFrame gr, 39 | const bool shortest, 40 | const bool quiet); 41 | -------------------------------------------------------------------------------- /src/common.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include // std::find 6 | #include 7 | #include 8 | #include 9 | #include // stoi 10 | #include // round 11 | #include 12 | 13 | #include 14 | // [[Rcpp::depends(RcppArmadillo)]] 15 | 16 | constexpr float INFINITE_FLOAT = std::numeric_limits::max (); 17 | constexpr double INFINITE_DOUBLE = std::numeric_limits::max (); 18 | constexpr int INFINITE_INT = std::numeric_limits::max (); 19 | 20 | typedef size_t index_t; 21 | 22 | typedef std::unordered_map int2int_map_t; 23 | typedef std::unordered_map indx2int_map_t; 24 | typedef std::unordered_map int2indx_map_t; 25 | 26 | typedef std::unordered_set intset_t; 27 | typedef std::unordered_map int2intset_map_t; 28 | 29 | typedef std::unordered_set indxset_t; 30 | typedef std::unordered_map int2indxset_map_t; 31 | -------------------------------------------------------------------------------- /src/cuttree.cpp: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include "cuttree.h" 3 | 4 | void cuttree::fill_edges (cuttree::TreeDat &tree, 5 | const std::vector &from, 6 | const std::vector &to, 7 | Rcpp::NumericVector &d) { 8 | std::unordered_map vert2index_map; 9 | intset_t vert_set; 10 | for (size_t i = 0; i < from.size (); i++) { 11 | vert_set.emplace (from [i]); 12 | vert_set.emplace (to [i]); 13 | } 14 | 15 | int vert_num = 0; 16 | for (auto v: vert_set) { 17 | vert2index_map.emplace (v, vert_num++); 18 | } 19 | 20 | for (size_t i = 0; i < tree.edges.size (); i++) { 21 | cuttree::EdgeComponent this_edge; 22 | this_edge.d = d [static_cast (i)]; 23 | this_edge.cluster_num = 0; 24 | this_edge.from = vert2index_map.at (from [i]); 25 | this_edge.to = vert2index_map.at (to [i]); 26 | tree.edges [i] = this_edge; 27 | } 28 | } 29 | 30 | // Internal sum of squared deviations of specified cluster number (this is just 31 | // the variance without the scaling by N) 32 | double cuttree::calc_ss (const std::vector &edges, 33 | const int cluster_num) { 34 | double s2 = 0.0, s = 0.0; 35 | double count = 0.0; 36 | for (auto i: edges) { 37 | if (i.cluster_num == cluster_num) { 38 | s += i.d; 39 | s2 += i.d * i.d; 40 | count += 1.0; 41 | } 42 | } 43 | //return (s2 - s * s / count) / (count - 1.0); // variance 44 | return (s2 - s * s / count); 45 | } 46 | 47 | // Internal mean covariance of specified cluster number 48 | double cuttree::calc_covsum (const std::vector &edges, 49 | const int cluster_num) { 50 | double s = 0.0; 51 | double count = 0.0; 52 | for (auto i: edges) { 53 | if (i.cluster_num == cluster_num) { 54 | s += i.d; 55 | count += 1.0; 56 | } 57 | } 58 | return s / count; 59 | } 60 | 61 | size_t cuttree::cluster_size (const std::vector &edges, 62 | const int cluster_num) { 63 | size_t n = 0; 64 | for (auto i: edges) { 65 | if (i.cluster_num == cluster_num) { 66 | n++; 67 | } 68 | } 69 | return n; 70 | } 71 | 72 | // Build connected component of tree starting from first edge. 73 | std::unordered_set cuttree::build_one_tree ( 74 | std::vector &edges) { 75 | std::unordered_set tree; 76 | 77 | tree.emplace (edges [0].from); 78 | tree.emplace (edges [0].to); 79 | 80 | bool done = false; 81 | while (!done) { 82 | bool added = false; 83 | for (size_t j = 1; j < edges.size (); j++) { 84 | if (tree.find (edges [j].from) != tree.end () && 85 | tree.find (edges [j].to) == tree.end ()) { 86 | tree.emplace (edges [j].to); 87 | added = true; 88 | } else if (tree.find (edges [j].to) != tree.end () && 89 | tree.find (edges [j].from) == tree.end ()) { 90 | tree.emplace (edges [j].from); 91 | added = true; 92 | } 93 | } 94 | done = !added; 95 | } 96 | return tree; 97 | } 98 | 99 | cuttree::TwoSS cuttree::sum_component_ss ( 100 | const std::vector &edges, 101 | const std::unordered_set &tree_edges, 102 | const bool shortest) { 103 | double sa = 0.0, sa2 = 0.0, sb = 0.0, sb2 = 0.0, na = 0.0, nb = 0.0; 104 | for (auto e: edges) { 105 | if (tree_edges.find (e.from) != tree_edges.end ()) { 106 | sa += e.d; 107 | if (shortest) { 108 | sa2 += e.d * e.d; 109 | } 110 | na += 1.0; 111 | } else { 112 | sb += e.d; 113 | if (shortest) { 114 | sb2 += e.d * e.d; 115 | } 116 | nb += 1.0; 117 | } 118 | } 119 | 120 | cuttree::TwoSS res; 121 | // res.ss1 = (sa2 - sa * sa / na) / (na - 1.0); // variance 122 | if (shortest) { 123 | res.ss1 = (sa2 - sa * sa / na); 124 | res.ss2 = (sb2 - sb * sb / nb); 125 | } else { // covariances are mean values, *NOT* sums like SS values 126 | res.ss1 = sa / na; 127 | res.ss2 = sb / nb; 128 | } 129 | res.n1 = static_cast (na); 130 | res.n2 = static_cast (nb); 131 | return res; 132 | } 133 | 134 | // Find the component split of edges in cluster_num which yields the lowest sum 135 | // of internal variance. 136 | cuttree::BestCut cuttree::find_min_cut ( 137 | const TreeDat &tree, 138 | const int cluster_num, 139 | const bool shortest) { 140 | size_t n = cuttree::cluster_size (tree.edges, cluster_num); 141 | 142 | // fill component vector 143 | std::vector cluster_edges (n); 144 | size_t pos = 0; 145 | for (auto e: tree.edges) { 146 | if (e.cluster_num == cluster_num) { 147 | cluster_edges [pos++] = e; 148 | } 149 | } 150 | 151 | std::vector edges_copy; 152 | 153 | // Remove each edge in turn 154 | cuttree::BestCut the_cut; 155 | the_cut.pos = the_cut.n1 = the_cut.n2 = INFINITE_INT; 156 | the_cut.ss1 = the_cut.ss2 = INFINITE_DOUBLE; 157 | the_cut.ss_diff = 0.0; // default, coz search is over max ss_diff 158 | double ssmin = INFINITE_DOUBLE; 159 | 160 | // TODO: Rewrite this to just erase and re-insert a single edge each time 161 | for (int i = 0; i < static_cast (n); i++) { 162 | edges_copy.resize (0); 163 | edges_copy.shrink_to_fit (); 164 | edges_copy.resize (n); 165 | std::copy (cluster_edges.begin (), cluster_edges.end (), 166 | edges_copy.begin ()); 167 | edges_copy.erase (edges_copy.begin () + i); 168 | 169 | if (edges_copy.size () < cuttree::MIN_CLUSTER_SIZE) { 170 | break; 171 | } 172 | 173 | std::unordered_set tree_edges = cuttree::build_one_tree (edges_copy); 174 | 175 | // only include groups with >= MIN_CLUSTER_SIZE members 176 | if (tree_edges.size () >= cuttree::MIN_CLUSTER_SIZE && 177 | tree_edges.size () < (edges_copy.size () - 178 | cuttree::MIN_CLUSTER_SIZE - 1)) { 179 | cuttree::TwoSS ss; 180 | ss = cuttree::sum_component_ss (edges_copy, tree_edges, shortest); 181 | 182 | if ((ss.ss1 + ss.ss2) < ssmin) { // applies to both distances & cov 183 | ssmin = ss.ss1 + ss.ss2; 184 | the_cut.pos = i; 185 | the_cut.ss1 = ss.ss1; 186 | the_cut.ss2 = ss.ss2; 187 | 188 | the_cut.n1 = ss.n1; 189 | the_cut.n2 = ss.n2; 190 | 191 | the_cut.nodes.clear (); 192 | for (auto te: tree_edges) { 193 | the_cut.nodes.emplace (te); 194 | } 195 | } 196 | } 197 | } 198 | 199 | if (the_cut.ss1 < INFINITE_DOUBLE) { 200 | the_cut.ss_diff = cuttree::calc_ss (tree.edges, cluster_num) - 201 | the_cut.ss1 - the_cut.ss2; 202 | } 203 | 204 | return the_cut; 205 | } 206 | 207 | //' rcpp_cut_tree 208 | //' 209 | //' Cut tree into specified number of clusters by minimising internal cluster 210 | //' variance. 211 | //' 212 | //' @param tree tree to be processed 213 | //' 214 | //' @return Vector of cluster IDs for each tree edge 215 | //' @noRd 216 | // [[Rcpp::export]] 217 | Rcpp::IntegerVector rcpp_cut_tree (const Rcpp::DataFrame tree, const int ncl, 218 | const bool shortest, const bool quiet) { 219 | Rcpp::IntegerVector from_in = tree ["from"]; 220 | Rcpp::IntegerVector to_in = tree ["to"]; 221 | Rcpp::NumericVector dref = tree ["d"]; 222 | 223 | std::vector from = Rcpp::as > (from_in); 224 | std::vector to = Rcpp::as > (to_in); 225 | 226 | cuttree::TreeDat tree_dat; 227 | tree_dat.edges.resize (static_cast (dref.size ())); 228 | cuttree::fill_edges (tree_dat, from, to, dref); 229 | 230 | cuttree::BestCut the_cut = cuttree::find_min_cut (tree_dat, 0, shortest); 231 | std::vector ss_diff, ss1, ss2; 232 | ss_diff.push_back (the_cut.ss_diff); // ss0 - ss1 - ss2 233 | ss1.push_back (the_cut.ss1); 234 | ss2.push_back (the_cut.ss2); 235 | // map from index into ss vectors to actual cluster numbers 236 | std::unordered_map cluster_map; 237 | cluster_map.emplace (0, 0); 238 | 239 | const bool really_quiet = !(!quiet && from_in.size () > 100); 240 | 241 | int num_clusters = 1; 242 | // This loop fills the three vectors (ss_diff, ss1, ss2), as well as the 243 | // cluster_map. 244 | while (num_clusters < ncl) { 245 | Rcpp::checkUserInterrupt (); 246 | if (!really_quiet) { 247 | Rcpp::Rcout << "\rNumber of clusters: " << num_clusters << " / " << ncl; 248 | Rcpp::Rcout.flush (); 249 | } 250 | 251 | auto mp = std::max_element (ss_diff.begin (), ss_diff.end ()); 252 | long int maxi_int = std::distance (ss_diff.begin (), mp); 253 | size_t maxi = static_cast (maxi_int); 254 | if (cluster_map.find (maxi) == cluster_map.end ()) { 255 | Rcpp::Rcout << "ss_diff has no max element in cluster_map" << 256 | std::endl; 257 | } 258 | int clnum = cluster_map.at (maxi); 259 | // maxi is index of cluster to be split 260 | 261 | if (ss_diff [maxi] == 0.0) { // no further cuts possible 262 | break; 263 | } 264 | 265 | the_cut = cuttree::find_min_cut (tree_dat, clnum, shortest); 266 | // Break old clnum into 2: 267 | int count = 0; 268 | for (auto &e: tree_dat.edges) { 269 | if (e.cluster_num == clnum) { 270 | if (count == the_cut.pos) { 271 | e.cluster_num = INFINITE_INT; 272 | } else if (the_cut.nodes.find (e.from) == the_cut.nodes.end ()) { 273 | e.cluster_num = num_clusters; 274 | } 275 | count++; 276 | } 277 | } 278 | // find new best cut of now reduced cluster 279 | the_cut = cuttree::find_min_cut (tree_dat, clnum, shortest); 280 | 281 | ss_diff [maxi] = the_cut.ss_diff; 282 | ss1 [maxi] = the_cut.ss1; 283 | ss2 [maxi] = the_cut.ss2; 284 | // and also of new cluster 285 | the_cut = cuttree::find_min_cut (tree_dat, num_clusters, shortest); 286 | 287 | ss_diff.push_back (the_cut.ss_diff); 288 | ss1.push_back (the_cut.ss1); 289 | ss2.push_back (the_cut.ss2); 290 | 291 | cluster_map.emplace (num_clusters, ss_diff.size () - 1); 292 | 293 | num_clusters++; 294 | } 295 | 296 | if (!really_quiet) { 297 | Rcpp::Rcout << " -> done" << std::endl; 298 | } 299 | 300 | Rcpp::IntegerVector res (tree_dat.edges.size ()); 301 | for (int i = 0; i < static_cast (tree_dat.edges.size ()); i++) { 302 | if (tree_dat.edges [static_cast (i)].cluster_num == INFINITE_INT) { 303 | res [i] = NA_INTEGER; 304 | } else { 305 | res [i] = tree_dat.edges [static_cast (i)].cluster_num; 306 | } 307 | } 308 | return res; 309 | } 310 | -------------------------------------------------------------------------------- /src/cuttree.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace cuttree { 6 | 7 | // clusters are of edges, so size = 2 => 3 nodes 8 | constexpr int MIN_CLUSTER_SIZE = 2; 9 | 10 | struct EdgeComponent { 11 | double d; 12 | int from, to, cluster_num; 13 | }; 14 | 15 | struct TreeDat { 16 | std::vector edges; 17 | }; 18 | 19 | struct BestCut { 20 | int pos, n1, n2; 21 | double ss_diff, ss1, ss2; 22 | std::unordered_set nodes; 23 | }; 24 | 25 | struct TwoSS { // 2 sums-of-squares values 26 | double ss1, ss2; 27 | int n1, n2; // sizes of clusters 28 | }; 29 | 30 | void fill_edges (TreeDat &tree, 31 | const std::vector &from, 32 | const std::vector &to, 33 | Rcpp::NumericVector &d); 34 | double calc_ss (const std::vector &edges, 35 | const int cluster_num); 36 | double calc_covsum (const std::vector &edges, 37 | const int cluster_num); 38 | size_t cluster_size (const std::vector &edges, 39 | const int cluster_num); 40 | std::unordered_set build_one_tree (std::vector &edges); 41 | 42 | TwoSS sum_component_ss (const std::vector &edges, 43 | const std::unordered_set &tree_edges, const bool shortest); 44 | BestCut find_min_cut (const TreeDat &tree, const int cluster_num, 45 | const bool shortest); 46 | 47 | } // end namespace cuttree 48 | 49 | Rcpp::IntegerVector rcpp_cut_tree (const Rcpp::DataFrame tree, const int ncl, 50 | const bool shortest, const bool quiet); 51 | -------------------------------------------------------------------------------- /src/full-init.cpp: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include "utils.h" 3 | #include "full-init.h" 4 | 5 | // --------- FULL CLUSTER ---------------- 6 | 7 | void full_init::init (full_init::FullInitDat &clfull_dat, 8 | Rcpp::IntegerVector from, 9 | Rcpp::IntegerVector to, 10 | Rcpp::NumericVector d) { 11 | intset_t vert_set; 12 | for (int i = 0; i < from.size (); i++) { 13 | vert_set.emplace (from [i]); 14 | vert_set.emplace (to [i]); 15 | } 16 | clfull_dat.n = vert_set.size (); 17 | 18 | index_t idx = 0; 19 | for (auto v: vert_set) { 20 | clfull_dat.index2vert_map.emplace (idx, v); 21 | clfull_dat.vert2index_map.emplace (v, idx); 22 | clfull_dat.index2cl_map.emplace (idx++, -1); 23 | } 24 | 25 | clfull_dat.edges.clear (); 26 | clfull_dat.edges.resize (static_cast (from.size ())); 27 | for (int i = 0; i < from.size (); i++) { 28 | utils::OneEdge here; 29 | here.from = from [i]; 30 | here.to = to [i]; 31 | here.dist = d [i]; 32 | clfull_dat.edges [static_cast (i)] = here; 33 | } 34 | 35 | clfull_dat.index_in_cluster.resize (clfull_dat.n); 36 | std::fill (clfull_dat.index_in_cluster.begin (), 37 | clfull_dat.index_in_cluster.end (), false); 38 | } 39 | 40 | 41 | void full_init::assign_first_edge (full_init::FullInitDat &clfull_dat) { 42 | int clnum = 0; 43 | index_t ei = 0; 44 | utils::OneEdge edge = clfull_dat.edges [ei]; 45 | index_t ito = clfull_dat.vert2index_map.at (edge.to), 46 | ifrom = clfull_dat.vert2index_map.at (edge.from); 47 | 48 | clfull_dat.index2cl_map [ito] = clnum; 49 | clfull_dat.index2cl_map [ifrom] = clnum; 50 | clfull_dat.index_in_cluster [ito] = true; 51 | clfull_dat.index_in_cluster [ifrom] = true; 52 | 53 | intset_t cli; 54 | cli.insert (static_cast (ito)); 55 | cli.insert (static_cast (ifrom)); 56 | clfull_dat.cl2index_map.emplace (clnum, cli); 57 | 58 | clfull_dat.vert2cl_map.emplace (edge.to, clnum); 59 | clfull_dat.vert2cl_map.emplace (edge.from, clnum); 60 | } 61 | 62 | 63 | //' step 64 | //' 65 | //' All edges are initially in their own clusters. This merges edge#i with the 66 | //' next closest edge 67 | //' 68 | //' @param ei The i'th edge of the sorted list of NN edge weights 69 | //' @noRd 70 | int full_init::step (full_init::FullInitDat &clfull_dat, 71 | const index_t ei, const int clnum) { 72 | bool from_in = false, to_in = false; 73 | utils::OneEdge edge = clfull_dat.edges [ei]; 74 | index_t ito = clfull_dat.vert2index_map.at (edge.to), 75 | ifrom = clfull_dat.vert2index_map.at (edge.from); 76 | if (clfull_dat.index_in_cluster [ito]) { 77 | to_in = true; 78 | } 79 | if (clfull_dat.index_in_cluster [ifrom]) { 80 | from_in = true; 81 | } 82 | 83 | int clnum_i = clnum; 84 | 85 | if (from_in && to_in) { 86 | clnum_i = INFINITE_INT; 87 | } else { 88 | if (from_in) { // then to is not in cluster 89 | clnum_i = clfull_dat.index2cl_map [ifrom]; 90 | } else if (to_in) { 91 | clnum_i = clfull_dat.index2cl_map [ito]; 92 | } 93 | 94 | clfull_dat.index_in_cluster [ito] = true; 95 | clfull_dat.index_in_cluster [ifrom] = true; 96 | clfull_dat.index2cl_map [ifrom] = clnum_i; 97 | clfull_dat.index2cl_map [ito] = clnum_i; 98 | 99 | intset_t cli; 100 | if (clfull_dat.cl2index_map.find (clnum_i) != 101 | clfull_dat.cl2index_map.end ()) { 102 | cli = clfull_dat.cl2index_map.at (clnum_i); 103 | } 104 | cli.insert (static_cast (ito)); 105 | cli.insert (static_cast (ifrom)); 106 | clfull_dat.cl2index_map [clnum_i] = cli; 107 | 108 | // These values may already be in the map here, but that's okay 109 | clfull_dat.vert2cl_map.emplace (edge.to, clnum_i); 110 | clfull_dat.vert2cl_map.emplace (edge.from, clnum_i); 111 | } 112 | 113 | return clnum_i; 114 | } 115 | 116 | //' fill_cl_edges 117 | //' 118 | //' Fill (arma) matrix of strongest/shortest connections between all clusters 119 | //' used to construct the hierarchical relationships 120 | //' @noRd 121 | void full_init::fill_cl_edges (full_init::FullInitDat &clfull_dat, 122 | arma::Mat &cl_edges, int num_clusters) { 123 | int2intset_map_t vert_sets; 124 | for (int i = 0; i < num_clusters; i++) { 125 | intset_t verts; 126 | for (auto vi: clfull_dat.vert2cl_map) { 127 | if (vi.second == i) { 128 | verts.emplace (vi.first); 129 | } 130 | } 131 | vert_sets.emplace (i, verts); 132 | } 133 | 134 | // need a (sparse) matrix of all pairwise edge distances: 135 | arma::uword nu = static_cast (clfull_dat.n); 136 | arma::Mat vert_dists (nu, nu); 137 | if (!clfull_dat.shortest) { 138 | vert_dists.fill (INFINITE_DOUBLE); 139 | } 140 | for (auto ei: clfull_dat.edges) { 141 | arma::uword i = static_cast ( 142 | clfull_dat.vert2index_map.at (ei.from)), 143 | j = static_cast ( 144 | clfull_dat.vert2index_map.at (ei.to)); 145 | vert_dists (i, j) = vert_dists (j, i) = ei.dist; 146 | } 147 | 148 | for (int i = 0; i < (num_clusters - 1); i++) { 149 | for (int j = (i + 1); j < num_clusters; j++) 150 | { 151 | intset_t verts_i = vert_sets.at (i), 152 | verts_j = vert_sets.at (j); 153 | double max_d = 0.0; 154 | if (!clfull_dat.shortest) { 155 | max_d = INFINITE_DOUBLE; // min covariance 156 | } 157 | for (auto vi: verts_i) { 158 | for (auto vj: verts_j) 159 | { 160 | arma::uword viu = static_cast ( 161 | clfull_dat.vert2index_map.at (vi)), 162 | vju = static_cast ( 163 | clfull_dat.vert2index_map.at (vj)); 164 | if ((clfull_dat.shortest && 165 | vert_dists (viu, vju) > max_d) || 166 | (!clfull_dat.shortest && 167 | vert_dists (viu, vju) < max_d)) { 168 | max_d = vert_dists (viu, vju); 169 | } 170 | } 171 | } 172 | arma::uword iu = static_cast (i), 173 | ju = static_cast (j); 174 | cl_edges (iu, ju) = cl_edges (ju, iu) = max_d; 175 | } 176 | } 177 | } 178 | 179 | 180 | //' rcpp_full_initial 181 | //' 182 | //' Initial allocation for full clustering 183 | //' 184 | //' @noRd 185 | // [[Rcpp::export]] 186 | Rcpp::IntegerVector rcpp_full_initial ( 187 | const Rcpp::DataFrame gr, 188 | bool shortest) { 189 | Rcpp::IntegerVector from_ref = gr ["from"]; 190 | Rcpp::IntegerVector to_ref = gr ["to"]; 191 | Rcpp::NumericVector d_ref = gr ["d"]; 192 | Rcpp::IntegerVector from = Rcpp::clone (from_ref); 193 | Rcpp::IntegerVector to = Rcpp::clone (to_ref); 194 | Rcpp::NumericVector d = Rcpp::clone (d_ref); 195 | 196 | // Index vectors are 1-indexed, so 197 | from = from - 1; 198 | to = to - 1; 199 | 200 | full_init::FullInitDat clfull_dat; 201 | clfull_dat.shortest = shortest; 202 | full_init::init (clfull_dat, from, to, d); 203 | 204 | full_init::assign_first_edge (clfull_dat); 205 | int clnum = 1; // #1 assigned in assign_first_edge 206 | index_t ei = 1; // index of next edge to be assigned 207 | 208 | while (clfull_dat.vert2cl_map.size () < clfull_dat.n) { 209 | int clnum_i = full_init::step (clfull_dat, ei, clnum); 210 | ei++; 211 | if (clnum_i == clnum) { 212 | clnum++; 213 | } 214 | } 215 | 216 | // Then construct the hierarchical relationships among clusters 217 | arma::uword cu = static_cast (clnum); 218 | arma::Mat cl_edges (cu, cu); 219 | full_init::fill_cl_edges (clfull_dat, cl_edges, clnum); 220 | 221 | // Then construct vector mapping edges to cluster numbers 222 | std::vector clvec (clfull_dat.n); 223 | for (auto ci: clfull_dat.vert2cl_map) { 224 | clvec [static_cast (ci.first)] = ci.second; 225 | } 226 | 227 | return Rcpp::wrap (clvec); 228 | } 229 | -------------------------------------------------------------------------------- /src/full-init.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "utils.h" 4 | 5 | // --------- FULL CLUSTER ---------------- 6 | 7 | /* All `index2cl` values are initally set to -1, and there are no `cl2index` 8 | * values. There is also a single binary vector of `index_in_cluster`, initialy 9 | * set to `false`. 10 | */ 11 | 12 | namespace full_init { 13 | 14 | struct FullInitDat { 15 | bool shortest; 16 | size_t n; 17 | 18 | std::vector edges; // nearest neighbour edges only 19 | std::vector index_in_cluster; 20 | 21 | int2int_map_t vert2cl_map; 22 | int2indx_map_t vert2index_map; 23 | indx2int_map_t index2cl_map, index2vert_map; 24 | int2intset_map_t cl2index_map; 25 | }; 26 | 27 | void init (FullInitDat &clfull_dat, 28 | Rcpp::IntegerVector from, 29 | Rcpp::IntegerVector to, 30 | Rcpp::NumericVector d); 31 | 32 | void assign_first_edge (FullInitDat &clfull_dat); 33 | 34 | int step (FullInitDat &clfull_dat, const index_t ei, 35 | const int clnum); 36 | 37 | void fill_cl_edges (FullInitDat &clfull_dat, arma::Mat &cl_edges, 38 | int num_clusters); 39 | 40 | } // end namespace ex_init 41 | 42 | Rcpp::IntegerVector rcpp_full_initial ( 43 | const Rcpp::DataFrame gr, 44 | bool shortest); 45 | -------------------------------------------------------------------------------- /src/full-merge.cpp: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include "full-merge.h" 3 | 4 | // load data from rcpp_full_initial into the FullMergeDat struct. The gr data 5 | // are pre-sorted by increasing d. 6 | void full_merge::init (const Rcpp::DataFrame &gr, 7 | full_merge::FullMergeDat &cldat) 8 | { 9 | Rcpp::IntegerVector from = gr ["from"]; 10 | Rcpp::IntegerVector to = gr ["to"]; 11 | Rcpp::NumericVector d = gr ["d"]; 12 | Rcpp::IntegerVector clnum = gr ["cluster"]; 13 | Rcpp::IntegerVector clfrom = gr ["cl_from"]; 14 | Rcpp::IntegerVector clto = gr ["cl_to"]; 15 | 16 | const size_t n = static_cast (d.size ()); 17 | 18 | cldat.edges.resize (n); 19 | std::unordered_map > cl2dist_map; 20 | std::unordered_map edge_dist_map; 21 | for (int i = 0; i < static_cast (n); i++) { 22 | if (clnum [i] >= 0) { // edge in a cluster 23 | int clnum_i = clnum [i]; 24 | 25 | std::unordered_set distset; 26 | if (cl2dist_map.find (clnum_i) != cl2dist_map.end ()) 27 | distset = cl2dist_map.at (clnum_i); 28 | distset.emplace (d [i]); 29 | 30 | cl2dist_map [clnum_i] = distset; 31 | } else { 32 | // make set of unordered edge names; the actual edge_dist_map is a 33 | // dummy here, and serves just to get number of edges 34 | std::string eft = std::to_string (clfrom [i]) + "-" + 35 | std::to_string (clto [i]), 36 | etf = std::to_string (clto [i]) + "-" + 37 | std::to_string (clfrom [i]); 38 | if (edge_dist_map.find (eft) == edge_dist_map.end () && 39 | edge_dist_map.find (etf) == edge_dist_map.end ()) 40 | edge_dist_map.emplace (eft, d [i]); 41 | } 42 | } 43 | 44 | cldat.edges.resize (edge_dist_map.size ()); 45 | edge_dist_map.clear (); 46 | size_t edge_count = 0; 47 | for (int i = 0; i < static_cast (n); i++) { 48 | if (clnum [i] < 0) { // edge not in a cluster 49 | utils::OneEdge edgei; 50 | // clfrom and clto hold cluster numbers, NOT vertex numbers 51 | edgei.from = clfrom [i]; 52 | edgei.to = clto [i]; 53 | edgei.dist = d [i]; 54 | 55 | std::string eft = std::to_string (edgei.from) + "-" + 56 | std::to_string (edgei.to), 57 | etf = std::to_string (edgei.to) + "-" + 58 | std::to_string (edgei.from); 59 | if (edge_dist_map.find (eft) == edge_dist_map.end () && 60 | edge_dist_map.find (etf) == edge_dist_map.end ()) 61 | { 62 | edge_dist_map.emplace (eft, d [i]); 63 | cldat.edges [edge_count++] = edgei; 64 | } else if (edge_dist_map.find (etf) != edge_dist_map.end ()) 65 | { 66 | if ((cldat.shortest && d [i] < edge_dist_map.at (etf)) || 67 | (!cldat.shortest && d [i] > edge_dist_map.at (etf))) 68 | edge_dist_map [etf] = d [i]; 69 | } else 70 | { 71 | if ((cldat.shortest && d [i] < edge_dist_map.at (eft)) || 72 | (!cldat.shortest && d [i] > edge_dist_map.at (eft))) 73 | edge_dist_map [eft] = d [i]; 74 | } 75 | } 76 | } 77 | // Then just loop over cldat.edges to update min distances 78 | for (auto ei: cldat.edges) { 79 | std::string eft = std::to_string (ei.from) + "-" + 80 | std::to_string (ei.to); 81 | if ((cldat.shortest && edge_dist_map.at (eft) < ei.dist) || 82 | (!cldat.shortest && edge_dist_map.at (eft) > ei.dist)) 83 | ei.dist = edge_dist_map.at (eft); 84 | } 85 | 86 | // Fill intra-cluster data: 87 | for (auto i: cl2dist_map) { 88 | std::unordered_set distset = i.second; 89 | OneCluster cli; 90 | cli.id = i.first; 91 | cli.n = distset.size (); 92 | cli.dist_sum = 0.0; 93 | cli.dist_max = 0.0; 94 | if (!cldat.shortest) 95 | cli.dist_max = INFINITE_DOUBLE; 96 | for (auto di: distset) { 97 | cli.dist_sum += di; 98 | if ((cldat.shortest && di > cli.dist_max) || 99 | (!cldat.shortest && di < cli.dist_max)) 100 | cli.dist_max = di; 101 | } 102 | cldat.clusters.emplace (i.first, cli); 103 | 104 | cldat.cl_remap.emplace (i.first, i.first); 105 | intset_t members; 106 | members.emplace (i.first); 107 | cldat.cl_members.emplace (i.first, members); 108 | } 109 | } 110 | 111 | // merge cluster clfrom with clto; clfrom remains as it was but is no longer 112 | // indexed so simply ignored from that point on 113 | full_merge::OneMerge full_merge::merge_one_single (full_merge::FullMergeDat &cldat, 114 | index_t ei) { 115 | const int cl_from_i = cldat.cl_remap.at (cldat.edges [ei].from), 116 | cl_to_i = cldat.cl_remap.at (cldat.edges [ei].to); 117 | 118 | full_merge::OneCluster clfrom = cldat.clusters.at (cl_from_i), 119 | clto = cldat.clusters.at (cl_to_i); 120 | clto.n += clfrom.n; 121 | clto.dist_sum += clfrom.dist_sum; 122 | 123 | if ((cldat.shortest && clfrom.dist_max > clto.dist_max) || 124 | (!cldat.shortest && clfrom.dist_max < clto.dist_max)) 125 | clto.dist_max = clfrom.dist_max; 126 | 127 | std::vector edges_from = clfrom.edges, 128 | edges_to = clto.edges; 129 | edges_to.insert (edges_to.end (), edges_from.begin (), edges_from.end ()); 130 | clto.edges.clear (); 131 | clto.edges.shrink_to_fit (); 132 | clto.edges = edges_to; 133 | 134 | cldat.clusters.erase (cl_from_i); 135 | cldat.clusters [cl_to_i] = clto; 136 | 137 | cldat.cl_remap [cl_from_i] = cldat.cl_remap [cl_to_i]; 138 | intset_t members_f = cldat.cl_members.at (cl_from_i), 139 | members_t = cldat.cl_members.at (cl_to_i); 140 | members_t.insert (members_f.begin (), members_f.end ()); 141 | cldat.cl_members [cl_to_i] = members_t; 142 | for (auto m: members_t) 143 | cldat.cl_remap [m] = cldat.cl_remap [cl_to_i]; 144 | 145 | full_merge::OneMerge the_merge; 146 | the_merge.cli = cl_from_i; 147 | the_merge.clj = cl_to_i; 148 | the_merge.merge_dist = cldat.edges [ei].dist; 149 | 150 | return the_merge; 151 | } 152 | 153 | // Each merge joins from to to; from remains unchanged but is no longer indexed. 154 | // Edges nevertheless always refer to original (non-merged) cluster numbers, so 155 | // need to be re-mapped via the cl_remap 156 | void full_merge::merge_single (full_merge::FullMergeDat &cldat) { 157 | index_t edgei = 0; 158 | while (cldat.clusters.size () > 1) { 159 | int clfr = cldat.cl_remap.at (cldat.edges [edgei].from), 160 | clto = cldat.cl_remap.at (cldat.edges [edgei].to); 161 | if (clfr != clto) { 162 | full_merge::OneMerge the_merge = 163 | full_merge::merge_one_single (cldat, edgei); 164 | cldat.merges.push_back (the_merge); 165 | } 166 | edgei++; 167 | if (edgei == cldat.edges.size ()) 168 | break; 169 | } 170 | } 171 | 172 | bool full_merge::avgdist_sorter_incr (const OneDist &lhs, 173 | const OneDist &rhs) { 174 | return lhs.value < rhs.value; 175 | } 176 | 177 | bool full_merge::avgdist_sorter_decr (const OneDist &lhs, 178 | const OneDist &rhs) { 179 | return lhs.value > rhs.value; 180 | } 181 | 182 | bool full_merge::maxdist_sorter_incr (const OneDist &lhs, 183 | const OneDist &rhs) { 184 | return lhs.d < rhs.d; 185 | } 186 | 187 | bool full_merge::maxdist_sorter_decr (const OneDist &lhs, 188 | const OneDist &rhs) { 189 | return lhs.d > rhs.d; 190 | } 191 | 192 | void full_merge::fill_avg_dists (full_merge::FullMergeDat &cldat, 193 | full_merge::AvgDists &cl_dists) { 194 | cl_dists.avg_dists.resize (cldat.edges.size ()); 195 | size_t nc = 0; 196 | std::unordered_set edgenames; // TODO: Remove 197 | for (auto ei: cldat.edges) { 198 | full_merge::OneDist onedist; 199 | onedist.cli = ei.from; 200 | onedist.clj = ei.to; 201 | onedist.d = ei.dist; 202 | onedist.di = cldat.clusters [ei.from].dist_sum; 203 | onedist.dj = cldat.clusters [ei.to].dist_sum; 204 | onedist.ni = cldat.clusters [ei.from].n; 205 | onedist.nj = cldat.clusters [ei.to].n; 206 | 207 | onedist.value = (onedist.di + onedist.dj + onedist.d) / 208 | static_cast (onedist.ni + onedist.nj + 1); 209 | 210 | cl_dists.avg_dists [nc++] = onedist; 211 | } 212 | 213 | if (cldat.shortest) { 214 | std::sort (cl_dists.avg_dists.begin (), cl_dists.avg_dists.end (), 215 | &full_merge::avgdist_sorter_incr); 216 | } else { 217 | std::sort (cl_dists.avg_dists.begin (), cl_dists.avg_dists.end (), 218 | &full_merge::avgdist_sorter_decr); 219 | } 220 | } 221 | 222 | // Fill the cli_map and clj_map entries which map cluster numbers onto sets of 223 | // indices in cl_dists.avg_dists 224 | void full_merge::fill_cl_indx_maps (full_merge::AvgDists &cl_dists) { 225 | cl_dists.cl_map.clear (); 226 | for (size_t i = 0; i < cl_dists.avg_dists.size (); i++) { 227 | indxset_t indxs; 228 | const int cli = cl_dists.avg_dists [i].cli; 229 | if (cl_dists.cl_map.find (cli) != cl_dists.cl_map.end ()) 230 | indxs = cl_dists.cl_map.at (cli); 231 | indxs.emplace (i); 232 | cl_dists.cl_map [cli] = indxs; 233 | 234 | indxs.clear (); 235 | const int clj = cl_dists.avg_dists [i].clj; 236 | if (cl_dists.cl_map.find (clj) != cl_dists.cl_map.end ()) 237 | indxs = cl_dists.cl_map.at (clj); 238 | indxs.emplace (i); 239 | cl_dists.cl_map [clj] = indxs; 240 | } 241 | } 242 | 243 | // Merging is based on AvgDists, which holds all possible pair-wise merges of 244 | // existing clusters. One merge combines the pair in one AvgDists item to make a 245 | // new one. The convention is to merge cli into clj, so cli disappears. 246 | // Importantly, this requires updating all other AvgDists.avg_dists items which 247 | // contain either one of the newly merged pairs. Indices from clusters to 248 | // AvgDists.avg_dists are kept in AvgDists.cli_map and .clj_map. The values of 249 | // the latter are updated to reflect merges, as are the entries of the new cli 250 | // in AvgDists.avg_dists. 251 | full_merge::OneMerge full_merge::merge_avg (full_merge::FullMergeDat &cldat, 252 | full_merge::AvgDists &cl_dists) 253 | { 254 | full_merge::OneDist the_dist = cl_dists.avg_dists [0]; 255 | const double dtot = the_dist.di + the_dist.dj + the_dist.d; 256 | const size_t ntot = the_dist.ni + the_dist.nj + 1; 257 | const double average = dtot / static_cast (ntot); 258 | const int cli = the_dist.cli, 259 | clj = the_dist.clj; 260 | double dmin = INFINITE_DOUBLE; // shortest connecting distance 261 | if (!cldat.shortest) { 262 | dmin = -dmin; 263 | } 264 | 265 | indxset_t cli_indx = cl_dists.cl_map.at (cli), 266 | clj_indx = cl_dists.cl_map.at (clj); 267 | // update cli_indx & clj_indx entries, and get value of dmin 268 | for (auto i: clj_indx) { 269 | cl_dists.avg_dists [i].dj = dtot; 270 | cl_dists.avg_dists [i].nj = ntot; 271 | if (cl_dists.avg_dists [i].cli == cli) 272 | cl_dists.avg_dists [i].cli = clj; 273 | else if (cl_dists.avg_dists [i].clj == cli) 274 | cl_dists.avg_dists [i].clj = clj; 275 | if ((cldat.shortest && cl_dists.avg_dists [i].d < dmin) || 276 | (!cldat.shortest && cl_dists.avg_dists [i].d > dmin)) 277 | dmin = cl_dists.avg_dists [i].d; 278 | } 279 | 280 | for (auto i: cli_indx) { 281 | cl_dists.avg_dists [i].di = dtot; 282 | cl_dists.avg_dists [i].ni = ntot; 283 | if (cl_dists.avg_dists [i].cli == cli) { 284 | cl_dists.avg_dists [i].cli = clj; 285 | } else if (cl_dists.avg_dists [i].clj == cli) { 286 | cl_dists.avg_dists [i].clj = clj; 287 | } 288 | if ((cldat.shortest && cl_dists.avg_dists [i].d < dmin) || 289 | (!cldat.shortest && cl_dists.avg_dists [i].d > dmin)) { 290 | dmin = cl_dists.avg_dists [i].d; 291 | } 292 | } 293 | // Then update all dmin and average dist values 294 | for (auto i: clj_indx) { 295 | cl_dists.avg_dists [i].d = dmin; 296 | cl_dists.avg_dists [i].value = 297 | (cl_dists.avg_dists [i].di + dtot + dmin) / 298 | static_cast (cl_dists.avg_dists [i].ni + ntot + 1); 299 | } 300 | for (auto i: cli_indx) { 301 | cl_dists.avg_dists [i].d = dmin; 302 | cl_dists.avg_dists [i].value = 303 | (cl_dists.avg_dists [i].dj + dtot + dmin) / 304 | static_cast (cl_dists.avg_dists [i].nj + ntot + 1); 305 | } 306 | cl_dists.avg_dists.pop_front (); 307 | 308 | // These can now have reverse-duplicated entries because after merging A->B 309 | // entries A->C and C->B will become B->C and C->B. There can also be D->A 310 | // and D->B which will both become D->B. 311 | std::vector rm; 312 | std::unordered_set edge_names; 313 | for (size_t i = 0; i < cl_dists.avg_dists.size (); i++) { 314 | std::string cij = std::to_string (cl_dists.avg_dists [i].cli) + "-" + 315 | std::to_string (cl_dists.avg_dists [i].clj), 316 | cji = std::to_string (cl_dists.avg_dists [i].clj) + "-" + 317 | std::to_string (cl_dists.avg_dists [i].cli); 318 | if (edge_names.find (cij) == edge_names.end () && 319 | edge_names.find (cji) == edge_names.end ()) { 320 | edge_names.emplace (cij); 321 | } else { 322 | rm.push_back (static_cast (i)); 323 | } 324 | } 325 | std::sort (rm.begin (), rm.end (), std::greater ()); 326 | for (auto i: rm) { 327 | cl_dists.avg_dists.erase (cl_dists.avg_dists.begin () + i); 328 | } 329 | 330 | if (cldat.shortest) { 331 | std::sort (cl_dists.avg_dists.begin (), cl_dists.avg_dists.end (), 332 | &full_merge::avgdist_sorter_incr); 333 | } else { 334 | std::sort (cl_dists.avg_dists.begin (), cl_dists.avg_dists.end (), 335 | &full_merge::avgdist_sorter_decr); 336 | } 337 | 338 | // Finally, update the cl_dists.cli_map & clj_map entries 339 | fill_cl_indx_maps (cl_dists); 340 | 341 | full_merge::OneMerge the_merge; 342 | the_merge.cli = cli; 343 | the_merge.clj = clj; 344 | the_merge.merge_dist = average; 345 | 346 | return the_merge; 347 | } 348 | 349 | // Successively merge pairs of clusters which yield the lower average 350 | // intra-cluster edge distance 351 | void full_merge::avg (full_merge::FullMergeDat &cldat) { 352 | AvgDists cl_dists; 353 | full_merge::fill_avg_dists (cldat, cl_dists); 354 | full_merge::fill_cl_indx_maps (cl_dists); 355 | 356 | while (cl_dists.avg_dists.size () > 1) { 357 | full_merge::OneMerge the_merge = full_merge::merge_avg (cldat, cl_dists); 358 | cldat.merges.push_back (the_merge); 359 | } 360 | } 361 | 362 | void full_merge::fill_max_dists (full_merge::FullMergeDat &cldat, 363 | full_merge::AvgDists &cl_dists) { 364 | cl_dists.avg_dists.resize (cldat.edges.size ()); 365 | size_t nc = 0; 366 | std::unordered_set edgenames; // TODO: Remove 367 | for (auto ei: cldat.edges) { 368 | full_merge::OneDist onedist; 369 | onedist.cli = ei.from; 370 | onedist.clj = ei.to; 371 | onedist.d = ei.dist; 372 | 373 | cl_dists.avg_dists [nc++] = onedist; 374 | } 375 | 376 | if (cldat.shortest) { 377 | std::sort (cl_dists.avg_dists.begin (), cl_dists.avg_dists.end (), 378 | &full_merge::maxdist_sorter_incr); 379 | } else { 380 | std::sort (cl_dists.avg_dists.begin (), cl_dists.avg_dists.end (), 381 | &full_merge::maxdist_sorter_decr); 382 | } 383 | } 384 | 385 | void full_merge::max (full_merge::FullMergeDat &cldat) { 386 | } 387 | 388 | 389 | full_merge::OneMerge full_merge::merge_max (full_merge::FullMergeDat &cldat, 390 | full_merge::AvgDists &cl_dists) { 391 | 392 | full_merge::OneDist the_dist = cl_dists.avg_dists [0]; 393 | const double dtot = the_dist.di + the_dist.dj + the_dist.d; 394 | const size_t ntot = the_dist.ni + the_dist.nj + 1; 395 | const double average = dtot / static_cast (ntot); 396 | const int cli = the_dist.cli, 397 | clj = the_dist.clj; 398 | double dmin = INFINITE_DOUBLE; // shortest connecting distance 399 | if (!cldat.shortest) { 400 | dmin = -dmin; 401 | } 402 | 403 | indxset_t cli_indx = cl_dists.cl_map.at (cli), 404 | clj_indx = cl_dists.cl_map.at (clj); 405 | // update cli_indx & clj_indx entries, and get value of dmin 406 | for (auto i: clj_indx) { 407 | cl_dists.avg_dists [i].dj = dtot; 408 | cl_dists.avg_dists [i].nj = ntot; 409 | if (cl_dists.avg_dists [i].cli == cli) 410 | cl_dists.avg_dists [i].cli = clj; 411 | else if (cl_dists.avg_dists [i].clj == cli) 412 | cl_dists.avg_dists [i].clj = clj; 413 | if ((cldat.shortest && cl_dists.avg_dists [i].d < dmin) || 414 | (!cldat.shortest && cl_dists.avg_dists [i].d > dmin)) { 415 | dmin = cl_dists.avg_dists [i].d; 416 | } 417 | } 418 | for (auto i: cli_indx) { 419 | cl_dists.avg_dists [i].di = dtot; 420 | cl_dists.avg_dists [i].ni = ntot; 421 | if (cl_dists.avg_dists [i].cli == cli) { 422 | cl_dists.avg_dists [i].cli = clj; 423 | } else if (cl_dists.avg_dists [i].clj == cli) { 424 | cl_dists.avg_dists [i].clj = clj; 425 | } 426 | if ((cldat.shortest && cl_dists.avg_dists [i].d < dmin) || 427 | (!cldat.shortest && cl_dists.avg_dists [i].d > dmin)) { 428 | dmin = cl_dists.avg_dists [i].d; 429 | } 430 | } 431 | // Then update all dmin and average dist values 432 | for (auto i: clj_indx) { 433 | cl_dists.avg_dists [i].d = dmin; 434 | cl_dists.avg_dists [i].value = 435 | (cl_dists.avg_dists [i].di + dtot + dmin) / 436 | static_cast (cl_dists.avg_dists [i].ni + ntot + 1); 437 | } 438 | for (auto i: cli_indx) { 439 | cl_dists.avg_dists [i].d = dmin; 440 | cl_dists.avg_dists [i].value = 441 | (cl_dists.avg_dists [i].dj + dtot + dmin) / 442 | static_cast (cl_dists.avg_dists [i].nj + ntot + 1); 443 | } 444 | cl_dists.avg_dists.pop_front (); 445 | 446 | // These can now have reverse-duplicated entries because after merging A->B 447 | // entries A->C and C->B will become B->C and C->B. There can also be D->A 448 | // and D->B which will both become D->B. 449 | std::vector rm; 450 | std::unordered_set edge_names; 451 | for (size_t i = 0; i < cl_dists.avg_dists.size (); i++) { 452 | std::string cij = std::to_string (cl_dists.avg_dists [i].cli) + "-" + 453 | std::to_string (cl_dists.avg_dists [i].clj), 454 | cji = std::to_string (cl_dists.avg_dists [i].clj) + "-" + 455 | std::to_string (cl_dists.avg_dists [i].cli); 456 | if (edge_names.find (cij) == edge_names.end () && 457 | edge_names.find (cji) == edge_names.end ()) { 458 | edge_names.emplace (cij); 459 | } else { 460 | rm.push_back (static_cast (i)); 461 | } 462 | } 463 | std::sort (rm.begin (), rm.end (), std::greater ()); 464 | for (auto i: rm) { 465 | cl_dists.avg_dists.erase (cl_dists.avg_dists.begin () + i); 466 | } 467 | 468 | if (cldat.shortest) { 469 | std::sort (cl_dists.avg_dists.begin (), cl_dists.avg_dists.end (), 470 | &full_merge::avgdist_sorter_incr); 471 | } else { 472 | std::sort (cl_dists.avg_dists.begin (), cl_dists.avg_dists.end (), 473 | &full_merge::avgdist_sorter_decr); 474 | } 475 | 476 | // Finally, update the cl_dists.cli_map & clj_map entries 477 | fill_cl_indx_maps (cl_dists); 478 | 479 | full_merge::OneMerge the_merge; 480 | the_merge.cli = cli; 481 | the_merge.clj = clj; 482 | the_merge.merge_dist = average; 483 | 484 | 485 | return the_merge; 486 | } 487 | 488 | //' rcpp_full_merge 489 | //' 490 | //' Merge clusters generated by rcpp_full_initial to full hierarchy of all 491 | //' possible merges. 492 | //' 493 | //' @noRd 494 | // [[Rcpp::export]] 495 | Rcpp::NumericMatrix rcpp_full_merge ( 496 | const Rcpp::DataFrame gr, 497 | const std::string linkage, 498 | const bool shortest) 499 | { 500 | full_merge::FullMergeDat clmerge_dat; 501 | clmerge_dat.shortest = shortest; 502 | full_merge::init (gr, clmerge_dat); 503 | 504 | if (utils::strfound (linkage, "single")) { 505 | full_merge::merge_single (clmerge_dat); 506 | } else if (utils::strfound (linkage, "average")) { 507 | full_merge::avg (clmerge_dat); 508 | } else if (utils::strfound (linkage, "max")) { 509 | full_merge::max (clmerge_dat); 510 | } else { 511 | Rcpp::stop ("linkage not found for full_merge"); 512 | } 513 | 514 | const size_t n = clmerge_dat.merges.size (); 515 | Rcpp::NumericMatrix res (static_cast (n), 3); 516 | for (size_t i = 0; i < n; i++) { 517 | res (i, 0) = clmerge_dat.merges [i].cli; 518 | res (i, 1) = clmerge_dat.merges [i].clj; 519 | res (i, 2) = clmerge_dat.merges [i].merge_dist; 520 | } 521 | 522 | std::vector colnames (3); 523 | colnames [0] = "from"; 524 | colnames [1] = "to"; 525 | colnames [2] = "dist"; 526 | Rcpp::List dimnames (2); 527 | dimnames (1) = colnames; 528 | res.attr ("dimnames") = dimnames; 529 | 530 | return res; 531 | } 532 | -------------------------------------------------------------------------------- /src/full-merge.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "utils.h" 6 | 7 | // Merge the clusters generated by the rcpp_full_initial. Separate class and 8 | // routines to allow results from rcpp_full_initial to be returned and cached 9 | // for subsequent re-merging. 10 | 11 | namespace full_merge { 12 | 13 | struct OneCluster { 14 | int id; 15 | size_t n; 16 | double dist_sum, dist_max; 17 | std::vector edges; 18 | }; 19 | 20 | struct OneMerge { 21 | int cli, clj; 22 | double merge_dist; 23 | }; 24 | 25 | struct FullMergeDat { 26 | bool shortest; 27 | std::unordered_map cl_remap; 28 | std::unordered_map cl_members; 29 | std::unordered_map clusters; 30 | std::vector edges; // edges between clusters 31 | std::vector merges; 32 | }; 33 | 34 | struct OneDist { 35 | int cli, clj; 36 | size_t ni, nj; 37 | double di, dj, d, value; 38 | // di, dj are dist_sums, d is min dist of connecting edge 39 | }; 40 | 41 | struct AvgDists { 42 | std::unordered_map cl_map; 43 | std::deque avg_dists; 44 | }; 45 | 46 | void init (const Rcpp::DataFrame &gr, FullMergeDat &cldat); 47 | 48 | OneMerge merge_one_single (FullMergeDat &cldat, index_t ei); 49 | void merge_single (FullMergeDat &cldat); 50 | 51 | bool avgdist_sorter_incr (const OneDist &lhs, const OneDist &rhs); 52 | bool avgdist_sorter_decr (const OneDist &lhs, const OneDist &rhs); 53 | void fill_avg_dists (FullMergeDat &cldat, AvgDists &cl_dists); 54 | void fill_cl_indx_maps (AvgDists &cl_dists); 55 | OneMerge merge_avg (FullMergeDat &cldat, AvgDists &cl_dists); 56 | void avg (FullMergeDat &cldat); 57 | 58 | bool maxdist_sorter_incr (const OneDist &lhs, const OneDist &rhs); 59 | bool maxdist_sorter_decr (const OneDist &lhs, const OneDist &rhs); 60 | void fill_max_dists (FullMergeDat &cldat, AvgDists &cl_dists); 61 | OneMerge merge_max (FullMergeDat &cldat, AvgDists &cl_dists); 62 | void max (FullMergeDat &cldat); 63 | 64 | } // end namespace full_merge 65 | 66 | Rcpp::NumericMatrix rcpp_full_merge ( 67 | const Rcpp::DataFrame gr, 68 | const std::string method, 69 | const bool shortest); 70 | -------------------------------------------------------------------------------- /src/mst.cpp: -------------------------------------------------------------------------------- 1 | #include "mst.h" 2 | 3 | std::vector mst (Rcpp::IntegerVector from, 4 | Rcpp::IntegerVector to, 5 | Rcpp::NumericVector d) { 6 | const size_t n = static_cast (from.size ()); 7 | 8 | std::vector edges (n); 9 | for (size_t i = 0; i < n; i++) { 10 | MSTEdge ei; 11 | ei.from = from (i); 12 | ei.to = to (i); 13 | ei.dist = d (i); 14 | edges [i] = ei; 15 | } 16 | 17 | std::vector cl_id (n); 18 | for (size_t i = 0; i < n; i++) { 19 | cl_id [i] = i; 20 | } 21 | 22 | std::sort (edges.begin (), edges.end ()); 23 | 24 | std::vector result; 25 | 26 | for (MSTEdge e : edges) { 27 | const size_t cl_from = cl_id [static_cast (e.from)], 28 | cl_to = cl_id [static_cast (e.to)]; 29 | 30 | if (cl_from != cl_to) { 31 | result.push_back (e); 32 | 33 | const size_t cl_min = std::min (cl_from, cl_to), 34 | cl_max = std::max (cl_from, cl_to); 35 | 36 | for (size_t i = 0; i < n; i++) { 37 | if (cl_id [i] == cl_max) { 38 | cl_id [i] = cl_min; 39 | } 40 | } 41 | } 42 | } 43 | 44 | return result; 45 | } 46 | 47 | 48 | //' rcpp_mst 49 | //' 50 | //' Minimum spanning tree 51 | //' 52 | //' @noRd 53 | // [[Rcpp::export]] 54 | Rcpp::DataFrame rcpp_mst (Rcpp::DataFrame input) { 55 | Rcpp::IntegerVector from = input ["from"]; 56 | Rcpp::IntegerVector to = input ["to"]; 57 | Rcpp::NumericVector d = input ["d"]; 58 | 59 | std::vector tree = mst (from, to, d); 60 | 61 | Rcpp::IntegerVector from_out (tree.size ()); 62 | Rcpp::IntegerVector to_out (tree.size ()); 63 | Rcpp::NumericVector d_out (tree.size ()); 64 | size_t i = 0; 65 | for (auto t: tree) { 66 | from_out (i) = t.from; 67 | to_out (i) = t.to; 68 | d_out (i) = t.dist; 69 | i++; 70 | } 71 | 72 | Rcpp::DataFrame res = Rcpp::DataFrame::create ( 73 | Rcpp::Named ("from") = from_out, 74 | Rcpp::Named ("to") = to_out, 75 | Rcpp::Named ("d") = d_out, 76 | Rcpp::_["stringsAsFactors"] = false); 77 | 78 | return res; 79 | }; 80 | -------------------------------------------------------------------------------- /src/mst.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | struct MSTEdge { 7 | int from, to; 8 | double dist; 9 | bool operator<(const MSTEdge& rhs) const { return dist < rhs.dist; } 10 | }; 11 | 12 | std::vector mst (Rcpp::IntegerVector from, 13 | Rcpp::IntegerVector to, 14 | Rcpp::NumericVector d); 15 | -------------------------------------------------------------------------------- /src/slk.cpp: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include "utils.h" 3 | #include "slk.h" 4 | #include 5 | 6 | // --------- SINGLE LINKAGE CLUSTER ---------------- 7 | 8 | //' rcpp_slk 9 | //' 10 | //' Full-order single linkage cluster redcap algorithm 11 | //' 12 | //' @noRd 13 | // [[Rcpp::export]] 14 | Rcpp::IntegerVector rcpp_slk ( 15 | const Rcpp::DataFrame gr_full, 16 | const Rcpp::DataFrame gr, 17 | const bool shortest, 18 | const bool quiet) { 19 | Rcpp::IntegerVector from_full_ref = gr_full ["from"]; 20 | Rcpp::IntegerVector to_full_ref = gr_full ["to"]; 21 | Rcpp::NumericVector d_full = gr_full ["d"]; 22 | Rcpp::IntegerVector from_ref = gr ["from"]; 23 | Rcpp::IntegerVector to_ref = gr ["to"]; 24 | Rcpp::NumericVector d = gr ["d"]; 25 | 26 | // Rcpp classes are always passed by reference, so cloning is necessary to 27 | // avoid modifying the original data.frames. 28 | Rcpp::IntegerVector from_full = Rcpp::clone (from_full_ref); 29 | Rcpp::IntegerVector to_full = Rcpp::clone (to_full_ref); 30 | Rcpp::IntegerVector from = Rcpp::clone (from_ref); 31 | Rcpp::IntegerVector to = Rcpp::clone (to_ref); 32 | 33 | // Index vectors are 1-indexed, so 34 | from_full = from_full - 1; 35 | to_full = to_full - 1; 36 | from = from - 1; 37 | to = to - 1; 38 | 39 | arma::Mat contig_mat; 40 | arma::Mat d_mat; 41 | 42 | // index2cl and cl2index are dynamically updated with cluster memberships; 43 | // vert2index and index2vert are retained at initial values which map (from, 44 | // to) vectors to matrix indices. All operations are performed on matrices 45 | // directly, with membership re-traced at the end via index2vert_map. 46 | int2indxset_map_t cl2index_map; 47 | int2indx_map_t vert2index_map; 48 | indx2int_map_t index2vert_map, index2cl_map; 49 | 50 | size_t n = utils::sets_init (from, to, vert2index_map, index2vert_map, 51 | index2cl_map, cl2index_map); 52 | 53 | utils_slk::mats_init (from, to, d, vert2index_map, contig_mat, d_mat, 54 | shortest); 55 | 56 | /* The contiguity matrix retains is shape, so is always indexed by the 57 | * (from, to) vectors. Merging clusters simply switches additional entries 58 | * from 0 to 1. 59 | */ 60 | 61 | const bool really_quiet = !(!quiet && n > 100); 62 | 63 | indxset_t the_tree; 64 | size_t e = 0; // edge number in gr_full 65 | while (the_tree.size () < (n - 1)) {// tree has n - 1 edges 66 | Rcpp::checkUserInterrupt (); 67 | 68 | index_t ifrom = vert2index_map.at (from_full (e)), 69 | ito = vert2index_map.at (to_full (e)); 70 | if (index2cl_map.find (ifrom) != index2cl_map.end () && 71 | index2cl_map.find (ito) != index2cl_map.end ()) { 72 | int cfrom = index2cl_map.at (ifrom), 73 | cto = index2cl_map.at (ito); 74 | if (cfrom != cto && 75 | contig_mat (static_cast (ifrom), 76 | static_cast (ito)) > 0) { 77 | size_t ishort = utils::find_shortest_connection (from, to, 78 | vert2index_map, d_mat, cl2index_map, cfrom, cto, 79 | shortest); 80 | the_tree.insert (ishort); 81 | utils::merge_clusters (contig_mat, 82 | index2cl_map, cl2index_map, cfrom, cto); 83 | e = 0; 84 | } else { 85 | e++; 86 | } 87 | } else { 88 | e++; 89 | } 90 | if (!really_quiet && the_tree.size () % 100 == 0) { 91 | Rcpp::Rcout << "\rBuilding tree: " << the_tree.size () << " / " << 92 | n - 1; 93 | Rcpp::Rcout.flush (); 94 | } 95 | } 96 | 97 | if (!really_quiet) { 98 | Rcpp::Rcout << "\rBuilding tree: " << the_tree.size () << " / " << 99 | n - 1 << " -> done" << std::endl; 100 | } 101 | 102 | std::vector treevec (the_tree.begin (), the_tree.end ()); 103 | 104 | return Rcpp::wrap (treevec); 105 | } 106 | -------------------------------------------------------------------------------- /src/slk.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | // --------- SINGLE LINKAGE CLUSTER ---------------- 4 | 5 | Rcpp::IntegerVector rcpp_slk ( 6 | const Rcpp::DataFrame gr_full, 7 | const Rcpp::DataFrame gr, 8 | const bool shortest, 9 | const bool quiet); 10 | -------------------------------------------------------------------------------- /src/spatialcluster_init.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include // for NULL 4 | #include 5 | 6 | /* FIXME: 7 | Check these declarations against the C/Fortran source code. 8 | */ 9 | 10 | /* .Call calls */ 11 | extern SEXP _spatialcluster_rcpp_alk(SEXP, SEXP, SEXP); 12 | extern SEXP _spatialcluster_rcpp_clk(SEXP, SEXP, SEXP, SEXP); 13 | extern SEXP _spatialcluster_rcpp_cut_tree(SEXP, SEXP, SEXP, SEXP); 14 | extern SEXP _spatialcluster_rcpp_full_initial(SEXP, SEXP); 15 | extern SEXP _spatialcluster_rcpp_full_merge(SEXP, SEXP, SEXP); 16 | extern SEXP _spatialcluster_rcpp_mst(SEXP); 17 | extern SEXP _spatialcluster_rcpp_slk(SEXP, SEXP, SEXP, SEXP); 18 | 19 | static const R_CallMethodDef CallEntries[] = { 20 | {"_spatialcluster_rcpp_alk", (DL_FUNC) &_spatialcluster_rcpp_alk, 3}, 21 | {"_spatialcluster_rcpp_clk", (DL_FUNC) &_spatialcluster_rcpp_clk, 4}, 22 | {"_spatialcluster_rcpp_cut_tree", (DL_FUNC) &_spatialcluster_rcpp_cut_tree, 4}, 23 | {"_spatialcluster_rcpp_full_initial", (DL_FUNC) &_spatialcluster_rcpp_full_initial, 2}, 24 | {"_spatialcluster_rcpp_full_merge", (DL_FUNC) &_spatialcluster_rcpp_full_merge, 3}, 25 | {"_spatialcluster_rcpp_mst", (DL_FUNC) &_spatialcluster_rcpp_mst, 1}, 26 | {"_spatialcluster_rcpp_slk", (DL_FUNC) &_spatialcluster_rcpp_slk, 4}, 27 | {NULL, NULL, 0} 28 | }; 29 | 30 | void R_init_spatialcluster(DllInfo *dll) 31 | { 32 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); 33 | R_useDynamicSymbols(dll, FALSE); 34 | } 35 | -------------------------------------------------------------------------------- /src/utils.cpp: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include "utils.h" 3 | #include 4 | 5 | // Note that all matrices **CAN** be asymmetrical, and so are always indexed 6 | // (from, to) 7 | 8 | /* These routines all work with 2 lots of 2 main maps: 9 | * 1. vert2index and index2vert maps 10 | * 2. cl2index and index2cl maps 11 | * 12 | * The former map all vertices enumerated in the original from and to vectors to 13 | * sequential index numbers into the matrices (dists, contig_mat, whatever). The 14 | * latter are initially direct a->a maps of all indices to themselves. As 15 | * clusters merge, the values of index2cl maps are updated so that, for example, 16 | * index2cl(a)->b and index2cl(b)->b. The cl2index map then holds an 17 | * unordered_set of target indices for each cluster. 18 | */ 19 | 20 | bool utils::strfound (const std::string str, const std::string target) { 21 | bool found = false; 22 | if (str.find (target) != std::string::npos) 23 | found = true; 24 | return found; 25 | } 26 | 27 | size_t utils::sets_init ( 28 | const Rcpp::IntegerVector &from, 29 | const Rcpp::IntegerVector &to, 30 | int2indx_map_t &vert2index_map, 31 | indx2int_map_t &index2vert_map, 32 | indx2int_map_t &index2cl_map, 33 | int2indxset_map_t &cl2index_map) { 34 | vert2index_map.clear (); 35 | index2vert_map.clear (); 36 | index2cl_map.clear (); 37 | cl2index_map.clear (); 38 | 39 | intset_t vert_set; 40 | for (int i = 0; i < from.size (); i++) { 41 | vert_set.emplace (from [i]); 42 | vert_set.emplace (to [i]); 43 | } 44 | int idx = 0; 45 | for (auto v: vert_set) { 46 | index2vert_map.emplace (idx, v); 47 | vert2index_map.emplace (v, idx++); 48 | } 49 | 50 | for (int i = 0; i < from.length (); i++) { 51 | size_t fi = vert2index_map.at (from [i]); 52 | indxset_t eset; // only one index for each vert at this stage 53 | eset.insert (fi); 54 | cl2index_map.emplace (fi, eset); 55 | } 56 | for (auto v: vert_set) { 57 | // all verts are their own clusters, so cast indx to cli 58 | int cli = static_cast (vert2index_map.at (v)); 59 | indxset_t eset; 60 | if (cl2index_map.find (cli) != cl2index_map.end ()) { 61 | eset = cl2index_map.at (cli); 62 | } 63 | eset.emplace (cli); 64 | cl2index_map.emplace (cli, eset); 65 | index2cl_map.emplace (static_cast (cli), cli); 66 | } 67 | 68 | return static_cast (vert_set.size ()); 69 | } 70 | 71 | //' find shortest (or longest) connection between two clusters 72 | //' @param from, to, d the columns of the edge graph 73 | //' @param d_mat distance matrix between all edges (not between clusters!) 74 | //' @param cl2vert_map map of list of all (from, to, d) edges for each cluster 75 | //' @param cfrom Number of cluster which is to be merged 76 | //' @param cto Number of cluster with which it is to be merged 77 | //' 78 | //' @return Index directly into from, to - **NOT** into the actual matrices! 79 | //' @noRd 80 | size_t utils::find_shortest_connection ( 81 | const Rcpp::IntegerVector &from, 82 | const Rcpp::IntegerVector &to, 83 | const int2indx_map_t &vert2index_map, 84 | const arma::Mat &d_mat, 85 | const int2indxset_map_t &cl2index_map, 86 | const int cfrom, 87 | const int cto, 88 | const bool shortest) { 89 | if (cl2index_map.find (cfrom) == cl2index_map.end ()) { 90 | Rcpp::stop ("cluster index not found"); 91 | } 92 | if (cl2index_map.find (cto) == cl2index_map.end ()) { 93 | Rcpp::stop ("cluster index not found"); 94 | } 95 | 96 | indxset_t index_i = cl2index_map.at (cfrom), 97 | index_j = cl2index_map.at (cto); 98 | 99 | double dlim = INFINITE_DOUBLE; 100 | if (!shortest) { 101 | dlim = -dlim; 102 | } 103 | size_t short_i = INFINITE_INT, short_j = INFINITE_INT; 104 | 105 | // from and to here are directional, so need to examine both directions 106 | for (auto i: index_i) { 107 | for (auto j: index_j) { 108 | arma::uword ia = static_cast (i), 109 | ja = static_cast (j); 110 | if ((shortest && d_mat (ia, ja) < dlim) || 111 | (!shortest && d_mat (ia, ja) > dlim)) { 112 | dlim = d_mat (ia, ja); 113 | short_i = i; 114 | short_j = j; 115 | } else if ((shortest && d_mat (ja, ia) < dlim) || 116 | (!shortest && d_mat (ja, ia) > dlim)) { 117 | dlim = d_mat (ja, ia); 118 | short_i = j; 119 | short_j = i; 120 | } 121 | } 122 | } 123 | if (dlim == INFINITE_DOUBLE) { 124 | Rcpp::stop ("no minimal distance; this should not happen"); 125 | } 126 | 127 | // convert short_i and short_j to a single edge 128 | // TODO: Make a std::map of vert2dist to avoid this loop 129 | size_t shortest_edge = INFINITE_INT; 130 | for (int i = 0; i < from.length (); i++) { // int for Rcpp index 131 | if ((vert2index_map.at (from [i]) == short_i && 132 | vert2index_map.at (to [i]) == short_j) || 133 | (vert2index_map.at (from [i]) == short_j && 134 | vert2index_map.at (to [i]) == short_i)) { 135 | shortest_edge = static_cast (i); 136 | break; 137 | } 138 | } 139 | if (shortest_edge == INFINITE_INT) { 140 | Rcpp::stop ("This shouldn't happen (in utils::find_shortest_connection)"); 141 | } 142 | 143 | return shortest_edge; 144 | } 145 | 146 | //' merge two clusters in the contiguity matrix, reducing the size of the matrix 147 | //' by one row and column. 148 | //' 149 | //' @return A logical parameter indicating whether or not the newly formed 150 | //' cluster has any outgoing connections. 151 | //' @noRd 152 | void utils::merge_clusters ( 153 | arma::Mat &contig_mat, 154 | indx2int_map_t &index2cl_map, 155 | int2indxset_map_t &cl2index_map, 156 | const int cluster_from, 157 | const int cluster_to) { 158 | if (cluster_from < 0) { 159 | Rcpp::stop ("cluster_from must be non-negative"); 160 | } 161 | if (cluster_to < 0) { 162 | Rcpp::stop ("cluster_to must be non-negative"); 163 | } 164 | 165 | arma::uword cfr = static_cast (cluster_from), 166 | cto = static_cast (cluster_to); 167 | // Set all contig_mat (cluster_from, .) to 1 168 | for (arma::uword i = 0; i < contig_mat.n_rows; i++) { 169 | if (contig_mat (cfr, i) == 1 || contig_mat (i, cfr) == 1) { 170 | contig_mat (cfr, i) = contig_mat (i, cfr) = 1; 171 | contig_mat (cto, i) = contig_mat (i, cto) = 1; 172 | } 173 | } 174 | 175 | indxset_t idx_from = cl2index_map.at (cluster_from), 176 | idx_to = cl2index_map.at (cluster_to); 177 | 178 | for (auto i: idx_from) { 179 | for (auto j: idx_to) { 180 | arma::uword ia = static_cast (i), 181 | ja = static_cast (j); 182 | contig_mat (ia, ja) = contig_mat (ja, ia) = 1; 183 | } 184 | } 185 | 186 | // then re-number all cluster numbers in cl2index 187 | cl2index_map.erase (cluster_from); 188 | cl2index_map.erase (cluster_to); 189 | for (auto i: idx_from) { 190 | idx_to.insert (i); 191 | } 192 | cl2index_map.emplace (cluster_to, idx_to); 193 | // and in index2cl: 194 | for (auto i: idx_from) { 195 | index2cl_map.erase (i); 196 | index2cl_map.emplace (i, cluster_to); 197 | } 198 | } 199 | 200 | //' initial contiguity and distance matrices. The contiguity matrix is between 201 | //' clusters, so is constantly modified, whereas the distance matrix is between 202 | //' edges, so is fixed at load time. 203 | //' @noRd 204 | void utils_slk::mats_init ( 205 | const Rcpp::IntegerVector &from, 206 | const Rcpp::IntegerVector &to, 207 | const Rcpp::NumericVector &d, 208 | const int2indx_map_t &vert2index_map, 209 | arma::Mat &contig_mat, 210 | arma::Mat &d_mat, 211 | bool shortest) { 212 | // arma::uword = unsigned int 213 | const arma::uword n = static_cast (vert2index_map.size ()); 214 | 215 | contig_mat = arma::zeros > (n, n); 216 | //d_mat = arma::zeros > (n, n); 217 | d_mat.resize (n, n); 218 | if (shortest) { 219 | d_mat.fill (INFINITE_DOUBLE); 220 | } else { 221 | d_mat.fill (-INFINITE_DOUBLE); 222 | } 223 | 224 | for (int i = 0; i < from.length (); i++) { 225 | arma::uword fi = static_cast (vert2index_map.at (from [i])), 226 | ti = static_cast (vert2index_map.at (to [i])); 227 | contig_mat (fi, ti) = contig_mat (ti, fi) = 1; 228 | d_mat (fi, ti) = d_mat (ti, fi) = d [i]; 229 | } 230 | } 231 | -------------------------------------------------------------------------------- /src/utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace utils { 4 | 5 | bool strfound (const std::string str, const std::string target); 6 | 7 | struct OneEdge { 8 | int from, to; 9 | double dist; 10 | }; 11 | 12 | size_t sets_init ( 13 | const Rcpp::IntegerVector &from, 14 | const Rcpp::IntegerVector &to, 15 | int2indx_map_t &vert2index_map, 16 | indx2int_map_t &index2vert_map, 17 | indx2int_map_t &index2cl_map, 18 | int2indxset_map_t &cl2index_map); 19 | 20 | size_t find_shortest_connection ( 21 | const Rcpp::IntegerVector &from, 22 | const Rcpp::IntegerVector &to, 23 | const int2indx_map_t &vert2index_map, 24 | const arma::Mat &d_mat, 25 | const int2indxset_map_t &cl2index_map, 26 | const int cfrom, 27 | const int cto, 28 | const bool shortest); 29 | 30 | void merge_clusters ( 31 | arma::Mat &contig_mat, 32 | indx2int_map_t &index2cl_map, 33 | int2indxset_map_t &cl2index_map, 34 | const int merge_from, 35 | const int merge_to); 36 | 37 | } // end namespace utils 38 | 39 | // These are only used in slk 40 | namespace utils_slk { 41 | 42 | void mats_init ( 43 | const Rcpp::IntegerVector &from, 44 | const Rcpp::IntegerVector &to, 45 | const Rcpp::NumericVector &d, 46 | const int2indx_map_t &vert2index_map, 47 | arma::Mat &contig_mat, 48 | arma::Mat &d_mat, 49 | bool shortest); 50 | 51 | void dmat_full_init ( 52 | const Rcpp::IntegerVector &from, 53 | const Rcpp::IntegerVector &to, 54 | const Rcpp::NumericVector &d, 55 | const int2indx_map_t &vert2index_map, 56 | arma::Mat &d_mat, 57 | bool shortest); 58 | 59 | } // end namespace utils_slk 60 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library (testthat) 2 | library (spatialcluster) 3 | 4 | test_check ("spatialcluster") 5 | -------------------------------------------------------------------------------- /tests/testthat/test-full.R: -------------------------------------------------------------------------------- 1 | context ("full") 2 | 3 | test_that ("structure", { 4 | set.seed (1) 5 | n <- 100 6 | xy <- matrix (runif (2 * n), ncol = 2) 7 | dmat <- matrix (runif (n^2), ncol = n) 8 | ncl <- 4 9 | scl <- scl_full (xy, dmat, ncl = ncl) 10 | expect_is (scl, "scl") 11 | expect_true (scl$pars$ncl == ncl) 12 | expect_true (all (names (scl) %in% 13 | c ( 14 | "tree", "merges", "ord", "nodes", 15 | "pars", "statistics" 16 | ))) 17 | cl <- scl$nodes$cluster [!is.na (scl$nodes$cluster)] 18 | expect_true (length (unique (cl)) == ncl) 19 | }) 20 | 21 | test_that ("methods", { 22 | set.seed (1) 23 | n <- 100 24 | xy <- matrix (runif (2 * n), ncol = 2) 25 | dmat <- matrix (runif (n^2), ncol = n) 26 | ncl <- 8 27 | scl1 <- scl_full (xy, dmat, ncl = ncl, linkage = "single") 28 | scl2 <- scl_full (xy, dmat, ncl = ncl, linkage = "average") 29 | expect_true (!identical (scl1, scl2)) 30 | cl1 <- scl1$nodes$cluster [!is.na (scl1$nodes$cluster)] 31 | expect_equal (length (unique (cl1)), ncl) 32 | cl2 <- scl2$nodes$cluster [!is.na (scl2$nodes$cluster)] 33 | expect_equal (length (unique (cl2)), ncl) 34 | }) 35 | 36 | test_that ("recluster", { 37 | set.seed (1) 38 | n <- 100 39 | xy <- matrix (runif (2 * n), ncol = 2) 40 | dmat <- matrix (runif (n^2), ncol = n) 41 | scl <- scl_full (xy, dmat, ncl = 4) 42 | scl1 <- scl_full (scl, ncl = 3) 43 | scl2 <- scl_recluster (scl, ncl = 3) 44 | expect_identical (scl1, scl2) 45 | expect_error ( 46 | scl3 <- scl_redcap (scl, ncl = 3), 47 | "scl_redcap can pass to scl_recluster only" 48 | ) 49 | }) 50 | -------------------------------------------------------------------------------- /tests/testthat/test-plots.R: -------------------------------------------------------------------------------- 1 | context ("plot") 2 | 3 | test_that ("redcap plot", { 4 | set.seed (1) 5 | n <- 100 6 | xy <- matrix (runif (2 * n), ncol = 2) 7 | dmat <- matrix (runif (n^2), ncol = n) 8 | scl <- scl_redcap (xy, dmat, ncl = 4) 9 | g1 <- plot (scl) 10 | expect_is (g1, "ggplot") 11 | g2 <- plot (scl, hull_alpha = 0.5) 12 | # The hulls are then contained in 13 | h1 <- g1$layers [[2]]$data 14 | h2 <- g2$layers [[2]]$data 15 | expect_true (!identical (h1, h2)) 16 | }) 17 | 18 | test_that ("full plot", { 19 | set.seed (1) 20 | n <- 100 21 | xy <- matrix (runif (2 * n), ncol = 2) 22 | dmat <- matrix (runif (n^2), ncol = n) 23 | scl <- scl_full (xy, dmat, ncl = 4) 24 | g1 <- plot (scl) 25 | expect_is (g1, "ggplot") 26 | g2 <- plot (scl, hull_alpha = 0.5) 27 | # The hulls are then contained in 28 | h1 <- g1$layers [[2]]$data 29 | h2 <- g2$layers [[2]]$data 30 | expect_true (!identical (h1, h2)) 31 | }) 32 | 33 | test_that ("plot_merges", { 34 | set.seed (1) 35 | n <- 100 36 | xy <- matrix (runif (2 * n), ncol = 2) 37 | dmat <- matrix (runif (n^2), ncol = n) 38 | scl <- scl_full (xy, dmat, ncl = 8) 39 | expect_silent (plot_merges (scl)) 40 | graphics.off () 41 | expect_silent (plot_merges (scl, root_tree = TRUE)) 42 | graphics.off () 43 | scl <- scl_redcap (xy, dmat, ncl = 8) 44 | expect_error ( 45 | plot_merges (scl), 46 | "plot_merges can only be applied to scl objects" 47 | ) 48 | }) 49 | -------------------------------------------------------------------------------- /tests/testthat/test-redcap.R: -------------------------------------------------------------------------------- 1 | context ("redcap ") 2 | 3 | test_that ("structure", { 4 | set.seed (1) 5 | n <- 100 6 | xy <- matrix (runif (2 * n), ncol = 2) 7 | dmat <- matrix (runif (n^2), ncol = n) 8 | scl <- scl_redcap (xy, dmat, ncl = 4) 9 | expect_is (scl, "scl") 10 | expect_true (scl$pars$ncl >= 4) 11 | expect_true (all (names (scl) %in% 12 | c ("tree", "nodes", "pars", "statistics"))) 13 | expect_true (nrow (scl$tree) < n) 14 | }) 15 | 16 | test_that ("methods", { 17 | set.seed (1) 18 | n <- 100 19 | xy <- matrix (runif (2 * n), ncol = 2) 20 | dmat <- matrix (runif (n^2), ncol = n) 21 | scl1 <- scl_redcap (xy, dmat, ncl = 4, shortest = FALSE) 22 | scl2 <- scl_redcap (xy, dmat, ncl = 4, shortest = TRUE) 23 | expect_true (!identical (scl1, scl2)) 24 | 25 | # these are the default pars: 26 | scl3 <- scl_redcap (xy, dmat, ncl = 4, full_order = TRUE) 27 | expect_true (!identical (scl1, scl2)) 28 | 29 | scl4 <- scl_redcap (xy, dmat, ncl = 4, linkage = "single") 30 | expect_true (identical (scl3, scl4)) 31 | scl5 <- scl_redcap (xy, dmat, ncl = 4, linkage = "average") 32 | expect_false (identical (scl4, scl5)) 33 | scl6 <- scl_redcap (xy, dmat, ncl = 4, linkage = "complete") 34 | expect_false (identical (scl6, scl5)) 35 | expect_false (identical (scl6, scl4)) 36 | scl7 <- scl_redcap (xy, dmat, ncl = 4, full_order = FALSE) 37 | expect_false (identical (scl7, scl4)) 38 | expect_false (identical (scl7, scl5)) 39 | expect_false (identical (scl7, scl6)) 40 | expect_error ( 41 | scl8 <- scl_redcap (xy, dmat, 42 | ncl = 4, 43 | linkage = "blah" 44 | ), 45 | "linkage must be one of" 46 | ) 47 | }) 48 | 49 | test_that ("recluster", { 50 | set.seed (1) 51 | n <- 100 52 | xy <- matrix (runif (2 * n), ncol = 2) 53 | dmat <- matrix (runif (n^2), ncol = n) 54 | scl <- scl_redcap (xy, dmat, ncl = 4) 55 | scl2 <- scl_recluster (scl, ncl = 3) 56 | expect_message ( 57 | scl3 <- scl_redcap (scl, ncl = 3), 58 | "scl_redcap is for initial cluster" 59 | ) 60 | expect_identical (scl2, scl3) 61 | expect_true (!identical (scl, scl2)) 62 | }) 63 | -------------------------------------------------------------------------------- /vignettes/makefile: -------------------------------------------------------------------------------- 1 | LFILE = spatialcluster 2 | 3 | all: knith 4 | #all: knith open 5 | 6 | knith: $(LFILE).Rmd 7 | echo "rmarkdown::render('$(LFILE).Rmd',output_file='$(LFILE).html')" | R --no-save -q 8 | 9 | knitr: $(LFILE).Rmd 10 | echo "rmarkdown::render('$(LFILE).Rmd',rmarkdown::md_document(variant='markdown_github'))" | R --no-save -q 11 | 12 | open: $(LFILE).html 13 | xdg-open $(LFILE).html & 14 | 15 | clean: 16 | rm -rf *.html *.png 17 | -------------------------------------------------------------------------------- /vignettes/spatialcluster.bib: -------------------------------------------------------------------------------- 1 | @article{Guo2008, 2 | author = {D. Guo }, 3 | title = {Regionalization with dynamically constrained agglomerative 4 | clustering and partitioning (REDCAP)}, 5 | journal = {International Journal of Geographical Information Science}, 6 | volume = {22}, 7 | number = {7}, 8 | pages = {801-823}, 9 | year = {2008}, 10 | publisher = {Taylor & Francis}, 11 | doi = {10.1080/13658810701674970}, 12 | URL = {https://doi.org/10.1080/13658810701674970} 13 | } 14 | 15 | @incollection{Lu2009, 16 | title = "Spatial Clustering, Detection and Analysis of ", 17 | editor = "Kitchin, Rob and Thrift, Nigel ", 18 | booktitle = "International Encyclopedia of Human Geography ", 19 | publisher = "Elsevier", 20 | edition = "", 21 | address = "Oxford", 22 | year = {2009}, 23 | pages = {317 - 324}, 24 | isbn = "978-0-08-044910-4", 25 | doi = "https://doi.org/10.1016/B978-008044910-4.00523-X", 26 | url = "https://www.sciencedirect.com/science/article/pii/B978008044910400523X", 27 | author = "Y. Lu", 28 | } 29 | 30 | @incollection{Han2001, 31 | author = {Han, Jiawei and Kamber, M and Tung, Anthony}, 32 | year = {2001}, 33 | month = {01}, 34 | pages = {}, 35 | title = {Spatial clustering methods in data mining: a survey}, 36 | booktitle = {Data Mining and Knowledge Discovery - DATAMINE}, 37 | url = "http://hanj.cs.illinois.edu/pdf/gkdbk01.pdf" 38 | } 39 | 40 | @article{Assuncao2006, 41 | author = {R.M. AssunÇão and M.C. Neves and G. Câmara and C. Da Costa Freitas}, 42 | title = {Efficient regionalization techniques for socio‐economic 43 | geographical units using minimum spanning trees}, 44 | journal = {International Journal of Geographical Information Science}, 45 | volume = {20}, 46 | number = {7}, 47 | pages = {797-811}, 48 | year = {2006}, 49 | publisher = {Taylor \& Francis}, 50 | doi = {10.1080/13658810600665111}, 51 | URL = {https://doi.org/10.1080/13658810600665111} 52 | } 53 | 54 | @article{Duque2007, 55 | author = {Juan Carlos Duque and Raúl Ramos and Jordi Suriñach}, 56 | title ={Supervised Regionalization Methods: A Survey}, 57 | journal = {International Regional Science Review}, 58 | volume = {30}, 59 | number = {3}, 60 | pages = {195-220}, 61 | year = {2007}, 62 | doi = {10.1177/0160017607301605}, 63 | URL = {https://doi.org/10.1177/0160017607301605} 64 | } 65 | 66 | @ARTICLE{Chavent2017, 67 | author = {Chavent, M. and Kuentz-Simonet, V. and Labenne, A. and Saracco J.}, 68 | title = "ClustGeo: an R package for hierarchical clustering with spatial constraints", 69 | journal = {ArXiv e-prints}, 70 | archivePrefix = "arXiv", 71 | eprint = {1707.03897}, 72 | year = {2017}, 73 | month = {jul}, 74 | URL = {https://arxiv.org/abs/1707.03897} 75 | } 76 | 77 | @article{Bivand2015, 78 | author = {Roger Bivand and Gianfranco Piras}, 79 | title = {Comparing Implementations of Estimation Methods for Spatial Econometrics}, 80 | journal = {Journal of Statistical Software, Articles}, 81 | volume = {63}, 82 | number = {18}, 83 | year = {2015}, 84 | issn = {1548-7660}, 85 | pages = {1--36}, 86 | doi = {10.18637/jss.v063.i18}, 87 | url = {https://www.jstatsoft.org/v063/i18}, 88 | note = {Reference 1 of 2 for the spdep package} 89 | } 90 | 91 | @article{Bivand2013, 92 | author = {Roger Bivand and Jan Hauke and Tomasz Kossowski}, 93 | title = {Computing the Jacobian in Gaussian Spatial Autoregressive Models: 94 | An Illustrated Comparison of Available Methods}, 95 | journal = {Geographical Analysis}, 96 | year = {2013}, 97 | volume = {45}, 98 | number = {2}, 99 | pages = {150-179}, 100 | doi = {10.1111/gean.12008}, 101 | url = {https://onlinelibrary.wiley.com/doi/abs/10.1111/gean.12008}, 102 | note = {Reference 2 of 2 for the spdep package} 103 | } 104 | 105 | --------------------------------------------------------------------------------