├── .github ├── .gitignore ├── ISSUE_TEMPLATE │ ├── config.yml │ └── issue_template.md ├── workflows │ ├── issue.yml │ ├── pkgdown.yaml │ ├── stale-actions.yml │ ├── R-CMD-check.yaml │ └── docker-image.yml └── dependabot.yml ├── .gitignore ├── data ├── Boston.rda └── ionosphere.rda ├── LICENSE ├── R ├── package.R ├── RcppExports.R ├── dist_knn_index_dist.R ├── knn_index_dist.R ├── kernelknnCV.R ├── dist_kernelknn.R ├── utils.R └── kernelknn.R ├── tests ├── testthat.R └── testthat │ ├── helper-function_for_tests.R │ ├── test-distance_metrics.R │ ├── test-dist_knn_index_dist.R │ ├── test-kernelknn_cross_valid.R │ ├── test-utils.R │ ├── test-knn_index_dist.R │ ├── test-distmat_kernelknn.R │ └── test-kernelknn.R ├── _pkgdown.yml ├── src ├── Makevars ├── Makevars.win ├── init.c ├── RcppExports.cpp └── distance_metrics.cpp ├── man ├── normalized.Rd ├── func_shuffle.Rd ├── func_tbl.Rd ├── func_tbl_dist.Rd ├── class_folds.Rd ├── regr_folds.Rd ├── FUN_kernels.Rd ├── switch.ops.Rd ├── func_categorical_preds.Rd ├── FUNCTION_weights.Rd ├── Boston.Rd ├── distMat.knn.index.dist.Rd ├── knn.index.dist.Rd ├── ionosphere.Rd ├── KernelKnn.Rd ├── distMat.KernelKnn.Rd └── KernelKnnCV.Rd ├── NAMESPACE ├── inst └── CITATION ├── DESCRIPTION ├── Dockerfile ├── README.md ├── NEWS.md └── vignettes ├── regression_using_the_housing_data.Rmd ├── binary_classification_using_the_ionosphere_data.Rmd └── image_classification_using_MNIST_CIFAR_data.Rmd /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | docs/ 2 | .Rbuildignore 3 | .Rhistory 4 | docs 5 | -------------------------------------------------------------------------------- /data/Boston.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/KernelKnn/HEAD/data/Boston.rda -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2016 2 | COPYRIGHT HOLDER: Mouselimis Lampros 3 | -------------------------------------------------------------------------------- /data/ionosphere.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/KernelKnn/HEAD/data/ionosphere.rda -------------------------------------------------------------------------------- /R/package.R: -------------------------------------------------------------------------------- 1 | #' @useDynLib KernelKnn, .registration = TRUE 2 | #' @importFrom Rcpp evalCpp 3 | NULL -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(KernelKnn) 3 | 4 | test_check("KernelKnn") 5 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | url: https://mlampros.github.io/KernelKnn/ 2 | 3 | template: 4 | bootstrap: 5 5 | bootswatch: cerulean 6 | 7 | -------------------------------------------------------------------------------- /src/Makevars: -------------------------------------------------------------------------------- 1 | PKG_CXXFLAGS = -I../inst/include -I. $(SHLIB_OPENMP_CXXFLAGS) -DARMA_64BIT_WORD -DARMA_USE_CURRENT 2 | PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) $(SHLIB_OPENMP_CXXFLAGS) 3 | -------------------------------------------------------------------------------- /src/Makevars.win: -------------------------------------------------------------------------------- 1 | PKG_CXXFLAGS = -I../inst/include -I. $(SHLIB_OPENMP_CXXFLAGS) -DARMA_64BIT_WORD -DARMA_USE_CURRENT 2 | PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) $(SHLIB_OPENMP_CXXFLAGS) 3 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | # For more info see: https://docs.github.com/en/github/building-a-strong-community/configuring-issue-templates-for-your-repository#configuring-the-template-chooser 2 | 3 | blank_issues_enabled: true 4 | -------------------------------------------------------------------------------- /man/normalized.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{normalized} 4 | \alias{normalized} 5 | \title{this function normalizes the data} 6 | \usage{ 7 | normalized(x) 8 | } 9 | \description{ 10 | this function normalizes the data 11 | } 12 | \keyword{internal} 13 | -------------------------------------------------------------------------------- /man/func_shuffle.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{func_shuffle} 4 | \alias{func_shuffle} 5 | \title{shuffle data} 6 | \usage{ 7 | func_shuffle(vec, times = 10) 8 | } 9 | \description{ 10 | this function shuffles the items of a vector 11 | } 12 | \keyword{internal} 13 | -------------------------------------------------------------------------------- /man/func_tbl.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{func_tbl} 4 | \alias{func_tbl} 5 | \title{this function returns a table of probabilities for each label} 6 | \usage{ 7 | func_tbl(DF, W, labels) 8 | } 9 | \description{ 10 | this function returns a table of probabilities for each label 11 | } 12 | \keyword{internal} 13 | -------------------------------------------------------------------------------- /man/func_tbl_dist.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{func_tbl_dist} 4 | \alias{func_tbl_dist} 5 | \title{this function returns the probabilities in case of classification} 6 | \usage{ 7 | func_tbl_dist(DF, Levels) 8 | } 9 | \description{ 10 | this function returns the probabilities in case of classification 11 | } 12 | \keyword{internal} 13 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(KernelKnn) 4 | export(KernelKnnCV) 5 | export(distMat.KernelKnn) 6 | export(distMat.knn.index.dist) 7 | export(knn.index.dist) 8 | importFrom(Rcpp,evalCpp) 9 | importFrom(stats,dist) 10 | importFrom(stats,model.matrix) 11 | importFrom(utils,combn) 12 | importFrom(utils,setTxtProgressBar) 13 | importFrom(utils,txtProgressBar) 14 | useDynLib(KernelKnn, .registration = TRUE) 15 | -------------------------------------------------------------------------------- /inst/CITATION: -------------------------------------------------------------------------------- 1 | citHeader("Please cite the package in your publications:") 2 | 3 | year <- sub("-.*", "", meta$Date) 4 | note <- sprintf("R package version %s", meta$Version) 5 | 6 | bibentry( 7 | bibtype = "Manual", 8 | title = "{KernelKnn}: Kernel k Nearest Neighbors", 9 | author = person("Lampros", "Mouselimis"), 10 | year = year, 11 | note = note, 12 | url = "https://CRAN.R-project.org/package=KernelKnn" 13 | ) 14 | 15 | 16 | -------------------------------------------------------------------------------- /man/class_folds.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{class_folds} 4 | \alias{class_folds} 5 | \title{stratified folds (in classification) [ detailed information about class_folds in the FeatureSelection package ]} 6 | \usage{ 7 | class_folds(folds, RESP) 8 | } 9 | \description{ 10 | this function creates stratified folds in binary and multiclass classification 11 | } 12 | \keyword{internal} 13 | -------------------------------------------------------------------------------- /man/regr_folds.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{regr_folds} 4 | \alias{regr_folds} 5 | \title{create folds (in regression) [ detailed information about class_folds in the FeatureSelection package ]} 6 | \usage{ 7 | regr_folds(folds, RESP) 8 | } 9 | \description{ 10 | this function creates both stratified and non-stratified folds in regression 11 | } 12 | \keyword{internal} 13 | -------------------------------------------------------------------------------- /man/FUN_kernels.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{FUN_kernels} 4 | \alias{FUN_kernels} 5 | \title{performs kernel smoothing using a bandwidth. Besides using a kernel there is also the option to combine kernels} 6 | \usage{ 7 | FUN_kernels(kernel, W, h) 8 | } 9 | \description{ 10 | performs kernel smoothing using a bandwidth. Besides using a kernel there is also the option to combine kernels 11 | } 12 | \keyword{internal} 13 | -------------------------------------------------------------------------------- /man/switch.ops.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{switch.ops} 4 | \alias{switch.ops} 5 | \title{Arithmetic operations on lists} 6 | \usage{ 7 | switch.ops(LST, MODE = "ADD") 8 | } 9 | \description{ 10 | Arithmetic operations on lists 11 | } 12 | \references{ 13 | https://www.cs.toronto.edu/~duvenaud/cookbook/ 14 | 15 | https://raw.githubusercontent.com/duvenaud/phd-thesis/master/kernels.pdf 16 | } 17 | \keyword{internal} 18 | -------------------------------------------------------------------------------- /man/func_categorical_preds.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{func_categorical_preds} 4 | \alias{func_categorical_preds} 5 | \title{OPTION to convert categorical features TO either numeric [ if levels more than 32] OR to dummy variables [ if levels less than 32 ]} 6 | \usage{ 7 | func_categorical_preds(prepr_categ) 8 | } 9 | \description{ 10 | OPTION to convert categorical features TO either numeric [ if levels more than 32] OR to dummy variables [ if levels less than 32 ] 11 | } 12 | \keyword{internal} 13 | -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | knn_index_dist_rcpp <- function(MATRIX, TEST_DATA, k, method, threads, p, eps = 1.0e-6) { 5 | .Call(`_KernelKnn_knn_index_dist_rcpp`, MATRIX, TEST_DATA, k, method, threads, p, eps) 6 | } 7 | 8 | DIST_MATRIX_knn <- function(DIST_MAT, TEST_IDX = NULL, is_min = TRUE, k = 5L, threads = 1L, rcpp_list_names = FALSE) { 9 | .Call(`_KernelKnn_DIST_MATRIX_knn`, DIST_MAT, TEST_IDX, is_min, k, threads, rcpp_list_names) 10 | } 11 | 12 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/issue_template.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report or feature request 3 | about: Describe a bug you've encountered or make a case for a new feature 4 | --- 5 | 6 | Please briefly describe your problem and what output you expect. If you have a question, you also have the option of (but I'm flexible if it's not too complicated) 7 | 8 | Please include a minimal reproducible example 9 | 10 | Please give a brief description of the problem 11 | 12 | Please add your Operating System (e.g., Windows10, Macintosh, Linux) and the R version that you use (e.g., 4.0.2) 13 | -------------------------------------------------------------------------------- /man/FUNCTION_weights.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{FUNCTION_weights} 4 | \alias{FUNCTION_weights} 5 | \title{this function is used as a kernel-function-identifier [ takes the distances and a weights-kernel (in form of a function) and returns weights ]} 6 | \usage{ 7 | FUNCTION_weights(W_dist_matrix, weights_function, eps = 1e-06) 8 | } 9 | \description{ 10 | this function is used as a kernel-function-identifier [ takes the distances and a weights-kernel (in form of a function) and returns weights ] 11 | } 12 | \keyword{internal} 13 | -------------------------------------------------------------------------------- /.github/workflows/issue.yml: -------------------------------------------------------------------------------- 1 | # For more info see: https://github.com/Renato66/auto-label 2 | # for the 'secrets.GITHUB_TOKEN' see: https://docs.github.com/en/actions/reference/authentication-in-a-workflow#about-the-github_token-secret 3 | 4 | name: Labeling new issue 5 | on: 6 | issues: 7 | types: ['opened'] 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: Renato66/auto-label@v3 13 | with: 14 | repo-token: ${{ secrets.GITHUB_TOKEN }} 15 | ignore-comments: true 16 | labels-synonyms: '{"bug":["error","need fix","not working"],"enhancement":["upgrade"],"question":["help","how can i"]}' 17 | labels-not-allowed: '["documentation","duplicate","good first issue","help wanted","invalid"]' 18 | default-labels: '["triage"]' 19 | -------------------------------------------------------------------------------- /src/init.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include // for NULL 4 | #include 5 | 6 | /* FIXME: 7 | Check these declarations against the C/Fortran source code. 8 | */ 9 | 10 | /* .Call calls */ 11 | extern SEXP _KernelKnn_DIST_MATRIX_knn(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); 12 | extern SEXP _KernelKnn_knn_index_dist_rcpp(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); 13 | 14 | static const R_CallMethodDef CallEntries[] = { 15 | {"_KernelKnn_DIST_MATRIX_knn", (DL_FUNC) &_KernelKnn_DIST_MATRIX_knn, 6}, 16 | {"_KernelKnn_knn_index_dist_rcpp", (DL_FUNC) &_KernelKnn_knn_index_dist_rcpp, 7}, 17 | {NULL, NULL, 0} 18 | }; 19 | 20 | void R_init_KernelKnn(DllInfo *dll) 21 | { 22 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); 23 | R_useDynamicSymbols(dll, FALSE); 24 | } 25 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | # Monitor Docker dependencies 4 | - package-ecosystem: "docker" 5 | directory: "/" 6 | schedule: 7 | interval: "monthly" 8 | open-pull-requests-limit: 3 9 | reviewers: 10 | - "mlampros" 11 | assignees: 12 | - "mlampros" 13 | labels: 14 | - "dependencies" 15 | - "docker" 16 | commit-message: 17 | prefix: "docker" 18 | include: "scope" 19 | 20 | # Monitor GitHub Actions (if you have any workflows) 21 | - package-ecosystem: "github-actions" 22 | directory: "/" 23 | schedule: 24 | interval: "monthly" 25 | open-pull-requests-limit: 5 26 | reviewers: 27 | - "mlampros" 28 | assignees: 29 | - "mlampros" 30 | labels: 31 | - "dependencies" 32 | - "github-actions" 33 | commit-message: 34 | prefix: "ci" 35 | include: "scope" 36 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: KernelKnn 2 | Type: Package 3 | Title: Kernel k Nearest Neighbors 4 | Version: 1.1.6 5 | Date: 2025-09-14 6 | Authors@R: c( person(given = "Lampros", family = "Mouselimis", email = "mouselimislampros@gmail.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-8024-1546")), person(given = "Matthew", family = "Parks", email = "parksmatthewm@gmail.com", role = "ctb", comment = "Github Contributor") ) 7 | BugReports: https://github.com/mlampros/KernelKnn/issues 8 | URL: https://github.com/mlampros/KernelKnn, https://mlampros.github.io/KernelKnn/ 9 | Description: Extends the simple k-nearest neighbors algorithm by incorporating numerous kernel functions and a variety of distance metrics. The package takes advantage of 'RcppArmadillo' to speed up the calculation of distances between observations. 10 | License: MIT + file LICENSE 11 | LazyData: TRUE 12 | Encoding: UTF-8 13 | Depends: R(>= 2.10.0) 14 | Imports: Rcpp (>= 0.12.5) 15 | LinkingTo: Rcpp, RcppArmadillo 16 | Suggests: 17 | testthat, 18 | covr, 19 | knitr, 20 | rmarkdown 21 | VignetteBuilder: knitr 22 | RoxygenNote: 7.3.2 23 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM rocker/rstudio:devel 2 | LABEL maintainer='Lampros Mouselimis' 3 | 4 | RUN export DEBIAN_FRONTEND=noninteractive; apt-get -y update && \ 5 | apt-get install -y git-core pandoc libssl-dev libcurl4-openssl-dev && \ 6 | apt-get install -y sudo && \ 7 | apt-get install -y libarmadillo-dev && \ 8 | apt-get install -y libblas-dev && \ 9 | apt-get install -y liblapack-dev && \ 10 | apt-get install -y libarpack++2-dev && \ 11 | apt-get install -y gfortran && \ 12 | apt-get install -y libxml2-dev && \ 13 | apt-get install -y libssh2-1-dev && \ 14 | apt-get install -y zlib1g-dev && \ 15 | R -e "install.packages('devtools', dependencies = TRUE, repos = 'https://cloud.r-project.org/')" && \ 16 | R -e "install.packages(c( 'Rcpp', 'RcppArmadillo', 'testthat', 'covr', 'knitr', 'rmarkdown', 'remotes' ), repos = 'https://cloud.r-project.org/' )" && \ 17 | apt-get autoremove -y && \ 18 | apt-get clean && \ 19 | rm -rf /var/lib/apt/lists/* 20 | 21 | ADD http://www.random.org/strings/?num=10&len=8&digits=on&upperalpha=on&loweralpha=on&unique=on&format=plain&rnd=new uuid 22 | ARG BUILD_DATE 23 | 24 | RUN echo "$BUILD_DATE" 25 | RUN R -e "remotes::install_github('mlampros/KernelKnn', upgrade = 'always', dependencies = TRUE, repos = 'https://cloud.r-project.org/')" 26 | 27 | ENV USER=rstudio 28 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | release: 8 | types: [published] 9 | workflow_dispatch: 10 | 11 | name: pkgdown.yaml 12 | 13 | permissions: read-all 14 | 15 | jobs: 16 | pkgdown: 17 | runs-on: ubuntu-latest 18 | # Only restrict concurrency for non-PR jobs 19 | concurrency: 20 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} 21 | env: 22 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 23 | permissions: 24 | contents: write 25 | steps: 26 | - uses: actions/checkout@v5 27 | 28 | - uses: r-lib/actions/setup-pandoc@v2 29 | 30 | - uses: r-lib/actions/setup-r@v2 31 | with: 32 | use-public-rspm: true 33 | 34 | - uses: r-lib/actions/setup-r-dependencies@v2 35 | with: 36 | extra-packages: any::pkgdown, local::. 37 | needs: website 38 | 39 | - name: Build site 40 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) 41 | shell: Rscript {0} 42 | 43 | - name: Deploy to GitHub pages 🚀 44 | if: github.event_name != 'pull_request' 45 | uses: JamesIves/github-pages-deploy-action@v4.7.3 46 | with: 47 | clean: false 48 | branch: gh-pages 49 | folder: docs 50 | -------------------------------------------------------------------------------- /.github/workflows/stale-actions.yml: -------------------------------------------------------------------------------- 1 | # for the 'secrets.GITHUB_TOKEN' see: https://docs.github.com/en/actions/reference/authentication-in-a-workflow#about-the-github_token-secret 2 | 3 | name: "Mark or close stale issues and PRs" 4 | 5 | on: 6 | schedule: 7 | - cron: "00 * * * *" 8 | 9 | jobs: 10 | stale: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/stale@v10 14 | with: 15 | repo-token: ${{ secrets.GITHUB_TOKEN }} 16 | days-before-stale: 12 17 | days-before-close: 7 18 | stale-issue-message: "This is Robo-lampros because the Human-lampros is lazy. This issue has been automatically marked as stale because it has not had recent activity. It will be closed after 7 days if no further activity occurs. Feel free to re-open a closed issue and the Human-lampros will respond." 19 | stale-pr-message: "This is Robo-lampros because the Human-lampros is lazy. This PR has been automatically marked as stale because it has not had recent activity. It will be closed after 7 days if no further activity occurs." 20 | close-issue-message: "This issue was automatically closed because of being stale. Feel free to re-open a closed issue and the Human-lampros will respond." 21 | close-pr-message: "This PR was automatically closed because of being stale." 22 | stale-pr-label: "stale" 23 | stale-issue-label: "stale" 24 | exempt-issue-labels: "bug,enhancement,pinned,security,pending,work_in_progress" 25 | exempt-pr-labels: "bug,enhancement,pinned,security,pending,work_in_progress" 26 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | 8 | name: R-CMD-check.yaml 9 | 10 | permissions: read-all 11 | 12 | jobs: 13 | R-CMD-check: 14 | runs-on: ${{ matrix.config.os }} 15 | 16 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 17 | 18 | strategy: 19 | fail-fast: false 20 | matrix: 21 | config: 22 | - {os: macos-latest, r: 'release'} 23 | - {os: windows-latest, r: 'release'} 24 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 25 | - {os: ubuntu-latest, r: 'release'} 26 | - {os: ubuntu-latest, r: 'oldrel-1'} 27 | 28 | env: 29 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 30 | R_KEEP_PKG_SOURCE: yes 31 | 32 | steps: 33 | - uses: actions/checkout@v5 34 | 35 | - uses: r-lib/actions/setup-pandoc@v2 36 | 37 | - uses: r-lib/actions/setup-r@v2 38 | with: 39 | r-version: ${{ matrix.config.r }} 40 | http-user-agent: ${{ matrix.config.http-user-agent }} 41 | use-public-rspm: true 42 | 43 | - uses: r-lib/actions/setup-r-dependencies@v2 44 | with: 45 | extra-packages: any::rcmdcheck 46 | needs: check 47 | 48 | - uses: r-lib/actions/check-r-package@v2 49 | with: 50 | upload-snapshots: true 51 | build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")' 52 | -------------------------------------------------------------------------------- /tests/testthat/helper-function_for_tests.R: -------------------------------------------------------------------------------- 1 | #=================================================================================================================================== 2 | 3 | # REGRESSION 4 | #----------- 5 | 6 | data(Boston, package = 'KernelKnn') 7 | 8 | X = Boston[, -dim(Boston)[2]] 9 | xtr = X[1:350, ] 10 | xte = X[351:nrow(X), ] 11 | y1 = Boston[1:350, dim(Boston)[2]] 12 | y = Boston[, dim(Boston)[2]] 13 | 14 | 15 | # CLASSIFICATION 16 | #--------------- 17 | 18 | data(ionosphere, package = 'KernelKnn') 19 | 20 | singular_mat = ionosphere 21 | ionosphere1 = ionosphere[, -2] 22 | ionosphere = ionosphere[, -c(2, ncol(ionosphere))] # remove second column which has a single unique value 23 | xtr_class = ionosphere[1:200, ] 24 | xte_class = ionosphere[201:nrow(ionosphere), ] 25 | 26 | X_class = ionosphere1[, -dim(ionosphere1)[2]] 27 | xtr_class = X_class[1:200, ] 28 | xte_class = X_class[201:nrow(X_class), ] 29 | y1_class = ionosphere1[1:200, dim(ionosphere1)[2]] 30 | y1_class_ext = ionosphere1[, dim(ionosphere1)[2]] 31 | 32 | 33 | # DISTANCE MATRIX (REGRESSION) 34 | 35 | DIST_obj = stats::dist(X, method = "euclidean") 36 | DIST_mat = as.matrix(DIST_obj) 37 | 38 | DIST_obj_class = stats::dist(ionosphere, method = "euclidean") 39 | DIST_mat_class = as.matrix(DIST_obj_class) 40 | 41 | 42 | # utility function ( lappend ) 43 | #----------------------------- 44 | 45 | lappend <- function(lst, ...){ # lappend() function to append vector to list 46 | 47 | lst <- c(lst, list(...)) 48 | 49 | return(lst) 50 | } 51 | 52 | #=================================================================================================================================== -------------------------------------------------------------------------------- /man/Boston.Rd: -------------------------------------------------------------------------------- 1 | \name{Boston} 2 | \alias{Boston} 3 | \docType{data} 4 | \title{ 5 | Boston Housing Data (Regression) 6 | } 7 | \description{ 8 | housing values in suburbs of Boston 9 | } 10 | \usage{data(Boston)} 11 | \format{ 12 | A data frame with 506 Instances and 14 attributes (including the class attribute, "medv") 13 | \describe{ 14 | \item{\code{crim}}{per capita crime rate by town} 15 | \item{\code{zn}}{proportion of residential land zoned for lots over 25,000 sq.ft.} 16 | \item{\code{indus}}{proportion of non-retail business acres per town} 17 | \item{\code{chas}}{Charles River dummy variable (= 1 if tract bounds)} 18 | \item{\code{nox}}{nitric oxides concentration (parts per 10 million)} 19 | \item{\code{rm}}{average number of rooms per dwelling} 20 | \item{\code{age}}{proportion of owner-occupied units built prior to 1940} 21 | \item{\code{dis}}{weighted distances to five Boston employment centres} 22 | \item{\code{rad}}{index of accessibility to radial highways} 23 | \item{\code{tax}}{full-value property-tax rate per $10,000} 24 | \item{\code{ptratio}}{pupil-teacher ratio by town} 25 | \item{\code{black}}{1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town} 26 | \item{\code{lstat}}{percentage of lower status of the population} 27 | \item{\code{medv}}{Median value of owner-occupied homes in $1000's} 28 | } 29 | } 30 | \source{ 31 | This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University. 32 | 33 | Creator: Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978. 34 | } 35 | \references{ 36 | https://archive.ics.uci.edu/ml/datasets/Housing 37 | } 38 | \examples{ 39 | 40 | data(Boston) 41 | 42 | X = Boston[, -ncol(Boston)] 43 | 44 | y = Boston[, ncol(Boston)] 45 | } 46 | \keyword{datasets} 47 | -------------------------------------------------------------------------------- /man/distMat.knn.index.dist.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/dist_knn_index_dist.R 3 | \name{distMat.knn.index.dist} 4 | \alias{distMat.knn.index.dist} 5 | \title{indices and distances of k-nearest-neighbors using a distance matrix} 6 | \usage{ 7 | distMat.knn.index.dist( 8 | DIST_mat, 9 | TEST_indices = NULL, 10 | k = 5, 11 | threads = 1, 12 | minimize = T 13 | ) 14 | } 15 | \arguments{ 16 | \item{DIST_mat}{a distance matrix (square matrix) having a diagonal filled with either zero's (\emph{0}) or NA's (\emph{missing values})} 17 | 18 | \item{TEST_indices}{a numeric vector specifying the indices of the test data in the distance matrix (row-wise or column-wise). If the parameter equals NULL then no test data is included in the distance matrix} 19 | 20 | \item{k}{an integer specifying the k-nearest-neighbors} 21 | 22 | \item{threads}{the number of cores to be used in parallel (openmp will be employed)} 23 | 24 | \item{minimize}{either TRUE or FALSE. If TRUE then lower values will be considered as relevant for the k-nearest search, otherwise higher values.} 25 | } 26 | \value{ 27 | a list of length 2. The first sublist returns the indices and the second the distances of the k nearest neighbors for each observation. 28 | If TEST_indices is NULL the number of rows of each sublist equals the number of rows in the DIST_mat data. If TEST_indices is not NULL the number of rows of each sublist equals the length of the input TEST_indices. 29 | } 30 | \description{ 31 | indices and distances of k-nearest-neighbors using a distance matrix 32 | } 33 | \details{ 34 | This function takes a number of arguments and it returns the indices and distances of the k-nearest-neighbors for each observation. If TEST_indices is NULL then the indices-distances for the DIST_mat be returned, whereas if TEST_indices is not NULL then the indices-distances for the test data only will be returned. 35 | } 36 | \examples{ 37 | 38 | data(Boston) 39 | 40 | X = Boston[, -ncol(Boston)] 41 | 42 | dist_obj = dist(X) 43 | 44 | dist_mat = as.matrix(dist_obj) 45 | 46 | out = distMat.knn.index.dist(dist_mat, TEST_indices = NULL, k = 5) 47 | 48 | } 49 | \author{ 50 | Lampros Mouselimis 51 | } 52 | -------------------------------------------------------------------------------- /man/knn.index.dist.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/knn_index_dist.R 3 | \name{knn.index.dist} 4 | \alias{knn.index.dist} 5 | \title{indices and distances of k-nearest-neighbors} 6 | \usage{ 7 | knn.index.dist( 8 | data, 9 | TEST_data = NULL, 10 | k = 5, 11 | method = "euclidean", 12 | transf_categ_cols = F, 13 | threads = 1, 14 | p = k 15 | ) 16 | } 17 | \arguments{ 18 | \item{data}{a data.frame or matrix} 19 | 20 | \item{TEST_data}{a data.frame or matrix (it can be also NULL)} 21 | 22 | \item{k}{an integer specifying the k-nearest-neighbors} 23 | 24 | \item{method}{a string specifying the method. Valid methods are 'euclidean', 'manhattan', 'chebyshev', 'canberra', 'braycurtis', 'pearson_correlation', 'simple_matching_coefficient', 'minkowski' (by default the order 'p' of the minkowski parameter equals k), 'hamming', 'mahalanobis', 'jaccard_coefficient', 'Rao_coefficient'} 25 | 26 | \item{transf_categ_cols}{a boolean (TRUE, FALSE) specifying if the categorical columns should be converted to numeric or to dummy variables} 27 | 28 | \item{threads}{the number of cores to be used in parallel (openmp will be employed)} 29 | 30 | \item{p}{a numeric value specifying the 'minkowski' order, i.e. if 'method' is set to 'minkowski'. This parameter defaults to 'k'} 31 | } 32 | \value{ 33 | a list of length 2. The first sublist returns the indices and the second the distances of the k nearest neighbors for each observation. 34 | If TEST_data is NULL the number of rows of each sublist equals the number of rows in the train data. If TEST_data is not NULL the number of rows of each sublist equals the number of rows in the TEST data. 35 | } 36 | \description{ 37 | This function returns the k nearest indices and distances of each observation 38 | } 39 | \details{ 40 | This function takes a number of arguments and it returns the indices and distances of the k-nearest-neighbors for each observation. If TEST_data is NULL then the indices-distances for the train data will be returned, whereas if TEST_data is not NULL then the indices-distances for the TEST_data will be returned. 41 | } 42 | \examples{ 43 | 44 | data(Boston) 45 | 46 | X = Boston[, -ncol(Boston)] 47 | 48 | out = knn.index.dist(X, TEST_data = NULL, k = 4, method = 'euclidean', threads = 1) 49 | 50 | } 51 | \author{ 52 | Lampros Mouselimis 53 | } 54 | -------------------------------------------------------------------------------- /src/RcppExports.cpp: -------------------------------------------------------------------------------- 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #include 5 | #include 6 | 7 | using namespace Rcpp; 8 | 9 | #ifdef RCPP_USE_GLOBAL_ROSTREAM 10 | Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); 11 | Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); 12 | #endif 13 | 14 | // knn_index_dist_rcpp 15 | Rcpp::List knn_index_dist_rcpp(arma::mat& MATRIX, arma::mat& TEST_DATA, int k, std::string& method, int threads, double p, double eps); 16 | RcppExport SEXP _KernelKnn_knn_index_dist_rcpp(SEXP MATRIXSEXP, SEXP TEST_DATASEXP, SEXP kSEXP, SEXP methodSEXP, SEXP threadsSEXP, SEXP pSEXP, SEXP epsSEXP) { 17 | BEGIN_RCPP 18 | Rcpp::RObject rcpp_result_gen; 19 | Rcpp::RNGScope rcpp_rngScope_gen; 20 | Rcpp::traits::input_parameter< arma::mat& >::type MATRIX(MATRIXSEXP); 21 | Rcpp::traits::input_parameter< arma::mat& >::type TEST_DATA(TEST_DATASEXP); 22 | Rcpp::traits::input_parameter< int >::type k(kSEXP); 23 | Rcpp::traits::input_parameter< std::string& >::type method(methodSEXP); 24 | Rcpp::traits::input_parameter< int >::type threads(threadsSEXP); 25 | Rcpp::traits::input_parameter< double >::type p(pSEXP); 26 | Rcpp::traits::input_parameter< double >::type eps(epsSEXP); 27 | rcpp_result_gen = Rcpp::wrap(knn_index_dist_rcpp(MATRIX, TEST_DATA, k, method, threads, p, eps)); 28 | return rcpp_result_gen; 29 | END_RCPP 30 | } 31 | // DIST_MATRIX_knn 32 | Rcpp::List DIST_MATRIX_knn(arma::mat& DIST_MAT, Rcpp::Nullable TEST_IDX, bool is_min, int k, int threads, bool rcpp_list_names); 33 | RcppExport SEXP _KernelKnn_DIST_MATRIX_knn(SEXP DIST_MATSEXP, SEXP TEST_IDXSEXP, SEXP is_minSEXP, SEXP kSEXP, SEXP threadsSEXP, SEXP rcpp_list_namesSEXP) { 34 | BEGIN_RCPP 35 | Rcpp::RObject rcpp_result_gen; 36 | Rcpp::RNGScope rcpp_rngScope_gen; 37 | Rcpp::traits::input_parameter< arma::mat& >::type DIST_MAT(DIST_MATSEXP); 38 | Rcpp::traits::input_parameter< Rcpp::Nullable >::type TEST_IDX(TEST_IDXSEXP); 39 | Rcpp::traits::input_parameter< bool >::type is_min(is_minSEXP); 40 | Rcpp::traits::input_parameter< int >::type k(kSEXP); 41 | Rcpp::traits::input_parameter< int >::type threads(threadsSEXP); 42 | Rcpp::traits::input_parameter< bool >::type rcpp_list_names(rcpp_list_namesSEXP); 43 | rcpp_result_gen = Rcpp::wrap(DIST_MATRIX_knn(DIST_MAT, TEST_IDX, is_min, k, threads, rcpp_list_names)); 44 | return rcpp_result_gen; 45 | END_RCPP 46 | } 47 | -------------------------------------------------------------------------------- /.github/workflows/docker-image.yml: -------------------------------------------------------------------------------- 1 | 2 | #.......................................................................................................... 3 | # build, push and cache the docker image. I have to adjust the following in case of a different repository: 4 | # - I have to add the 'BUILD_DATE' arg in the Dockerfile 5 | # - I have to create a DOCKER_PASSWORD (use the docker token) in the 'Settings' tab of the repository 6 | # - This github action also updates the dockerhub readme file 7 | # References: 8 | # - https://github.com/mlampros/IceSat2R/blob/master/.github/workflows/docker_image.yml 9 | # - https://github.com/orgs/community/discussions/25768#discussioncomment-3249184 10 | #.......................................................................................................... 11 | 12 | on: 13 | push: 14 | branches: [main, master] 15 | pull_request: 16 | branches: [main, master] 17 | 18 | name: docker_img 19 | 20 | jobs: 21 | build: 22 | runs-on: ubuntu-latest 23 | 24 | steps: 25 | - name: Check Out Repo 26 | uses: actions/checkout@v5 27 | 28 | - id: string 29 | uses: ASzc/change-string-case-action@v6 30 | with: 31 | string: ${{ github.event.repository.name }} 32 | 33 | - name: Login to Docker Hub 34 | uses: docker/login-action@v3 35 | with: 36 | username: ${{ github.repository_owner }} 37 | password: ${{ secrets.DOCKER_PASSWORD }} 38 | 39 | - name: Set up Docker Buildx 40 | id: buildx 41 | uses: docker/setup-buildx-action@v3 42 | 43 | - name: Build and push 44 | uses: docker/build-push-action@v6 45 | with: 46 | context: ./ 47 | build-args: BUILD_DATE="$(date -u +'%Y-%m-%dT%H:%M:%SZ')" 48 | file: ./Dockerfile 49 | builder: ${{ steps.buildx.outputs.name }} 50 | push: true 51 | tags: ${{ github.repository_owner }}/${{ steps.string.outputs.lowercase }}:rstudiodev 52 | cache-from: type=registry,ref=${{ github.repository_owner }}/${{ steps.string.outputs.lowercase }}:buildcache 53 | cache-to: type=registry,ref=${{ github.repository_owner }}/${{ steps.string.outputs.lowercase }}:buildcache,mode=max 54 | 55 | - name: Update Docker Hub Description 56 | uses: peter-evans/dockerhub-description@v4 57 | with: 58 | username: ${{ github.repository_owner }} 59 | password: ${{ secrets.DOCKER_PASSWORD }} 60 | repository: ${{ github.repository_owner }}/${{ steps.string.outputs.lowercase }} 61 | short-description: ${{ github.event.repository.description }} 62 | readme-filepath: ./README.md 63 | -------------------------------------------------------------------------------- /man/ionosphere.Rd: -------------------------------------------------------------------------------- 1 | \name{ionosphere} 2 | \alias{ionosphere} 3 | \docType{data} 4 | \title{ 5 | Johns Hopkins University Ionosphere database (binary classification) 6 | } 7 | \description{ 8 | This radar data was collected by a system in Goose Bay, Labrador. 9 | This radar data was collected by a system in Goose Bay, Labrador. This system consists of a phased array of 16 high-frequency 10 | antennas with a total transmitted power on the order of 6.4 kilowatts. See the paper for more details. The targets were free 11 | electrons in the ionosphere. "Good" radar returns are those showing evidence of some type of structure in the ionosphere. 12 | "Bad" returns are those that do not; their signals pass through the ionosphere. Received signals were processed using an autocorrelation 13 | function whose arguments are the time of a pulse and the pulse number. There were 17 pulse numbers for the Goose Bay system. 14 | Instances in this databse are described by 2 attributes per pulse number, corresponding to the complex values returned by the 15 | function resulting from the complex electromagnetic signal. 16 | } 17 | \usage{data(ionosphere)} 18 | \format{ 19 | A data frame with 351 Instances and 35 attributes (including the class attribute, "class") 20 | } 21 | \details{ 22 | Sigillito, V. G., Wing, S. P., Hutton, L. V., Baker, K. B. (1989). Classification of radar returns from the ionosphere using neural networks. 23 | Johns Hopkins APL Technical Digest, 10, 262-266. 24 | 25 | They investigated using backprop and the perceptron training algorithm on this database. Using the first 200 instances for training, which 26 | were carefully split almost 50 percent positive and 50 percent negative, they found that a "linear" perceptron attained 90.7 percent, 27 | a "non-linear" perceptron attained 92 percent, and backprop an average of over 96 percent accuracy on the remaining 150 test instances, 28 | consisting of 123 "good" and only 24 "bad" instances. (There was a counting error or some mistake somewhere; there are a total of 351 rather 29 | than 350 instances in this domain.) Accuracy on "good" instances was much higher than for "bad" instances. Backprop was tested with several 30 | different numbers of hidden units (in [0,15]) and incremental results were also reported (corresponding to how well the different variants of 31 | backprop did after a periodic number of epochs). David Aha (aha@ics.uci.edu) briefly investigated this database. He found that nearest neighbor 32 | attains an accuracy of 92.1 percent, that Ross Quinlan's C4 algorithm attains 94.0 percent (no windowing), and that IB3 (Aha & Kibler, IJCAI-1989) 33 | attained 96.7 percent (parameter settings: 70 percent and 80 percent for acceptance and dropping respectively). 34 | } 35 | \source{ 36 | Donor: Vince Sigillito (vgs@aplcen.apl.jhu.edu) 37 | 38 | Date: 1989 39 | 40 | Source: Space Physics Group 41 | 42 | Applied Physics Laboratory 43 | 44 | Johns Hopkins University 45 | 46 | Johns Hopkins Road 47 | 48 | Laurel, MD 20723 49 | } 50 | \references{ 51 | https://archive.ics.uci.edu/ml/datasets/Ionosphere 52 | } 53 | \examples{ 54 | 55 | data(ionosphere) 56 | 57 | X = ionosphere[, -ncol(ionosphere)] 58 | 59 | y = ionosphere[, ncol(ionosphere)] 60 | } 61 | \keyword{datasets} 62 | -------------------------------------------------------------------------------- /man/KernelKnn.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/kernelknn.R 3 | \name{KernelKnn} 4 | \alias{KernelKnn} 5 | \title{kernel k-nearest-neighbors} 6 | \usage{ 7 | KernelKnn( 8 | data, 9 | TEST_data = NULL, 10 | y, 11 | k = 5, 12 | h = 1, 13 | method = "euclidean", 14 | weights_function = NULL, 15 | regression = F, 16 | transf_categ_cols = F, 17 | threads = 1, 18 | extrema = F, 19 | Levels = NULL, 20 | p = k 21 | ) 22 | } 23 | \arguments{ 24 | \item{data}{a data frame or matrix} 25 | 26 | \item{TEST_data}{a data frame or matrix (it can be also NULL)} 27 | 28 | \item{y}{a numeric vector (in classification the labels must be numeric from 1:Inf)} 29 | 30 | \item{k}{an integer specifying the k-nearest-neighbors} 31 | 32 | \item{h}{the bandwidth (applicable if the weights_function is not NULL, defaults to 1.0)} 33 | 34 | \item{method}{a string specifying the method. Valid methods are 'euclidean', 'manhattan', 'chebyshev', 'canberra', 'braycurtis', 'pearson_correlation', 'simple_matching_coefficient', 'minkowski' (by default the order 'p' of the minkowski parameter equals k), 'hamming', 'mahalanobis', 'jaccard_coefficient', 'Rao_coefficient'} 35 | 36 | \item{weights_function}{there are various ways of specifying the kernel function. See the details section.} 37 | 38 | \item{regression}{a boolean (TRUE,FALSE) specifying if regression or classification should be performed} 39 | 40 | \item{transf_categ_cols}{a boolean (TRUE, FALSE) specifying if the categorical columns should be converted to numeric or to dummy variables} 41 | 42 | \item{threads}{the number of cores to be used in parallel (openmp will be employed)} 43 | 44 | \item{extrema}{if TRUE then the minimum and maximum values from the k-nearest-neighbors will be removed (can be thought as outlier removal)} 45 | 46 | \item{Levels}{a numeric vector. In case of classification the unique levels of the response variable are necessary} 47 | 48 | \item{p}{a numeric value specifying the 'minkowski' order, i.e. if 'method' is set to 'minkowski'. This parameter defaults to 'k'} 49 | } 50 | \value{ 51 | a vector (if regression is TRUE), or a data frame with class probabilities (if regression is FALSE) 52 | } 53 | \description{ 54 | This function utilizes kernel k nearest neighbors to predict new observations 55 | } 56 | \details{ 57 | This function takes a number of arguments and it returns the predicted values. If TEST_data is NULL then the predictions for the train data will be returned, whereas if TEST_data is not NULL then the predictions for the TEST_data will be returned. 58 | There are three possible ways to specify the weights function, 1st option : if the weights_function is NULL then a simple k-nearest-neighbor is performed. 2nd option : the weights_function is one of 'uniform', 'triangular', 'epanechnikov', 'biweight', 'triweight', 'tricube', 'gaussian', 'cosine', 'logistic', 'gaussianSimple', 'silverman', 'inverse', 'exponential'. The 2nd option can be extended by combining kernels from the existing ones (adding or multiplying). For instance, I can multiply the tricube with the gaussian kernel by giving 'tricube_gaussian_MULT' or I can add the previously mentioned kernels by giving 'tricube_gaussian_ADD'. 3rd option : a user defined kernel function 59 | } 60 | \examples{ 61 | 62 | data(Boston) 63 | 64 | X = Boston[, -ncol(Boston)] 65 | y = Boston[, ncol(Boston)] 66 | 67 | out = KernelKnn(X, TEST_data = NULL, y, k = 5, method = 'euclidean', regression = TRUE) 68 | 69 | } 70 | \author{ 71 | Lampros Mouselimis 72 | } 73 | -------------------------------------------------------------------------------- /R/dist_knn_index_dist.R: -------------------------------------------------------------------------------- 1 | 2 | #' indices and distances of k-nearest-neighbors using a distance matrix 3 | #' 4 | #' @param DIST_mat a distance matrix (square matrix) having a diagonal filled with either zero's (\emph{0}) or NA's (\emph{missing values}) 5 | #' @param TEST_indices a numeric vector specifying the indices of the test data in the distance matrix (row-wise or column-wise). If the parameter equals NULL then no test data is included in the distance matrix 6 | #' @param k an integer specifying the k-nearest-neighbors 7 | #' @param threads the number of cores to be used in parallel (openmp will be employed) 8 | #' @param minimize either TRUE or FALSE. If TRUE then lower values will be considered as relevant for the k-nearest search, otherwise higher values. 9 | #' @return a list of length 2. The first sublist returns the indices and the second the distances of the k nearest neighbors for each observation. 10 | #' If TEST_indices is NULL the number of rows of each sublist equals the number of rows in the DIST_mat data. If TEST_indices is not NULL the number of rows of each sublist equals the length of the input TEST_indices. 11 | #' @author Lampros Mouselimis 12 | #' @details 13 | #' This function takes a number of arguments and it returns the indices and distances of the k-nearest-neighbors for each observation. If TEST_indices is NULL then the indices-distances for the DIST_mat be returned, whereas if TEST_indices is not NULL then the indices-distances for the test data only will be returned. 14 | #' @export 15 | #' @examples 16 | #' 17 | #' data(Boston) 18 | #' 19 | #' X = Boston[, -ncol(Boston)] 20 | #' 21 | #' dist_obj = dist(X) 22 | #' 23 | #' dist_mat = as.matrix(dist_obj) 24 | #' 25 | #' out = distMat.knn.index.dist(dist_mat, TEST_indices = NULL, k = 5) 26 | #' 27 | 28 | 29 | distMat.knn.index.dist = function(DIST_mat, TEST_indices = NULL, k = 5, threads = 1, minimize = T) { 30 | 31 | if (!is.matrix(DIST_mat)) stop("the 'DIST_mat' parameter should be of type matrix") 32 | if (nrow(DIST_mat) != ncol(DIST_mat)) stop("the input 'DIST_mat' should be a square matrix with number of rows equal to number of columns") 33 | DIAG = diag(DIST_mat) 34 | nas = all(is.na(DIAG)) 35 | if (nas) { 36 | diag(DIST_mat) = 0 } # set diagonal to 0.0 if equal to NA 37 | else { 38 | if (sum(DIAG) != 0) { 39 | stop("the diagonal of the distance matrix must be a vector of zeros or NA's") 40 | } 41 | } 42 | if (!is.null(TEST_indices)) { 43 | if (!inherits(TEST_indices, c("numeric", "integer"))) stop("the 'TEST_indices' parameter should be a numeric vector") 44 | if (max(TEST_indices) > nrow(DIST_mat)) stop('the maximum number of the TEST_indices is greater than the rows of the input distance matrix') 45 | tr_idx = 1:nrow(DIST_mat) 46 | tr_idx = tr_idx[-TEST_indices] 47 | if (!(min(TEST_indices) > max(tr_idx))) stop("The minimum index of the 'TEST_indices' parameter is greater than the maximum index of the 'DIST_mat' data! Make sure that the 'TEST_indices' consist of the last indices of the 'DIST_mat' parameter!") 48 | } 49 | if (!is.numeric(k) || is.null(k) || (k >= nrow(DIST_mat)) || k < 1) stop('k must be of type integer, greater than 0 and less than nrow(DIST_mat)') 50 | if (abs(k - round(k)) > 0) { 51 | k = round(k) 52 | warning('k is float and will be rounded to : ', call. = F, expr = k)} 53 | if (any(is.na(DIST_mat))) stop('the DIST_mat includes missing values') 54 | if (!inherits(minimize, "logical")) stop("the 'minimize' parameter should be either TRUE or FALSE") 55 | 56 | res = DIST_MATRIX_knn(DIST_mat, TEST_indices, minimize, k, threads, T) 57 | 58 | return(res) 59 | } 60 | 61 | 62 | -------------------------------------------------------------------------------- /man/distMat.KernelKnn.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/dist_kernelknn.R 3 | \name{distMat.KernelKnn} 4 | \alias{distMat.KernelKnn} 5 | \title{kernel k-nearest-neighbors using a distance matrix} 6 | \usage{ 7 | distMat.KernelKnn( 8 | DIST_mat, 9 | TEST_indices = NULL, 10 | y, 11 | k = 5, 12 | h = 1, 13 | weights_function = NULL, 14 | regression = F, 15 | threads = 1, 16 | extrema = F, 17 | Levels = NULL, 18 | minimize = T 19 | ) 20 | } 21 | \arguments{ 22 | \item{DIST_mat}{a distance matrix (square matrix) having a \emph{diagonal} filled with either zero's (\emph{0}) or NA's (\emph{missing values})} 23 | 24 | \item{TEST_indices}{a numeric vector specifying the indices of the test data in the distance matrix (row-wise or column-wise). If the parameter equals NULL then no test data is included in the distance matrix} 25 | 26 | \item{y}{a numeric vector (in classification the labels must be numeric from 1:Inf). It is assumed that if the \emph{TEST_indices} is not NULL then the length of \emph{y} equals to the rows of the train data \emph{( nrow(DIST_mat) - length(TEST_indices) )}, otherwise \emph{length(y) == nrow(DIST_mat)}.} 27 | 28 | \item{k}{an integer specifying the k-nearest-neighbors} 29 | 30 | \item{h}{the bandwidth (applicable if the weights_function is not NULL, defaults to 1.0)} 31 | 32 | \item{weights_function}{there are various ways of specifying the kernel function. See the details section.} 33 | 34 | \item{regression}{a boolean (TRUE,FALSE) specifying if regression or classification should be performed} 35 | 36 | \item{threads}{the number of cores to be used in parallel (openmp will be employed)} 37 | 38 | \item{extrema}{if TRUE then the minimum and maximum values from the k-nearest-neighbors will be removed (can be thought as outlier removal)} 39 | 40 | \item{Levels}{a numeric vector. In case of classification the unique levels of the response variable are necessary} 41 | 42 | \item{minimize}{either TRUE or FALSE. If TRUE then lower values will be considered as relevant for the k-nearest search, otherwise higher values.} 43 | } 44 | \value{ 45 | a vector (if regression is TRUE), or a data frame with class probabilities (if regression is FALSE) 46 | } 47 | \description{ 48 | kernel k-nearest-neighbors using a distance matrix 49 | } 50 | \details{ 51 | This function takes a distance matrix (square matrix where the diagonal is filled with \emph{0} or \emph{NA}) as input. If the \emph{TEST_indices} parameter is NULL then the predictions for the train data will be returned, whereas if the \emph{TEST_indices} parameter is not NULL then the predictions for the test data will be returned. 52 | There are three possible ways to specify the weights function, 1st option : if the weights_function is NULL then a simple k-nearest-neighbor is performed. 2nd option : the weights_function is one of 'uniform', 'triangular', 'epanechnikov', 'biweight', 'triweight', 'tricube', 'gaussian', 'cosine', 'logistic', 'gaussianSimple', 'silverman', 'inverse', 'exponential'. The 2nd option can be extended by combining kernels from the existing ones (adding or multiplying). For instance, I can multiply the tricube with the gaussian kernel by giving 'tricube_gaussian_MULT' or I can add the previously mentioned kernels by giving 'tricube_gaussian_ADD'. 3rd option : a user defined kernel function 53 | } 54 | \examples{ 55 | 56 | data(Boston) 57 | 58 | X = Boston[, -ncol(Boston)] 59 | y = Boston[, ncol(Boston)] 60 | 61 | dist_obj = dist(X) 62 | 63 | dist_mat = as.matrix(dist_obj) 64 | 65 | out = distMat.KernelKnn(dist_mat, TEST_indices = NULL, y, k = 5, regression = TRUE) 66 | 67 | } 68 | \author{ 69 | Lampros Mouselimis 70 | } 71 | -------------------------------------------------------------------------------- /man/KernelKnnCV.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/kernelknnCV.R 3 | \name{KernelKnnCV} 4 | \alias{KernelKnnCV} 5 | \title{kernel-k-nearest-neighbors using cross-validation} 6 | \usage{ 7 | KernelKnnCV( 8 | data, 9 | y, 10 | k = 5, 11 | folds = 5, 12 | h = 1, 13 | method = "euclidean", 14 | weights_function = NULL, 15 | regression = F, 16 | transf_categ_cols = F, 17 | threads = 1, 18 | extrema = F, 19 | Levels = NULL, 20 | seed_num = 1, 21 | p = k 22 | ) 23 | } 24 | \arguments{ 25 | \item{data}{a data frame or matrix} 26 | 27 | \item{y}{a numeric vector (in classification the labels must be numeric from 1:Inf)} 28 | 29 | \item{k}{an integer specifying the k-nearest-neighbors} 30 | 31 | \item{folds}{the number of cross validation folds (must be greater than 1)} 32 | 33 | \item{h}{the bandwidth (applicable if the weights_function is not NULL, defaults to 1.0)} 34 | 35 | \item{method}{a string specifying the method. Valid methods are 'euclidean', 'manhattan', 'chebyshev', 'canberra', 'braycurtis', 'pearson_correlation', 'simple_matching_coefficient', 'minkowski' (by default the order 'p' of the minkowski parameter equals k), 'hamming', 'mahalanobis', 'jaccard_coefficient', 'Rao_coefficient'} 36 | 37 | \item{weights_function}{there are various ways of specifying the kernel function. See the details section.} 38 | 39 | \item{regression}{a boolean (TRUE,FALSE) specifying if regression or classification should be performed} 40 | 41 | \item{transf_categ_cols}{a boolean (TRUE, FALSE) specifying if the categorical columns should be converted to numeric or to dummy variables} 42 | 43 | \item{threads}{the number of cores to be used in parallel (openmp will be employed)} 44 | 45 | \item{extrema}{if TRUE then the minimum and maximum values from the k-nearest-neighbors will be removed (can be thought as outlier removal)} 46 | 47 | \item{Levels}{a numeric vector. In case of classification the unique levels of the response variable are necessary} 48 | 49 | \item{seed_num}{a numeric value specifying the seed of the random number generator} 50 | 51 | \item{p}{a numeric value specifying the 'minkowski' order, i.e. if 'method' is set to 'minkowski'. This parameter defaults to 'k'} 52 | } 53 | \value{ 54 | a list of length 2. The first sublist is a list of predictions (the length of the list equals the number of the folds). The second sublist is a list with the indices for each fold. 55 | } 56 | \description{ 57 | This function performs kernel k nearest neighbors regression and classification using cross validation 58 | } 59 | \details{ 60 | This function takes a number of arguments (including the number of cross-validation-folds) and it returns predicted values and indices for each fold. 61 | There are three possible ways to specify the weights function, 1st option : if the weights_function is NULL then a simple k-nearest-neighbor is performed. 2nd option : the weights_function is one of 'uniform', 'triangular', 'epanechnikov', 'biweight', 'triweight', 'tricube', 'gaussian', 'cosine', 'logistic', 'gaussianSimple', 'silverman', 'inverse', 'exponential'. The 2nd option can be extended by combining kernels from the existing ones (adding or multiplying). For instance, I can multiply the tricube with the gaussian kernel by giving 'tricube_gaussian_MULT' or I can add the previously mentioned kernels by giving 'tricube_gaussian_ADD'. 3rd option : a user defined kernel function 62 | } 63 | \examples{ 64 | 65 | \dontrun{ 66 | data(ionosphere) 67 | 68 | X = ionosphere[, -c(2, ncol(ionosphere))] 69 | y = as.numeric(ionosphere[, ncol(ionosphere)]) 70 | 71 | out = KernelKnnCV(X, y, k = 5, folds = 3, regression = FALSE, Levels = unique(y)) 72 | } 73 | } 74 | \author{ 75 | Lampros Mouselimis 76 | } 77 | -------------------------------------------------------------------------------- /tests/testthat/test-distance_metrics.R: -------------------------------------------------------------------------------- 1 | #============================================================================================================================================================== 2 | 3 | context('Distance metrics rcpp') 4 | 5 | # TRAIN data 6 | 7 | testthat::test_that("in case that the TEST data is an empty matrix (is.null) : the result for each metric is a list, the length of the output is 2 [meaning 2 sublists] and the nrows in total for the two 8 | 9 | sublists [ indices and distances ] are 2 times the TRAIN data", { 10 | 11 | test_dat = matrix(, nrow = 0, ncol = 0) 12 | 13 | lap_regr = lapply(c('euclidean', 'manhattan', 'chebyshev', 'canberra', 'braycurtis', 'pearson_correlation', 'simple_matching_coefficient', 14 | 'minkowski', 'hamming', 'mahalanobis', 'jaccard_coefficient', 'Rao_coefficient'), function(x) knn_index_dist_rcpp(as.matrix(xtr), test_dat, 5, x, 1, 1, eps = 1.0e-6)) 15 | 16 | lap_class = lapply(c('euclidean', 'manhattan', 'chebyshev', 'canberra', 'braycurtis', 'pearson_correlation', 'simple_matching_coefficient', 17 | 'minkowski', 'hamming', 'mahalanobis', 'jaccard_coefficient', 'Rao_coefficient'), function(x) knn_index_dist_rcpp(as.matrix(xtr_class), test_dat, 5, x, 1, 1, eps = 1.0e-6)) 18 | 19 | out1 = sum(unlist(lapply(lap_regr, is.list))) == length(lap_regr) && sum(unlist(lapply(lap_class, is.list))) == length(lap_class) # sublists are lists 20 | 21 | out2 = mean(unlist(lapply(lap_regr, length))) == 2 && mean(unlist(lapply(lap_class, length))) == 2 # length of sublists are 2 22 | 23 | out3 = mean(unlist(lapply(lap_regr, function(x) nrow(x$train_knn_dist) + nrow(x$train_knn_idx)))) == nrow(xtr) * 2 && 24 | 25 | mean(unlist(lapply(lap_class, function(x) nrow(x$train_knn_dist) + nrow(x$train_knn_idx)))) == nrow(xtr_class) * 2 # output nrow for idx and distance is nrow(train_data) * 2 [ as I sum both sublists ] 26 | 27 | testthat::expect_true(sum(c(out1, out2, out3)) == 3) 28 | }) 29 | 30 | 31 | 32 | # TEST data 33 | 34 | testthat::test_that("in case that the TEST data is not an empty matrix : the result for each metric is a list, the length of the output is 2 [meaning 2 sublists] and the nrows in total for the two 35 | 36 | sublists [ indices and distances ] are 2 times the TEST data", { 37 | 38 | test_dat = matrix(, nrow = 0, ncol = 0) 39 | 40 | lap_regr = lapply(c('euclidean', 'manhattan', 'chebyshev', 'canberra', 'braycurtis', 'pearson_correlation', 'simple_matching_coefficient', 41 | 'minkowski', 'hamming', 'mahalanobis', 'jaccard_coefficient', 'Rao_coefficient'), function(x) knn_index_dist_rcpp(as.matrix(xtr), as.matrix(xte), 5, x, 1, 1, eps = 1.0e-6)) 42 | 43 | lap_class = lapply(c('euclidean', 'manhattan', 'chebyshev', 'canberra', 'braycurtis', 'pearson_correlation', 'simple_matching_coefficient', 44 | 'minkowski', 'hamming', 'mahalanobis', 'jaccard_coefficient', 'Rao_coefficient'), function(x) knn_index_dist_rcpp(as.matrix(xtr_class), as.matrix(xte_class), 5, x, 1, 1, eps = 1.0e-6)) 45 | 46 | out1 = sum(unlist(lapply(lap_regr, is.list))) == length(lap_regr) && sum(unlist(lapply(lap_class, is.list))) == length(lap_class) # sublists are lists 47 | 48 | out2 = mean(unlist(lapply(lap_regr, length))) == 2 && mean(unlist(lapply(lap_class, length))) == 2 # length of sublists are 2 49 | 50 | out3 = mean(unlist(lapply(lap_regr, function(x) nrow(x$test_knn_dist) + nrow(x$test_knn_idx)))) == nrow(xte) * 2 && 51 | 52 | mean(unlist(lapply(lap_class, function(x) nrow(x$test_knn_dist) + nrow(x$test_knn_idx)))) == nrow(xte_class) * 2 # output nrow for idx and distance is nrow(train_data) * 2 [ as I sum both sublists ] 53 | 54 | testthat::expect_true(sum(c(out1, out2, out3)) == 3) 55 | }) 56 | 57 | 58 | #============================================================================================================================================================== 59 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | [![R-CMD-check](https://github.com/mlampros/KernelKnn/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/mlampros/KernelKnn/actions/workflows/R-CMD-check.yaml) 3 | [![CRAN_Status_Badge](http://www.r-pkg.org/badges/version/KernelKnn)](http://cran.r-project.org/package=KernelKnn) 4 | [![Downloads](http://cranlogs.r-pkg.org/badges/grand-total/KernelKnn?color=blue)](http://www.r-pkg.org/pkg/KernelKnn) 5 | [![](https://img.shields.io/docker/automated/mlampros/kernelknn.svg)](https://hub.docker.com/r/mlampros/kernelknn) 6 | [![status](https://tinyverse.netlify.app/badge/KernelKnn)](https://CRAN.R-project.org/package=KernelKnn) 7 | 8 | 9 | ## KernelKnn 10 |
11 | 12 | The KernelKnn package extends the simple k-nearest neighbors algorithm by incorporating numerous kernel functions and a variety of distance metrics. The package takes advantage of 'RcppArmadillo' to speed up the calculation of distances between observations. More details on the functionality of KernelKnn can be found in the [blog-post](http://mlampros.github.io/2016/07/10/KernelKnn/) and in the package Vignettes ( *scroll down for information on how to use the* **docker image** ). 13 |

14 | 15 | To install the package from CRAN use, 16 | 17 | ```R 18 | 19 | install.packages("KernelKnn") 20 | 21 | 22 | ``` 23 |
24 | 25 | or download the latest version from Github using the *pak* package, 26 | 27 |
28 | 29 | ```R 30 | 31 | pak::pak('mlampros/KernelKnn') 32 | 33 | ``` 34 |
35 | 36 | Use the following link to report bugs/issues, 37 |

38 | 39 | [https://github.com/mlampros/KernelKnn/issues](https://github.com/mlampros/KernelKnn/issues) 40 | 41 | 42 |
43 | 44 | **UPDATE 29-11-2019** 45 | 46 |
47 | 48 | **Docker images** of the *KernelKnn* package are available to download from my [dockerhub](https://hub.docker.com/r/mlampros/kernelknn) account. The images come with *Rstudio* and the *R-development* version (latest) installed. The whole process was tested on Ubuntu 18.04. To **pull** & **run** the image do the following, 49 | 50 |
51 | 52 | ```R 53 | 54 | docker pull mlampros/kernelknn:rstudiodev 55 | 56 | docker run -d --name rstudio_dev -e USER=rstudio -e PASSWORD=give_here_your_password --rm -p 8787:8787 mlampros/kernelknn:rstudiodev 57 | 58 | ``` 59 | 60 |
61 | 62 | The user can also **bind** a home directory / folder to the image to use its files by specifying the **-v** command, 63 | 64 |
65 | 66 | ```R 67 | 68 | docker run -d --name rstudio_dev -e USER=rstudio -e PASSWORD=give_here_your_password --rm -p 8787:8787 -v /home/YOUR_DIR:/home/rstudio/YOUR_DIR mlampros/kernelknn:rstudiodev 69 | 70 | 71 | ``` 72 | 73 |
74 | 75 | In the latter case you might have first give permission privileges for write access to **YOUR_DIR** directory (not necessarily) using, 76 | 77 |
78 | 79 | ```R 80 | 81 | chmod -R 777 /home/YOUR_DIR 82 | 83 | 84 | ``` 85 | 86 |
87 | 88 | The **USER** defaults to *rstudio* but you have to give your **PASSWORD** of preference (see [https://rocker-project.org](https://rocker-project.org/) for more information). 89 | 90 |
91 | 92 | Open your web-browser and depending where the docker image was *build / run* give, 93 | 94 |
95 | 96 | **1st. Option** on your personal computer, 97 | 98 |
99 | 100 | ```R 101 | http://0.0.0.0:8787 102 | 103 | ``` 104 | 105 |
106 | 107 | **2nd. Option** on a cloud instance, 108 | 109 |
110 | 111 | ```R 112 | http://Public DNS:8787 113 | 114 | ``` 115 | 116 |
117 | 118 | to access the Rstudio console in order to give your username and password. 119 | 120 |
121 | 122 | ### **Citation:** 123 | 124 | If you use the **KernelKnn** R package in your paper or research please cite [https://CRAN.R-project.org/package=KernelKnn/citation.html](https://CRAN.R-project.org/package=KernelKnn/citation.html): 125 | 126 |
127 | 128 | ```R 129 | @Manual{, 130 | title = {{KernelKnn}: Kernel k Nearest Neighbors}, 131 | author = {Lampros Mouselimis}, 132 | year = {2025}, 133 | note = {R package version 1.1.6}, 134 | url = {https://CRAN.R-project.org/package=KernelKnn}, 135 | } 136 | ``` 137 | 138 |
139 | 140 | -------------------------------------------------------------------------------- /tests/testthat/test-dist_knn_index_dist.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | context("kernel knn list of distances and indices using a distance matrix") 4 | 5 | 6 | #================= 7 | # Error handling 8 | #================= 9 | 10 | 11 | testthat::test_that("it returns an error if the input distance object is not of type matrix", { 12 | 13 | tmp_df = as.data.frame(DIST_mat) 14 | 15 | testthat::expect_error(distMat.knn.index.dist(tmp_df, TEST_indices = NULL, k = 5, threads = 1, minimize = T)) 16 | }) 17 | 18 | 19 | testthat::test_that("it returns an error if the input distance matrix is not square", { 20 | 21 | testthat::expect_error(distMat.knn.index.dist(DIST_mat[, -ncol(DIST_mat)], TEST_indices = NULL, k = 5, threads = 1, minimize = T)) 22 | }) 23 | 24 | 25 | testthat::test_that("it returns an error if the diagonal of the distance matrix is other than 0's or NA's", { 26 | 27 | TMP_DIAG = DIST_mat 28 | 29 | diag(TMP_DIAG) = -1 30 | 31 | testthat::expect_error(distMat.knn.index.dist(TMP_DIAG, TEST_indices = NULL, k = 5, threads = 1, minimize = T)) 32 | }) 33 | 34 | 35 | testthat::test_that("it returns an error if the TEST_indices parameter is not of type numeric or integer", { 36 | 37 | invalid_tst_idx = letters[1:100] 38 | 39 | testthat::expect_error(distMat.knn.index.dist(DIST_mat, TEST_indices = invalid_tst_idx, k = 5, threads = 1, minimize = T)) 40 | }) 41 | 42 | 43 | testthat::test_that("it returns an error if the maximum index of the TEST_indices parameter is greater than the rows of the distance matrix", { 44 | 45 | invalid_tst_idx = 1:(nrow(DIST_mat) + 10) 46 | 47 | invalid_tst_idx = (nrow(DIST_mat)-100):length(invalid_tst_idx) 48 | 49 | testthat::expect_error(distMat.knn.index.dist(DIST_mat, TEST_indices = invalid_tst_idx, k = 5, threads = 1, minimize = T)) 50 | }) 51 | 52 | testthat::test_that("it returns an error if k is NULL", { 53 | 54 | testthat::expect_error(distMat.knn.index.dist(DIST_mat, TEST_indices = NULL, k = NULL, threads = 1, minimize = T)) 55 | }) 56 | 57 | 58 | testthat::test_that("it returns an error if k is a character", { 59 | 60 | testthat::expect_error( distMat.knn.index.dist(DIST_mat, TEST_indices = NULL, k = "invalid", threads = 1, minimize = T) ) 61 | }) 62 | 63 | 64 | testthat::test_that("it returns an error if k is greater or equal to the number of rows of the distance matrix", { 65 | 66 | testthat::expect_error( distMat.knn.index.dist(DIST_mat, TEST_indices = NULL, k = nrow(DIST_mat), threads = 1, minimize = T) ) 67 | }) 68 | 69 | 70 | testthat::test_that("it returns an error if k is less than 1", { 71 | 72 | testthat::expect_error(distMat.knn.index.dist(DIST_mat, TEST_indices = NULL, k = -1, threads = 1, minimize = T) ) 73 | }) 74 | 75 | 76 | testthat::test_that("it returns a warning if k is a float", { 77 | 78 | testthat::expect_warning( distMat.knn.index.dist(DIST_mat, TEST_indices = NULL, k = 1.5, threads = 1, minimize = T) ) 79 | }) 80 | 81 | 82 | testthat::test_that("it returns an error if the minimize parameter is not a boolean", { 83 | 84 | testthat::expect_error( distMat.knn.index.dist(DIST_mat, TEST_indices = NULL, k = 5, threads = 1, minimize = 'T') ) 85 | }) 86 | 87 | 88 | 89 | # testing of distMat.knn.index.dist 90 | #---------------------------------- 91 | 92 | 93 | testthat::test_that("it returns the correct output if TEST_indices is NULL", { 94 | 95 | res = distMat.knn.index.dist(DIST_mat, TEST_indices = NULL, k = 5, threads = 1, minimize = T) 96 | 97 | testthat::expect_true( inherits(res, "list") && length(res) == 2 && mean(unlist(lapply(res, nrow))) == nrow(DIST_mat)) 98 | }) 99 | 100 | 101 | testthat::test_that("it returns the correct output if TEST_indices is not NULL", { 102 | 103 | idxs = (nrow(DIST_mat) - 100): nrow(DIST_mat) 104 | 105 | res = distMat.knn.index.dist(DIST_mat, TEST_indices = idxs, k = 5, threads = 1, minimize = T) 106 | 107 | testthat::expect_true( inherits(res, "list") && length(res) == 2 && mean(unlist(lapply(res, nrow))) == length(idxs) ) 108 | }) 109 | 110 | 111 | testthat::test_that("it returns the correct output if TEST_indices is NULL and the main diagonal is NA's", { 112 | 113 | tmp_dist = DIST_mat 114 | 115 | diag(tmp_dist) = NA 116 | 117 | res = distMat.knn.index.dist(tmp_dist, TEST_indices = NULL, k = 5, threads = 1, minimize = T) 118 | 119 | testthat::expect_true( inherits(res, "list") && length(res) == 2 && mean(unlist(lapply(res, nrow))) == nrow(tmp_dist)) 120 | }) 121 | 122 | 123 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | 2 | ## KernelKnn 1.1.7 3 | 4 | * I removed the `SystemRequirements` from the DESCRIPTION file. 5 | 6 | 7 | ## KernelKnn 1.1.6 8 | 9 | * I updated the `Makevars` and `Makevars.win` files by adding `-DARMA_USE_CURRENT` (see issue: https://github.com/RcppCore/RcppArmadillo/issues/476) 10 | * I removed the `-mthreads` compilation option from the "Makevars.win" file 11 | * I removed the "CXX_STD = CXX11" from the "Makevars" files, and the "[[Rcpp::plugins(cpp11)]]" from the "distance_metrics.cpp" file due to the following NOTE from CRAN, "NOTE Specified C++11: please drop specification unless essential" (see also: https://www.tidyverse.org/blog/2023/03/cran-checks-compiled-code/#note-regarding-systemrequirements-c11) 12 | 13 | 14 | ## KernelKnn 1.1.5 15 | 16 | * I added the order *'p'* of the "minkowski" method as a new parameter to the *'KernelKnn()'*, *'KernelKnnCV()'* and *'knn.index.dist()'* functions. It defaults to 'k' (see https://github.com/mlampros/KernelKnn/issues/9) 17 | * I added test cases for the *'KernelKnn()'* and *'knn.index.dist()'* functions 18 | 19 | 20 | ## KernelKnn 1.1.4 21 | 22 | * The pull request 7 fixed a bug in the checking of the Levels argument (see https://github.com/mlampros/KernelKnn/pull/7) 23 | * I fixed an omission of the column names in case of classification in the *KernelKnn()* and *distMat.KernelKnn()* functions (see https://github.com/mlampros/KernelKnn/issues/8) 24 | 25 | 26 | ## KernelKnn 1.1.3 27 | 28 | * I updated the References section of the *switch.ops()* function in the *utils.R* file which explain how the combination of the kernels work 29 | * I added an error case in all functions that make usage of the 'Levels' parameter if the 'Levels' do not match the unique 'y' labels 30 | * I removed the *distMat.KernelKnnCV()* function (and the *tests/test-dist_kernelknnCV.R* file) because based on the current implementation of the *distMat.KernelKnn()* function the *TEST_indices* parameter *must* consist of the *last indices* of the input *DIST_mat* distance matrix and this is not the case if we run cross-validation (see [issue 5](https://github.com/mlampros/KernelKnn/issues/5)) 31 | 32 | 33 | ## KernelKnn 1.1.2 34 | 35 | * I've fixed an error in the *CITATION* file 36 | 37 | 38 | ## KernelKnn 1.1.1 39 | 40 | * I've added the *CITATION* file in the *inst* directory 41 | 42 | 43 | ## KernelKnn 1.1.0 44 | 45 | * I fixed the *"failure: the condition has length > 1"* CRAN error which appeared mainly due to the misuse of the base *class()* function in multiple code snippets in the package (for more info on this matter see: https://developer.r-project.org/Blog/public/2019/11/09/when-you-think-class.-think-again/index.html) 46 | 47 | 48 | ## KernelKnn 1.0.9 49 | 50 | I added a test case to check equality of the results between *KernelKnnCV* and *distMat.KernelKnnCV* functions 51 | 52 | 53 | ## KernelKnn 1.0.8 54 | 55 | I added the *DARMA_64BIT_WORD* flag in the Makevars file to allow the package processing big datasets 56 | 57 | 58 | ## KernelKnn 1.0.7 59 | 60 | I modified the *input_dist_mat* function of the *distance_metrics.cpp* file due to a bug. 61 | I modified the *distMat.KernelKnn* function so that it does not return an error if the rows of the *DIST_mat* distance matrix is not equal to the length of *y* (added comments in the function documentation). 62 | 63 | 64 | ## KernelKnn 1.0.6 65 | 66 | In this version the following functions/parameters were added: 67 | 68 | * *seed_num* : parameter in *KernelKnnCV* and *distMat.KernelKnnCV* cross-validation functions, which specifies the seed of R's random number generator 69 | * *distMat.KernelKnn* : this function performs kernel k-nearest-neighbor search by using a *distance matrix* as input 70 | * *distMat.knn.index.dist* : this function returns the indices and distances for k-nearest neighbors using a distance matrix 71 | * *distMat.KernelKnnCV* : this function performs cross-validated kernel k-nearest-neighbor search using a distance matrix as input 72 | 73 | I also modified the *OpenMP* clauses of the .cpp file to address the ASAN errors. 74 | 75 | 76 | ## KernelKnn 1.0.5 77 | 78 | I removed *OpenImageR* and *irlba* as package dependencies. I also added an *init.c* file in the *src* folder due to a change in CRAN submissions for compiled code [ *references* : http://stackoverflow.com/questions/42313373/r-cmd-check-note-found-no-calls-to-r-registerroutines-r-usedynamicsymbols, https://github.com/RcppCore/Rcpp/issues/636 ] 79 | 80 | 81 | ## KernelKnn 1.0.4 82 | 83 | I added a try-catch Rcpp function to make possible the calculation of singular covariance matrices as sugggested in https://github.com/mlampros/KernelKnn/issues/1 84 | 85 | 86 | ## KernelKnn 1.0.3 87 | 88 | Reimplementation of the Rcpp function due to ASAN-memory-errors 89 | 90 | 91 | ## KernelKnn 1.0.2 92 | 93 | I updated the Description file with a URL and a BugReports web-address. 94 | 95 | 96 | ## KernelKnn 1.0.1 97 | 98 | Currently, Software platforms like OSX do not support openMP, thus I've made openMP optional for all cpp functions. 99 | 100 | 101 | ## KernelKnn 1.0.0 102 | 103 | -------------------------------------------------------------------------------- /tests/testthat/test-kernelknn_cross_valid.R: -------------------------------------------------------------------------------- 1 | 2 | context("Cross validate kernelknn") 3 | 4 | 5 | # shuffle data 6 | 7 | testthat::test_that("shuffle data takes a vector as input and returns a vector as output", { 8 | 9 | y = c(1:50) 10 | 11 | testthat::expect_true(is.vector(func_shuffle(y, times = 10))) 12 | }) 13 | 14 | testthat::test_that("the length of the input vector equals the length of the output vector", { 15 | 16 | y = c(1:50) 17 | 18 | output = func_shuffle(y, times = 10) 19 | 20 | testthat::expect_true(length(y) == length(output)) 21 | }) 22 | 23 | 24 | # classification folds 25 | 26 | testthat::test_that("throws an error if the RESP is not a factor", { 27 | 28 | y = c(1:10) 29 | 30 | testthat::expect_error(class_folds(5, y), "RESP must be a factor") 31 | }) 32 | 33 | 34 | testthat::test_that("the number of folds equals the number of the resulted sublist indices", { 35 | 36 | y = as.factor(sample(1:5, 100, replace = T)) 37 | 38 | testthat::expect_length(class_folds(5, y), 5) 39 | }) 40 | 41 | # regression folds 42 | 43 | testthat::test_that("throws an error if the RESP is not a factor", { 44 | 45 | y = as.factor(c(1:50)) 46 | 47 | testthat::expect_error(regr_folds(5, y), "this function is meant for regression for classification use the 'class_folds' function") 48 | }) 49 | 50 | 51 | testthat::test_that("the number of folds equals the number of the resulted sublist indices", { 52 | 53 | y = sample(1:5, 100, replace = T) 54 | 55 | testthat::expect_length(regr_folds(5, y), 5) 56 | }) 57 | 58 | 59 | # KernelKnnCV function 60 | 61 | testthat::test_that("it returns an error if y is NULL", { 62 | 63 | testthat::expect_error( KernelKnnCV(X, y = NULL, k = 5, folds = 5, h = 1.0, method = 'euclidean', weights_function = NULL, regression = T) ) 64 | }) 65 | 66 | testthat::test_that("it returns an error if y is not numeric", { 67 | 68 | testthat::expect_error( KernelKnnCV(X, y = list(y), k = 5, folds = 5, h = 1.0, method = 'euclidean', weights_function = NULL, regression = T) ) 69 | }) 70 | 71 | 72 | testthat::test_that("it returns an error if missing values are present in the data", { 73 | 74 | tmp_dat = X 75 | tmp_dat$crim[sample(1:length(tmp_dat$crim), 10)] = NA 76 | 77 | testthat::expect_error( KernelKnnCV(tmp_dat, y = y, k = 5, folds = 5, h = 1.0, method = 'euclidean', weights_function = NULL, regression = T) ) 78 | }) 79 | 80 | 81 | testthat::test_that("it returns an error if missing values are present in the response variable", { 82 | 83 | tmp_dat = y 84 | tmp_dat[sample(1:length(tmp_dat), 10)] = NA 85 | 86 | testthat::expect_error( KernelKnnCV(X, y = tmp_dat, k = 5, folds = 5, h = 1.0, method = 'euclidean', weights_function = NULL, regression = T) ) 87 | }) 88 | 89 | 90 | testthat::test_that("it returns an error if the length of y is not equal to the number of rows of the train data", { 91 | 92 | testthat::expect_error( KernelKnnCV(X, y[1:(length(y)-10)], k = 5, folds = 5, h = 1.0, method = 'euclidean', weights_function = NULL, regression = T) ) 93 | }) 94 | 95 | 96 | testthat::test_that("it returns an error if regression = F and there are unique labels less than 1", { 97 | 98 | testthat::expect_error( KernelKnnCV(X_class, as.numeric(y1_class_ext) - 1, k = 5, folds = 5, h = 1.0, method = 'euclidean', weights_function = NULL, regression = F) ) 99 | }) 100 | 101 | 102 | testthat::test_that("it returns an error if folds < 2", { 103 | 104 | testthat::expect_error( KernelKnnCV(X_class, as.numeric(y1_class_ext), k = 5, folds = 1, h = 1.0, method = 'euclidean', weights_function = NULL, regression = F, Levels = unique(y1_class_ext)) ) 105 | }) 106 | 107 | 108 | testthat::test_that("it returns an error if regression is not TRUE or FALSE", { 109 | 110 | testthat::expect_error( KernelKnnCV(X_class, as.numeric(y1_class_ext), k = 5, folds = 3, h = 1.0, method = 'euclidean', weights_function = NULL, regression = 'F', Levels = unique(y1_class_ext)) ) 111 | }) 112 | 113 | 114 | testthat::test_that("it returns an error if each fold has less than 5 observations", { 115 | 116 | testthat::expect_error( KernelKnnCV(X, y = y, k = 5, folds = 100, h = 1.0, method = 'euclidean', weights_function = NULL, regression = T) ) 117 | }) 118 | 119 | 120 | # test KernelKnnCV functionality 121 | 122 | testthat::test_that("it returns a list of length 2 where the length of the unlisted sublists equal the length of the train data, if REGRESSION = TRUE", { 123 | 124 | res = KernelKnnCV(X, y, k = 5, folds = 3, method = 'euclidean', weights_function = NULL, regression = T, Levels = NULL) 125 | 126 | lap_pr = sum(unlist(lapply(res$preds, length))) 127 | 128 | lap_idx = sum(unlist(lapply(res$folds, length))) 129 | 130 | testthat::expect_true( lap_pr == nrow(X) && lap_idx == nrow(X) ) 131 | }) 132 | 133 | 134 | testthat::test_that("it returns a list of length 2 where the length of the unlisted sublists equal the length of the train data, if REGRESSION = FALSE", { 135 | 136 | res = KernelKnnCV(X_class, as.numeric(y1_class_ext), k = 5, folds = 3, method = 'euclidean', weights_function = NULL, regression = F, Levels = as.numeric(unique(y1_class_ext))) 137 | 138 | lap_pr = sum(unlist(lapply(res$preds, nrow))) 139 | 140 | lap_idx = sum(unlist(lapply(res$folds, length))) 141 | 142 | testthat::expect_true( lap_pr == nrow(X_class) && lap_idx == nrow(X_class) ) 143 | }) 144 | 145 | -------------------------------------------------------------------------------- /R/knn_index_dist.R: -------------------------------------------------------------------------------- 1 | 2 | #' indices and distances of k-nearest-neighbors 3 | #' 4 | #' This function returns the k nearest indices and distances of each observation 5 | #' 6 | #' @param data a data.frame or matrix 7 | #' @param TEST_data a data.frame or matrix (it can be also NULL) 8 | #' @param k an integer specifying the k-nearest-neighbors 9 | #' @param method a string specifying the method. Valid methods are 'euclidean', 'manhattan', 'chebyshev', 'canberra', 'braycurtis', 'pearson_correlation', 'simple_matching_coefficient', 'minkowski' (by default the order 'p' of the minkowski parameter equals k), 'hamming', 'mahalanobis', 'jaccard_coefficient', 'Rao_coefficient' 10 | #' @param transf_categ_cols a boolean (TRUE, FALSE) specifying if the categorical columns should be converted to numeric or to dummy variables 11 | #' @param threads the number of cores to be used in parallel (openmp will be employed) 12 | #' @param p a numeric value specifying the 'minkowski' order, i.e. if 'method' is set to 'minkowski'. This parameter defaults to 'k' 13 | #' @return a list of length 2. The first sublist returns the indices and the second the distances of the k nearest neighbors for each observation. 14 | #' If TEST_data is NULL the number of rows of each sublist equals the number of rows in the train data. If TEST_data is not NULL the number of rows of each sublist equals the number of rows in the TEST data. 15 | #' @author Lampros Mouselimis 16 | #' @details 17 | #' This function takes a number of arguments and it returns the indices and distances of the k-nearest-neighbors for each observation. If TEST_data is NULL then the indices-distances for the train data will be returned, whereas if TEST_data is not NULL then the indices-distances for the TEST_data will be returned. 18 | #' @export 19 | #' @examples 20 | #' 21 | #' data(Boston) 22 | #' 23 | #' X = Boston[, -ncol(Boston)] 24 | #' 25 | #' out = knn.index.dist(X, TEST_data = NULL, k = 4, method = 'euclidean', threads = 1) 26 | #' 27 | 28 | 29 | knn.index.dist = function(data, TEST_data = NULL, k = 5, method = 'euclidean', transf_categ_cols = F, threads = 1, p = k) { 30 | 31 | categorical_data_present = sapply(data, function(x) is.factor(x) || is.character(x)) 32 | 33 | if (sum(categorical_data_present) && !transf_categ_cols) stop('Categorical columns present in data. These should be either converted to numeric or the function should be run with transf_categ_cols = TRUE') 34 | if (!is.numeric(k) || is.null(k) || (k >= nrow(data)) || k < 1) stop('k must be of type integer, greater than 0 and less than nrow(train)') 35 | if (!is.character(method) || is.null(method) || !method %in% c('euclidean', 'manhattan', 'chebyshev', 'canberra', 'braycurtis', 'pearson_correlation', 'simple_matching_coefficient', 36 | 'minkowski', 'hamming', 'mahalanobis', 'jaccard_coefficient', 'Rao_coefficient')) 37 | stop("method must be of type character and one of 'euclidean', 'manhattan', 'chebyshev', 'canberra', 'braycurtis', 'pearson_correlation', 'simple_matching_coefficient', 38 | 'minkowski', 'hamming', 'mahalanobis', 'jaccard_coefficient', 'Rao_coefficient'") 39 | if (any(is.na(data))) stop('the data or the response variable includes missing values') 40 | if (!is.null(TEST_data) && any(is.na(TEST_data))) stop('the TEST_data includes missing values') 41 | if (method %in% c('simple_matching_coefficient', 'jaccard_coefficient', 'Rao_coefficient') && !all(unlist(lapply(1:ncol(data), function(x) sum(c(0,1) %in% unique(data[, x])) == 2)))) 42 | stop("methods : 'simple_matching_coefficient', 'jaccard_coefficient' and 'Rao_coefficient' require the data to be in binary form e.g 0,1") 43 | if (!is.null(TEST_data) && method %in% c('simple_matching_coefficient', 'jaccard_coefficient', 'Rao_coefficient') && !all(unlist(lapply(1:ncol(TEST_data), function(x) sum(c(0,1) %in% unique(TEST_data[, x])) == 2)))) 44 | stop("methods : 'simple_matching_coefficient', 'jaccard_coefficient' and 'Rao_coefficient' require the TEST_data to be in binary form e.g 0,1") 45 | 46 | if (!is.null(TEST_data) && ncol(data) != ncol(TEST_data)) stop('the number of columns in train and test data differ') 47 | 48 | #---------------------------------------------------------------------------------------------------- 49 | 50 | # check if any of the variables is categorical, if TRUE then convert categorical predictors to either dummy variables or numeric variables [ depending on the number of levels ] 51 | 52 | if (transf_categ_cols) { 53 | 54 | if (is.null(TEST_data)) { 55 | 56 | data = func_categorical_preds(data)} 57 | 58 | else { 59 | 60 | tmp_dat = func_categorical_preds(rbind(data, TEST_data)) 61 | 62 | data = tmp_dat[1:nrow(data), ] 63 | 64 | TEST_data = tmp_dat[(nrow(data) + 1):nrow(tmp_dat), ] 65 | } 66 | } 67 | #---------------------------------------------------------------------------------------------------- 68 | 69 | if (is.null(TEST_data)) { 70 | 71 | mat = matrix(, nrow = 0, ncol = 0) 72 | 73 | if (!is.matrix(data)) data = as.matrix(data) 74 | 75 | res = knn_index_dist_rcpp(data, mat, k = k, method = method, threads = threads, p = p) 76 | } 77 | 78 | else { 79 | 80 | if (!is.matrix(data)) data = as.matrix(data) 81 | if (!is.matrix(TEST_data)) TEST_data = as.matrix(TEST_data) 82 | 83 | res = knn_index_dist_rcpp(data, TEST_data, k = k, method = method, threads = threads, p = p) 84 | } 85 | 86 | return(res) 87 | } 88 | 89 | 90 | -------------------------------------------------------------------------------- /R/kernelknnCV.R: -------------------------------------------------------------------------------- 1 | 2 | #' kernel-k-nearest-neighbors using cross-validation 3 | #' 4 | #' This function performs kernel k nearest neighbors regression and classification using cross validation 5 | #' 6 | #' @param data a data frame or matrix 7 | #' @param y a numeric vector (in classification the labels must be numeric from 1:Inf) 8 | #' @param k an integer specifying the k-nearest-neighbors 9 | #' @param folds the number of cross validation folds (must be greater than 1) 10 | #' @param h the bandwidth (applicable if the weights_function is not NULL, defaults to 1.0) 11 | #' @param method a string specifying the method. Valid methods are 'euclidean', 'manhattan', 'chebyshev', 'canberra', 'braycurtis', 'pearson_correlation', 'simple_matching_coefficient', 'minkowski' (by default the order 'p' of the minkowski parameter equals k), 'hamming', 'mahalanobis', 'jaccard_coefficient', 'Rao_coefficient' 12 | #' @param weights_function there are various ways of specifying the kernel function. See the details section. 13 | #' @param regression a boolean (TRUE,FALSE) specifying if regression or classification should be performed 14 | #' @param transf_categ_cols a boolean (TRUE, FALSE) specifying if the categorical columns should be converted to numeric or to dummy variables 15 | #' @param threads the number of cores to be used in parallel (openmp will be employed) 16 | #' @param extrema if TRUE then the minimum and maximum values from the k-nearest-neighbors will be removed (can be thought as outlier removal) 17 | #' @param Levels a numeric vector. In case of classification the unique levels of the response variable are necessary 18 | #' @param seed_num a numeric value specifying the seed of the random number generator 19 | #' @param p a numeric value specifying the 'minkowski' order, i.e. if 'method' is set to 'minkowski'. This parameter defaults to 'k' 20 | #' @return a list of length 2. The first sublist is a list of predictions (the length of the list equals the number of the folds). The second sublist is a list with the indices for each fold. 21 | #' @author Lampros Mouselimis 22 | #' @details 23 | #' This function takes a number of arguments (including the number of cross-validation-folds) and it returns predicted values and indices for each fold. 24 | #' There are three possible ways to specify the weights function, 1st option : if the weights_function is NULL then a simple k-nearest-neighbor is performed. 2nd option : the weights_function is one of 'uniform', 'triangular', 'epanechnikov', 'biweight', 'triweight', 'tricube', 'gaussian', 'cosine', 'logistic', 'gaussianSimple', 'silverman', 'inverse', 'exponential'. The 2nd option can be extended by combining kernels from the existing ones (adding or multiplying). For instance, I can multiply the tricube with the gaussian kernel by giving 'tricube_gaussian_MULT' or I can add the previously mentioned kernels by giving 'tricube_gaussian_ADD'. 3rd option : a user defined kernel function 25 | #' @export 26 | #' @importFrom utils txtProgressBar 27 | #' @importFrom utils setTxtProgressBar 28 | #' @examples 29 | #' 30 | #' \dontrun{ 31 | #' data(ionosphere) 32 | #' 33 | #' X = ionosphere[, -c(2, ncol(ionosphere))] 34 | #' y = as.numeric(ionosphere[, ncol(ionosphere)]) 35 | #' 36 | #' out = KernelKnnCV(X, y, k = 5, folds = 3, regression = FALSE, Levels = unique(y)) 37 | #' } 38 | 39 | 40 | KernelKnnCV = function(data, y, k = 5, folds = 5, h = 1.0, method = 'euclidean', weights_function = NULL, regression = F, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL, seed_num = 1, p = k) { 41 | 42 | if (length(y) != nrow(data)) stop('the size of the data and y differ') 43 | if (!is.logical(regression)) stop('the regression argument should be either TRUE or FALSE') 44 | if (any(is.na(data)) || any(is.na(y))) stop('the data or the response variable includes missing values') 45 | if (is.null(y)) stop('the response variable should be numeric') 46 | if (is.integer(y)) y = as.numeric(y) 47 | if (!is.numeric(y)) stop('in both regression and classification the response variable should be numeric or integer and in classification it should start from 1') 48 | if (!regression && any(unique(y) < 1)) stop('the response variable values should begin from 1') 49 | if (folds < 2) stop('the number of folds should be at least 2') 50 | 51 | start = Sys.time() 52 | 53 | if (regression) { 54 | 55 | set.seed(seed_num) 56 | n_folds = regr_folds(folds, y)} 57 | 58 | else { 59 | 60 | set.seed(seed_num) 61 | n_folds = class_folds(folds, as.factor(y)) 62 | } 63 | 64 | if (!all(unlist(lapply(n_folds, length)) > 5)) stop('Each fold has less than 5 observations. Consider decreasing the number of folds or increasing the size of the data.') 65 | 66 | tmp_fit = list() 67 | 68 | cat('\n') ; cat('cross-validation starts ..', '\n') 69 | 70 | pb <- txtProgressBar(min = 0, max = folds, style = 3); cat('\n') 71 | 72 | for (i in 1:folds) { 73 | 74 | tmp_fit[[i]] = KernelKnn(data = data[unlist(n_folds[-i]), ], 75 | TEST_data = data[unlist(n_folds[i]), ], 76 | y = y[unlist(n_folds[-i])], 77 | k = k, 78 | h = h, 79 | method = method, 80 | weights_function = weights_function, 81 | regression = regression, 82 | transf_categ_cols = transf_categ_cols, 83 | threads = threads, 84 | extrema = extrema, 85 | Levels = Levels, 86 | p = p) 87 | 88 | setTxtProgressBar(pb, i) 89 | } 90 | 91 | close(pb); cat('\n') 92 | 93 | end = Sys.time() 94 | 95 | t = end - start 96 | 97 | cat('time to complete :', t, attributes(t)$units, '\n'); cat('\n'); 98 | 99 | return(list(preds = tmp_fit, folds = n_folds)) 100 | } 101 | 102 | 103 | -------------------------------------------------------------------------------- /tests/testthat/test-utils.R: -------------------------------------------------------------------------------- 1 | #================================================================================================================================================================ 2 | 3 | context("Utils testing") 4 | 5 | 6 | # 'normalized' function 7 | 8 | testthat::test_that("normalize data takes a vector as input and returns a vector as output", { 9 | 10 | y = c(1:50) 11 | 12 | testthat::expect_true(is.vector(normalized(y))) 13 | }) 14 | 15 | testthat::test_that("the length of the input vector equals the length of the output vector", { 16 | 17 | y = c(1:50) 18 | 19 | output = normalized(y) 20 | 21 | testthat::expect_true(length(y) == length(output)) 22 | }) 23 | 24 | 25 | # 'func_tbl_dist' function 26 | 27 | testthat::test_that("the output of the func_tbl_dist function has : same columns as the number of the unique levels, same rows as the initial matrix and all rows sum to 1.0", { 28 | 29 | unq_levels = 1:3 30 | 31 | tmp = matrix(sample(unq_levels, 40, replace = T), 8, 5) 32 | 33 | res = func_tbl_dist(tmp, unq_levels) 34 | 35 | testthat::expect_true(ncol(res) == length(unq_levels) && nrow(res) == nrow(tmp) && mean(rowSums(res)) == 1.0) 36 | }) 37 | 38 | 39 | # 'func_tbl' function 40 | 41 | testthat::test_that("the output of the func_tbl function has : same columns as the number of the unique levels, same rows as the initial matrix and all rows sum to 1.0", { 42 | 43 | unq_levels = 1:3 44 | 45 | W = matrix(rep(0.2, 40), 8, 5) 46 | 47 | tmp = matrix(sample(unq_levels, 40, replace = T), 8, 5) 48 | 49 | res = func_tbl(tmp, W, unq_levels) 50 | 51 | testthat::expect_true(ncol(res) == length(unq_levels) && nrow(res) == nrow(tmp) && mean(rowSums(res)) == 1.0) 52 | }) 53 | 54 | 55 | # 'FUNCTION_weights' function 56 | 57 | testthat::test_that("the output of the FUNCTION_weights is the multiplication of a distance matrix with a kernel function. It returns a matrix with the same dimensions 58 | 59 | as the distance matrix and the rows sum to 1.0", { 60 | 61 | func = function(x) { 0.25 * x } 62 | 63 | tmp = matrix(sample(1:10, 16, replace = T), 4, 4) 64 | 65 | res = FUNCTION_weights(tmp, func, eps = 0.0) 66 | 67 | testthat::expect_true(ncol(res) == ncol(tmp) && nrow(res) == nrow(tmp) && mean(rowSums(res)) == 1.0) 68 | }) 69 | 70 | 71 | 72 | # 'switch_secondary' function 73 | 74 | testthat::test_that("the output of the switch_secondary is the multiplication of a distance matrix with a known kernel such as 'uniform', 'triangular' etc. 75 | 76 | It returns a non-empty matrix with the same dimensions as the distance matrix", { 77 | 78 | W = matrix(1:25, 5, 5) 79 | 80 | lap = lapply(c('uniform', 'triangular', 'epanechnikov', 'biweight', 'triweight', 'tricube', 'gaussian', 'cosine', 'logistic', 'gaussianSimple', 'silverman', 'inverse', 'exponential'), 81 | 82 | function(x) switch_secondary(x, W, 1.0)) 83 | 84 | dims = unlist(lapply(lap, function(x) sum(dim(x)) == sum(dim(W)))) 85 | 86 | testthat::expect_true(sum(dims) == length(lap)) 87 | }) 88 | 89 | 90 | # switch.ops function 91 | 92 | testthat::test_that("returns an error if the sublists are not matrices or data frames", { 93 | 94 | tmp = list(vec1 = 1:10, vec2 = 11:20) 95 | 96 | testthat::expect_error( switch.ops(tmp, MODE = 'ADD') ) 97 | }) 98 | 99 | 100 | # 'FUN_kernels' function 101 | 102 | testthat::test_that("returns an error if the combination of the kernels is incorrect", { 103 | 104 | testthat::expect_error( FUN_kernels('uniform_triangular', matrix(runif(100), 10, 10), h = 1.0) ) 105 | }) 106 | 107 | testthat::test_that("returns an error if the combination of the kernels includes an unknown kernel", { 108 | 109 | testthat::expect_error( FUN_kernels('uniform_invalid_ADD', matrix(runif(100), 10, 10), h = 1.0) ) 110 | }) 111 | 112 | testthat::test_that("the output of the FUN_kernels is the multiplication of a distance matrix with a known kernel such as 'uniform', 'triangular' etc. In addition here 113 | 114 | ADD and MULTiply different kernels. It returns a non-empty matrix with the same dimensions as the distance matrix", { 115 | 116 | W = matrix(1:25, 5, 5) 117 | 118 | lap = lapply(c('uniform', 'triangular', 'epanechnikov', 'biweight', 'triweight', 'tricube', 'gaussian', 'cosine', 'logistic', 'gaussianSimple', 'silverman', 'inverse', 119 | 120 | 'exponential', 'uniform_triangular_ADD', 'biweight_triweight_MULT', 'tricube_gaussian_MULT', 'gaussianSimple_silverman_ADD'), 121 | 122 | function(x) FUN_kernels(x, W, 1.0)) 123 | 124 | dims = unlist(lapply(lap, function(x) sum(dim(x)) == sum(dim(W)))) 125 | 126 | testthat::expect_true(sum(dims) == length(lap)) 127 | }) 128 | 129 | 130 | # 'func_categorical_preds' function 131 | 132 | testthat::test_that("if TRUE the function builds dummy variables for factor predictors with less than 32 unique levels and converts to numeric factor predictors with more than 32 levels", { 133 | 134 | m = data.frame(a = as.factor(c(1:19, rep(20, 21))), b = 1:40, c = as.factor(1:40), d = 40:1) 135 | 136 | res = func_categorical_preds(m) 137 | 138 | testthat::expect_true(ncol(res) == 20 + 3) 139 | }) 140 | 141 | 142 | # 'func_categorical_preds' function [ multiple factor variables ] 143 | 144 | testthat::test_that("if TRUE the function builds dummy variables for factor predictors with less than 32 unique levels and converts to numeric factor predictors with more than 32 levels", { 145 | 146 | m = data.frame(a = as.factor(c(1:19, rep(20, 21))), s = as.factor(c(1:19, rep(20, 21))), k = as.factor(c(1:19, rep(20, 21))), b = 1:40, c = as.factor(1:40), d = 40:1) 147 | 148 | res = func_categorical_preds(m) 149 | 150 | testthat::expect_true(ncol(res) == 20 + 19 + 19 + 3) 151 | }) 152 | 153 | 154 | # switch.ops - function 155 | 156 | 157 | testthat::test_that("adding 3 matrices or data.frames of all 1's results in a a matrix of all 3's", { 158 | m1 = matrix(rep(1, 25), 5, 5) 159 | m2 = matrix(rep(1, 25), 5, 5) 160 | m3 = matrix(rep(1, 25), 5, 5) 161 | 162 | lst = list(m1, m2, m3) 163 | 164 | testthat::expect_equal(switch.ops(lst, MODE = 'ADD'), matrix(rep(3, 25), 5,5), check.attributes = FALSE) # check.attributes = F otherwise due to dimnames error 165 | }) 166 | 167 | 168 | testthat::test_that("multiplying 3 matrices or data.frames of all 2's results in a a matrix of all 8's", { 169 | m1 = matrix(rep(2, 25), 5, 5) 170 | m2 = matrix(rep(2, 25), 5, 5) 171 | m3 = matrix(rep(2, 25), 5, 5) 172 | 173 | lst = list(m1, m2, m3) 174 | 175 | testthat::expect_equal(switch.ops(lst, MODE = 'MULT'), matrix(rep(8, 25), 5,5), check.attributes = FALSE) # check.attributes = F otherwise due to dimnames error 176 | }) 177 | 178 | 179 | testthat::test_that("it throws an error if the sublist matrices are of unequal dimensions", { 180 | m1 = matrix(rep(2, 35), 5, 7) 181 | m2 = matrix(rep(2, 25), 5, 5) 182 | m3 = matrix(rep(2, 25), 5, 5) 183 | 184 | lst = list(m1, m2, m3) 185 | 186 | testthat::expect_error(switch.ops(lst, MODE = 'MULT')) # check.attributes = F otherwise due to dimnames error 187 | }) 188 | 189 | 190 | testthat::test_that("if LST is not a list it throws error", { 191 | m1 = matrix(rep(1, 25), 5, 5) 192 | 193 | testthat::expect_error( switch.ops(m1, MODE = 'MULT') ) 194 | }) 195 | 196 | 197 | testthat::test_that("if the MODE is an invalid string it throws an error", { 198 | 199 | m1 = matrix(rep(2, 25), 5, 5) 200 | m2 = matrix(rep(2, 25), 5, 5) 201 | m3 = matrix(rep(2, 25), 5, 5) 202 | 203 | lst = list(m1, m2, m3) 204 | 205 | testthat::expect_error( switch.ops(lst, MODE = 'invalid') ) 206 | }) 207 | 208 | #================================================================================================================================================================ 209 | -------------------------------------------------------------------------------- /R/dist_kernelknn.R: -------------------------------------------------------------------------------- 1 | 2 | #' kernel k-nearest-neighbors using a distance matrix 3 | #' 4 | #' @param DIST_mat a distance matrix (square matrix) having a \emph{diagonal} filled with either zero's (\emph{0}) or NA's (\emph{missing values}) 5 | #' @param TEST_indices a numeric vector specifying the indices of the test data in the distance matrix (row-wise or column-wise). If the parameter equals NULL then no test data is included in the distance matrix 6 | #' @param y a numeric vector (in classification the labels must be numeric from 1:Inf). It is assumed that if the \emph{TEST_indices} is not NULL then the length of \emph{y} equals to the rows of the train data \emph{( nrow(DIST_mat) - length(TEST_indices) )}, otherwise \emph{length(y) == nrow(DIST_mat)}. 7 | #' @param k an integer specifying the k-nearest-neighbors 8 | #' @param h the bandwidth (applicable if the weights_function is not NULL, defaults to 1.0) 9 | #' @param weights_function there are various ways of specifying the kernel function. See the details section. 10 | #' @param regression a boolean (TRUE,FALSE) specifying if regression or classification should be performed 11 | #' @param threads the number of cores to be used in parallel (openmp will be employed) 12 | #' @param extrema if TRUE then the minimum and maximum values from the k-nearest-neighbors will be removed (can be thought as outlier removal) 13 | #' @param Levels a numeric vector. In case of classification the unique levels of the response variable are necessary 14 | #' @param minimize either TRUE or FALSE. If TRUE then lower values will be considered as relevant for the k-nearest search, otherwise higher values. 15 | #' @return a vector (if regression is TRUE), or a data frame with class probabilities (if regression is FALSE) 16 | #' @author Lampros Mouselimis 17 | #' @details 18 | #' This function takes a distance matrix (square matrix where the diagonal is filled with \emph{0} or \emph{NA}) as input. If the \emph{TEST_indices} parameter is NULL then the predictions for the train data will be returned, whereas if the \emph{TEST_indices} parameter is not NULL then the predictions for the test data will be returned. 19 | #' There are three possible ways to specify the weights function, 1st option : if the weights_function is NULL then a simple k-nearest-neighbor is performed. 2nd option : the weights_function is one of 'uniform', 'triangular', 'epanechnikov', 'biweight', 'triweight', 'tricube', 'gaussian', 'cosine', 'logistic', 'gaussianSimple', 'silverman', 'inverse', 'exponential'. The 2nd option can be extended by combining kernels from the existing ones (adding or multiplying). For instance, I can multiply the tricube with the gaussian kernel by giving 'tricube_gaussian_MULT' or I can add the previously mentioned kernels by giving 'tricube_gaussian_ADD'. 3rd option : a user defined kernel function 20 | #' @export 21 | #' @importFrom stats dist 22 | #' @examples 23 | #' 24 | #' data(Boston) 25 | #' 26 | #' X = Boston[, -ncol(Boston)] 27 | #' y = Boston[, ncol(Boston)] 28 | #' 29 | #' dist_obj = dist(X) 30 | #' 31 | #' dist_mat = as.matrix(dist_obj) 32 | #' 33 | #' out = distMat.KernelKnn(dist_mat, TEST_indices = NULL, y, k = 5, regression = TRUE) 34 | #' 35 | 36 | 37 | distMat.KernelKnn = function(DIST_mat, TEST_indices = NULL, y, k = 5, h = 1.0, weights_function = NULL, regression = F, threads = 1, extrema = F, Levels = NULL, minimize = T) { 38 | 39 | if (!is.matrix(DIST_mat)) stop("the 'DIST_mat' parameter should be of type matrix") 40 | if (nrow(DIST_mat) != ncol(DIST_mat)) stop("the input 'DIST_mat' should be a square matrix with number of rows equal to number of columns") 41 | DIAG = diag(DIST_mat) 42 | nas = all(is.na(DIAG)) 43 | if (nas) { 44 | diag(DIST_mat) = 0 } # set diagonal to 0.0 if equal to NA 45 | else { 46 | if (sum(DIAG) != 0) { 47 | stop("the diagonal of the distance matrix must be a vector of zeros or NA's") 48 | } 49 | } 50 | if (!is.null(TEST_indices)) { 51 | if (!inherits(TEST_indices, c("numeric", "integer"))) stop("the 'TEST_indices' parameter should be a numeric vector") 52 | if (max(TEST_indices) > nrow(DIST_mat)) stop('the maximum number of the TEST_indices is greater than the rows of the input distance matrix') 53 | tr_idx = 1:nrow(DIST_mat) 54 | tr_idx = tr_idx[-TEST_indices] 55 | if (!(min(TEST_indices) > max(tr_idx))) stop("The minimum index of the 'TEST_indices' parameter is greater than the maximum index of the 'DIST_mat' data! Make sure that the 'TEST_indices' consist of the last indices of the 'DIST_mat' parameter!") 56 | } 57 | if (!is.numeric(k) || is.null(k) || (k >= nrow(DIST_mat)) || k < 1) stop('k must be of type integer, greater than 0 and less than nrow(DIST_mat)') 58 | if (abs(k - round(k)) > 0) { 59 | k = round(k) 60 | warning('k is float and will be rounded to : ', call. = F, expr = k)} 61 | if (h == 0) stop('h can be any number except for 0') 62 | if (is.null(y)) stop('the response variable should be numeric') 63 | if (is.integer(y)) y = as.numeric(y) 64 | if (!is.numeric(y)) stop('in both regression and classification the response variable should be numeric or integer and in classification it should start from 1') 65 | if (!regression && is.null(Levels)) stop('In classification give the unique values of y in form of a vector') 66 | if (!regression && any(unique(y) < 1)) stop('the response variable values should begin from 1') 67 | if (!regression) { 68 | if (!all(Levels %in% unique(y))) stop("The specified 'Levels' must match the unique 'y' labels!") 69 | } 70 | if (any(is.na(DIST_mat)) || any(is.na(y))) stop('the DIST_mat or the response variable includes missing values') 71 | if (is.null(TEST_indices)) { 72 | if (length(y) != nrow(DIST_mat)) { 73 | stop('the size of the DIST_mat and y differ') 74 | } 75 | } 76 | if (extrema && k < 4) stop('k must be greater than 3 if extrema = TRUE') 77 | if (!inherits(minimize, "logical")) stop("the 'minimize' parameter should be either TRUE or FALSE") 78 | 79 | if (extrema) { 80 | 81 | k = k + 2 # add two values (for min-max) 82 | } 83 | 84 | index_train = DIST_MATRIX_knn(DIST_mat, TEST_indices, minimize, k, threads, F) # the last parameter is FALSE because it is only applicable to 'distMat.knn.index.dist' [ here I don't need two separate functions for train and test data, as the function returns only predictions and not indices ] 85 | 86 | if (extrema) { 87 | 88 | index_train$knn_idx = index_train$knn_idx[, -c(1,k)] # remove min, max (matrices already sorted) 89 | index_train$knn_dist = index_train$knn_dist[, -c(1,k)] # remove min, max (matrices already sorted) 90 | 91 | k = k - 2 # adjust k to previous value 92 | } 93 | 94 | out_train = matrix(y[index_train$knn_idx], ncol = k) 95 | 96 | if (!regression) { 97 | 98 | if (is.null(weights_function)) { 99 | 100 | out = func_tbl_dist(out_train, sort(Levels)) 101 | } 102 | else if (is.function(weights_function)) { 103 | 104 | W = FUNCTION_weights(index_train$knn_dist, weights_function) 105 | 106 | out = func_tbl(out_train, W, sort(Levels)) 107 | } 108 | else if (is.character(weights_function) && nchar(weights_function) > 1) { 109 | 110 | W = FUN_kernels(weights_function, index_train$knn_dist, h) 111 | 112 | out = func_tbl(out_train, W, sort(Levels)) 113 | } 114 | else { 115 | 116 | stop('false input for the weights_function argument') 117 | } 118 | 119 | colnames(out) = paste0('class_', sort(Levels)) 120 | } 121 | 122 | else { 123 | 124 | if (is.null(weights_function)) { 125 | 126 | out = rowMeans(out_train) 127 | } 128 | else if (is.function(weights_function)) { 129 | 130 | W = FUNCTION_weights(index_train$knn_dist, weights_function) 131 | 132 | out = rowSums(out_train * W) 133 | } 134 | else if (is.character(weights_function) && nchar(weights_function) > 1) { 135 | 136 | W = FUN_kernels(weights_function, index_train$knn_dist, h) 137 | 138 | out = rowSums(out_train * W) 139 | } 140 | else { 141 | 142 | stop('false input for the weights_function argument') 143 | } 144 | } 145 | 146 | return(out) 147 | } 148 | 149 | 150 | #================================================================================================================================================================================ 151 | -------------------------------------------------------------------------------- /vignettes/regression_using_the_housing_data.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Regression using the Housing data" 3 | author: "Lampros Mouselimis" 4 | date: "`r Sys.Date()`" 5 | output: rmarkdown::html_vignette 6 | vignette: > 7 | %\VignetteIndexEntry{Regression using the Housing data} 8 | %\VignetteEngine{knitr::rmarkdown} 9 | %\VignetteEncoding{UTF-8} 10 | --- 11 | 12 | The following examples illustrate the functionality of the KernelKnn package for **regression** tasks. I'll make use of the *Housing* data set, 13 | 14 | ```{r, eval=T} 15 | 16 | data(Boston, package = 'KernelKnn') 17 | 18 | str(Boston) 19 | 20 | ``` 21 |
22 | 23 | 24 | When using an algorithm where the ouput depends on distance calculation (as is the case in k-nearest-neighbors) it is recommended to first scale the data, 25 | 26 | ```{r, eval=T} 27 | X = scale(Boston[, -ncol(Boston)]) 28 | y = Boston[, ncol(Boston)] 29 | 30 | # random split of data in train and test 31 | 32 | spl_train = sample(1:length(y), round(length(y) * 0.75)) 33 | spl_test = setdiff(1:length(y), spl_train) 34 | str(spl_train) 35 | str(spl_test) 36 | 37 | 38 | # evaluation metric 39 | 40 | mse = function (y_true, y_pred) { 41 | 42 | out = mean((y_true - y_pred)^2) 43 | 44 | out 45 | } 46 | 47 | ``` 48 | 49 | 50 | 51 | ## The KernelKnn function 52 | 53 | The KernelKnn function takes a number of arguments. To read details for each one of the arguments type ?KernelKnn::KernelKnn in the console. 54 | 55 | A simple k-nearest-neighbors can be run with weights_function = NULL (the parameter 'regression' should be set to TRUE for regression), 56 | 57 | ```{r, eval=T} 58 | 59 | library(KernelKnn) 60 | 61 | preds_TEST = KernelKnn(X[spl_train, ], TEST_data = X[spl_test, ], y[spl_train], k = 5 , 62 | 63 | method = 'euclidean', weights_function = NULL, regression = T) 64 | str(preds_TEST) 65 | 66 | ``` 67 | 68 |
69 | Using transf_categ_cols = TRUE, categorical features can be either encoded to dummy or to numeric features depending on the number of the unique values (here I convert the 'chas' and 'rad' features to factor to apply the *transf_categ_cols* parameter) 70 | 71 | ```{r, eval=T} 72 | 73 | 74 | apply(Boston, 2, function(x) length(unique(x))) 75 | 76 | 77 | tmp_bst = Boston 78 | tmp_bst$chas = as.factor(tmp_bst$chas) 79 | tmp_bst$rad = as.factor(tmp_bst$rad) 80 | 81 | preds_TEST = KernelKnn(tmp_bst[spl_train, -ncol(tmp_bst)], 82 | 83 | TEST_data = tmp_bst[spl_test, -ncol(tmp_bst)], 84 | 85 | y[spl_train], k = 5 , method = 'euclidean', 86 | 87 | regression = T, transf_categ_cols = T) 88 | str(preds_TEST) 89 | 90 | ``` 91 |
92 | There are two ways to use a kernel in the KernelKnn function. The **first option** is to choose one of the existing kernels (*uniform*, *triangular*, *epanechnikov*, *biweight*, *triweight*, *tricube*, *gaussian*, *cosine*, *logistic*, *silverman*, *inverse*, *gaussianSimple*, *exponential*). Here, I use the *mahalanobis* metric (which takes advantage of the covariance matrix of the data, but it somewhat slows down training in comparison to the other distance metrics) and the *biweight* kernel, because they give optimal results (according to my *RandomSearchR* package), 93 | 94 | 95 | ```{r, eval=T} 96 | 97 | 98 | preds_TEST_biw = KernelKnn(X[spl_train, ], TEST_data = X[spl_test, ], y[spl_train], k = 5, 99 | 100 | method = 'mahalanobis', weights_function = 'biweight', 101 | 102 | regression = T, transf_categ_cols = F) 103 | str(preds_TEST_biw) 104 | 105 | ``` 106 |
107 | The **second option** is to give a self defined kernel function. Here, I'll pick the density function of the normal distribution with mean = 0.0 and standard deviation = 1.0 (the data are scaled to have mean zero and unit variance), 108 | 109 | 110 | ```{r, eval=T} 111 | 112 | 113 | norm_kernel = function(W) { 114 | 115 | W = dnorm(W, mean = 0, sd = 1.0) 116 | 117 | W = W / rowSums(W) 118 | 119 | return(W) 120 | } 121 | 122 | 123 | preds_TEST_norm = KernelKnn(X[spl_train, ], TEST_data = X[spl_test, ], y[spl_train], k = 5, 124 | 125 | method = 'mahalanobis', weights_function = norm_kernel, 126 | 127 | regression = T, transf_categ_cols = F) 128 | str(preds_TEST_norm) 129 | 130 | ``` 131 |
132 | 133 | 134 | The computations can be speed up by using the parameter **threads** (multiple cores can be run in parallel). There is also the option to exclude **extrema** (minimum and maximum distances) during the calculation of the k-nearest-neighbor distances using extrema = TRUE. The *bandwidth* of the existing kernels can be tuned using the **h** parameter. 135 |
136 | 137 | K-nearest-neigbor calculations in the KernelKnn function can be accomplished using the following distance metrics : *euclidean*, *manhattan*, *chebyshev*, *canberra*, *braycurtis*, *minkowski* (by default the order 'p' of the minkowski parameter equals k), *hamming*, *mahalanobis*, *pearson_correlation*, *simple_matching_coefficient*, *jaccard_coefficient* and *Rao_coefficient*. The last four are similarity measures and are appropriate for binary data [0,1]. 138 |
139 | 140 | I employed my RandomSearchR package to find the optimal parameters for the KernelKnn function and the following two pairs of parameters give an optimal mean-squared-error, 141 |
142 |
143 | 144 | ```{r, eval = T, echo = F} 145 | 146 | knitr::kable(data.frame(k = c(9,3), method = c('mahalanobis', 'canberra'), kernel = c('triweight', 'cosine'))) 147 | ``` 148 | 149 | 150 | 151 | ## The KernelKnnCV function 152 | 153 | I'll use the *KernelKnnCV* function to calculate the mean-squared-error using 3-fold cross-validation for the previous mentioned parameter pairs, 154 | 155 | 156 | ```{r, eval=T, warning = FALSE, message = FALSE, results = 'hide'} 157 | 158 | fit_cv_pair1 = KernelKnnCV(X, y, k = 9, folds = 3, method = 'mahalanobis', 159 | 160 | weights_function = 'triweight', regression = T, 161 | 162 | threads = 5, seed_num = 3) 163 | ``` 164 | 165 | ```{r, eval=T} 166 | str(fit_cv_pair1) 167 | ``` 168 | 169 | ```{r, eval=T, warning = FALSE, message = FALSE, results = 'hide'} 170 | fit_cv_pair2 = KernelKnnCV(X, y, k = 3, folds = 3, method = 'canberra', 171 | 172 | weights_function = 'cosine', regression = T, 173 | 174 | threads = 5, seed_num = 3) 175 | ``` 176 | 177 | ```{r, eval=T, warning = FALSE, message = FALSE, results = 'hide'} 178 | str(fit_cv_pair2) 179 | 180 | ``` 181 |
182 | 183 | 184 | Each cross-validated object returns a list of length 2 ( the first sublist includes the predictions for each fold whereas the second gives the indices of the folds) 185 | 186 | 187 | ```{r, eval=T} 188 | mse_pair1 = unlist(lapply(1:length(fit_cv_pair1$preds), 189 | 190 | function(x) mse(y[fit_cv_pair1$folds[[x]]], 191 | 192 | fit_cv_pair1$preds[[x]]))) 193 | mse_pair1 194 | 195 | cat('mse for params_pair1 is :', mean(mse_pair1), '\n') 196 | 197 | mse_pair2 = unlist(lapply(1:length(fit_cv_pair2$preds), 198 | 199 | function(x) mse(y[fit_cv_pair2$folds[[x]]], 200 | 201 | fit_cv_pair2$preds[[x]]))) 202 | mse_pair2 203 | 204 | cat('mse for params_pair2 is :', mean(mse_pair2), '\n') 205 | 206 | ``` 207 |
208 | 209 | 210 | ## Adding or multiplying kernels 211 | 212 | In the KernelKnn package there is also the option to **combine kernels** (adding or multiplying) from the existing ones. For instance, if I want to multiply the *tricube* with the *gaussian* kernel, then I'll give the following character string to the weights_function, *"tricube_gaussian_MULT"*. On the other hand, If I want to add the same kernels then the weights_function will be *"tricube_gaussian_ADD"*. I experimented with my RandomSearchR package combining the different kernels and the following two parameter settings gave optimal results, 213 | 214 |
215 | ```{r, eval = T, echo = F} 216 | 217 | knitr::kable(data.frame(k = c(19,18), method = c('mahalanobis', 'mahalanobis'), kernel = c('triangular_triweight_MULT', 'biweight_triweight_gaussian_MULT'))) 218 | ``` 219 | 220 |
221 | 222 | 223 | ```{r, eval=T, warning = FALSE, message = FALSE, results = 'hide'} 224 | 225 | fit_cv_pair1 = KernelKnnCV(X, y, k = 19, folds = 3, method = 'mahalanobis', 226 | 227 | weights_function = 'triangular_triweight_MULT', 228 | 229 | regression = T, threads = 5, seed_num = 3) 230 | ``` 231 | 232 | ```{r, eval=T} 233 | str(fit_cv_pair1) 234 | ``` 235 | 236 | ```{r, eval=T, warning = FALSE, message = FALSE, results = 'hide'} 237 | fit_cv_pair2 = KernelKnnCV(X, y, k = 18, folds = 3, method = 'mahalanobis', 238 | 239 | weights_function = 'biweight_triweight_gaussian_MULT', 240 | 241 | regression = T, threads = 5, seed_num = 3) 242 | ``` 243 | 244 | ```{r, eval=T} 245 | str(fit_cv_pair2) 246 | 247 | ``` 248 |
249 | 250 | ```{r, eval=T} 251 | mse_pair1 = unlist(lapply(1:length(fit_cv_pair1$preds), 252 | 253 | function(x) mse(y[fit_cv_pair1$folds[[x]]], 254 | 255 | fit_cv_pair1$preds[[x]]))) 256 | mse_pair1 257 | 258 | cat('mse for params_pair1 is :', mean(mse_pair1), '\n') 259 | 260 | mse_pair2 = unlist(lapply(1:length(fit_cv_pair2$preds), 261 | 262 | function(x) mse(y[fit_cv_pair2$folds[[x]]], 263 | 264 | fit_cv_pair2$preds[[x]]))) 265 | mse_pair2 266 | 267 | cat('mse for params_pair2 is :', mean(mse_pair2), '\n') 268 | 269 | ``` 270 |
271 | 272 | -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | #============================================================================================================================================== 2 | 3 | #' this function normalizes the data 4 | #' 5 | #' @keywords internal 6 | 7 | normalized = function(x) { 8 | 9 | out = (x - min(x))/(max(x) - min(x)) 10 | 11 | out 12 | } 13 | 14 | 15 | 16 | #' this function returns the probabilities in case of classification 17 | #' 18 | #' @keywords internal 19 | 20 | func_tbl_dist = function(DF, Levels) { 21 | 22 | mat = matrix(rep(0, dim(DF)[1] * length(Levels)), ncol = length(Levels), nrow = dim(DF)[1]) 23 | 24 | for (i in 1:dim(DF)[1]) { 25 | 26 | tmp_tbl = prop.table(table(DF[i, ])) 27 | 28 | mat[i, as.numeric(names(tmp_tbl))] = tmp_tbl 29 | } 30 | 31 | mat 32 | } 33 | 34 | 35 | 36 | #' this function returns a table of probabilities for each label 37 | #' 38 | #' @keywords internal 39 | 40 | func_tbl = function(DF, W, labels) { 41 | 42 | tmp_W = matrix(rep(0, dim(DF)[1] * length(labels)), ncol = length(labels), nrow = dim(DF)[1]) 43 | 44 | for (i in 1:length(labels)) { 45 | 46 | tmp_W[, i] <- rowSums(W * (DF == labels[i])) 47 | } 48 | 49 | tmp_W 50 | } 51 | 52 | 53 | 54 | #' this function is used as a kernel-function-identifier [ takes the distances and a weights-kernel (in form of a function) and returns weights ] 55 | #' 56 | #' @keywords internal 57 | 58 | FUNCTION_weights = function(W_dist_matrix, weights_function, eps = 1.0e-6) { 59 | 60 | W_dist_matrix = t(apply(W_dist_matrix, 1, normalized)) 61 | 62 | W_dist_matrix = W_dist_matrix - eps 63 | 64 | W = do.call(weights_function, list(W_dist_matrix)) 65 | 66 | W <- W/rowSums(W) 67 | 68 | W 69 | } 70 | 71 | 72 | 73 | # secondary function used in 'FUN_kernels' function 74 | #' 75 | #' @keywords internal 76 | 77 | switch_secondary = function(kernel, W, h, eps = 1.0e-6) { 78 | 79 | W = t(apply(W, 1, normalized)) 80 | 81 | W = W - eps # add small value in case of 0's 82 | 83 | kernel <- match.arg(kernel, c('uniform', 'triangular', 'epanechnikov', 'biweight', 'triweight', 'tricube', 'gaussian', 'cosine', 'logistic', 'gaussianSimple', 84 | 85 | 'silverman', 'inverse', 'exponential'), FALSE) 86 | 87 | switch(kernel, 88 | 89 | uniform = {W = (1/2)* abs(W/h)}, 90 | 91 | triangular = {W = (1 - abs(W/h))}, 92 | 93 | epanechnikov = {W = (3/4) * (1 - (W / h) ^ 2)}, 94 | 95 | biweight = {W = (15/16) * ((1 - ((W / h) ^ 2)) ^ 2)}, 96 | 97 | triweight = {W = (35/32) * ((1 - ((W / h) ^ 2)) ^ 3)}, 98 | 99 | tricube = {W = (70/81) * ((1 - (abs(W) ^ 3)/(h ^ 3)) ^ 3)}, 100 | 101 | gaussian = {W = (1/sqrt(2*pi)) * exp((-1/2) * ((W ^ 2)/(2*(h ^ 2))))}, 102 | 103 | gaussianSimple = {W = exp(- (W / h) ^ 2)}, 104 | 105 | cosine = {W = (pi/4) * cos((pi * abs(W - 0.5))/(2*h))}, 106 | 107 | logistic = {W = (1/(exp(W/h) + 2 + exp(-W/h)))}, 108 | 109 | silverman = {W = 1/2 * exp(-abs(W/h)/sqrt(2)) * sin((abs(W)/(2*h)) + (pi/4))}, 110 | 111 | inverse = {W <- 1/abs(W/h)}, 112 | 113 | exponential = {W = exp(- abs(W / h))}, 114 | ) 115 | 116 | W <- W/rowSums(W) # normalize weights 117 | 118 | return(W) 119 | } 120 | 121 | 122 | #' Arithmetic operations on lists 123 | #' 124 | #' @references 125 | #' 126 | #' https://www.cs.toronto.edu/~duvenaud/cookbook/ 127 | #' 128 | #' https://raw.githubusercontent.com/duvenaud/phd-thesis/master/kernels.pdf 129 | #' 130 | #' @keywords internal 131 | 132 | 133 | switch.ops = function (LST, MODE = 'ADD') { 134 | 135 | if (!inherits(LST, "list")) stop("LST must be a list") 136 | 137 | if (!all(unlist(lapply(LST, function(x) inherits(x, c('data.frame', 'matrix')))))) stop('the sublist objects must be either matrices or data frames') 138 | 139 | r = all(unlist(lapply(LST, nrow)) == unlist(lapply(LST, nrow))[1]) 140 | 141 | c = all(unlist(lapply(LST, ncol)) == unlist(lapply(LST, ncol))[1]) 142 | 143 | if (!all(c(r, c))) stop("the dimensions of the included data.frames or matrices differ") 144 | 145 | if (MODE == 'ADD') { 146 | 147 | init_df = data.frame(matrix(rep(0, dim(LST[[1]])[1] * dim(LST[[1]])[2]), nrow = dim(LST[[1]])[1], ncol = dim(LST[[1]])[2]))} 148 | 149 | else if (MODE == 'MULT') { 150 | 151 | init_df = data.frame(matrix(rep(1, dim(LST[[1]])[1] * dim(LST[[1]])[2]), nrow = dim(LST[[1]])[1], ncol = dim(LST[[1]])[2])) 152 | } 153 | 154 | else { 155 | 156 | stop('invalid MODE type') 157 | } 158 | 159 | for (i in 1:length(LST)) { 160 | 161 | if (MODE == 'ADD') { 162 | 163 | init_df = init_df + LST[[i]]} 164 | 165 | if (MODE == 'MULT') { 166 | 167 | init_df = init_df * LST[[i]] 168 | } 169 | } 170 | 171 | colnames(init_df) = colnames(LST[[1]]) 172 | 173 | return(as.matrix(init_df)) 174 | } 175 | 176 | 177 | 178 | #' performs kernel smoothing using a bandwidth. Besides using a kernel there is also the option to combine kernels 179 | #' 180 | #' @keywords internal 181 | 182 | 183 | FUN_kernels = function(kernel, W, h) { 184 | 185 | spl = strsplit(kernel, '_')[[1]] 186 | 187 | s_op = spl[length(spl)] 188 | 189 | s_kerns = spl[-length(spl)] 190 | 191 | if (length(spl) > 1) { 192 | 193 | if (length(s_kerns) < 2) stop('invalid kernel combination') 194 | 195 | kernels = c('uniform', 'triangular', 'epanechnikov', 'biweight', 'triweight', 'tricube', 'gaussian', 'cosine', 196 | 197 | 'logistic', 'gaussianSimple', 'silverman', 'inverse', 'exponential') 198 | 199 | if (sum(s_kerns %in% kernels) != length(s_kerns)) stop('invalid kernel combination') 200 | 201 | lap = lapply(s_kerns, function(x) switch_secondary(x, W, h)) 202 | 203 | W = switch.ops(lap, MODE = s_op) 204 | 205 | W <- W/rowSums(W) 206 | } 207 | 208 | else { 209 | 210 | W = switch_secondary(kernel, W, h) 211 | } 212 | 213 | return(W) 214 | } 215 | 216 | 217 | 218 | #' OPTION to convert categorical features TO either numeric [ if levels more than 32] OR to dummy variables [ if levels less than 32 ] 219 | #' 220 | #' @keywords internal 221 | #' @importFrom stats model.matrix 222 | 223 | func_categorical_preds = function(prepr_categ) { 224 | 225 | less32 = sapply(prepr_categ, function(x) is.factor(x) && length(unique(x)) < 32) 226 | greater32 = sapply(prepr_categ, function(x) is.factor(x) && length(unique(x)) >= 32) 227 | 228 | if (sum(less32) == 1) { 229 | 230 | rem_predictors = names(which(less32)) 231 | out_L = model.matrix(~. - 1, data = data.frame(prepr_categ[, rem_predictors])) 232 | colnames(out_L) = paste0(rem_predictors, 1:dim(out_L)[2]) 233 | } 234 | 235 | if (sum(less32) > 1) { 236 | 237 | rem_predictors = names(which(less32)) 238 | out_L = model.matrix(~. - 1, data = prepr_categ[, rem_predictors]) 239 | colnames(out_L) = make.names(colnames(out_L)) 240 | } 241 | 242 | if (sum(greater32) > 0) { 243 | 244 | fact_predictors = names(which(greater32)) 245 | 246 | for (nams in fact_predictors) { 247 | 248 | prepr_categ[, nams] = as.numeric(prepr_categ[, nams]) 249 | } 250 | } 251 | 252 | if (sum(less32) > 0) { 253 | 254 | return(cbind(prepr_categ[, -which(colnames(prepr_categ) %in% rem_predictors)], out_L)) 255 | } 256 | 257 | else { 258 | 259 | return(prepr_categ) 260 | } 261 | } 262 | 263 | 264 | #' shuffle data 265 | #' 266 | #' this function shuffles the items of a vector 267 | #' @keywords internal 268 | 269 | func_shuffle = function(vec, times = 10) { 270 | 271 | for (i in 1:times) { 272 | 273 | out = sample(vec, length(vec)) 274 | } 275 | out 276 | } 277 | 278 | 279 | #' stratified folds (in classification) [ detailed information about class_folds in the FeatureSelection package ] 280 | #' 281 | #' this function creates stratified folds in binary and multiclass classification 282 | #' @keywords internal 283 | #' @importFrom utils combn 284 | 285 | 286 | class_folds = function(folds, RESP) { 287 | 288 | if (!is.factor(RESP)) { 289 | 290 | stop(simpleError("RESP must be a factor")) 291 | } 292 | 293 | clas = lapply(unique(RESP), function(x) which(RESP == x)) 294 | 295 | len = lapply(clas, function(x) length(x)) 296 | 297 | samp_vec = rep(1/folds, folds) 298 | 299 | prop = lapply(len, function(y) sapply(1:length(samp_vec), function(x) round(y * samp_vec[x]))) 300 | 301 | repl = unlist(lapply(prop, function(x) sapply(1:length(x), function(y) rep(paste0('fold_', y), x[y])))) 302 | 303 | spl = suppressWarnings(split(1:length(RESP), repl)) 304 | 305 | sort_names = paste0('fold_', 1:folds) 306 | 307 | spl = spl[sort_names] 308 | 309 | spl = lapply(spl, function(x) func_shuffle(x)) # the indices of the unique levels will be shuffled 310 | 311 | ind = t(combn(1:folds, 2)) 312 | 313 | ind1 = apply(ind, 1, function(x) length(intersect(spl[x[1]], spl[x[2]]))) 314 | 315 | if (sum(ind1) > 0) { 316 | 317 | stop(simpleError("there is an intersection between the resulted indexes of the folds")) 318 | 319 | } 320 | 321 | if (length(unlist(spl)) != length(RESP)) { 322 | 323 | stop(simpleError("the number of items in the folds are not equal with the response items")) 324 | } 325 | 326 | spl 327 | } 328 | 329 | 330 | #' create folds (in regression) [ detailed information about class_folds in the FeatureSelection package ] 331 | #' 332 | #' this function creates both stratified and non-stratified folds in regression 333 | #' @keywords internal 334 | 335 | 336 | regr_folds = function(folds, RESP) { 337 | 338 | if (is.factor(RESP)) { 339 | 340 | stop(simpleError("this function is meant for regression for classification use the 'class_folds' function")) 341 | } 342 | 343 | samp_vec = rep(1/folds, folds) 344 | 345 | sort_names = paste0('fold_', 1:folds) 346 | 347 | prop = lapply(length(RESP), function(y) sapply(1:length(samp_vec), function(x) round(y * samp_vec[x]))) 348 | 349 | repl = func_shuffle(unlist(lapply(prop, function(x) sapply(1:length(x), function(y) rep(paste0('fold_', y), x[y]))))) 350 | 351 | spl = suppressWarnings(split(1:length(RESP), repl)) 352 | 353 | spl = spl[sort_names] 354 | 355 | if (length(unlist(spl)) != length(RESP)) { 356 | 357 | stop(simpleError("the length of the splits are not equal with the length of the response")) 358 | } 359 | 360 | spl 361 | } 362 | 363 | 364 | 365 | #================================================================================================================================================================ 366 | -------------------------------------------------------------------------------- /vignettes/binary_classification_using_the_ionosphere_data.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "binary classification using the ionosphere data" 3 | author: "Lampros Mouselimis" 4 | date: "`r Sys.Date()`" 5 | output: rmarkdown::html_vignette 6 | vignette: > 7 | %\VignetteIndexEntry{binary classification using the ionosphere data} 8 | %\VignetteEngine{knitr::rmarkdown} 9 | %\VignetteEncoding{UTF-8} 10 | --- 11 | 12 | 13 | The following examples illustrate the functionality of the KernelKnn package for **classification** tasks. I'll make use of the *ionosphere* data set, 14 | 15 | 16 | 17 | ```{r, eval=T} 18 | 19 | data(ionosphere, package = 'KernelKnn') 20 | 21 | apply(ionosphere, 2, function(x) length(unique(x))) 22 | 23 | # the second column will be removed as it has a single unique value 24 | 25 | ionosphere = ionosphere[, -2] 26 | ``` 27 |
28 | 29 | 30 | When using an algorithm where the ouput depends on distance calculation (as is the case in k-nearest-neighbors) it is recommended to first scale the data, 31 | 32 | 33 | ```{r, eval=T} 34 | # recommended is to scale the data 35 | 36 | X = scale(ionosphere[, -ncol(ionosphere)]) 37 | y = ionosphere[, ncol(ionosphere)] 38 | ``` 39 |
40 | 41 | **important note** : In classification, both functions *KernelKnn* and *KernelKnnCV* accept a numeric vector as a response variable (here y) and the unique values of the labels should begin from 1. This is important otherwise the internal functions do not work. Furthermore, both functions (by default) return predictions in form of probabilities, which can be converted to labels by using either a threshold (if binary classification) or the maximum value of each column (if multiclass classification). 42 | 43 | 44 | ```{r, eval=T} 45 | 46 | # labels should be numeric and begin from 1:Inf 47 | 48 | y = c(1:length(unique(y)))[ match(ionosphere$class, sort(unique(ionosphere$class))) ] 49 | 50 | # random split of data in train and test 51 | 52 | spl_train = sample(1:length(y), round(length(y) * 0.75)) 53 | spl_test = setdiff(1:length(y), spl_train) 54 | str(spl_train) 55 | str(spl_test) 56 | 57 | 58 | # evaluation metric 59 | 60 | acc = function (y_true, preds) { 61 | 62 | out = table(y_true, max.col(preds, ties.method = "random")) 63 | 64 | acc = sum(diag(out))/sum(out) 65 | 66 | acc 67 | } 68 | 69 | ``` 70 | 71 | 72 | ## The KernelKnn function 73 | 74 | The KernelKnn function takes a number of arguments. To read details for each one of the arguments type ?KernelKnn::KernelKnn in the console. 75 | 76 | A simple k-nearest-neighbors can be run with weights_function = NULL and the parameter 'regression' should be set to FALSE. In classification the *Levels* parameter takes the unique values of the response variable, 77 | 78 | ```{r, eval=T, warning = FALSE, message = FALSE} 79 | 80 | library(KernelKnn) 81 | 82 | preds_TEST = KernelKnn(X[spl_train, ], TEST_data = X[spl_test, ], y[spl_train], k = 5 , 83 | 84 | method = 'euclidean', weights_function = NULL, regression = F, 85 | 86 | Levels = unique(y)) 87 | head(preds_TEST) 88 | 89 | ``` 90 |
91 | There are two ways to use a kernel in the KernelKnn function. The **first option** is to choose one of the existing kernels (*uniform*, *triangular*, *epanechnikov*, *biweight*, *triweight*, *tricube*, *gaussian*, *cosine*, *logistic*, *silverman*, *inverse*, *gaussianSimple*, *exponential*). Here, I use the *canberra* metric and the *tricube* kernel because they give optimal results (according to my RandomSearchR package), 92 | 93 | 94 | ```{r, eval=T} 95 | 96 | 97 | preds_TEST_tric = KernelKnn(X[spl_train, ], TEST_data = X[spl_test, ], y[spl_train], k = 10 , 98 | 99 | method = 'canberra', weights_function = 'tricube', regression = F, 100 | 101 | Levels = unique(y)) 102 | head(preds_TEST_tric) 103 | 104 | ``` 105 | 106 |
107 | The **second option** is to give a self defined kernel function. Here, I'll pick the density function of the normal distribution with mean = 0.0 and standard deviation = 1.0 (the data are scaled to have mean zero and unit variance), 108 | 109 | 110 | ```{r, eval=T} 111 | 112 | 113 | norm_kernel = function(W) { 114 | 115 | W = dnorm(W, mean = 0, sd = 1.0) 116 | 117 | W = W / rowSums(W) 118 | 119 | return(W) 120 | } 121 | 122 | 123 | preds_TEST_norm = KernelKnn(X[spl_train, ], TEST_data = X[spl_test, ], y[spl_train], k = 10 , 124 | 125 | method = 'canberra', weights_function = norm_kernel, regression = F, 126 | 127 | Levels = unique(y)) 128 | head(preds_TEST_norm) 129 | 130 | ``` 131 |
132 | 133 | The computations can be speed up by using the parameter **threads** (multiple cores can be run in parallel). There is also the option to exclude **extrema** (minimum and maximum distances) during the calculation of the k-nearest-neighbor distances using extrema = TRUE. The *bandwidth* of the existing kernels can be tuned using the **h** parameter. 134 |
135 | 136 | K-nearest-neigbor calculations in the KernelKnn function can be accomplished using the following distance metrics : *euclidean*, *manhattan*, *chebyshev*, *canberra*, *braycurtis*, *minkowski* (by default the order 'p' of the minkowski parameter equals k), *hamming*, *mahalanobis*, *pearson_correlation*, *simple_matching_coefficient*, *jaccard_coefficient* and *Rao_coefficient*. The last four are similarity measures and are appropriate for binary data [0,1]. 137 |
138 | 139 | I employed my RandomSearchR package to find the optimal parameters for the KernelKnn function and the following two pairs of parameters give an optimal accuracy, 140 |

141 | 142 | ```{r, eval = T, echo = F} 143 | 144 | knitr::kable(data.frame(k = c(10,9), method = c('canberra', 'canberra'), kernel = c('tricube', 'epanechnikov'))) 145 | ``` 146 | 147 | 148 | ## The KernelKnnCV function 149 | 150 | I'll use the *KernelKnnCV* function to calculate the accuracy using 5-fold cross-validation for the previous mentioned parameter pairs, 151 | 152 | 153 | ```{r, eval=T, warning = FALSE, message = FALSE, results = 'hide'} 154 | 155 | fit_cv_pair1 = KernelKnnCV(X, y, k = 10 , folds = 5, method = 'canberra', 156 | 157 | weights_function = 'tricube', regression = F, 158 | 159 | Levels = unique(y), threads = 5, seed_num = 5) 160 | ``` 161 | 162 | ```{r, eval=T} 163 | str(fit_cv_pair1) 164 | ``` 165 | 166 | ```{r, eval=T, warning = FALSE, message = FALSE, results = 'hide'} 167 | fit_cv_pair2 = KernelKnnCV(X, y, k = 9 , folds = 5,method = 'canberra', 168 | 169 | weights_function = 'epanechnikov', regression = F, 170 | 171 | Levels = unique(y), threads = 5, seed_num = 5) 172 | ``` 173 | 174 | 175 | ```{r, eval=T} 176 | str(fit_cv_pair2) 177 | 178 | ``` 179 |
180 | 181 | 182 | Each cross-validated object returns a list of length 2 ( the first sublist includes the predictions for each fold whereas the second gives the indices of the folds) 183 | 184 | 185 | ```{r, eval=T} 186 | acc_pair1 = unlist(lapply(1:length(fit_cv_pair1$preds), 187 | 188 | function(x) acc(y[fit_cv_pair1$folds[[x]]], 189 | 190 | fit_cv_pair1$preds[[x]]))) 191 | acc_pair1 192 | 193 | cat('accurcay for params_pair1 is :', mean(acc_pair1), '\n') 194 | 195 | acc_pair2 = unlist(lapply(1:length(fit_cv_pair2$preds), 196 | 197 | function(x) acc(y[fit_cv_pair2$folds[[x]]], 198 | 199 | fit_cv_pair2$preds[[x]]))) 200 | acc_pair2 201 | 202 | cat('accuracy for params_pair2 is :', mean(acc_pair2), '\n') 203 | 204 | ``` 205 |
206 | 207 | 208 | ## Adding or multiplying kernels 209 | 210 | In the KernelKnn package there is also the option to **combine kernels** (adding or multiplying) from the existing ones. For instance, if I want to multiply the *tricube* with the *gaussian* kernel, then I'll give the following character string to the weights_function, *"tricube_gaussian_MULT"*. On the other hand, If I want to add the same kernels then the weights_function will be *"tricube_gaussian_ADD"*. I experimented with my RandomSearchR package combining the different kernels and the following two parameter settings gave optimal results, 211 | 212 |
213 | 214 | ```{r, eval = T, echo = F} 215 | 216 | knitr::kable(data.frame(k = c(16,5), method = c('canberra', 'canberra'), kernel = c('biweight_triweight_gaussian_MULT', 'triangular_triweight_MULT'))) 217 | ``` 218 | 219 |
220 | 221 | ```{r, eval=T, warning = FALSE, message = FALSE, results = 'hide'} 222 | 223 | fit_cv_pair1 = KernelKnnCV(X, y, k = 16, folds = 5, method = 'canberra', 224 | 225 | weights_function = 'biweight_triweight_gaussian_MULT', 226 | 227 | regression = F, Levels = unique(y), threads = 5, 228 | 229 | seed_num = 5) 230 | ``` 231 | 232 | ```{r, eval=T} 233 | str(fit_cv_pair1) 234 | ``` 235 | 236 | ```{r, eval=T, warning = FALSE, message = FALSE, results = 'hide'} 237 | 238 | fit_cv_pair2 = KernelKnnCV(X, y, k = 5, folds = 5, method = 'canberra', 239 | 240 | weights_function = 'triangular_triweight_MULT', 241 | 242 | regression = F, Levels = unique(y), threads = 5, 243 | 244 | seed_num = 5) 245 | 246 | ``` 247 | 248 | ```{r, eval=T} 249 | str(fit_cv_pair2) 250 | 251 | ``` 252 |
253 | 254 | ```{r, eval=T} 255 | acc_pair1 = unlist(lapply(1:length(fit_cv_pair1$preds), 256 | 257 | function(x) acc(y[fit_cv_pair1$folds[[x]]], 258 | 259 | fit_cv_pair1$preds[[x]]))) 260 | acc_pair1 261 | 262 | cat('accuracy for params_pair1 is :', mean(acc_pair1), '\n') 263 | 264 | acc_pair2 = unlist(lapply(1:length(fit_cv_pair2$preds), 265 | 266 | function(x) acc(y[fit_cv_pair2$folds[[x]]], 267 | 268 | fit_cv_pair2$preds[[x]]))) 269 | acc_pair2 270 | 271 | cat('accuracy for params_pair2 is :', mean(acc_pair2), '\n') 272 | 273 | ``` 274 |
275 | 276 | -------------------------------------------------------------------------------- /R/kernelknn.R: -------------------------------------------------------------------------------- 1 | 2 | #' kernel k-nearest-neighbors 3 | #' 4 | #' This function utilizes kernel k nearest neighbors to predict new observations 5 | #' 6 | #' @param data a data frame or matrix 7 | #' @param TEST_data a data frame or matrix (it can be also NULL) 8 | #' @param y a numeric vector (in classification the labels must be numeric from 1:Inf) 9 | #' @param k an integer specifying the k-nearest-neighbors 10 | #' @param h the bandwidth (applicable if the weights_function is not NULL, defaults to 1.0) 11 | #' @param method a string specifying the method. Valid methods are 'euclidean', 'manhattan', 'chebyshev', 'canberra', 'braycurtis', 'pearson_correlation', 'simple_matching_coefficient', 'minkowski' (by default the order 'p' of the minkowski parameter equals k), 'hamming', 'mahalanobis', 'jaccard_coefficient', 'Rao_coefficient' 12 | #' @param weights_function there are various ways of specifying the kernel function. See the details section. 13 | #' @param regression a boolean (TRUE,FALSE) specifying if regression or classification should be performed 14 | #' @param transf_categ_cols a boolean (TRUE, FALSE) specifying if the categorical columns should be converted to numeric or to dummy variables 15 | #' @param threads the number of cores to be used in parallel (openmp will be employed) 16 | #' @param extrema if TRUE then the minimum and maximum values from the k-nearest-neighbors will be removed (can be thought as outlier removal) 17 | #' @param Levels a numeric vector. In case of classification the unique levels of the response variable are necessary 18 | #' @param p a numeric value specifying the 'minkowski' order, i.e. if 'method' is set to 'minkowski'. This parameter defaults to 'k' 19 | #' @return a vector (if regression is TRUE), or a data frame with class probabilities (if regression is FALSE) 20 | #' @author Lampros Mouselimis 21 | #' @details 22 | #' This function takes a number of arguments and it returns the predicted values. If TEST_data is NULL then the predictions for the train data will be returned, whereas if TEST_data is not NULL then the predictions for the TEST_data will be returned. 23 | #' There are three possible ways to specify the weights function, 1st option : if the weights_function is NULL then a simple k-nearest-neighbor is performed. 2nd option : the weights_function is one of 'uniform', 'triangular', 'epanechnikov', 'biweight', 'triweight', 'tricube', 'gaussian', 'cosine', 'logistic', 'gaussianSimple', 'silverman', 'inverse', 'exponential'. The 2nd option can be extended by combining kernels from the existing ones (adding or multiplying). For instance, I can multiply the tricube with the gaussian kernel by giving 'tricube_gaussian_MULT' or I can add the previously mentioned kernels by giving 'tricube_gaussian_ADD'. 3rd option : a user defined kernel function 24 | #' @export 25 | #' @examples 26 | #' 27 | #' data(Boston) 28 | #' 29 | #' X = Boston[, -ncol(Boston)] 30 | #' y = Boston[, ncol(Boston)] 31 | #' 32 | #' out = KernelKnn(X, TEST_data = NULL, y, k = 5, method = 'euclidean', regression = TRUE) 33 | #' 34 | 35 | 36 | KernelKnn = function(data, TEST_data = NULL, y, k = 5, h = 1.0, method = 'euclidean', weights_function = NULL, regression = F, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL, p = k) { 37 | 38 | categorical_data_present = sapply(data, function(x) is.factor(x) || is.character(x)) 39 | 40 | if (sum(categorical_data_present) && !transf_categ_cols) stop('Categorical columns present in data. These should be either converted to numeric or the function should be run with transf_categ_cols = TRUE') 41 | if (!is.numeric(k) || is.null(k) || (k >= nrow(data)) || k < 1) stop('k must be of type integer, greater than 0 and less than nrow(train)') 42 | if (abs(k - round(k)) > 0) { 43 | k = round(k) 44 | warning('k is float and will be rounded to : ', call. = F, expr = k)} 45 | if (h == 0) stop('h can be any number except for 0') 46 | if (!is.character(method) || is.null(method) || !method %in% c('euclidean', 'manhattan', 'chebyshev', 'canberra', 'braycurtis', 'pearson_correlation', 'simple_matching_coefficient', 47 | 'minkowski', 'hamming', 'mahalanobis', 'jaccard_coefficient', 'Rao_coefficient')) 48 | stop("method must be of type character and one of 'euclidean', 'manhattan', 'chebyshev', 'canberra', 'braycurtis', 'pearson_correlation', 'simple_matching_coefficient', 49 | 'minkowski', 'hamming', 'mahalanobis', 'jaccard_coefficient', 'Rao_coefficient'") 50 | if (is.null(y)) stop('the response variable should be numeric') 51 | if (is.integer(y)) y = as.numeric(y) 52 | if (!is.numeric(y)) stop('in both regression and classification the response variable should be numeric or integer and in classification it should start from 1') 53 | if (!regression && is.null(Levels)) stop('In classification give the unique values of y in form of a vector') 54 | if (!regression && any(unique(y) < 1)) stop('the response variable values should begin from 1') 55 | if (!regression) { 56 | if (!all(Levels %in% unique(y))) stop("The specified 'Levels' must match the unique 'y' labels!") 57 | } 58 | if (any(is.na(data)) || any(is.na(y))) stop('the data or the response variable includes missing values') 59 | if (!is.null(TEST_data) && any(is.na(TEST_data))) stop('the TEST_data includes missing values') 60 | if (length(y) != nrow(data)) stop('the size of the data and y differ') 61 | if (extrema && k < 4) stop('k must be greater than 3 if extrema = TRUE') 62 | if (method %in% c('simple_matching_coefficient', 'jaccard_coefficient', 'Rao_coefficient') && !sum(apply(data, 2, function(x) all(unique(x) %in% c(0,1)))) == ncol(data)) 63 | stop("methods : 'simple_matching_coefficient', 'jaccard_coefficient' and 'Rao_coefficient' require the data to be in binary form e.g 0,1") 64 | if (!is.null(TEST_data) && method %in% c('simple_matching_coefficient', 'jaccard_coefficient', 'Rao_coefficient') && !sum(apply(TEST_data, 2, function(x) all(unique(x) %in% c(0,1)))) == ncol(TEST_data)) 65 | stop("methods : 'simple_matching_coefficient', 'jaccard_coefficient' and 'Rao_coefficient' require the TEST_data to be in binary form e.g 0,1") 66 | 67 | if (!is.null(TEST_data) && ncol(data) != ncol(TEST_data)) stop('the number of columns in train and test data differ') 68 | 69 | #---------------------------------------------------------------------------------------------------- 70 | 71 | # check if any of the variables is categorical, if TRUE then convert categorical predictors to either dummy variables or numeric variables [ depending on the number of levels ] 72 | 73 | if (transf_categ_cols) { 74 | 75 | if (is.null(TEST_data)) { 76 | 77 | data = func_categorical_preds(data)} 78 | 79 | else { 80 | 81 | tmp_dat = func_categorical_preds(rbind(data, TEST_data)) 82 | 83 | data = tmp_dat[1:nrow(data), ] 84 | 85 | TEST_data = tmp_dat[(nrow(data) + 1):nrow(tmp_dat), ] 86 | } 87 | } 88 | #---------------------------------------------------------------------------------------------------- 89 | 90 | if (is.null(TEST_data)) { 91 | 92 | mat = matrix(, nrow = 0, ncol = 0) 93 | 94 | if (!is.matrix(data)) data = as.matrix(data) 95 | 96 | if (extrema) { 97 | 98 | k = k + 2 # add two values (for min-max) 99 | } 100 | 101 | index_train = knn_index_dist_rcpp(data, mat, k = k, method = method, threads = threads, p = p) 102 | 103 | if (extrema) { 104 | 105 | index_train$train_knn_idx = index_train$train_knn_idx[, -c(1,k)] # remove min, max (matrices already sorted) 106 | index_train$train_knn_dist = index_train$train_knn_dist[, -c(1,k)] # remove min, max (matrices already sorted) 107 | 108 | k = k - 2 # adjust k to previous value 109 | } 110 | 111 | out_train = matrix(y[index_train$train_knn_idx], nrow = nrow(data), ncol = k) 112 | 113 | if (!regression) { 114 | 115 | if (is.null(weights_function)) { 116 | 117 | out = func_tbl_dist(out_train, sort(Levels)) 118 | } 119 | else if (is.function(weights_function)) { 120 | 121 | W = FUNCTION_weights(index_train$train_knn_dist, weights_function) 122 | 123 | out = func_tbl(out_train, W, sort(Levels)) 124 | } 125 | else if (is.character(weights_function) && nchar(weights_function) > 1) { 126 | 127 | W = FUN_kernels(weights_function, index_train$train_knn_dist, h) 128 | 129 | out = func_tbl(out_train, W, sort(Levels)) 130 | } 131 | else { 132 | stop('false input for the weights_function argument') 133 | } 134 | 135 | colnames(out) = paste0('class_', sort(Levels)) 136 | } 137 | 138 | else { 139 | 140 | if (is.null(weights_function)) { 141 | 142 | out = rowMeans(out_train) 143 | } 144 | else if (is.function(weights_function)) { 145 | 146 | W = FUNCTION_weights(index_train$train_knn_dist, weights_function) 147 | 148 | out = rowSums(out_train * W) 149 | } 150 | else if (is.character(weights_function) && nchar(weights_function) > 1) { 151 | 152 | W = FUN_kernels(weights_function, index_train$train_knn_dist, h) 153 | 154 | out = rowSums(out_train * W) 155 | } 156 | else { 157 | 158 | stop('false input for the weights_function argument') 159 | } 160 | } 161 | 162 | return(out) 163 | } 164 | 165 | else { 166 | 167 | if (!is.matrix(data)) data = as.matrix(data) 168 | if (!is.matrix(TEST_data)) TEST_data = as.matrix(TEST_data) 169 | 170 | if (extrema) { 171 | 172 | k = k + 2 # add two values (for min-max) 173 | } 174 | 175 | index = knn_index_dist_rcpp(data, TEST_data, k = k, method = method, threads = threads, p = p) 176 | 177 | if (extrema) { 178 | 179 | index$test_knn_idx = index$test_knn_idx[, -c(1,k)] # remove min, max (matrices already sorted) 180 | index$test_knn_dist = index$test_knn_dist[, -c(1,k)] # remove min, max (matrices already sorted) 181 | 182 | k = k - 2 # adjust k to previous value 183 | } 184 | 185 | out_test = matrix(y[index$test_knn_idx], ncol = k) 186 | 187 | if (!regression) { 188 | 189 | if (is.null(weights_function)) { 190 | 191 | out_te = func_tbl_dist(out_test, sort(Levels)) 192 | } 193 | else if (is.function(weights_function)) { 194 | 195 | W_te = FUNCTION_weights(index$test_knn_dist, weights_function) 196 | 197 | out_te = func_tbl(out_test, W_te, sort(Levels)) 198 | } 199 | else if (is.character(weights_function) && nchar(weights_function) > 1) { 200 | 201 | W_te = FUN_kernels(weights_function, index$test_knn_dist, h) 202 | 203 | out_te = func_tbl(out_test, W_te, sort(Levels)) 204 | } 205 | else { 206 | 207 | stop('false input for the weights_function argument') 208 | } 209 | 210 | colnames(out_te) = paste0('class_', sort(Levels)) 211 | } 212 | 213 | else { 214 | 215 | if (is.null(weights_function)) { 216 | 217 | out_te = rowMeans(out_test) 218 | } 219 | else if (is.function(weights_function)) { 220 | 221 | W_te = FUNCTION_weights(index$test_knn_dist, weights_function) 222 | 223 | out_te = rowSums(out_test * W_te) 224 | } 225 | else if (is.character(weights_function) && nchar(weights_function) > 1) { 226 | 227 | W_te = FUN_kernels(weights_function, index$test_knn_dist, h) 228 | 229 | out_te = rowSums(out_test * W_te) 230 | } 231 | else { 232 | stop('false input for the weights_function argument') 233 | } 234 | } 235 | 236 | return(out_te) 237 | } 238 | } 239 | 240 | 241 | #================================================================================================================================================================================ 242 | -------------------------------------------------------------------------------- /tests/testthat/test-knn_index_dist.R: -------------------------------------------------------------------------------- 1 | #============================================================================================================================================================== 2 | 3 | context("Knn index distance") 4 | 5 | #================= 6 | # Error handling 7 | #================= 8 | 9 | 10 | testthat::test_that("it returns an error if a factor variable is present in the data and the transf_categ_cols = FALSE", { 11 | 12 | tmp_dat = xtr 13 | 14 | tmp_dat$rad = as.factor(tmp_dat$rad) 15 | 16 | testthat::expect_error(knn.index.dist(tmp_dat, TEST_data = xte, k = 5, method = 'euclidean', transf_categ_cols = F, threads = 1)) 17 | }) 18 | 19 | testthat::test_that("it returns an error if a character variable is present in the data and the transf_categ_cols = FALSE", { 20 | 21 | tmp_dat = xtr 22 | 23 | tmp_dat$rad = as.character(tmp_dat$rad) 24 | 25 | testthat::expect_error(knn.index.dist(tmp_dat, TEST_data = xte, k = 5, method = 'euclidean', transf_categ_cols = F, threads = 1)) 26 | }) 27 | 28 | 29 | testthat::test_that("'simple_matching_coefficient', 'jaccard_coefficient' and 'Rao_coefficient' can work only with binary data, case TEST_data is NULL", { 30 | 31 | testthat::expect_error(knn.index.dist(xtr, TEST_data = NULL, k = 5, method = 'simple_matching_coefficient', transf_categ_cols = F, threads = 1)) 32 | }) 33 | 34 | 35 | testthat::test_that("'simple_matching_coefficient', 'jaccard_coefficient' and 'Rao_coefficient' can work only with binary data, case TEST_data is NULL", { 36 | 37 | testthat::expect_error(knn.index.dist(xtr, TEST_data = NULL, k = 5, method = 'jaccard_coefficient', transf_categ_cols = F, threads = 1)) 38 | }) 39 | 40 | 41 | testthat::test_that("'simple_matching_coefficient', 'jaccard_coefficient' and 'Rao_coefficient' can work only with binary data, case TEST_data is NULL", { 42 | 43 | testthat::expect_error(knn.index.dist(xtr, TEST_data = NULL, k = 5, method = 'Rao_coefficient', transf_categ_cols = F, threads = 1)) 44 | }) 45 | 46 | 47 | testthat::test_that("'simple_matching_coefficient', 'jaccard_coefficient' and 'Rao_coefficient' can work only with binary data, case TEST_data is not NULL", { 48 | 49 | testthat::expect_error(knn.index.dist(xtr, TEST_data = xte, k = 5, method = 'simple_matching_coefficient', transf_categ_cols = F, threads = 1)) 50 | }) 51 | 52 | 53 | testthat::test_that("'simple_matching_coefficient', 'jaccard_coefficient' and 'Rao_coefficient' can work only with binary data, case TEST_data is not NULL", { 54 | 55 | testthat::expect_error(knn.index.dist(xtr, TEST_data = xte, k = 5, method = 'jaccard_coefficient', transf_categ_cols = F, threads = 1)) 56 | }) 57 | 58 | 59 | testthat::test_that("'simple_matching_coefficient', 'jaccard_coefficient' and 'Rao_coefficient' can work only with binary data, case TEST_data is not NULL", { 60 | 61 | testthat::expect_error(knn.index.dist(xtr, TEST_data = xte, k = 5, method = 'Rao_coefficient', transf_categ_cols = F, threads = 1)) 62 | }) 63 | 64 | 65 | testthat::test_that("it returns an error if k is NULL", { 66 | 67 | testthat::expect_error( knn.index.dist(xtr, TEST_data = xte, k = NULL, method = 'euclidean', transf_categ_cols = F, threads = 1)) 68 | }) 69 | 70 | 71 | testthat::test_that("it returns an error if k is a character", { 72 | 73 | testthat::expect_error( knn.index.dist(xtr, TEST_data = xte, k = 'invalid', method = 'euclidean', transf_categ_cols = F, threads = 1) ) 74 | }) 75 | 76 | 77 | testthat::test_that("it returns an error if k is greater or equal to the number of rows of the train data", { 78 | 79 | testthat::expect_error( knn.index.dist(xtr, TEST_data = xte, k = nrow(xtr), method = 'euclidean', transf_categ_cols = F, threads = 1) ) 80 | }) 81 | 82 | 83 | testthat::test_that("it returns an error if k is less than 1", { 84 | 85 | testthat::expect_error( knn.index.dist(xtr, TEST_data = xte, k = -1, method = 'euclidean', transf_categ_cols = F, threads = 1) ) 86 | }) 87 | 88 | 89 | testthat::test_that("it returns an error if the method is NULL", { 90 | 91 | testthat::expect_error( knn.index.dist(xtr, TEST_data = xte, k = 5, method = NULL, transf_categ_cols = F, threads = 1) ) 92 | }) 93 | 94 | 95 | testthat::test_that("it returns an error if the method is not a character", { 96 | 97 | testthat::expect_error( knn.index.dist(xtr, TEST_data = xte, k = 5 , method = 1, transf_categ_cols = F, threads = 1) ) 98 | }) 99 | 100 | 101 | testthat::test_that("it returns an error if the method is a character, but not one of the valid names", { 102 | 103 | testthat::expect_error( knn.index.dist(xtr, TEST_data = xte, k = 5 , method = 'invalid', transf_categ_cols = F, threads = 1) ) 104 | }) 105 | 106 | 107 | testthat::test_that("it returns an error if missing values are present in the data", { 108 | 109 | tmp_dat = xtr 110 | tmp_dat$crim[sample(1:length(tmp_dat$crim), 10)] = NA 111 | 112 | testthat::expect_error( knn.index.dist(tmp_dat, TEST_data = xte, k = 5 , method = 'euclidean', transf_categ_cols = F, threads = 1) ) 113 | }) 114 | 115 | 116 | testthat::test_that("it returns an error if missing values are present in the TEST data", { 117 | 118 | tmp_dat = xte 119 | tmp_dat$crim[sample(1:length(tmp_dat$crim), 10)] = NA 120 | 121 | testthat::expect_error( knn.index.dist(xtr, TEST_data = tmp_dat, k = 5 , method = 'euclidean', transf_categ_cols = F, threads = 1) ) 122 | }) 123 | 124 | 125 | testthat::test_that("if the number of columns in train and test data differ it returns an error", { 126 | 127 | tmp_xte = xte[, -ncol(xte)] 128 | 129 | testthat::expect_error( knn.index.dist(xtr, TEST_data = tmp_xte, k = 4 , method = 'euclidean', transf_categ_cols = F, threads = 1) ) 130 | }) 131 | 132 | 133 | # testing of knn.index.dist 134 | 135 | 136 | testthat::test_that("if transf_categ_cols = TRUE and TEST_data = NULL the knn.index.dist returns a list with number of rows in each sublist equal to the number of rows in train data", { 137 | 138 | tmp_xtr = xtr 139 | tmp_xtr$rad = as.factor(tmp_xtr$rad) 140 | 141 | res = knn.index.dist(tmp_xtr, TEST_data = NULL, k = 5 , method = 'euclidean', transf_categ_cols = T, threads = 1) 142 | 143 | testthat::expect_true(length(res) == 2 && mean(unlist(lapply(res, nrow))) == nrow(xtr)) 144 | }) 145 | 146 | 147 | testthat::test_that("if transf_categ_cols = TRUE and TEST_data is NOT NULL the knn.index.dist returns a list with number of rows in each sublsit equal to the number of rows in the TEST data", { 148 | 149 | tmp_xtr = xtr 150 | tmp_xtr$rad = as.factor(tmp_xtr$rad) 151 | 152 | tmp_xte = xte 153 | tmp_xte$rad = as.factor(tmp_xte$rad) 154 | 155 | res = knn.index.dist(tmp_xtr, TEST_data = tmp_xte, k = 5 , method = 'euclidean', transf_categ_cols = T, threads = 1) 156 | 157 | testthat::expect_true(length(res) == 2 && mean(unlist(lapply(res, nrow))) == nrow(xte)) 158 | }) 159 | 160 | 161 | 162 | testthat::test_that("if the TEST data is NULL for all posible combinations [ when transf_categ_cols = T ] the knn.index.dist returns a list of length 2 with 163 | 164 | number of rows in each sublist equal to the number of rows in the train data", { 165 | 166 | tmp_xtr = xtr 167 | tmp_xtr$rad = as.factor(tmp_xtr$rad) 168 | 169 | lst = list() 170 | 171 | for (k in 4:6) { 172 | 173 | for (metric in c('euclidean', 'manhattan', 'chebyshev')) { 174 | 175 | tmp_res = knn.index.dist(tmp_xtr, TEST_data = NULL, k = k , method = metric, transf_categ_cols = T, threads = 1) 176 | 177 | lst = lappend(lst, length(tmp_res) == 2 && mean(unlist(lapply(tmp_res, nrow))) == nrow(xtr)) 178 | } 179 | } 180 | 181 | testthat::expect_true(all(unlist(lst))) 182 | }) 183 | 184 | 185 | 186 | testthat::test_that("if the TEST data is NULL for all posible combinations [ when transf_categ_cols = F ] the knn.index.dist returns a list of length 2 with 187 | 188 | number of rows in each sublist equal to the number of rows in the train data", { 189 | 190 | lst = list() 191 | 192 | for (k in 4:6) { 193 | 194 | for (metric in c('euclidean', 'manhattan', 'chebyshev')) { 195 | 196 | tmp_res = knn.index.dist(xtr, TEST_data = NULL, k = k , method = metric, transf_categ_cols = F, threads = 1) 197 | 198 | lst = lappend(lst, length(tmp_res) == 2 && mean(unlist(lapply(tmp_res, nrow))) == nrow(xtr)) 199 | } 200 | } 201 | 202 | testthat::expect_true(all(unlist(lst))) 203 | }) 204 | 205 | 206 | 207 | testthat::test_that("if the TEST data is NOT NULL for all posible combinations [ when transf_categ_cols = T ] the knn.index.dist returns a list of length 2 with 208 | 209 | number of rows in each sublist equal to the number of rows in the TEST data", { 210 | 211 | tmp_xtr = xtr 212 | tmp_xtr$rad = as.factor(tmp_xtr$rad) 213 | 214 | tmp_xte = xte 215 | tmp_xte$rad = as.factor(tmp_xte$rad) 216 | 217 | lst = list() 218 | 219 | for (k in 4:6) { 220 | 221 | for (metric in c('euclidean', 'manhattan', 'chebyshev')) { 222 | 223 | tmp_res = knn.index.dist(tmp_xtr, TEST_data = tmp_xte, k = k , method = metric, transf_categ_cols = T, threads = 1) 224 | 225 | lst = lappend(lst, length(tmp_res) == 2 && mean(unlist(lapply(tmp_res, nrow))) == nrow(tmp_xte)) 226 | } 227 | } 228 | 229 | testthat::expect_true(all(unlist(lst))) 230 | }) 231 | 232 | 233 | 234 | testthat::test_that("if the TEST data is NULL for all posible combinations [ when transf_categ_cols = F ] the knn.index.dist returns a list of length 2 with 235 | 236 | number of rows in each sublist equal to the number of rows in the train data", { 237 | 238 | lst = list() 239 | 240 | for (k in 4:6) { 241 | 242 | for (metric in c('euclidean', 'manhattan', 'chebyshev')) { 243 | 244 | tmp_res = knn.index.dist(xtr, TEST_data = xte, k = k , method = metric, transf_categ_cols = F, threads = 1) 245 | 246 | lst = lappend(lst, length(tmp_res) == 2 && mean(unlist(lapply(tmp_res, nrow))) == nrow(xte)) 247 | } 248 | } 249 | 250 | testthat::expect_true(all(unlist(lst))) 251 | }) 252 | 253 | 254 | 255 | testthat::test_that("the similarity measures 'simple_matching_coefficient', 'jaccard_coefficient', 'Rao_coefficient' and 'pearson_correlation' return correct output 256 | 257 | in case of binary data when TEST data is not NULL", { 258 | 259 | 260 | dat = do.call(cbind, lapply(1:10, function(x) sample(0:1, 100, replace = T))) 261 | TES = do.call(cbind, lapply(1:10, function(x) sample(0:1, 50, replace = T))) 262 | 263 | lst = count = list() 264 | 265 | for (k in 4:6) { 266 | 267 | for (metric in c('simple_matching_coefficient', 'jaccard_coefficient', 'Rao_coefficient', 'pearson_correlation')) { 268 | 269 | tmp_lst = knn.index.dist(dat, TEST_data = TES, k = k , method = metric, transf_categ_cols = F, threads = 1) 270 | 271 | lst = lappend(lst, tmp_lst) 272 | 273 | count = lappend(count, ncol(tmp_lst$test_knn_idx) == k && ncol(tmp_lst$test_knn_dist) == k) 274 | } 275 | } 276 | 277 | res = unlist(lapply(lst, function(x) nrow(x$test_knn_idx) == nrow(TES) && nrow(x$test_knn_dist) == nrow(TES))) 278 | 279 | testthat::expect_true(all(res) && all(unlist(count))) 280 | }) 281 | 282 | 283 | testthat::test_that("the 'p' parameter when method is 'minkowski' returns the expected output", { 284 | 285 | k = 2 286 | res_wo = knn.index.dist(data = X, k = k , method = 'minkowski') # without specifying the 'p' parameter 287 | res_w = knn.index.dist(data = X, k = k , method = 'minkowski', p = k) # by specifying the 'p' parameter 288 | res_dif = knn.index.dist(data = X, k = k , method = 'minkowski', p = 1) # 'p' is set to 1 289 | 290 | is_identical = identical(res_wo, res_w) 291 | is_not_identical = identical(res_wo, res_dif) 292 | 293 | testthat::expect_true(is_identical & (!is_not_identical)) 294 | }) 295 | 296 | 297 | #============================================================================================================================================================== 298 | -------------------------------------------------------------------------------- /vignettes/image_classification_using_MNIST_CIFAR_data.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Image classification of the MNIST and CIFAR-10 data using KernelKnn and HOG (histogram of oriented gradients)" 3 | author: "Lampros Mouselimis" 4 | date: "`r Sys.Date()`" 5 | output: rmarkdown::html_vignette 6 | vignette: > 7 | %\VignetteIndexEntry{Image classification of the MNIST and CIFAR-10 data using KernelKnn and HOG (histogram of oriented gradients)} 8 | %\VignetteEngine{knitr::rmarkdown} 9 | %\VignetteEncoding{UTF-8} 10 | --- 11 | 12 | 13 | In this vignette I'll illustrate how to increase the accuracy on the MNIST (to approx. 98.4%) and CIFAR-10 data (to approx. 58.3%) using the KernelKnn package and HOG (histogram of oriented gradients). 14 |
15 | 16 | ### dependencies 17 | 18 | * The MNIST and Cifar-10 data sets can be downloaded from Github using **system("wget https://raw.githubusercontent.com/mlampros/DataSets/master/mnist.zip")** and **system("wget https://raw.githubusercontent.com/mlampros/DataSets/master/cifar_10.zip")** 19 | * the **irlba** package, which is needed for comparison purposes, can be installed from CRAN directly 20 | * the *HOG_apply* function is part of the **OpenImageR** package, which can be installed from CRAN as well. 21 | 22 | 23 | ### MNIST data set 24 | 25 | The MNIST data set of handwritten digits has a training set of 70,000 examples and each row of the matrix corresponds to a 28 x 28 image. The unique values of the response variable *y* range from 0 to 9. More information about the data can be found in the *DataSets* repository (the folder includes also an Rmarkdown file). 26 | 27 | 28 | ```{r, eval = F, echo = T, warning = F, message = F, cache = T} 29 | 30 | # using system('wget..') on a linux OS 31 | 32 | system("wget https://raw.githubusercontent.com/mlampros/DataSets/master/mnist.zip") 33 | 34 | mnist <- read.table(unz("mnist.zip", "mnist.csv"), nrows = 70000, header = T, 35 | 36 | quote = "\"", sep = ",") 37 | 38 | ``` 39 | 40 | 41 | ```{r, eval = F, cache = T} 42 | X = mnist[, -ncol(mnist)] 43 | dim(X) 44 | 45 | ## [1] 70000 784 46 | 47 | # the KernelKnn function requires that the labels are numeric and start from 1 : Inf 48 | 49 | y = mnist[, ncol(mnist)] + 1 50 | table(y) 51 | 52 | ## y 53 | ## 1 2 3 4 5 6 7 8 9 10 54 | ## 6903 7877 6990 7141 6824 6313 6876 7293 6825 6958 55 | 56 | ``` 57 |
58 | 59 | K nearest neighbors do not perform well in high dimensions due to the *curse of dimensionality* (k observations that are nearest to a given test observation x1 may be very far away from x1 in p-dimensional space when p is large [ An introduction to statistical learning, James/Witten/Hastie/Tibshirani, pages 108-109 ]), leading to a very poor k-nearest-neighbors fit. One option to overcome this problem would be to use truncated svd (irlba package) to reduce the dimensions of the data. A second option, which is appropriate in case of images, would be to use image descriptors. In this vignette, I'll compare those two approaches.

60 | 61 | ##### KernelKnnCV using truncated svd 62 | 63 | 64 | I experimented with different settings and the following parameters gave the best results, 65 |

66 | 67 | ```{r, eval = T, echo = F} 68 | 69 | knitr::kable(data.frame(irlba_singlular_vectors = 40, k = 8, method = 'braycurtis', kernel = 'biweight_tricube_MULT'), align = 'l') 70 | ``` 71 |
72 | 73 | ```{r, eval = F, cache = T} 74 | 75 | library(irlba) 76 | 77 | svd_irlb = irlba(as.matrix(X), nv = 40, nu = 40, verbose = F) # irlba truncated svd 78 | 79 | new_x = as.matrix(X) %*% svd_irlb$v # new_x using the 40 right singular vectors 80 | 81 | ``` 82 |
83 | 84 | ```{r, eval = F, cache = T, warning = FALSE, message = FALSE, results = 'hide'} 85 | 86 | library(KernelKnn) 87 | 88 | fit = KernelKnnCV(as.matrix(new_x), y, k = 8, folds = 4, method = 'braycurtis', 89 | 90 | weights_function = 'biweight_tricube_MULT', regression = F, 91 | 92 | threads = 6, Levels = sort(unique(y))) 93 | 94 | 95 | # str(fit) 96 | 97 | 98 | # evaluation metric 99 | 100 | acc = function (y_true, preds) { 101 | 102 | out = table(y_true, max.col(preds, ties.method = "random")) 103 | 104 | acc = sum(diag(out))/sum(out) 105 | 106 | acc 107 | } 108 | 109 | ``` 110 |
111 | 112 | ```{r, eval = F, cache = F} 113 | 114 | acc_fit = unlist(lapply(1:length(fit$preds), 115 | 116 | function(x) acc(y[fit$folds[[x]]], 117 | 118 | fit$preds[[x]]))) 119 | acc_fit 120 | 121 | ## [1] 0.9742857 0.9749143 0.9761143 0.9741143 122 | 123 | cat('mean accuracy using cross-validation :', mean(acc_fit), '\n') 124 | 125 | ## mean accuracy using cross-validation : 0.9748571 126 | 127 | ``` 128 |
129 | 130 | Utilizing truncated svd a 4-fold cross-validation KernelKnn model gives a 97.48% accuracy. 131 |

132 | 133 | ##### KernelKnnCV and HOG (histogram of oriented gradients) 134 | 135 | In this chunk of code, besides KernelKnnCV I'll also use HOG. The histogram of oriented gradients (HOG) is a feature descriptor used in computer vision and image processing for the purpose of object detection. The technique counts occurrences of gradient orientation in localized portions of an image. This method is similar to that of edge orientation histograms, scale-invariant feature transform descriptors, and shape contexts, but differs in that it is computed on a dense grid of uniformly spaced cells and uses overlapping local contrast normalization for improved accuracy (Wikipedia). 136 |
137 | 138 | ```{r, eval = F, cache = T} 139 | 140 | library(OpenImageR) 141 | 142 | hog = HOG_apply(X, cells = 6, orientations = 9, rows = 28, columns = 28, threads = 6) 143 | 144 | ## 145 | ## time to complete : 1.802997 secs 146 | 147 | dim(hog) 148 | 149 | ## [1] 70000 324 150 | 151 | ``` 152 |
153 | 154 | ```{r, eval = F, cache = T, warning = FALSE, message = FALSE, results = 'hide'} 155 | 156 | fit_hog = KernelKnnCV(hog, y, k = 20, folds = 4, method = 'braycurtis', 157 | 158 | weights_function = 'biweight_tricube_MULT', regression = F, 159 | 160 | threads = 6, Levels = sort(unique(y))) 161 | 162 | 163 | #str(fit_hog) 164 | 165 | ``` 166 |
167 | 168 | ```{r, eval = F, cache = F} 169 | 170 | acc_fit_hog = unlist(lapply(1:length(fit_hog$preds), 171 | 172 | function(x) acc(y[fit_hog$folds[[x]]], 173 | 174 | fit_hog$preds[[x]]))) 175 | acc_fit_hog 176 | 177 | ## [1] 0.9833714 0.9840571 0.9846857 0.9838857 178 | 179 | cat('mean accuracy for hog-features using cross-validation :', mean(acc_fit_hog), '\n') 180 | 181 | ## mean accuracy for hog-features using cross-validation : 0.984 182 | 183 | ``` 184 |
185 | 186 | By changing from the simple svd-features to HOG-features the accuracy of a 4-fold cross-validation model increased from 97.48% to 98.4% (approx. 1% difference) 187 | 188 | 189 | 190 | ### CIFAR-10 data set 191 | 192 | 193 | CIFAR-10 is an established computer-vision dataset used for object recognition. The data I'll use in this example is a subset of an 80 million tiny images dataset and consists of 60,000 32x32 color images containing one of 10 object classes ( 6000 images per class ). Furthermore, the data were converted from RGB to gray, normalized and rounded to 2 decimal places (to reduce the storage size). More information about the data can be found in my *DataSets* repository (I included an Rmarkdown file). 194 |

195 | 196 | I'll build the kernel k-nearest-neighbors models in the same way I've done for the mnist data set and then I'll compare the results. 197 | 198 | 199 | ```{r, eval = F, echo = T, warning = F, message = F, cache = T} 200 | 201 | # using system('wget..') on a linux OS 202 | 203 | system("wget https://raw.githubusercontent.com/mlampros/DataSets/master/cifar_10.zip") 204 | 205 | cifar_10 <- read.table(unz("cifar_10.zip", "cifar_10.csv"), nrows = 60000, header = T, 206 | 207 | quote = "\"", sep = ",") 208 | 209 | ``` 210 | 211 | 212 | ##### KernelKnnCV using truncated svd 213 | 214 | 215 | ```{r, eval = F, cache = T} 216 | X = cifar_10[, -ncol(cifar_10)] 217 | dim(X) 218 | 219 | ## [1] 60000 1024 220 | 221 | # the KernelKnn function requires that the labels are numeric and start from 1 : Inf 222 | 223 | y = cifar_10[, ncol(cifar_10)] 224 | table(y) 225 | 226 | ## y 227 | ## 1 2 3 4 5 6 7 8 9 10 228 | ## 6000 6000 6000 6000 6000 6000 6000 6000 6000 6000 229 | 230 | ``` 231 |
232 | 233 | 234 | The parameter settings are similar to those for the mnist data, 235 | 236 | ```{r, eval = T, echo = F} 237 | 238 | knitr::kable(data.frame(irlba_singlular_vectors = 40, k = 8, method = 'braycurtis', 239 | 240 | kernel = 'biweight_tricube_MULT'), align = 'l') 241 | ``` 242 |
243 | 244 | ```{r, eval = F, cache = T} 245 | 246 | svd_irlb = irlba(as.matrix(X), nv = 40, nu = 40, verbose = F) # irlba truncated svd 247 | 248 | new_x = as.matrix(X) %*% svd_irlb$v # new_x using the 40 right singular vectors 249 | 250 | ``` 251 |
252 | 253 | ```{r, eval = F, cache = T, warning = FALSE, message = FALSE, results = 'hide'} 254 | 255 | fit = KernelKnnCV(as.matrix(new_x), y, k = 8, folds = 4, method = 'braycurtis', 256 | 257 | weights_function = 'biweight_tricube_MULT', regression = F, 258 | 259 | threads = 6, Levels = sort(unique(y))) 260 | 261 | 262 | # str(fit) 263 | 264 | ``` 265 |
266 | 267 | ```{r, eval = F, cache = F} 268 | 269 | acc_fit = unlist(lapply(1:length(fit$preds), 270 | 271 | function(x) acc(y[fit$folds[[x]]], 272 | 273 | fit$preds[[x]]))) 274 | acc_fit 275 | 276 | ## [1] 0.4080667 0.4097333 0.4040000 0.4102667 277 | 278 | cat('mean accuracy using cross-validation :', mean(acc_fit), '\n') 279 | 280 | ## mean accuracy using cross-validation : 0.4080167 281 | 282 | ``` 283 |
284 | 285 | The accuracy of a 4-fold cross-validation model using truncated svd is 40.8%. 286 | 287 | ##### KernelKnnCV using HOG (histogram of oriented gradients) 288 | 289 |
290 | Next, I'll run the KernelKnnCV using the HOG-descriptors, 291 |

292 | 293 | 294 | ```{r, eval = F, cache = T} 295 | 296 | hog = HOG_apply(X, cells = 6, orientations = 9, rows = 32, 297 | 298 | columns = 32, threads = 6) 299 | 300 | ## 301 | ## time to complete : 3.394621 secs 302 | 303 | dim(hog) 304 | 305 | ## [1] 60000 324 306 | 307 | ``` 308 |
309 | 310 | ```{r, eval = F, cache = T, warning = FALSE, message = FALSE, results = 'hide'} 311 | 312 | fit_hog = KernelKnnCV(hog, y, k = 20, folds = 4, method = 'braycurtis', 313 | 314 | weights_function = 'biweight_tricube_MULT', regression = F, 315 | 316 | threads = 6, Levels = sort(unique(y))) 317 | 318 | 319 | # str(fit_hog) 320 | 321 | ``` 322 |
323 | 324 | ```{r, eval = F, cache = F} 325 | 326 | acc_fit_hog = unlist(lapply(1:length(fit_hog$preds), 327 | 328 | function(x) acc(y[fit_hog$folds[[x]]], 329 | 330 | fit_hog$preds[[x]]))) 331 | acc_fit_hog 332 | 333 | ## [1] 0.5807333 0.5884000 0.5777333 0.5861333 334 | 335 | cat('mean accuracy for hog-features using cross-validation :', mean(acc_fit_hog), '\n') 336 | 337 | ## mean accuracy for hog-features using cross-validation : 0.58325 338 | 339 | ``` 340 |
341 | 342 | By using hog-descriptors in a 4-fold cross-validation model the accuracy in the cifar-10 data increases from 40.8% to 58.3% (approx. 17.5% difference). 343 | 344 | 345 | ```{r, eval = F, echo = F} 346 | 347 | # remove cache and mnist.zip once vignettes are built 348 | 349 | # unlink("image_classification_using_MNIST_CIFAR_data_cache", recursive = TRUE) # USE this chunk in case of 'eval = TRUE' 350 | # unlink("mnist.zip", recursive = TRUE) 351 | # unlink("cifar_10.zip", recursive = TRUE) 352 | ``` 353 | -------------------------------------------------------------------------------- /tests/testthat/test-distmat_kernelknn.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | context("kernel knn with distance matrix") 4 | 5 | #================= 6 | # Error handling 7 | #================= 8 | 9 | 10 | testthat::test_that("it returns an error if the input distance object is not of type matrix", { 11 | 12 | tmp_df = as.data.frame(DIST_mat) 13 | 14 | testthat::expect_error(distMat.KernelKnn(tmp_df, TEST_indices = NULL, y, k = 5, h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T)) 15 | }) 16 | 17 | 18 | testthat::test_that("it returns an error if the input distance matrix is not square", { 19 | 20 | testthat::expect_error(distMat.KernelKnn(DIST_mat[, -ncol(DIST_mat)], TEST_indices = NULL, y, k = 5, h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T)) 21 | }) 22 | 23 | 24 | testthat::test_that("it returns an error if the diagonal of the distance matrix is other than 0's or NA's", { 25 | 26 | TMP_DIAG = DIST_mat 27 | 28 | diag(TMP_DIAG) = -1 29 | 30 | testthat::expect_error(distMat.KernelKnn(TMP_DIAG, TEST_indices = NULL, y, k = 5, h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T)) 31 | }) 32 | 33 | 34 | testthat::test_that("it returns an error if the TEST_indices parameter is not of type numeric or integer", { 35 | 36 | invalid_tst_idx = letters[1:100] 37 | 38 | testthat::expect_error(distMat.KernelKnn(DIST_mat, TEST_indices = invalid_tst_idx, y, k = 5, h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T)) 39 | }) 40 | 41 | 42 | testthat::test_that("it returns an error if the maximum index of the TEST_indices parameter is greater than the rows of the distance matrix", { 43 | 44 | invalid_tst_idx = 1:(nrow(DIST_mat) + 10) 45 | 46 | invalid_tst_idx = (nrow(DIST_mat)-100):length(invalid_tst_idx) 47 | 48 | testthat::expect_error(distMat.KernelKnn(DIST_mat, TEST_indices = invalid_tst_idx, y, k = 5, h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T)) 49 | }) 50 | 51 | testthat::test_that("it returns an error if k is NULL", { 52 | 53 | testthat::expect_error( distMat.KernelKnn(DIST_mat, TEST_indices = NULL, y, k = NULL, h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T) ) 54 | }) 55 | 56 | 57 | testthat::test_that("it returns an error if k is a character", { 58 | 59 | testthat::expect_error( distMat.KernelKnn(DIST_mat, TEST_indices = NULL, y, k = "invalid", h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T) ) 60 | }) 61 | 62 | 63 | testthat::test_that("it returns an error if k is greater or equal to the number of rows of the distance matrix", { 64 | 65 | testthat::expect_error( distMat.KernelKnn(DIST_mat, TEST_indices = NULL, y, k = nrow(DIST_mat), h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T) ) 66 | }) 67 | 68 | 69 | testthat::test_that("it returns an error if k is less than 1", { 70 | 71 | testthat::expect_error( distMat.KernelKnn(DIST_mat, TEST_indices = NULL, y, k = -1, h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T) ) 72 | }) 73 | 74 | 75 | testthat::test_that("it returns a warning if k is a float", { 76 | 77 | testthat::expect_warning( distMat.KernelKnn(DIST_mat, TEST_indices = NULL, y, k = 1.5, h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T) ) 78 | }) 79 | 80 | 81 | testthat::test_that("it returns an error if h = 0", { 82 | 83 | testthat::expect_error( distMat.KernelKnn(DIST_mat, TEST_indices = NULL, y, k = 5, h = 0.0, weights_function = NULL, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T) ) 84 | }) 85 | 86 | 87 | testthat::test_that("it returns an error if y is NULL", { 88 | 89 | testthat::expect_error( distMat.KernelKnn(DIST_mat, TEST_indices = NULL, y = NULL, k = 5, h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T) ) 90 | }) 91 | 92 | 93 | testthat::test_that("it returns an error if y is not numeric", { 94 | 95 | testthat::expect_error( distMat.KernelKnn(DIST_mat, TEST_indices = NULL, y = list(y), k = 5, h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T) ) 96 | }) 97 | 98 | 99 | testthat::test_that("it returns an error if regression = F and the Levels = NULL", { 100 | 101 | testthat::expect_error( distMat.KernelKnn(DIST_mat_class, TEST_indices = NULL, as.numeric(y1_class_ext), k = 5, h = 1.0, weights_function = NULL, regression = F, threads = 1, extrema = F, Levels = NULL, minimize = T) ) 102 | }) 103 | 104 | 105 | testthat::test_that("it returns an error if regression = F and there are unique labels less than 1", { 106 | 107 | testthat::expect_error( distMat.KernelKnn(DIST_mat_class, TEST_indices = NULL, as.numeric(y1_class_ext) - 1, k = 5, h = 1.0, weights_function = NULL, regression = F, threads = 1, extrema = F, Levels = unique(as.numeric(y1_class_ext)), minimize = T) ) 108 | }) 109 | 110 | 111 | testthat::test_that("it returns an error if missing values are present in the data", { 112 | 113 | tmp_dat = DIST_mat 114 | tmp_dat[2,1] = NA 115 | 116 | testthat::expect_error( distMat.KernelKnn(tmp_dat, TEST_indices = NULL, y, k = 5, h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T) ) 117 | }) 118 | 119 | 120 | testthat::test_that("it returns an error if missing values are present in the response variable", { 121 | 122 | tmp_dat = y 123 | tmp_dat[1] = NA 124 | 125 | testthat::expect_error( distMat.KernelKnn(DIST_mat, TEST_indices = NULL, tmp_dat, k = 5, h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T) ) 126 | }) 127 | 128 | 129 | testthat::test_that("it returns an error if the length of y is not equal to the number of rows of the distance matrix", { 130 | 131 | testthat::expect_error( distMat.KernelKnn(DIST_mat, TEST_indices = NULL, y[1:100], k = 5, h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T) ) 132 | }) 133 | 134 | 135 | testthat::test_that("it returns an error if k is less than 3 and extrema = TRUE", { 136 | 137 | testthat::expect_error( distMat.KernelKnn(DIST_mat, TEST_indices = NULL, y, k = 3, h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = T, Levels = NULL, minimize = T) ) 138 | }) 139 | 140 | 141 | testthat::test_that("it returns an error if the minimize parameter is not a boolean", { 142 | 143 | testthat::expect_error( distMat.KernelKnn(DIST_mat, TEST_indices = NULL, y, k = 5, h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = 'T') ) 144 | }) 145 | 146 | 147 | testthat::test_that("it returns error if the weights function is invalid for regression = T, if TEST_indices is NULL", { 148 | 149 | testthat::expect_error( distMat.KernelKnn(DIST_mat, TEST_indices = NULL, y, k = 5, h = 1.0, weights_function = list(), regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T) ) 150 | }) 151 | 152 | 153 | testthat::test_that("it returns error if the weights function is invalid for regression = F, if TEST_indices is NULL", { 154 | 155 | testthat::expect_error( distMat.KernelKnn(DIST_mat, TEST_indices = NULL, y, k = 5, h = 1.0, weights_function = matrix(,0,0), regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T) ) 156 | }) 157 | 158 | 159 | testthat::test_that("it returns error if the weights function is invalid for regression = F, TEST_indices is NOT NULL", { 160 | 161 | testthat::expect_error(distMat.KernelKnn(DIST_mat_class, TEST_indices = 1:100, as.numeric(y1_class_ext), k = 5, h = 1.0, weights_function = data.frame(matrix(,0,0)), regression = F, threads = 1, extrema = F, Levels = unique(as.numeric(y1_class_ext)), minimize = T)) 162 | }) 163 | 164 | 165 | testthat::test_that("it returns error if the weights function is invalid for regression = T, TEST_indices is NOT NULL", { 166 | 167 | testthat::expect_error( distMat.KernelKnn(DIST_mat, TEST_indices = 1:100, y, k = 5, h = 1.0, weights_function = as.factor(1:10), regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T) ) 168 | }) 169 | 170 | 171 | #===================== 172 | # testing of KernelKnn 173 | #===================== 174 | 175 | 176 | testthat::test_that("if TEST_indices = NULL, if regression = T the function returns the correct output", { 177 | 178 | res = distMat.KernelKnn(DIST_mat, TEST_indices = NULL, y, k = 5, h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T) 179 | 180 | testthat::expect_true(length(res) == nrow(DIST_mat)) 181 | }) 182 | 183 | 184 | testthat::test_that("if TEST_indices = NULL, if regression = T, if extrema = T, the function returns the correct output", { 185 | 186 | res = distMat.KernelKnn(DIST_mat, TEST_indices = NULL, y, k = 5, h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = T, Levels = NULL, minimize = T) 187 | 188 | testthat::expect_true(length(res) == nrow(DIST_mat)) 189 | }) 190 | 191 | 192 | testthat::test_that("if TEST_indices = NULL, if regression = T, if the diagonal of the distance matrix is filled with NA's the function returns the correct output", { 193 | 194 | tmp_dist = DIST_mat 195 | 196 | diag(tmp_dist) = NA 197 | 198 | res = distMat.KernelKnn(tmp_dist, TEST_indices = NULL, y, k = 5, h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T) 199 | 200 | testthat::expect_true(length(res) == nrow(tmp_dist)) 201 | }) 202 | 203 | 204 | testthat::test_that("if TEST_indices is not NULL, if regression = T the function returns the correct output", { 205 | 206 | tmp_idx = (nrow(DIST_mat) - 100): nrow(DIST_mat) 207 | 208 | res = distMat.KernelKnn(DIST_mat, TEST_indices = tmp_idx, y, k = 5, h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T) 209 | 210 | testthat::expect_true(length(res) == length(tmp_idx)) 211 | }) 212 | 213 | 214 | testthat::test_that("using either a kernel or a user-defined-kernel-function (here in both cases a 'uniform' kernel), IF regression = T, returns the same result", { 215 | 216 | res_kernel = distMat.KernelKnn(DIST_mat, TEST_indices = NULL, y, k = 5, h = 1.0, weights_function = 'uniform', regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T) 217 | 218 | uniform = function(W) { 219 | 220 | W = (1/2) * abs(W) 221 | 222 | W = W / rowSums(W) 223 | 224 | return(W) 225 | } 226 | 227 | res_kernel_function = distMat.KernelKnn(DIST_mat, TEST_indices = NULL, y, k = 5, h = 1.0, weights_function = uniform, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T) 228 | 229 | df = data.frame(first_case = res_kernel, sec_case = res_kernel_function) 230 | 231 | df$difference_of_results = round(df$first_case - df$sec_case, 3) # difference of results 232 | 233 | testthat::expect_true(sum(df$difference_of_results) == 0.0) 234 | }) 235 | 236 | 237 | 238 | testthat::test_that("using either a kernel or a user-defined-kernel-function (here in both cases a 'uniform' kernel) returns the same result, IF regression = F, wenn TEST_indices is not NULL", { 239 | 240 | idxs = (nrow(DIST_mat_class) - 100): nrow(DIST_mat_class) 241 | 242 | res_kernel = distMat.KernelKnn(DIST_mat_class, TEST_indices = idxs, as.numeric(y1_class_ext), k = 5, h = 1.0, weights_function = 'uniform', regression = F, threads = 1, extrema = F, Levels = unique(as.numeric(y1_class_ext)), minimize = T) 243 | 244 | uniform = function(W) { 245 | 246 | W = (1/2) * abs(W) 247 | 248 | W = W / rowSums(W) 249 | 250 | return(W) 251 | } 252 | 253 | res_kernel_function = distMat.KernelKnn(DIST_mat_class, TEST_indices = idxs, as.numeric(y1_class_ext), k = 5, h = 1.0, weights_function = uniform, regression = F, threads = 1, extrema = F, Levels = unique(as.numeric(y1_class_ext)), minimize = T) 254 | 255 | df_col1 = data.frame(first_case = res_kernel[, 1], sec_case = res_kernel_function[, 1]) 256 | 257 | df_col2 = data.frame(first_case = res_kernel[, 2], sec_case = res_kernel_function[, 2]) 258 | 259 | df_col1$difference_of_results = round(df_col1$first_case - df_col1$sec_case, 3) # difference of results in 'col1' 260 | 261 | df_col2$difference_of_results = round(df_col2$first_case - df_col2$sec_case, 3) # difference of results in 'col2' 262 | 263 | testthat::expect_true(sum(df_col1$difference_of_results) == 0.0 && sum(df_col2$difference_of_results) == 0.0) 264 | }) 265 | 266 | -------------------------------------------------------------------------------- /src/distance_metrics.cpp: -------------------------------------------------------------------------------- 1 | # define ARMA_WARN_LEVEL 0 2 | # include 3 | // [[Rcpp::depends("RcppArmadillo")]] 4 | // [[Rcpp::plugins(openmp)]] 5 | 6 | 7 | #ifdef _OPENMP 8 | #include 9 | #endif 10 | 11 | 12 | 13 | // return data struct 14 | // 15 | 16 | struct return_data { 17 | 18 | arma::mat knn_idx; 19 | 20 | arma::mat knn_dist; 21 | }; 22 | 23 | 24 | 25 | // kernelknn-class 26 | // 27 | 28 | 29 | class kernelKnn { 30 | 31 | private: 32 | 33 | arma::mat knn_indices; 34 | 35 | arma::mat knn_distances; 36 | 37 | public: 38 | 39 | kernelKnn() { } 40 | 41 | 42 | // inner loop [ no difference between threaded and non-threaded version for the 'inner_loop' method (parallelization) ] 43 | // 44 | 45 | arma::rowvec inner_loop(arma::mat& MATRIX_1st, arma::mat& MATRIX_2nd, int i, int ITERS, int k, std::string& method, arma::mat& cov_mat, double p, double eps = 1.0e-6) { 46 | 47 | 48 | arma::rowvec tmp_out = arma::zeros(ITERS); 49 | 50 | 51 | for (int j = 0; j < ITERS; j++) { // http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.DistanceMetric.html 52 | 53 | double tmp_idx; 54 | 55 | if (method == "euclidean") { 56 | 57 | tmp_idx = std::sqrt(arma::as_scalar(arma::accu(arma::square((MATRIX_1st.row(j) - MATRIX_2nd.row(i)))))); 58 | } 59 | 60 | else if (method == "manhattan") { 61 | 62 | tmp_idx = arma::as_scalar(arma::accu(arma::abs((MATRIX_1st.row(j) - MATRIX_2nd.row(i))))); 63 | } 64 | 65 | else if (method == "chebyshev") { 66 | 67 | tmp_idx = arma::as_scalar(max(arma::abs((MATRIX_1st.row(j) - MATRIX_2nd.row(i))))); 68 | } 69 | 70 | else if (method == "canberra") { 71 | 72 | tmp_idx = arma::as_scalar(arma::accu(arma::abs((MATRIX_1st.row(j) - MATRIX_2nd.row(i)) + eps)/(arma::abs(MATRIX_1st.row(j)) + arma::abs(MATRIX_2nd.row(i)) + eps))); // added 1.0e-6 otherwise rstudio crashes 73 | } 74 | 75 | else if (method == "braycurtis") { 76 | 77 | tmp_idx = arma::as_scalar(arma::accu(arma::abs((MATRIX_1st.row(j) - MATRIX_2nd.row(i))))/(arma::accu(arma::abs(MATRIX_1st.row(j))) + arma::accu(arma::abs(MATRIX_2nd.row(i))))); 78 | } 79 | 80 | else if (method == "pearson_correlation") { 81 | 82 | tmp_idx = arma::as_scalar(1.0 - arma::cor(MATRIX_1st.row(j), MATRIX_2nd.row(i))); 83 | } 84 | 85 | else if (method == "simple_matching_coefficient") { 86 | 87 | double a = eps; 88 | double d = eps; 89 | 90 | for (unsigned int t = 0; t < MATRIX_1st.row(j).n_elem; t++) { 91 | 92 | if (MATRIX_1st.row(j)(t) == 1 && MATRIX_2nd.row(i)(t) == 1) { 93 | 94 | a += 1.0;} 95 | 96 | if (MATRIX_1st.row(j)(t) == 0 && MATRIX_2nd.row(i)(t) == 0) { 97 | 98 | d += 1.0; 99 | } 100 | } 101 | 102 | tmp_idx = 1.0 - ((a + d) / MATRIX_1st.row(j).n_elem); 103 | } 104 | 105 | else if (method == "minkowski") { // by default the order of the minkowski parameter equals k 106 | 107 | tmp_idx = std::pow(arma::as_scalar(arma::accu(arma::pow(arma::abs((MATRIX_1st.row(j) - MATRIX_2nd.row(i))), p))), 1.0/p); 108 | } 109 | 110 | else if (method == "hamming") { // for binary data 111 | 112 | tmp_idx = arma::as_scalar(accu(MATRIX_1st.row(j) != MATRIX_2nd.row(i))/(MATRIX_1st.row(j).n_elem * 1.0)); 113 | } 114 | 115 | else if (method == "mahalanobis") { // first create covariance matrix from data 116 | 117 | tmp_idx = arma::as_scalar(std::sqrt(arma::as_scalar(((MATRIX_1st.row(j) - MATRIX_2nd.row(i)) * cov_mat) * (MATRIX_1st.row(j) - MATRIX_2nd.row(i)).t()))); 118 | } 119 | 120 | else if (method == "jaccard_coefficient") { // for binary data 121 | 122 | double a = eps; 123 | double b = eps; 124 | double c = eps; 125 | 126 | for (unsigned int t = 0; t < MATRIX_1st.row(j).n_elem; t++) { 127 | 128 | if (MATRIX_1st.row(j)(t) == 1 && MATRIX_2nd.row(i)(t) == 1) { 129 | 130 | a += 1.0;} 131 | 132 | if (MATRIX_1st.row(j)(t) == 1 && MATRIX_2nd.row(i)(t) == 0) { 133 | 134 | b += 1.0;} 135 | 136 | if (MATRIX_1st.row(j)(t) == 0 && MATRIX_2nd.row(i)(t) == 1) { 137 | 138 | c += 1.0; 139 | } 140 | } 141 | 142 | tmp_idx = 1.0 - (a / (a + b + c)); 143 | } 144 | 145 | else if (method == "Rao_coefficient") { // for binary data 146 | 147 | double a = eps; 148 | 149 | for (unsigned int t = 0; t < MATRIX_1st.row(j).n_elem; t++) { 150 | 151 | if (MATRIX_1st.row(j)(t) == 1 && MATRIX_2nd.row(i)(t) == 1) { 152 | 153 | a += 1.0; 154 | } 155 | } 156 | 157 | tmp_idx = 1.0 - (a / MATRIX_1st.row(j).n_elem); 158 | } 159 | 160 | else { 161 | 162 | tmp_idx = 0; // default = 0; create exceptions in R, so that tmp_idx is never 0; 163 | } 164 | 165 | if ( tmp_idx != tmp_idx ) { // handling of NAs, if NaN then distance 1.0 [ NaN will compare false to everything, including itself ], http://stackoverflow.com/questions/11569337/using-an-if-statement-to-switch-nan-values-in-an-array-to-0-0] 166 | 167 | tmp_out(j) = 1.0; 168 | } 169 | 170 | else { 171 | 172 | tmp_out(j) = tmp_idx; 173 | } 174 | } 175 | 176 | return tmp_out; 177 | } 178 | 179 | 180 | // calculate the 'inverse' AND in case of exception the 'Moore-Penrose pseudo-inverse' of the covariance matrix FOR the 'mahalanobis' distance 181 | // https://github.com/mlampros/KernelKnn/issues/1 182 | // 183 | 184 | arma::mat INV_EXC(arma::mat cov_data) { 185 | 186 | arma::mat inv_tmp; 187 | 188 | try { 189 | 190 | inv_tmp = arma::inv(arma::cov(cov_data)); 191 | } 192 | 193 | catch(...) { 194 | 195 | Rcpp::warning("the input matrix seems singular. The Moore-Penrose pseudo-inverse of the covariance matrix will be calculated"); 196 | } 197 | 198 | if (inv_tmp.empty()) { 199 | 200 | inv_tmp = arma::pinv(arma::cov(cov_data)); 201 | } 202 | 203 | return inv_tmp; 204 | } 205 | 206 | 207 | // secondary function for the 'train_only' and 'test_only' methods [ due to ASAN errors ] 208 | // 209 | 210 | arma::field inner_field_func(arma::mat& MATRIX, arma::mat& MATRIX1, int i, int ITERS, int k, std::string& method, arma::mat& cov_mat, double p, double eps = 1.0e-6) { 211 | 212 | arma::rowvec tmp_out = inner_loop(MATRIX, MATRIX1, i, ITERS, k, method, cov_mat, p, eps); 213 | 214 | arma::uvec index_out = arma::sort_index(tmp_out, "ascend"); 215 | 216 | arma::field F(2,1); 217 | 218 | F(0,0) = tmp_out; 219 | 220 | F(1,0) = arma::conv_to< arma::rowvec >::from(index_out); 221 | 222 | return F; 223 | } 224 | 225 | 226 | // train-data-input-only 227 | // 228 | 229 | void train_only(arma::mat& MATRIX, int k, std::string& method, int threads, double p, double eps = 1.0e-6) { 230 | 231 | #ifdef _OPENMP 232 | omp_set_num_threads(threads); 233 | #endif 234 | 235 | unsigned int COLS = MATRIX.n_cols; 236 | 237 | arma::mat cov_mat(COLS, COLS); 238 | 239 | if (method == "mahalanobis") { 240 | 241 | cov_mat = INV_EXC(MATRIX); 242 | } 243 | 244 | int ITERS = MATRIX.n_rows; 245 | 246 | knn_indices.set_size(ITERS, k); // loop to calculate the distances for the TRAIN data [ MATRIX ] 247 | 248 | knn_distances.set_size(ITERS, k); 249 | 250 | int i,f; 251 | 252 | #ifdef _OPENMP 253 | #pragma omp parallel for schedule(static) shared(ITERS, p, eps, cov_mat, method, k, MATRIX) private(i, f) 254 | #endif 255 | for (i = 0; i < ITERS; i++) { 256 | 257 | arma::field unl_field = inner_field_func(MATRIX, MATRIX, i, ITERS, k, method, cov_mat, p, eps); 258 | 259 | for (f = 1; f < k + 1; f++) { 260 | 261 | int IDX_subset_iter = unl_field(1,0)(f); 262 | 263 | #ifdef _OPENMP 264 | #pragma omp atomic write 265 | #endif 266 | knn_indices(i,f-1) = IDX_subset_iter + 1; // 'knn_indices', 'knn_distances' : class-members (not variables) are 'shared' by default when they aren't present in the clauses [ see comment https://stackoverflow.com/questions/5891641/data-members-in-an-openmp-loop ] 267 | 268 | #ifdef _OPENMP 269 | #pragma omp atomic write 270 | #endif 271 | knn_distances(i,f-1) = unl_field(0,0)(IDX_subset_iter); 272 | } 273 | } 274 | } 275 | 276 | 277 | // test-data-only 278 | // 279 | 280 | void test_only(arma::mat& MATRIX, arma::mat& TEST_DATA, int k, std::string& method, int threads, double p, double eps = 1.0e-6) { 281 | 282 | #ifdef _OPENMP 283 | omp_set_num_threads(threads); 284 | #endif 285 | 286 | unsigned int COLS = MATRIX.n_cols; 287 | 288 | arma::mat cov_mat(COLS, COLS); 289 | 290 | if (method == "mahalanobis") { 291 | 292 | cov_mat = INV_EXC(arma::join_vert(MATRIX, TEST_DATA)); 293 | } 294 | 295 | int ITERS_TEST = TEST_DATA.n_rows; 296 | 297 | int ITERS_TRAIN = MATRIX.n_rows; 298 | 299 | knn_indices.set_size(ITERS_TEST, k); 300 | 301 | knn_distances.set_size(ITERS_TEST, k); 302 | 303 | int i,f; 304 | 305 | #ifdef _OPENMP 306 | #pragma omp parallel for schedule(static) shared(ITERS_TEST, p, eps, cov_mat, threads, method, k, ITERS_TRAIN, TEST_DATA, MATRIX) private(i, f) 307 | #endif 308 | for (i = 0; i < ITERS_TEST; i++) { 309 | 310 | arma::field unl_field = inner_field_func(MATRIX, TEST_DATA, i, ITERS_TRAIN, k, method, cov_mat, p, eps); 311 | 312 | for (f = 0; f < k; f++) { 313 | 314 | int IDX_subset_iter = unl_field(1,0)(f); 315 | 316 | #ifdef _OPENMP 317 | #pragma omp atomic write 318 | #endif 319 | knn_indices(i,f) = IDX_subset_iter + 1; 320 | 321 | #ifdef _OPENMP 322 | #pragma omp atomic write 323 | #endif 324 | knn_distances(i,f) = unl_field(0,0)(IDX_subset_iter); 325 | } 326 | } 327 | } 328 | 329 | 330 | // secondary function for the 'input_dist_mat' [ due to ASAN errors ] 331 | // 332 | 333 | arma::field inner_dist_field(arma::mat& DIST_MAT, bool idx_openmp_flag, arma::uvec& test_idx, double min_sort, unsigned int i, const char *asc_des) { 334 | 335 | arma::rowvec tmp_row = DIST_MAT.row(i); // take each row IF test-indices NOT Null AND IF minimize = true THEN fill the test-indices with A MAXIMUM VALUE (so that sort_index does not pick one of the test-indices) IF minimize = false THEN fill the test-indices with a MINIMUM VALUE 336 | 337 | if (idx_openmp_flag) { 338 | 339 | tmp_row(test_idx).fill(min_sort); // another option would be to use erase for the 'test_idx' (remove the 'test_idx' from 'tmp_row') so that the for-loop can run faster, however I won't be able to calculate the sorted indices correctly (especially in case of cross-validation) 340 | } 341 | 342 | arma::uvec index_out = arma::sort_index(tmp_row, asc_des); 343 | 344 | arma::field F(2,1); 345 | 346 | F(0,0) = tmp_row; 347 | 348 | F(1,0) = arma::conv_to< arma::rowvec >::from(index_out); 349 | 350 | return F; 351 | } 352 | 353 | 354 | // input data is a distance matrix 355 | // 356 | 357 | void input_dist_mat(arma::mat DIST_MAT_input, Rcpp::Nullable TEST_IDX = R_NilValue, bool is_min = true, int k = 5, int threads = 1) { 358 | 359 | #ifdef _OPENMP 360 | omp_set_num_threads(threads); 361 | #endif 362 | 363 | arma::mat& DIST_MAT = DIST_MAT_input; // copy initial data (by reference), because I'll modified it 364 | 365 | arma::uvec test_idx; 366 | 367 | bool idx_openmp_flag = false; 368 | 369 | if (TEST_IDX.isNotNull()) { 370 | 371 | idx_openmp_flag = true; 372 | 373 | test_idx = Rcpp::as(TEST_IDX); 374 | 375 | test_idx = test_idx - 1; // adjust the indices to c++ indexing (which begins from 0) 376 | 377 | DIST_MAT = DIST_MAT.rows(test_idx); // overwrite the DIST_MAT using the test-indices (in case that TEST_IDX not NULL) 378 | } 379 | 380 | unsigned int ROWS = DIST_MAT.n_rows; 381 | 382 | const char *asc_des = is_min ? "ascend" : "descend"; 383 | 384 | double min_sort = arma::datum::inf;; 385 | 386 | if (idx_openmp_flag) { 387 | 388 | min_sort = is_min ? arma::datum::inf : (-arma::datum::inf); // in case of minimization of distance (and TEST_IDX not NULL) assign the max. value to train-data (and the opposite if maximization). That way I "indirectly" omit the TEST_IDX data from sorted-distance-calculation 389 | } 390 | 391 | int start_idx = idx_openmp_flag ? 0 : 1; 392 | 393 | int end_idx = idx_openmp_flag ? k : (k + 1); 394 | 395 | knn_indices.set_size(ROWS, k); 396 | 397 | knn_distances.set_size(ROWS, k); 398 | 399 | unsigned int i; 400 | 401 | #ifdef _OPENMP 402 | #pragma omp parallel for schedule(static) shared(ROWS, DIST_MAT, idx_openmp_flag, min_sort, test_idx, asc_des, start_idx, end_idx) private(i) 403 | #endif 404 | for (i = 0; i < ROWS; i++) { 405 | 406 | arma::field unl_field = inner_dist_field(DIST_MAT, idx_openmp_flag, test_idx, min_sort, i, asc_des); 407 | 408 | for (int f = start_idx; f < end_idx; f++) { 409 | 410 | int IDX_subset_iter = unl_field(1,0)(f); 411 | 412 | int app_idx = idx_openmp_flag ? f : f-1; 413 | 414 | #ifdef _OPENMP 415 | #pragma omp atomic write 416 | #endif 417 | knn_indices(i,app_idx) = IDX_subset_iter + 1; 418 | 419 | #ifdef _OPENMP 420 | #pragma omp atomic write 421 | #endif 422 | knn_distances(i,app_idx) = unl_field(0,0)(IDX_subset_iter); 423 | } 424 | } 425 | } 426 | 427 | 428 | // return data for either train or test 429 | // 430 | 431 | return_data return_train_test() { 432 | 433 | return { knn_indices, knn_distances }; 434 | } 435 | 436 | 437 | ~kernelKnn() { } 438 | }; 439 | 440 | 441 | 442 | 443 | //------------------------------------------Rcpp::export 444 | 445 | 446 | // kernel-knn (input : raw data) 447 | //------------------------------ 448 | 449 | // [[Rcpp::export]] 450 | Rcpp::List knn_index_dist_rcpp(arma::mat& MATRIX, arma::mat& TEST_DATA, int k, std::string& method, int threads, double p, double eps = 1.0e-6) { 451 | 452 | kernelKnn kn; 453 | 454 | std::string name_idx; 455 | 456 | std::string name_dist; 457 | 458 | if (TEST_DATA.is_empty()) { 459 | 460 | kn.train_only( MATRIX, k, method, threads, p, eps ); 461 | 462 | name_idx = "train_knn_idx"; 463 | 464 | name_dist = "train_knn_dist"; 465 | } 466 | 467 | if (!TEST_DATA.is_empty()) { 468 | 469 | kn.test_only( MATRIX, TEST_DATA, k, method, threads, p, eps ); 470 | 471 | name_idx = "test_knn_idx"; 472 | 473 | name_dist = "test_knn_dist"; 474 | } 475 | 476 | return_data dat = kn.return_train_test(); 477 | 478 | return Rcpp::List::create(Rcpp::Named(name_idx) = dat.knn_idx, Rcpp::Named(name_dist) = dat.knn_dist); 479 | } 480 | 481 | 482 | 483 | 484 | // kernel-knn (input : distance matrix) 485 | //------------------------------------- 486 | 487 | // [[Rcpp::export]] 488 | Rcpp::List DIST_MATRIX_knn(arma::mat& DIST_MAT, Rcpp::Nullable TEST_IDX = R_NilValue, bool is_min = true, int k = 5, int threads = 1, bool rcpp_list_names = false) { 489 | 490 | kernelKnn dist_knn; 491 | 492 | dist_knn.input_dist_mat(DIST_MAT, TEST_IDX, is_min, k, threads); 493 | 494 | return_data dat = dist_knn.return_train_test(); 495 | 496 | std::string name_idx, name_dist; 497 | 498 | if (!rcpp_list_names) { // used in 'distMat.KernelKnn' 499 | 500 | name_idx = "knn_idx"; 501 | name_dist = "knn_dist";} 502 | 503 | else { 504 | 505 | if (TEST_IDX.isNull()) { // used in 'distMat.knn.index.dist' 506 | 507 | name_idx = "train_knn_idx"; 508 | name_dist = "train_knn_dist";} 509 | 510 | else { 511 | 512 | name_idx = "test_knn_idx"; 513 | name_dist = "test_knn_dist"; 514 | } 515 | } 516 | 517 | return Rcpp::List::create(Rcpp::Named(name_idx) = dat.knn_idx, Rcpp::Named(name_dist) = dat.knn_dist); 518 | } 519 | 520 | 521 | 522 | -------------------------------------------------------------------------------- /tests/testthat/test-kernelknn.R: -------------------------------------------------------------------------------- 1 | #============================================================================================================================================================== 2 | 3 | context("Kernel knn") 4 | 5 | 6 | #================= 7 | # Error handling 8 | #================= 9 | 10 | 11 | testthat::test_that("it returns an error if a factor variable is present in the data and the transf_categ_cols = FALSE", { 12 | 13 | tmp_dat = xtr 14 | 15 | tmp_dat$rad = as.factor(tmp_dat$rad) 16 | 17 | testthat::expect_error(KernelKnn(tmp_dat, TEST_data = xte, y1, k = 5, 1.0, method = 'euclidean', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL)) 18 | }) 19 | 20 | testthat::test_that("it returns an error if a character variable is present in the data and the transf_categ_cols = FALSE", { 21 | 22 | tmp_dat = xtr 23 | 24 | tmp_dat$rad = as.character(tmp_dat$rad) 25 | 26 | testthat::expect_error(KernelKnn(tmp_dat, TEST_data = xte, y1, k = 5, 1.0, method = 'euclidean', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL)) 27 | }) 28 | 29 | 30 | testthat::test_that("'simple_matching_coefficient', 'jaccard_coefficient' and 'Rao_coefficient' can work only with binary data, case TEST_data is NULL", { 31 | 32 | testthat::expect_error(KernelKnn(xtr, TEST_data = NULL, y1, k = 5, 1.0, method = 'simple_matching_coefficient', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL)) 33 | }) 34 | 35 | 36 | testthat::test_that("'simple_matching_coefficient', 'jaccard_coefficient' and 'Rao_coefficient' can work only with binary data, case TEST_data is NULL", { 37 | 38 | testthat::expect_error(KernelKnn(xtr, TEST_data = NULL, y1, k = 5, 1.0, method = 'jaccard_coefficient', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL)) 39 | }) 40 | 41 | 42 | testthat::test_that("'simple_matching_coefficient', 'jaccard_coefficient' and 'Rao_coefficient' can work only with binary data, case TEST_data is NULL", { 43 | 44 | testthat::expect_error(KernelKnn(xtr, TEST_data = NULL, y1, k = 5, 1.0, method = 'Rao_coefficient', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL)) 45 | }) 46 | 47 | 48 | testthat::test_that("'simple_matching_coefficient', 'jaccard_coefficient' and 'Rao_coefficient' can work only with binary data, case TEST_data is not NULL", { 49 | 50 | testthat::expect_error(KernelKnn(xtr, TEST_data = xte, y1, k = 5, 1.0, method = 'simple_matching_coefficient', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL)) 51 | }) 52 | 53 | 54 | testthat::test_that("'simple_matching_coefficient', 'jaccard_coefficient' and 'Rao_coefficient' can work only with binary data, case TEST_data is not NULL", { 55 | 56 | testthat::expect_error(KernelKnn(xtr, TEST_data = xte, y1, k = 5, 1.0, method = 'jaccard_coefficient', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL)) 57 | }) 58 | 59 | 60 | testthat::test_that("'simple_matching_coefficient', 'jaccard_coefficient' and 'Rao_coefficient' can work only with binary data, case TEST_data is not NULL", { 61 | 62 | testthat::expect_error(KernelKnn(xtr, TEST_data = xte, y1, k = 5, 1.0, method = 'Rao_coefficient', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL)) 63 | }) 64 | 65 | 66 | testthat::test_that("it returns an error if k is NULL", { 67 | 68 | testthat::expect_error( KernelKnn(xtr, TEST_data = xte, y1, k = NULL, 1.0, method = 'euclidean', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL) ) 69 | }) 70 | 71 | 72 | testthat::test_that("it returns an error if k is a character", { 73 | 74 | testthat::expect_error( KernelKnn(xtr, TEST_data = xte, y1, k = 'invalid', 1.0, method = 'euclidean', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL) ) 75 | }) 76 | 77 | 78 | testthat::test_that("it returns an error if k is greater or equal to the number of rows of the train data", { 79 | 80 | testthat::expect_error( KernelKnn(xtr, TEST_data = xte, y1, k = nrow(xtr) , 1.0, method = 'euclidean', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL) ) 81 | }) 82 | 83 | 84 | testthat::test_that("it returns an error if k is less than 1", { 85 | 86 | testthat::expect_error( KernelKnn(xtr, TEST_data = xte, y1, k = -1 , 1.0, method = 'euclidean', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL) ) 87 | }) 88 | 89 | 90 | testthat::test_that("it returns a warning if k is a float", { 91 | 92 | testthat::expect_warning( KernelKnn(xtr, TEST_data = xte, y1, k = 1.5 , 1.0, method = 'euclidean', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL) ) 93 | }) 94 | 95 | 96 | testthat::test_that("it returns an error if h = 0", { 97 | 98 | testthat::expect_error( KernelKnn(xtr, TEST_data = xte, y1, k = 4 , 0.0, method = 'euclidean', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL) ) 99 | }) 100 | 101 | testthat::test_that("it returns an error if the method is NULL", { 102 | 103 | testthat::expect_error( KernelKnn(xtr, TEST_data = xte, y1, k = 5 , 1.0, method = NULL, weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL) ) 104 | }) 105 | 106 | 107 | testthat::test_that("it returns an error if the method is not a character", { 108 | 109 | testthat::expect_error( KernelKnn(xtr, TEST_data = xte, y1, k = 5 , 1.0, method = 1, weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL) ) 110 | }) 111 | 112 | 113 | testthat::test_that("it returns an error if the method is a character, but not one of the valid names", { 114 | 115 | testthat::expect_error( KernelKnn(xtr, TEST_data = xte, y1, k = 5 , 1.0, method = 'invalid', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL) ) 116 | }) 117 | 118 | 119 | testthat::test_that("it returns an error if y is NULL", { 120 | 121 | testthat::expect_error( KernelKnn(xtr, TEST_data = xte, NULL, k = 5 , 1.0, method = 'euclidean', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL) ) 122 | }) 123 | 124 | 125 | testthat::test_that("it returns an error if y is not numeric", { 126 | 127 | testthat::expect_error( KernelKnn(xtr, TEST_data = xte, list(y1), k = 5 , 1.0, method = 'euclidean', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL) ) 128 | }) 129 | 130 | 131 | testthat::test_that("it returns an error if regression = F and the Levels = NULL", { 132 | 133 | testthat::expect_error( KernelKnn(xtr, TEST_data = xte, as.numeric(y1_class), k = 5 , 1.0, method = 'euclidean', weights_function = NULL, regression = F, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL) ) 134 | }) 135 | 136 | 137 | testthat::test_that("it returns an error if regression = F and there are unique labels less than 1", { 138 | 139 | testthat::expect_error( KernelKnn(xtr, TEST_data = xte, as.numeric(y1_class) - 1, k = 5 , 1.0, method = 'euclidean', weights_function = NULL, regression = F, transf_categ_cols = F, threads = 1, extrema = F, Levels = unique(as.numeric(y1_class))) ) 140 | }) 141 | 142 | 143 | testthat::test_that("it returns an error if missing values are present in the data", { 144 | 145 | tmp_dat = xtr 146 | tmp_dat$crim[sample(1:length(tmp_dat$crim), 10)] = NA 147 | 148 | testthat::expect_error( KernelKnn(tmp_dat, TEST_data = xte, y1, k = 5 , 1.0, method = 'euclidean', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL) ) 149 | }) 150 | 151 | 152 | testthat::test_that("it returns an error if missing values are present in the response variable", { 153 | 154 | tmp_dat = y1 155 | tmp_dat[sample(1:length(tmp_dat), 10)] = NA 156 | 157 | testthat::expect_error( KernelKnn(xtr, TEST_data = xte, tmp_dat, k = 5 , 1.0, method = 'euclidean', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL) ) 158 | }) 159 | 160 | 161 | testthat::test_that("it returns an error if missing values are present in the TEST data", { 162 | 163 | tmp_dat = xte 164 | tmp_dat$crim[sample(1:length(tmp_dat$crim), 10)] = NA 165 | 166 | testthat::expect_error( KernelKnn(xtr, TEST_data = tmp_dat, y1, k = 5 , 1.0, method = 'euclidean', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL) ) 167 | }) 168 | 169 | 170 | testthat::test_that("it returns an error if the length of y is not equal to the number of rows of the train data", { 171 | 172 | testthat::expect_error( KernelKnn(xtr, TEST_data = xte, y1[1:(length(y1)-10)], k = 5 , 1.0, method = 'euclidean', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL) ) 173 | }) 174 | 175 | 176 | testthat::test_that("it returns an error if k is less than 3 and extrema = TRUE", { 177 | 178 | testthat::expect_error( KernelKnn(xtr, TEST_data = xte, y1, k = 3 , 1.0, method = 'euclidean', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = T, Levels = NULL) ) 179 | }) 180 | 181 | 182 | testthat::test_that("if the number of columns in train and test data differ it returns an error", { 183 | 184 | tmp_xte = xte[, -ncol(xte)] 185 | 186 | testthat::expect_error( KernelKnn(xtr, TEST_data = tmp_xte, y1, k = 4 , 1.0, method = 'euclidean', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL) ) 187 | }) 188 | 189 | 190 | testthat::test_that("it returns error if the weights function is invalid for regression = T, if TEST_data is NULL", { 191 | 192 | testthat::expect_error(KernelKnn(xtr, TEST_data = NULL, y1, k = 5 , h = 1.0, method = 'euclidean', weights_function = list(), regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL)) 193 | }) 194 | 195 | 196 | testthat::test_that("it returns error if the weights function is invalid for regression = F, if TEST_data is NULL", { 197 | 198 | testthat::expect_error(KernelKnn(xtr_class, TEST_data = NULL, as.numeric(y1_class), k = 5 , h = 1.0, method = 'euclidean', weights_function = matrix(,0,0), regression = F, transf_categ_cols = F, threads = 1, extrema = F, Levels = unique(y1_class))) 199 | }) 200 | 201 | 202 | testthat::test_that("it returns error if the weights function is invalid for regression = F, TEST_data is NOT NULL", { 203 | 204 | testthat::expect_error(KernelKnn(xtr_class, TEST_data = xte_class, as.numeric(y1_class), k = 5 , h = 1.0, method = 'euclidean', weights_function = data.frame(matrix(,0,0)), regression = F, transf_categ_cols = F, threads = 1, extrema = F, Levels = unique(y1_class))) 205 | }) 206 | 207 | 208 | testthat::test_that("it returns error if the weights function is invalid for regression = T, TEST_data is NOT NULL", { 209 | 210 | testthat::expect_error(KernelKnn(xtr, TEST_data = xte, y1, k = 5 , h = 1.0, method = 'euclidean', weights_function = as.factor(1:10), regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL)) 211 | }) 212 | 213 | 214 | testthat::test_that("it returns a warning if the input matrix seems singular in case of the 'mahalanobis' distance", { 215 | 216 | tmp_x = singular_mat[, -ncol(singular_mat)] 217 | 218 | tmp_y = as.numeric(singular_mat[, ncol(singular_mat)]) 219 | 220 | testthat::expect_warning(KernelKnn(tmp_x, TEST_data = NULL, tmp_y, k = 5 , h = 1.0, method = 'mahalanobis', weights_function = NULL, regression = F, transf_categ_cols = F, threads = 1, extrema = F, Levels = unique(tmp_y))) 221 | }) 222 | 223 | 224 | # testing of KernelKnn 225 | 226 | 227 | testthat::test_that("if transf_categ_cols = TRUE and TEST_data = NULL the KernelKnn returns output equal to the number of rows in the train data", { 228 | 229 | tmp_xtr = xtr 230 | tmp_xtr$rad = as.factor(tmp_xtr$rad) 231 | 232 | res = KernelKnn(tmp_xtr, TEST_data = NULL, y1, k = 5 , 1.0, method = 'euclidean', weights_function = NULL, regression = T, transf_categ_cols = T, threads = 1, extrema = F, Levels = NULL) 233 | 234 | testthat::expect_true(length(res) == nrow(xtr)) 235 | }) 236 | 237 | 238 | testthat::test_that("if transf_categ_cols = TRUE and TEST_data is NOT NULL the KernelKnn returns output equal to the number of rows in the TEST data", { 239 | 240 | tmp_xtr = xtr 241 | tmp_xtr$rad = as.factor(tmp_xtr$rad) 242 | 243 | tmp_xte = xte 244 | tmp_xte$rad = as.factor(tmp_xte$rad) 245 | 246 | res = KernelKnn(tmp_xtr, TEST_data = tmp_xte, y1, k = 5 , 1.0, method = 'euclidean', weights_function = NULL, regression = T, transf_categ_cols = T, threads = 1, extrema = F, Levels = NULL) 247 | 248 | testthat::expect_true(length(res) == nrow(tmp_xte)) 249 | }) 250 | 251 | 252 | 253 | testthat::test_that("using either a kernel or a user-defined-kernel-function (here in both cases a 'uniform' kernel) returns the same result, when both train and test sets are used", { 254 | 255 | res_kernel = KernelKnn(xtr, TEST_data = xte, y1, k = 5 , 1.0, method = 'euclidean', weights_function = 'uniform', regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL) 256 | 257 | uniform = function(W) { 258 | 259 | W = (1/2) * abs(W) 260 | 261 | W = W / rowSums(W) 262 | 263 | return(W) 264 | } 265 | 266 | res_kernel_function = KernelKnn(xtr, TEST_data = xte, y1, k = 5 , 1.0, method = 'euclidean', weights_function = uniform, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL) 267 | 268 | df = data.frame(first_case = res_kernel, sec_case = res_kernel_function) 269 | 270 | df$difference_of_results = round(df$first_case - df$sec_case, 3) # difference of results 271 | 272 | testthat::expect_true(sum(df$difference_of_results) == 0.0) 273 | }) 274 | 275 | 276 | testthat::test_that("using either a kernel or a user-defined-kernel-function (here in both cases a 'uniform' kernel) returns the same result, when ONLY train data is used", { 277 | 278 | res_kernel = KernelKnn(xtr, TEST_data = NULL, y1, k = 5 , 1.0, method = 'euclidean', weights_function = 'uniform', regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL) 279 | 280 | uniform = function(W) { 281 | 282 | W = (1/2) * abs(W) 283 | 284 | W = W / rowSums(W) 285 | 286 | return(W) 287 | } 288 | 289 | res_kernel_function = KernelKnn(xtr, TEST_data = NULL, y1, k = 5 , 1.0, method = 'euclidean', weights_function = uniform, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL) 290 | 291 | df = data.frame(first_case = res_kernel, sec_case = res_kernel_function) 292 | 293 | df$difference_of_results = round(df$first_case - df$sec_case, 3) # difference of results 294 | 295 | testthat::expect_true(sum(df$difference_of_results) == 0.0) 296 | }) 297 | 298 | 299 | testthat::test_that("if the TEST data is NULL for all posible combinations [ WHEN the weights_function is a character, when regression = T, when transf_categ_cols = T] the length of the output 300 | 301 | matches the number of rows of the input", { 302 | 303 | tmp_xtr = xtr 304 | tmp_xtr$rad = as.factor(tmp_xtr$rad) 305 | 306 | lst = count = list() 307 | 308 | for (k in 4:6) { 309 | 310 | for (h in c(0.1, 0.5, 1.0)) { 311 | 312 | for (metric in c('euclidean', 'manhattan', 'chebyshev')) { 313 | 314 | for (w_func in c('uniform', 'triangular', 'epanechnikov')) { 315 | 316 | for (extr in c(T,F)) { 317 | 318 | lst = lappend(lst, KernelKnn(tmp_xtr, TEST_data = NULL, y1, k = k , h = h, method = metric, weights_function = w_func, regression = T, transf_categ_cols = T, threads = 1, extrema = extr, Levels = NULL)) 319 | 320 | count = lappend(count, 1) 321 | } 322 | } 323 | } 324 | } 325 | } 326 | 327 | testthat::expect_true(nrow(do.call(cbind, lst)) == nrow(tmp_xtr) && ncol(do.call(cbind, lst)) == length(unlist(count))) 328 | }) 329 | 330 | 331 | testthat::test_that("if the TEST data is NULL for all posible combinations [ WHEN the weights_function is a function, when regression = T, when transf_categ_cols = T] the length of the output 332 | 333 | matches the number of rows of the input", { 334 | 335 | tmp_xtr = xtr 336 | tmp_xtr$rad = as.factor(tmp_xtr$rad) 337 | 338 | epanechnikov = function(W) { 339 | 340 | W = (3/4) * (1 - W ^ 2) 341 | 342 | W = W / rowSums(W) 343 | 344 | return(W) 345 | } 346 | 347 | lst = count = list() 348 | 349 | for (k in 4:6) { 350 | 351 | for (h in c(0.1, 0.5, 1.0)) { 352 | 353 | for (metric in c('canberra', 'braycurtis', 'minkowski')) { 354 | 355 | for (extr in c(T,F)) { 356 | 357 | lst = lappend(lst, KernelKnn(tmp_xtr, TEST_data = NULL, y1, k = k , h = h, method = metric, weights_function = epanechnikov, regression = T, transf_categ_cols = T, threads = 1, extrema = extr, Levels = NULL)) 358 | 359 | count = lappend(count, 1) 360 | } 361 | } 362 | } 363 | } 364 | 365 | testthat::expect_true(nrow(do.call(cbind, lst)) == nrow(tmp_xtr) && ncol(do.call(cbind, lst)) == length(unlist(count))) 366 | }) 367 | 368 | 369 | testthat::test_that("if the TEST data is NULL for all posible combinations [ WHEN the weights_function is a character, when regression = F, when transf_categ_cols = F] the length of the output 370 | 371 | matches the number of rows of the input", { 372 | 373 | lst = count = list() 374 | 375 | for (k in 4:6) { 376 | 377 | for (h in c(0.1, 0.5, 1.0)) { 378 | 379 | for (metric in c('euclidean', 'manhattan', 'chebyshev')) { 380 | 381 | for (w_func in c('uniform', 'triangular', 'epanechnikov')) { 382 | 383 | for (extr in c(T,F)) { 384 | 385 | lst = lappend(lst, KernelKnn(xtr_class, TEST_data = NULL, as.numeric(y1_class), k = k , h = h, method = metric, weights_function = w_func, regression = F, transf_categ_cols = F, threads = 1, extrema = extr, Levels = as.numeric(unique(y1_class)))) 386 | 387 | count = lappend(count, 1) 388 | } 389 | } 390 | } 391 | } 392 | } 393 | 394 | NCOL = mean(unlist(lapply(lst, ncol))) 395 | NROW = mean(unlist(lapply(lst, nrow))) 396 | 397 | testthat::expect_true(NROW == nrow(xtr_class) && NCOL == length(unique(y1_class))) 398 | }) 399 | 400 | 401 | testthat::test_that("if the TEST data is NULL for all posible combinations [ WHEN the weights_function is a function, when regression = F, when transf_categ_cols = F] the length of the output 402 | 403 | matches the number of rows of the input", { 404 | 405 | logistic = function(W) { 406 | 407 | W = (1/(exp(W) + 2 + exp(-W))) 408 | 409 | W = W / rowSums(W) 410 | 411 | return(W) 412 | } 413 | 414 | lst = count = list() 415 | 416 | for (k in 4:6) { 417 | 418 | for (h in c(0.1, 0.5, 1.0)) { 419 | 420 | for (metric in c('canberra', 'braycurtis', 'minkowski')) { 421 | 422 | for (extr in c(T,F)) { 423 | 424 | lst = lappend(lst, KernelKnn(xtr_class, TEST_data = NULL, as.numeric(y1_class), k = k , h = h, method = metric, weights_function = logistic, regression = F, transf_categ_cols = F, threads = 1, extrema = extr, Levels = as.numeric(unique(y1_class)))) 425 | 426 | count = lappend(count, 1) 427 | } 428 | } 429 | } 430 | } 431 | 432 | NCOL = mean(unlist(lapply(lst, ncol))) 433 | NROW = mean(unlist(lapply(lst, nrow))) 434 | 435 | testthat::expect_true(NROW == nrow(xtr_class) && NCOL == length(unique(y1_class))) 436 | }) 437 | 438 | 439 | 440 | testthat::test_that("if the TEST data is NOT NULL for all posible combinations [ WHEN the weights_function is a character, when regression = T, when transf_categ_cols = T] the length of the output 441 | 442 | matches the number of rows of the input", { 443 | 444 | tmp_xtr = xtr 445 | tmp_xtr$rad = as.factor(tmp_xtr$rad) 446 | 447 | tmp_xte = xte 448 | tmp_xte$rad = as.factor(tmp_xte$rad) 449 | 450 | lst = count = list() 451 | 452 | for (k in 4:6) { 453 | 454 | for (h in c(0.1, 0.5, 1.0)) { 455 | 456 | for (metric in c('canberra', 'braycurtis', 'minkowski')) { 457 | 458 | for (w_func in c('uniform', 'triangular', 'epanechnikov')) { 459 | 460 | for (extr in c(T,F)) { 461 | 462 | lst = lappend(lst, KernelKnn(tmp_xtr, TEST_data = tmp_xte, y1, k = k , h = h, method = metric, weights_function = w_func, regression = T, transf_categ_cols = T, threads = 1, extrema = extr, Levels = NULL)) 463 | 464 | count = lappend(count, 1) 465 | } 466 | } 467 | } 468 | } 469 | } 470 | 471 | testthat::expect_true(nrow(do.call(cbind, lst)) == nrow(tmp_xte) && ncol(do.call(cbind, lst)) == length(unlist(count))) 472 | }) 473 | 474 | 475 | testthat::test_that("if the TEST data is NOT NULL for all posible combinations [ WHEN the weights_function is a function, when regression = T, when transf_categ_cols = T] the length of the output 476 | 477 | matches the number of rows of the input", { 478 | 479 | tmp_xtr = xtr 480 | tmp_xtr$rad = as.factor(tmp_xtr$rad) 481 | 482 | tmp_xte = xte 483 | tmp_xte$rad = as.factor(tmp_xte$rad) 484 | 485 | epanechnikov = function(W) { 486 | 487 | W = (3/4) * (1 - W ^ 2) 488 | 489 | W = W / rowSums(W) 490 | 491 | return(W) 492 | } 493 | 494 | lst = count = list() 495 | 496 | for (k in 4:6) { 497 | 498 | for (h in c(0.1, 0.5, 1.0)) { 499 | 500 | for (metric in c('canberra', 'hamming', 'mahalanobis')) { 501 | 502 | for (extr in c(T,F)) { 503 | 504 | lst = lappend(lst, KernelKnn(tmp_xtr, TEST_data = tmp_xte, y1, k = k , h = h, method = metric, weights_function = epanechnikov, regression = T, transf_categ_cols = T, threads = 1, extrema = extr, Levels = NULL)) 505 | 506 | count = lappend(count, 1) 507 | } 508 | } 509 | } 510 | } 511 | 512 | testthat::expect_true(nrow(do.call(cbind, lst)) == nrow(tmp_xte) && ncol(do.call(cbind, lst)) == length(unlist(count))) 513 | }) 514 | 515 | 516 | testthat::test_that("if the TEST data is NOT NULL for all posible combinations [ WHEN the weights_function is a character, when regression = F, when transf_categ_cols = F] the length of the output 517 | 518 | matches the number of rows of the input", { 519 | 520 | lst = count = list() 521 | 522 | for (k in 4:6) { 523 | 524 | for (h in c(0.1, 0.5, 1.0)) { 525 | 526 | for (metric in c('euclidean', 'manhattan', 'chebyshev')) { 527 | 528 | for (w_func in c('uniform', 'triangular', 'epanechnikov')) { 529 | 530 | for (extr in c(T,F)) { 531 | 532 | lst = lappend(lst, KernelKnn(xtr_class, TEST_data = xte_class, as.numeric(y1_class), k = k , h = h, method = metric, weights_function = w_func, regression = F, transf_categ_cols = F, threads = 1, extrema = extr, Levels = as.numeric(unique(y1_class)))) 533 | 534 | count = lappend(count, 1) 535 | } 536 | } 537 | } 538 | } 539 | } 540 | 541 | NCOL = mean(unlist(lapply(lst, ncol))) 542 | NROW = mean(unlist(lapply(lst, nrow))) 543 | 544 | testthat::expect_true(NROW == nrow(xte_class) && NCOL == length(unique(y1_class))) 545 | }) 546 | 547 | 548 | testthat::test_that("if the TEST data is NOT NULL for all posible combinations [ WHEN the weights_function is a function, when regression = F, when transf_categ_cols = F] the length of the output 549 | 550 | matches the number of rows of the input", { 551 | 552 | logistic = function(W) { 553 | 554 | W = (1/(exp(W) + 2 + exp(-W))) 555 | 556 | W = W / rowSums(W) 557 | 558 | return(W) 559 | } 560 | 561 | lst = count = list() 562 | 563 | for (k in 4:6) { 564 | 565 | for (h in c(0.1, 0.5, 1.0)) { 566 | 567 | for (metric in c('canberra', 'braycurtis', 'minkowski')) { 568 | 569 | for (extr in c(T,F)) { 570 | 571 | lst = lappend(lst, KernelKnn(xtr_class, TEST_data = xte_class, as.numeric(y1_class), k = k , h = h, method = metric, weights_function = logistic, regression = F, transf_categ_cols = F, threads = 1, extrema = extr, Levels = as.numeric(unique(y1_class)))) 572 | 573 | count = lappend(count, 1) 574 | } 575 | } 576 | } 577 | } 578 | 579 | NCOL = mean(unlist(lapply(lst, ncol))) 580 | NROW = mean(unlist(lapply(lst, nrow))) 581 | 582 | testthat::expect_true(NROW == nrow(xte_class) && NCOL == length(unique(y1_class))) 583 | }) 584 | 585 | 586 | testthat::test_that("if the TEST data is NULL for all posible combinations [ WHEN the weights_function is NULL, when regression = T, when transf_categ_cols = T] the length of the output 587 | 588 | matches the number of rows of the input", { 589 | 590 | tmp_xtr = xtr 591 | tmp_xtr$rad = as.factor(tmp_xtr$rad) 592 | 593 | lst = count = list() 594 | 595 | for (k in 4:6) { 596 | 597 | for (h in c(0.1, 0.5, 1.0)) { 598 | 599 | for (metric in c('euclidean', 'manhattan', 'chebyshev')) { 600 | 601 | for (extr in c(T,F)) { 602 | 603 | lst = lappend(lst, KernelKnn(tmp_xtr, TEST_data = NULL, y1, k = k , h = h, method = metric, weights_function = NULL, regression = T, transf_categ_cols = T, threads = 1, extrema = extr, Levels = NULL)) 604 | 605 | count = lappend(count, 1) 606 | } 607 | } 608 | } 609 | } 610 | 611 | testthat::expect_true(nrow(do.call(cbind, lst)) == nrow(tmp_xtr) && ncol(do.call(cbind, lst)) == length(unlist(count))) 612 | }) 613 | 614 | 615 | 616 | testthat::test_that("if the TEST data is NULL for all posible combinations [ WHEN the weights_function is NULL, when regression = F, when transf_categ_cols = F] the length of the output 617 | 618 | matches the number of rows of the input", { 619 | 620 | lst = count = list() 621 | 622 | for (k in 4:6) { 623 | 624 | for (h in c(0.1, 0.5, 1.0)) { 625 | 626 | for (metric in c('euclidean', 'manhattan', 'chebyshev')) { 627 | 628 | for (extr in c(T,F)) { 629 | 630 | lst = lappend(lst, KernelKnn(xtr_class, TEST_data = NULL, as.numeric(y1_class), k = k , h = h, method = metric, weights_function = NULL, regression = F, transf_categ_cols = F, threads = 1, extrema = extr, Levels = as.numeric(unique(y1_class)))) 631 | 632 | count = lappend(count, 1) 633 | } 634 | } 635 | } 636 | } 637 | 638 | NCOL = mean(unlist(lapply(lst, ncol))) 639 | NROW = mean(unlist(lapply(lst, nrow))) 640 | 641 | testthat::expect_true(NROW == nrow(xtr_class) && NCOL == length(unique(y1_class))) 642 | }) 643 | 644 | 645 | 646 | testthat::test_that("if the TEST data is NULL for all posible combinations [ WHEN the weights_function is NULL, when regression = T, when transf_categ_cols = T] the length of the output 647 | 648 | matches the number of rows of the input", { 649 | 650 | tmp_xtr = xtr 651 | tmp_xtr$rad = as.factor(tmp_xtr$rad) 652 | 653 | tmp_xte = xte 654 | tmp_xte$rad = as.factor(tmp_xte$rad) 655 | 656 | lst = count = list() 657 | 658 | for (k in 4:6) { 659 | 660 | for (h in c(0.1, 0.5, 1.0)) { 661 | 662 | for (metric in c('euclidean', 'manhattan', 'chebyshev')) { 663 | 664 | for (extr in c(T,F)) { 665 | 666 | lst = lappend(lst, KernelKnn(tmp_xtr, TEST_data = tmp_xte, y1, k = k , h = h, method = metric, weights_function = NULL, regression = T, transf_categ_cols = T, threads = 1, extrema = extr, Levels = NULL)) 667 | 668 | count = lappend(count, 1) 669 | } 670 | } 671 | } 672 | } 673 | 674 | testthat::expect_true(nrow(do.call(cbind, lst)) == nrow(tmp_xte) && ncol(do.call(cbind, lst)) == length(unlist(count))) 675 | }) 676 | 677 | 678 | 679 | testthat::test_that("if the TEST data is NULL for all posible combinations [ WHEN the weights_function is NULL, when regression = F, when transf_categ_cols = F] the length of the output 680 | 681 | matches the number of rows of the input", { 682 | 683 | lst = count = list() 684 | 685 | for (k in 4:6) { 686 | 687 | for (h in c(0.1, 0.5, 1.0)) { 688 | 689 | for (metric in c('euclidean', 'manhattan', 'chebyshev')) { 690 | 691 | for (extr in c(T,F)) { 692 | 693 | lst = lappend(lst, KernelKnn(xtr_class, TEST_data = xte_class, as.numeric(y1_class), k = k , h = h, method = metric, weights_function = NULL, regression = F, transf_categ_cols = F, threads = 1, extrema = extr, Levels = as.numeric(unique(y1_class)))) 694 | 695 | count = lappend(count, 1) 696 | } 697 | } 698 | } 699 | } 700 | 701 | NCOL = mean(unlist(lapply(lst, ncol))) 702 | NROW = mean(unlist(lapply(lst, nrow))) 703 | 704 | testthat::expect_true(NROW == nrow(xte_class) && NCOL == length(unique(y1_class))) 705 | }) 706 | 707 | 708 | 709 | 710 | testthat::test_that("the similarity measures 'simple_matching_coefficient', 'jaccard_coefficient', 'Rao_coefficient' and 'pearson_correlation' return correct output 711 | 712 | in case of binary data", { 713 | 714 | 715 | dat = do.call(cbind, lapply(1:10, function(x) sample(0:1, 100, replace = T))) 716 | TES = do.call(cbind, lapply(1:10, function(x) sample(0:1, 50, replace = T))) 717 | y = sample(1:2, 100, replace = T) 718 | 719 | lst = count = list() 720 | 721 | for (k in 4:6) { 722 | 723 | for (h in c(0.1, 0.5, 1.0)) { 724 | 725 | for (metric in c('simple_matching_coefficient', 'jaccard_coefficient', 'Rao_coefficient', 'pearson_correlation')) { 726 | 727 | for (extr in c(T,F)) { 728 | 729 | lst = lappend(lst, KernelKnn(dat, TEST_data = TES, y, k = k , h = h, method = metric, weights_function = NULL, regression = F, transf_categ_cols = F, threads = 1, extrema = extr, Levels = unique(y))) 730 | 731 | count = lappend(count, 1) 732 | } 733 | } 734 | } 735 | } 736 | 737 | NCOL = mean(unlist(lapply(lst, ncol))) 738 | NROW = mean(unlist(lapply(lst, nrow))) 739 | 740 | testthat::expect_true(NROW == nrow(TES) && NCOL == length(unique(y))) 741 | }) 742 | 743 | 744 | 745 | testthat::test_that("the 'p' parameter when method is 'minkowski' returns the expected output", { 746 | 747 | k = 2 748 | res_wo = KernelKnn(data = X, y = y, k = k , h = 1.0, method = 'minkowski', regression = T, threads = 1) # without specifying the 'p' parameter 749 | res_w = KernelKnn(data = X, y = y, k = k , h = 1.0, method = 'minkowski', regression = T, threads = 1, p = k) # by specifying the 'p' parameter 750 | res_dif = KernelKnn(data = X, y = y, k = k , h = 1.0, method = 'minkowski', regression = T, threads = 1, p = 1) # 'p' is set to 1 751 | 752 | is_identical = identical(res_wo, res_w) 753 | is_not_identical = identical(res_wo, res_dif) 754 | 755 | testthat::expect_true(is_identical & (!is_not_identical)) 756 | }) 757 | 758 | 759 | #============================================================================================================================================================== 760 | --------------------------------------------------------------------------------