├── .github
    ├── .gitignore
    ├── ISSUE_TEMPLATE
    │   ├── config.yml
    │   └── issue_template.md
    ├── workflows
    │   ├── issue.yml
    │   ├── pkgdown.yaml
    │   ├── stale-actions.yml
    │   ├── R-CMD-check.yaml
    │   └── docker-image.yml
    └── dependabot.yml
├── .gitignore
├── data
    ├── Boston.rda
    └── ionosphere.rda
├── LICENSE
├── R
    ├── package.R
    ├── RcppExports.R
    ├── dist_knn_index_dist.R
    ├── knn_index_dist.R
    ├── kernelknnCV.R
    ├── dist_kernelknn.R
    ├── utils.R
    └── kernelknn.R
├── tests
    ├── testthat.R
    └── testthat
    │   ├── helper-function_for_tests.R
    │   ├── test-distance_metrics.R
    │   ├── test-dist_knn_index_dist.R
    │   ├── test-kernelknn_cross_valid.R
    │   ├── test-utils.R
    │   ├── test-knn_index_dist.R
    │   ├── test-distmat_kernelknn.R
    │   └── test-kernelknn.R
├── _pkgdown.yml
├── src
    ├── Makevars
    ├── Makevars.win
    ├── init.c
    ├── RcppExports.cpp
    └── distance_metrics.cpp
├── man
    ├── normalized.Rd
    ├── func_shuffle.Rd
    ├── func_tbl.Rd
    ├── func_tbl_dist.Rd
    ├── class_folds.Rd
    ├── regr_folds.Rd
    ├── FUN_kernels.Rd
    ├── switch.ops.Rd
    ├── func_categorical_preds.Rd
    ├── FUNCTION_weights.Rd
    ├── Boston.Rd
    ├── distMat.knn.index.dist.Rd
    ├── knn.index.dist.Rd
    ├── ionosphere.Rd
    ├── KernelKnn.Rd
    ├── distMat.KernelKnn.Rd
    └── KernelKnnCV.Rd
├── NAMESPACE
├── inst
    └── CITATION
├── DESCRIPTION
├── Dockerfile
├── README.md
├── NEWS.md
└── vignettes
    ├── regression_using_the_housing_data.Rmd
    ├── binary_classification_using_the_ionosphere_data.Rmd
    └── image_classification_using_MNIST_CIFAR_data.Rmd


/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | docs/
2 | .Rbuildignore
3 | .Rhistory
4 | docs
5 | 


--------------------------------------------------------------------------------
/data/Boston.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlampros/KernelKnn/HEAD/data/Boston.rda


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2016
2 | COPYRIGHT HOLDER: Mouselimis Lampros <mouselimislampros@gmail.com>
3 | 


--------------------------------------------------------------------------------
/data/ionosphere.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlampros/KernelKnn/HEAD/data/ionosphere.rda


--------------------------------------------------------------------------------
/R/package.R:
--------------------------------------------------------------------------------
1 | #' @useDynLib KernelKnn, .registration = TRUE
2 | #' @importFrom Rcpp evalCpp
3 | NULL


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(KernelKnn)
3 | 
4 | test_check("KernelKnn")
5 | 


--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
1 | url: https://mlampros.github.io/KernelKnn/
2 | 
3 | template:
4 |   bootstrap: 5
5 |   bootswatch: cerulean
6 | 
7 | 


--------------------------------------------------------------------------------
/src/Makevars:
--------------------------------------------------------------------------------
1 | PKG_CXXFLAGS = -I../inst/include -I. $(SHLIB_OPENMP_CXXFLAGS) -DARMA_64BIT_WORD -DARMA_USE_CURRENT
2 | PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) $(SHLIB_OPENMP_CXXFLAGS)
3 | 


--------------------------------------------------------------------------------
/src/Makevars.win:
--------------------------------------------------------------------------------
1 | PKG_CXXFLAGS = -I../inst/include -I. $(SHLIB_OPENMP_CXXFLAGS) -DARMA_64BIT_WORD -DARMA_USE_CURRENT
2 | PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) $(SHLIB_OPENMP_CXXFLAGS)
3 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | # For more info see:  https://docs.github.com/en/github/building-a-strong-community/configuring-issue-templates-for-your-repository#configuring-the-template-chooser
2 | 
3 | blank_issues_enabled: true
4 | 


--------------------------------------------------------------------------------
/man/normalized.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{normalized}
 4 | \alias{normalized}
 5 | \title{this function normalizes the data}
 6 | \usage{
 7 | normalized(x)
 8 | }
 9 | \description{
10 | this function normalizes the data
11 | }
12 | \keyword{internal}
13 | 


--------------------------------------------------------------------------------
/man/func_shuffle.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{func_shuffle}
 4 | \alias{func_shuffle}
 5 | \title{shuffle data}
 6 | \usage{
 7 | func_shuffle(vec, times = 10)
 8 | }
 9 | \description{
10 | this function shuffles the items of a vector
11 | }
12 | \keyword{internal}
13 | 


--------------------------------------------------------------------------------
/man/func_tbl.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{func_tbl}
 4 | \alias{func_tbl}
 5 | \title{this function returns a table of probabilities for each label}
 6 | \usage{
 7 | func_tbl(DF, W, labels)
 8 | }
 9 | \description{
10 | this function returns a table of probabilities for each label
11 | }
12 | \keyword{internal}
13 | 


--------------------------------------------------------------------------------
/man/func_tbl_dist.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{func_tbl_dist}
 4 | \alias{func_tbl_dist}
 5 | \title{this function returns the probabilities in case of classification}
 6 | \usage{
 7 | func_tbl_dist(DF, Levels)
 8 | }
 9 | \description{
10 | this function returns the probabilities in case of classification
11 | }
12 | \keyword{internal}
13 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(KernelKnn)
 4 | export(KernelKnnCV)
 5 | export(distMat.KernelKnn)
 6 | export(distMat.knn.index.dist)
 7 | export(knn.index.dist)
 8 | importFrom(Rcpp,evalCpp)
 9 | importFrom(stats,dist)
10 | importFrom(stats,model.matrix)
11 | importFrom(utils,combn)
12 | importFrom(utils,setTxtProgressBar)
13 | importFrom(utils,txtProgressBar)
14 | useDynLib(KernelKnn, .registration = TRUE)
15 | 


--------------------------------------------------------------------------------
/inst/CITATION:
--------------------------------------------------------------------------------
 1 | citHeader("Please cite the package in your publications:")
 2 | 
 3 | year <- sub("-.*", "", meta$Date)
 4 | note <- sprintf("R package version %s", meta$Version)
 5 | 
 6 | bibentry(
 7 |   bibtype = "Manual",
 8 |   title   = "{KernelKnn}: Kernel k Nearest Neighbors",
 9 |   author  = person("Lampros", "Mouselimis"),
10 |   year    = year,
11 |   note    = note,
12 |   url     = "https://CRAN.R-project.org/package=KernelKnn"
13 | )
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/man/class_folds.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{class_folds}
 4 | \alias{class_folds}
 5 | \title{stratified folds (in classification)                      [ detailed information about class_folds in the FeatureSelection package ]}
 6 | \usage{
 7 | class_folds(folds, RESP)
 8 | }
 9 | \description{
10 | this function creates stratified folds in binary and multiclass classification
11 | }
12 | \keyword{internal}
13 | 


--------------------------------------------------------------------------------
/man/regr_folds.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{regr_folds}
 4 | \alias{regr_folds}
 5 | \title{create folds (in regression)                                           [ detailed information about class_folds in the FeatureSelection package ]}
 6 | \usage{
 7 | regr_folds(folds, RESP)
 8 | }
 9 | \description{
10 | this function creates both stratified and non-stratified folds in regression
11 | }
12 | \keyword{internal}
13 | 


--------------------------------------------------------------------------------
/man/FUN_kernels.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{FUN_kernels}
 4 | \alias{FUN_kernels}
 5 | \title{performs kernel smoothing using a bandwidth. Besides using a kernel there is also the option to combine kernels}
 6 | \usage{
 7 | FUN_kernels(kernel, W, h)
 8 | }
 9 | \description{
10 | performs kernel smoothing using a bandwidth. Besides using a kernel there is also the option to combine kernels
11 | }
12 | \keyword{internal}
13 | 


--------------------------------------------------------------------------------
/man/switch.ops.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{switch.ops}
 4 | \alias{switch.ops}
 5 | \title{Arithmetic operations on lists}
 6 | \usage{
 7 | switch.ops(LST, MODE = "ADD")
 8 | }
 9 | \description{
10 | Arithmetic operations on lists
11 | }
12 | \references{
13 | https://www.cs.toronto.edu/~duvenaud/cookbook/
14 | 
15 | https://raw.githubusercontent.com/duvenaud/phd-thesis/master/kernels.pdf
16 | }
17 | \keyword{internal}
18 | 


--------------------------------------------------------------------------------
/man/func_categorical_preds.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{func_categorical_preds}
 4 | \alias{func_categorical_preds}
 5 | \title{OPTION to convert categorical features TO either numeric [ if levels more than 32] OR to dummy variables [ if levels less than 32 ]}
 6 | \usage{
 7 | func_categorical_preds(prepr_categ)
 8 | }
 9 | \description{
10 | OPTION to convert categorical features TO either numeric [ if levels more than 32] OR to dummy variables [ if levels less than 32 ]
11 | }
12 | \keyword{internal}
13 | 


--------------------------------------------------------------------------------
/R/RcppExports.R:
--------------------------------------------------------------------------------
 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 3 | 
 4 | knn_index_dist_rcpp <- function(MATRIX, TEST_DATA, k, method, threads, p, eps = 1.0e-6) {
 5 |     .Call(`_KernelKnn_knn_index_dist_rcpp`, MATRIX, TEST_DATA, k, method, threads, p, eps)
 6 | }
 7 | 
 8 | DIST_MATRIX_knn <- function(DIST_MAT, TEST_IDX = NULL, is_min = TRUE, k = 5L, threads = 1L, rcpp_list_names = FALSE) {
 9 |     .Call(`_KernelKnn_DIST_MATRIX_knn`, DIST_MAT, TEST_IDX, is_min, k, threads, rcpp_list_names)
10 | }
11 | 
12 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/issue_template.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report or feature request
 3 | about: Describe a bug you've encountered or make a case for a new feature
 4 | ---
 5 | 
 6 | Please briefly describe your problem and what output you expect. If you have a question, you also have the option of <https://stackoverflow.com/> (but I'm flexible if it's not too complicated)
 7 | 
 8 | Please include a minimal reproducible example
 9 | 
10 | Please give a brief description of the problem
11 | 
12 | Please add your Operating System (e.g., Windows10, Macintosh, Linux) and the R version that you use (e.g., 4.0.2)
13 | 


--------------------------------------------------------------------------------
/man/FUNCTION_weights.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{FUNCTION_weights}
 4 | \alias{FUNCTION_weights}
 5 | \title{this function is used as a kernel-function-identifier [ takes the distances and a weights-kernel (in form of a function) and returns weights ]}
 6 | \usage{
 7 | FUNCTION_weights(W_dist_matrix, weights_function, eps = 1e-06)
 8 | }
 9 | \description{
10 | this function is used as a kernel-function-identifier [ takes the distances and a weights-kernel (in form of a function) and returns weights ]
11 | }
12 | \keyword{internal}
13 | 


--------------------------------------------------------------------------------
/.github/workflows/issue.yml:
--------------------------------------------------------------------------------
 1 | # For more info see: https://github.com/Renato66/auto-label
 2 | # for the 'secrets.GITHUB_TOKEN' see: https://docs.github.com/en/actions/reference/authentication-in-a-workflow#about-the-github_token-secret
 3 | 
 4 | name: Labeling new issue
 5 | on:
 6 |   issues:
 7 |     types: ['opened']
 8 | jobs:
 9 |   build:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - uses: Renato66/auto-label@v3
13 |         with:
14 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
15 |           ignore-comments: true
16 |           labels-synonyms: '{"bug":["error","need fix","not working"],"enhancement":["upgrade"],"question":["help","how can i"]}'
17 |           labels-not-allowed: '["documentation","duplicate","good first issue","help wanted","invalid"]'
18 |           default-labels: '["triage"]'
19 | 


--------------------------------------------------------------------------------
/src/init.c:
--------------------------------------------------------------------------------
 1 | #include <R.h>
 2 | #include <Rinternals.h>
 3 | #include <stdlib.h> // for NULL
 4 | #include <R_ext/Rdynload.h>
 5 | 
 6 | /* FIXME: 
 7 |    Check these declarations against the C/Fortran source code.
 8 | */
 9 | 
10 | /* .Call calls */
11 | extern SEXP _KernelKnn_DIST_MATRIX_knn(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
12 | extern SEXP _KernelKnn_knn_index_dist_rcpp(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
13 | 
14 | static const R_CallMethodDef CallEntries[] = {
15 |     {"_KernelKnn_DIST_MATRIX_knn",     (DL_FUNC) &_KernelKnn_DIST_MATRIX_knn,     6},
16 |     {"_KernelKnn_knn_index_dist_rcpp", (DL_FUNC) &_KernelKnn_knn_index_dist_rcpp, 7},
17 |     {NULL, NULL, 0}
18 | };
19 | 
20 | void R_init_KernelKnn(DllInfo *dll)
21 | {
22 |     R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
23 |     R_useDynamicSymbols(dll, FALSE);
24 | }
25 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   # Monitor Docker dependencies
 4 |   - package-ecosystem: "docker"
 5 |     directory: "/"
 6 |     schedule:
 7 |       interval: "monthly"
 8 |     open-pull-requests-limit: 3
 9 |     reviewers:
10 |       - "mlampros"
11 |     assignees:
12 |       - "mlampros"
13 |     labels:
14 |       - "dependencies"
15 |       - "docker"
16 |     commit-message:
17 |       prefix: "docker"
18 |       include: "scope"
19 | 
20 |   # Monitor GitHub Actions (if you have any workflows)
21 |   - package-ecosystem: "github-actions"
22 |     directory: "/"
23 |     schedule:
24 |       interval: "monthly"
25 |     open-pull-requests-limit: 5
26 |     reviewers:
27 |       - "mlampros"
28 |     assignees:
29 |       - "mlampros"
30 |     labels:
31 |       - "dependencies"
32 |       - "github-actions"
33 |     commit-message:
34 |       prefix: "ci"
35 |       include: "scope"
36 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: KernelKnn
 2 | Type: Package
 3 | Title: Kernel k Nearest Neighbors
 4 | Version: 1.1.6
 5 | Date: 2025-09-14
 6 | Authors@R: c( person(given = "Lampros", family = "Mouselimis", email = "mouselimislampros@gmail.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-8024-1546")), person(given = "Matthew", family = "Parks", email = "parksmatthewm@gmail.com", role = "ctb", comment = "Github Contributor") )
 7 | BugReports: https://github.com/mlampros/KernelKnn/issues
 8 | URL: https://github.com/mlampros/KernelKnn, https://mlampros.github.io/KernelKnn/
 9 | Description: Extends the simple k-nearest neighbors algorithm by incorporating numerous kernel functions and a variety of distance metrics. The package takes advantage of 'RcppArmadillo' to speed up the calculation of distances between observations.
10 | License: MIT + file LICENSE
11 | LazyData: TRUE
12 | Encoding: UTF-8
13 | Depends: R(>= 2.10.0)
14 | Imports: Rcpp (>= 0.12.5)
15 | LinkingTo: Rcpp, RcppArmadillo
16 | Suggests:
17 |     testthat,
18 |     covr,
19 |     knitr,
20 |     rmarkdown
21 | VignetteBuilder: knitr
22 | RoxygenNote: 7.3.2
23 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM rocker/rstudio:devel
 2 | LABEL maintainer='Lampros Mouselimis'
 3 | 
 4 | RUN export DEBIAN_FRONTEND=noninteractive; apt-get -y update && \
 5 |  apt-get install -y git-core pandoc libssl-dev libcurl4-openssl-dev && \
 6 |  apt-get install -y sudo && \
 7 |  apt-get install -y libarmadillo-dev && \
 8 |  apt-get install -y libblas-dev && \
 9 |  apt-get install -y liblapack-dev && \
10 |  apt-get install -y libarpack++2-dev && \
11 |  apt-get install -y gfortran  && \
12 |  apt-get install -y libxml2-dev && \
13 |  apt-get install -y libssh2-1-dev && \
14 |  apt-get install -y zlib1g-dev && \
15 |  R -e "install.packages('devtools', dependencies = TRUE, repos = 'https://cloud.r-project.org/')" && \
16 |  R -e "install.packages(c( 'Rcpp', 'RcppArmadillo', 'testthat', 'covr', 'knitr', 'rmarkdown', 'remotes' ), repos =  'https://cloud.r-project.org/' )" && \
17 |  apt-get autoremove -y && \
18 |  apt-get clean && \
19 |  rm -rf /var/lib/apt/lists/*
20 | 
21 | ADD http://www.random.org/strings/?num=10&len=8&digits=on&upperalpha=on&loweralpha=on&unique=on&format=plain&rnd=new uuid
22 | ARG BUILD_DATE
23 | 
24 | RUN echo "$BUILD_DATE"
25 | RUN R -e "remotes::install_github('mlampros/KernelKnn', upgrade = 'always', dependencies = TRUE, repos = 'https://cloud.r-project.org/')"
26 | 
27 | ENV USER=rstudio
28 | 


--------------------------------------------------------------------------------
/.github/workflows/pkgdown.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |   release:
 8 |     types: [published]
 9 |   workflow_dispatch:
10 | 
11 | name: pkgdown.yaml
12 | 
13 | permissions: read-all
14 | 
15 | jobs:
16 |   pkgdown:
17 |     runs-on: ubuntu-latest
18 |     # Only restrict concurrency for non-PR jobs
19 |     concurrency:
20 |       group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }}
21 |     env:
22 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
23 |     permissions:
24 |       contents: write
25 |     steps:
26 |       - uses: actions/checkout@v5
27 | 
28 |       - uses: r-lib/actions/setup-pandoc@v2
29 | 
30 |       - uses: r-lib/actions/setup-r@v2
31 |         with:
32 |           use-public-rspm: true
33 | 
34 |       - uses: r-lib/actions/setup-r-dependencies@v2
35 |         with:
36 |           extra-packages: any::pkgdown, local::.
37 |           needs: website
38 | 
39 |       - name: Build site
40 |         run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
41 |         shell: Rscript {0}
42 | 
43 |       - name: Deploy to GitHub pages 🚀
44 |         if: github.event_name != 'pull_request'
45 |         uses: JamesIves/github-pages-deploy-action@v4.7.3
46 |         with:
47 |           clean: false
48 |           branch: gh-pages
49 |           folder: docs
50 | 


--------------------------------------------------------------------------------
/.github/workflows/stale-actions.yml:
--------------------------------------------------------------------------------
 1 | # for the 'secrets.GITHUB_TOKEN' see: https://docs.github.com/en/actions/reference/authentication-in-a-workflow#about-the-github_token-secret
 2 | 
 3 | name: "Mark or close stale issues and PRs"
 4 | 
 5 | on:
 6 |   schedule:
 7 |   - cron: "00 * * * *"
 8 | 
 9 | jobs:
10 |   stale:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |     - uses: actions/stale@v10
14 |       with:
15 |         repo-token: ${{ secrets.GITHUB_TOKEN }}
16 |         days-before-stale: 12
17 |         days-before-close: 7
18 |         stale-issue-message: "This is Robo-lampros because the Human-lampros is lazy. This issue has been automatically marked as stale because it has not had recent activity. It will be closed after 7 days if no further activity occurs. Feel free to re-open a closed issue and the Human-lampros will respond."
19 |         stale-pr-message: "This is Robo-lampros because the Human-lampros is lazy. This PR has been automatically marked as stale because it has not had recent activity. It will be closed after 7 days if no further activity occurs."
20 |         close-issue-message: "This issue was automatically closed because of being stale. Feel free to re-open a closed issue and the Human-lampros will respond."
21 |         close-pr-message: "This PR was automatically closed because of being stale."
22 |         stale-pr-label: "stale"
23 |         stale-issue-label: "stale"
24 |         exempt-issue-labels: "bug,enhancement,pinned,security,pending,work_in_progress"
25 |         exempt-pr-labels: "bug,enhancement,pinned,security,pending,work_in_progress"
26 | 


--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 | 
 8 | name: R-CMD-check.yaml
 9 | 
10 | permissions: read-all
11 | 
12 | jobs:
13 |   R-CMD-check:
14 |     runs-on: ${{ matrix.config.os }}
15 | 
16 |     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
17 | 
18 |     strategy:
19 |       fail-fast: false
20 |       matrix:
21 |         config:
22 |           - {os: macos-latest,   r: 'release'}
23 |           - {os: windows-latest, r: 'release'}
24 |           - {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
25 |           - {os: ubuntu-latest,   r: 'release'}
26 |           - {os: ubuntu-latest,   r: 'oldrel-1'}
27 | 
28 |     env:
29 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
30 |       R_KEEP_PKG_SOURCE: yes
31 | 
32 |     steps:
33 |       - uses: actions/checkout@v5
34 | 
35 |       - uses: r-lib/actions/setup-pandoc@v2
36 | 
37 |       - uses: r-lib/actions/setup-r@v2
38 |         with:
39 |           r-version: ${{ matrix.config.r }}
40 |           http-user-agent: ${{ matrix.config.http-user-agent }}
41 |           use-public-rspm: true
42 | 
43 |       - uses: r-lib/actions/setup-r-dependencies@v2
44 |         with:
45 |           extra-packages: any::rcmdcheck
46 |           needs: check
47 | 
48 |       - uses: r-lib/actions/check-r-package@v2
49 |         with:
50 |           upload-snapshots: true
51 |           build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")'
52 | 


--------------------------------------------------------------------------------
/tests/testthat/helper-function_for_tests.R:
--------------------------------------------------------------------------------
 1 | #===================================================================================================================================
 2 | 
 3 | # REGRESSION
 4 | #-----------
 5 | 
 6 | data(Boston, package = 'KernelKnn')
 7 | 
 8 | X = Boston[, -dim(Boston)[2]]
 9 | xtr = X[1:350, ]
10 | xte = X[351:nrow(X), ]
11 | y1 = Boston[1:350, dim(Boston)[2]]
12 | y = Boston[, dim(Boston)[2]]
13 | 
14 | 
15 | # CLASSIFICATION
16 | #---------------
17 | 
18 | data(ionosphere, package = 'KernelKnn')
19 | 
20 | singular_mat = ionosphere
21 | ionosphere1 = ionosphere[, -2]  
22 | ionosphere = ionosphere[, -c(2, ncol(ionosphere))]                    # remove second column which has a single unique value
23 | xtr_class = ionosphere[1:200, ]
24 | xte_class = ionosphere[201:nrow(ionosphere), ]
25 | 
26 | X_class = ionosphere1[, -dim(ionosphere1)[2]]
27 | xtr_class = X_class[1:200, ]
28 | xte_class = X_class[201:nrow(X_class), ]
29 | y1_class = ionosphere1[1:200, dim(ionosphere1)[2]]
30 | y1_class_ext = ionosphere1[, dim(ionosphere1)[2]]
31 | 
32 | 
33 | # DISTANCE MATRIX (REGRESSION)
34 | 
35 | DIST_obj = stats::dist(X, method = "euclidean")
36 | DIST_mat = as.matrix(DIST_obj)
37 | 
38 | DIST_obj_class = stats::dist(ionosphere, method = "euclidean")
39 | DIST_mat_class = as.matrix(DIST_obj_class)
40 | 
41 | 
42 | # utility function ( lappend )
43 | #-----------------------------
44 | 
45 | lappend <- function(lst, ...){                                       # lappend() function to append vector to list
46 |   
47 |   lst <- c(lst, list(...))
48 |   
49 |   return(lst)
50 | }
51 | 
52 | #===================================================================================================================================


--------------------------------------------------------------------------------
/man/Boston.Rd:
--------------------------------------------------------------------------------
 1 | \name{Boston}
 2 | \alias{Boston}
 3 | \docType{data}
 4 | \title{
 5 | Boston Housing Data    (Regression)
 6 | }
 7 | \description{
 8 | housing values in suburbs of Boston
 9 | }
10 | \usage{data(Boston)}
11 | \format{
12 |   A data frame with 506 Instances and 14 attributes (including the class attribute, "medv")
13 |   \describe{
14 |     \item{\code{crim}}{per capita crime rate by town}
15 |     \item{\code{zn}}{proportion of residential land zoned for lots over 25,000 sq.ft.}
16 |     \item{\code{indus}}{proportion of non-retail business acres per town}
17 |     \item{\code{chas}}{Charles River dummy variable (= 1 if tract bounds)}
18 |     \item{\code{nox}}{nitric oxides concentration (parts per 10 million)}
19 |     \item{\code{rm}}{average number of rooms per dwelling}
20 |     \item{\code{age}}{proportion of owner-occupied units built prior to 1940}
21 |     \item{\code{dis}}{weighted distances to five Boston employment centres}
22 |     \item{\code{rad}}{index of accessibility to radial highways}
23 |     \item{\code{tax}}{full-value property-tax rate per $10,000}
24 |     \item{\code{ptratio}}{pupil-teacher ratio by town}
25 |     \item{\code{black}}{1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town}
26 |     \item{\code{lstat}}{percentage of lower status of the population}
27 |     \item{\code{medv}}{Median value of owner-occupied homes in $1000's}
28 |   }
29 | }
30 | \source{
31 | This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.
32 | 
33 | Creator:  Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978.
34 | }
35 | \references{
36 | https://archive.ics.uci.edu/ml/datasets/Housing
37 | }
38 | \examples{
39 | 
40 | data(Boston)
41 | 
42 | X = Boston[, -ncol(Boston)]
43 | 
44 | y = Boston[, ncol(Boston)]
45 | }
46 | \keyword{datasets}
47 | 


--------------------------------------------------------------------------------
/man/distMat.knn.index.dist.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/dist_knn_index_dist.R
 3 | \name{distMat.knn.index.dist}
 4 | \alias{distMat.knn.index.dist}
 5 | \title{indices and distances of k-nearest-neighbors using a distance matrix}
 6 | \usage{
 7 | distMat.knn.index.dist(
 8 |   DIST_mat,
 9 |   TEST_indices = NULL,
10 |   k = 5,
11 |   threads = 1,
12 |   minimize = T
13 | )
14 | }
15 | \arguments{
16 | \item{DIST_mat}{a distance matrix (square matrix) having a diagonal filled with either zero's (\emph{0}) or NA's (\emph{missing values})}
17 | 
18 | \item{TEST_indices}{a numeric vector specifying the indices of the test data in the distance matrix (row-wise or column-wise). If the parameter equals NULL then no test data is included in the distance matrix}
19 | 
20 | \item{k}{an integer specifying the k-nearest-neighbors}
21 | 
22 | \item{threads}{the number of cores to be used in parallel (openmp will be employed)}
23 | 
24 | \item{minimize}{either TRUE or FALSE. If TRUE then lower values will be considered as relevant for the k-nearest search, otherwise higher values.}
25 | }
26 | \value{
27 | a list of length 2. The first sublist returns the indices and the second the distances of the k nearest neighbors for each observation.
28 | If TEST_indices is NULL the number of rows of each sublist equals the number of rows in the DIST_mat data. If TEST_indices is not NULL the number of rows of each sublist equals the length of the input TEST_indices.
29 | }
30 | \description{
31 | indices and distances of k-nearest-neighbors using a distance matrix
32 | }
33 | \details{
34 | This function takes a number of arguments and it returns the indices and distances of the k-nearest-neighbors for each observation. If TEST_indices is NULL then the indices-distances for the DIST_mat be returned, whereas if TEST_indices is not NULL then the indices-distances for the test data only will be returned.
35 | }
36 | \examples{
37 | 
38 | data(Boston)
39 | 
40 | X = Boston[, -ncol(Boston)]
41 | 
42 | dist_obj = dist(X)
43 | 
44 | dist_mat = as.matrix(dist_obj)
45 | 
46 | out = distMat.knn.index.dist(dist_mat, TEST_indices = NULL, k = 5)
47 | 
48 | }
49 | \author{
50 | Lampros Mouselimis
51 | }
52 | 


--------------------------------------------------------------------------------
/man/knn.index.dist.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/knn_index_dist.R
 3 | \name{knn.index.dist}
 4 | \alias{knn.index.dist}
 5 | \title{indices and distances of k-nearest-neighbors}
 6 | \usage{
 7 | knn.index.dist(
 8 |   data,
 9 |   TEST_data = NULL,
10 |   k = 5,
11 |   method = "euclidean",
12 |   transf_categ_cols = F,
13 |   threads = 1,
14 |   p = k
15 | )
16 | }
17 | \arguments{
18 | \item{data}{a data.frame or matrix}
19 | 
20 | \item{TEST_data}{a data.frame or matrix (it can be also NULL)}
21 | 
22 | \item{k}{an integer specifying the k-nearest-neighbors}
23 | 
24 | \item{method}{a string specifying the method. Valid methods are 'euclidean', 'manhattan', 'chebyshev', 'canberra', 'braycurtis', 'pearson_correlation', 'simple_matching_coefficient', 'minkowski' (by default the order 'p' of the minkowski parameter equals k), 'hamming', 'mahalanobis', 'jaccard_coefficient', 'Rao_coefficient'}
25 | 
26 | \item{transf_categ_cols}{a boolean (TRUE, FALSE) specifying if the categorical columns should be converted to numeric or to dummy variables}
27 | 
28 | \item{threads}{the number of cores to be used in parallel (openmp will be employed)}
29 | 
30 | \item{p}{a numeric value specifying the 'minkowski' order, i.e. if 'method' is set to 'minkowski'. This parameter defaults to 'k'}
31 | }
32 | \value{
33 | a list of length 2. The first sublist returns the indices and the second the distances of the k nearest neighbors for each observation.
34 | If TEST_data is NULL the number of rows of each sublist equals the number of rows in the train data. If TEST_data is not NULL the number of rows of each sublist equals the number of rows in the TEST data.
35 | }
36 | \description{
37 | This function returns the k nearest indices and distances of each observation
38 | }
39 | \details{
40 | This function takes a number of arguments and it returns the indices and distances of the k-nearest-neighbors for each observation. If TEST_data is NULL then the indices-distances for the train data will be returned, whereas if TEST_data is not NULL then the indices-distances for the TEST_data will be returned.
41 | }
42 | \examples{
43 | 
44 | data(Boston)
45 | 
46 | X = Boston[, -ncol(Boston)]
47 | 
48 | out = knn.index.dist(X, TEST_data = NULL, k = 4, method = 'euclidean', threads = 1)
49 | 
50 | }
51 | \author{
52 | Lampros Mouselimis
53 | }
54 | 


--------------------------------------------------------------------------------
/src/RcppExports.cpp:
--------------------------------------------------------------------------------
 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand
 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 3 | 
 4 | #include <RcppArmadillo.h>
 5 | #include <Rcpp.h>
 6 | 
 7 | using namespace Rcpp;
 8 | 
 9 | #ifdef RCPP_USE_GLOBAL_ROSTREAM
10 | Rcpp::Rostream<true>&  Rcpp::Rcout = Rcpp::Rcpp_cout_get();
11 | Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
12 | #endif
13 | 
14 | // knn_index_dist_rcpp
15 | Rcpp::List knn_index_dist_rcpp(arma::mat& MATRIX, arma::mat& TEST_DATA, int k, std::string& method, int threads, double p, double eps);
16 | RcppExport SEXP _KernelKnn_knn_index_dist_rcpp(SEXP MATRIXSEXP, SEXP TEST_DATASEXP, SEXP kSEXP, SEXP methodSEXP, SEXP threadsSEXP, SEXP pSEXP, SEXP epsSEXP) {
17 | BEGIN_RCPP
18 |     Rcpp::RObject rcpp_result_gen;
19 |     Rcpp::RNGScope rcpp_rngScope_gen;
20 |     Rcpp::traits::input_parameter< arma::mat& >::type MATRIX(MATRIXSEXP);
21 |     Rcpp::traits::input_parameter< arma::mat& >::type TEST_DATA(TEST_DATASEXP);
22 |     Rcpp::traits::input_parameter< int >::type k(kSEXP);
23 |     Rcpp::traits::input_parameter< std::string& >::type method(methodSEXP);
24 |     Rcpp::traits::input_parameter< int >::type threads(threadsSEXP);
25 |     Rcpp::traits::input_parameter< double >::type p(pSEXP);
26 |     Rcpp::traits::input_parameter< double >::type eps(epsSEXP);
27 |     rcpp_result_gen = Rcpp::wrap(knn_index_dist_rcpp(MATRIX, TEST_DATA, k, method, threads, p, eps));
28 |     return rcpp_result_gen;
29 | END_RCPP
30 | }
31 | // DIST_MATRIX_knn
32 | Rcpp::List DIST_MATRIX_knn(arma::mat& DIST_MAT, Rcpp::Nullable<Rcpp::NumericVector> TEST_IDX, bool is_min, int k, int threads, bool rcpp_list_names);
33 | RcppExport SEXP _KernelKnn_DIST_MATRIX_knn(SEXP DIST_MATSEXP, SEXP TEST_IDXSEXP, SEXP is_minSEXP, SEXP kSEXP, SEXP threadsSEXP, SEXP rcpp_list_namesSEXP) {
34 | BEGIN_RCPP
35 |     Rcpp::RObject rcpp_result_gen;
36 |     Rcpp::RNGScope rcpp_rngScope_gen;
37 |     Rcpp::traits::input_parameter< arma::mat& >::type DIST_MAT(DIST_MATSEXP);
38 |     Rcpp::traits::input_parameter< Rcpp::Nullable<Rcpp::NumericVector> >::type TEST_IDX(TEST_IDXSEXP);
39 |     Rcpp::traits::input_parameter< bool >::type is_min(is_minSEXP);
40 |     Rcpp::traits::input_parameter< int >::type k(kSEXP);
41 |     Rcpp::traits::input_parameter< int >::type threads(threadsSEXP);
42 |     Rcpp::traits::input_parameter< bool >::type rcpp_list_names(rcpp_list_namesSEXP);
43 |     rcpp_result_gen = Rcpp::wrap(DIST_MATRIX_knn(DIST_MAT, TEST_IDX, is_min, k, threads, rcpp_list_names));
44 |     return rcpp_result_gen;
45 | END_RCPP
46 | }
47 | 


--------------------------------------------------------------------------------
/.github/workflows/docker-image.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | #..........................................................................................................
 3 | # build, push and cache the docker image. I have to adjust the following in case of a different repository:
 4 | #        - I have to add the 'BUILD_DATE' arg in the Dockerfile
 5 | #        - I have to create a DOCKER_PASSWORD (use the docker token) in the 'Settings' tab of the repository
 6 | #        - This github action also updates the dockerhub readme file
 7 | # References:
 8 | #        - https://github.com/mlampros/IceSat2R/blob/master/.github/workflows/docker_image.yml
 9 | #        - https://github.com/orgs/community/discussions/25768#discussioncomment-3249184
10 | #..........................................................................................................
11 | 
12 | on:
13 |   push:
14 |     branches: [main, master]
15 |   pull_request:
16 |     branches: [main, master]
17 | 
18 | name: docker_img
19 | 
20 | jobs:
21 |   build:
22 |     runs-on: ubuntu-latest
23 | 
24 |     steps:
25 |       - name: Check Out Repo
26 |         uses: actions/checkout@v5
27 | 
28 |       - id: string
29 |         uses: ASzc/change-string-case-action@v6
30 |         with:
31 |           string: ${{ github.event.repository.name }}
32 | 
33 |       - name: Login to Docker Hub
34 |         uses: docker/login-action@v3
35 |         with:
36 |           username: ${{ github.repository_owner }}
37 |           password: ${{ secrets.DOCKER_PASSWORD }}
38 | 
39 |       - name: Set up Docker Buildx
40 |         id: buildx
41 |         uses: docker/setup-buildx-action@v3
42 | 
43 |       - name: Build and push
44 |         uses: docker/build-push-action@v6
45 |         with:
46 |           context: ./
47 |           build-args: BUILD_DATE="$(date -u +'%Y-%m-%dT%H:%M:%SZ')"
48 |           file: ./Dockerfile
49 |           builder: ${{ steps.buildx.outputs.name }}
50 |           push: true
51 |           tags: ${{ github.repository_owner }}/${{ steps.string.outputs.lowercase }}:rstudiodev
52 |           cache-from: type=registry,ref=${{ github.repository_owner }}/${{ steps.string.outputs.lowercase }}:buildcache
53 |           cache-to: type=registry,ref=${{ github.repository_owner }}/${{ steps.string.outputs.lowercase }}:buildcache,mode=max
54 | 
55 |       - name: Update Docker Hub Description
56 |         uses: peter-evans/dockerhub-description@v4
57 |         with:
58 |           username: ${{ github.repository_owner }}
59 |           password: ${{ secrets.DOCKER_PASSWORD }}
60 |           repository: ${{ github.repository_owner }}/${{ steps.string.outputs.lowercase }}
61 |           short-description: ${{ github.event.repository.description }}
62 |           readme-filepath: ./README.md
63 | 


--------------------------------------------------------------------------------
/man/ionosphere.Rd:
--------------------------------------------------------------------------------
 1 | \name{ionosphere}
 2 | \alias{ionosphere}
 3 | \docType{data}
 4 | \title{
 5 | Johns Hopkins University Ionosphere database   (binary classification)
 6 | }
 7 | \description{
 8 | This radar data was collected by a system in Goose Bay, Labrador.
 9 | This radar data was collected by a system in Goose Bay, Labrador. This system consists of a phased array of 16 high-frequency
10 | antennas with a total transmitted power on the order of 6.4 kilowatts. See the paper for more details.  The targets were free
11 | electrons in the ionosphere. "Good" radar returns are those showing evidence of some type of structure in the ionosphere.
12 | "Bad" returns are those that do not; their signals pass through the ionosphere. Received signals were processed using an autocorrelation
13 | function whose arguments are the time of a pulse and the pulse number. There were 17 pulse numbers for the Goose Bay system.
14 | Instances in this databse are described by 2 attributes per pulse number, corresponding to the complex values returned by the
15 | function resulting from the complex electromagnetic signal.
16 | }
17 | \usage{data(ionosphere)}
18 | \format{
19 |   A data frame with 351 Instances and 35 attributes (including the class attribute, "class")
20 | }
21 | \details{
22 | Sigillito, V. G., Wing, S. P., Hutton, L. V., Baker, K. B. (1989). Classification of radar returns from the ionosphere using neural networks.
23 | Johns Hopkins APL Technical Digest, 10, 262-266.
24 | 
25 | They investigated using backprop and the perceptron training algorithm on this database. Using the first 200 instances for training, which
26 | were carefully split almost 50 percent positive and 50 percent negative, they found that a "linear" perceptron attained 90.7 percent,
27 | a "non-linear" perceptron attained 92 percent, and backprop an average of over 96 percent accuracy on the remaining 150 test instances,
28 | consisting of 123 "good" and only 24 "bad" instances.  (There was a counting error or some mistake somewhere; there are a total of 351 rather
29 | than 350 instances in this domain.) Accuracy on "good" instances was much higher than for "bad" instances.  Backprop was tested with several
30 | different numbers of hidden units (in [0,15]) and incremental results were also reported (corresponding to how well the different variants of
31 | backprop did after a periodic number of epochs). David Aha (aha@ics.uci.edu) briefly investigated this database. He found that nearest neighbor
32 | attains an accuracy of 92.1 percent, that Ross Quinlan's C4 algorithm attains 94.0 percent (no windowing), and that IB3 (Aha & Kibler, IJCAI-1989)
33 | attained 96.7 percent (parameter settings: 70 percent and 80 percent for acceptance and dropping respectively).
34 | }
35 | \source{
36 | Donor: Vince Sigillito (vgs@aplcen.apl.jhu.edu)
37 | 
38 | Date: 1989
39 | 
40 | Source: Space Physics Group
41 | 
42 | Applied Physics Laboratory
43 | 
44 | Johns Hopkins University
45 | 
46 | Johns Hopkins Road
47 | 
48 | Laurel, MD 20723
49 | }
50 | \references{
51 | https://archive.ics.uci.edu/ml/datasets/Ionosphere
52 | }
53 | \examples{
54 | 
55 | data(ionosphere)
56 | 
57 | X = ionosphere[, -ncol(ionosphere)]
58 | 
59 | y = ionosphere[, ncol(ionosphere)]
60 | }
61 | \keyword{datasets}
62 | 


--------------------------------------------------------------------------------
/man/KernelKnn.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/kernelknn.R
 3 | \name{KernelKnn}
 4 | \alias{KernelKnn}
 5 | \title{kernel k-nearest-neighbors}
 6 | \usage{
 7 | KernelKnn(
 8 |   data,
 9 |   TEST_data = NULL,
10 |   y,
11 |   k = 5,
12 |   h = 1,
13 |   method = "euclidean",
14 |   weights_function = NULL,
15 |   regression = F,
16 |   transf_categ_cols = F,
17 |   threads = 1,
18 |   extrema = F,
19 |   Levels = NULL,
20 |   p = k
21 | )
22 | }
23 | \arguments{
24 | \item{data}{a data frame or matrix}
25 | 
26 | \item{TEST_data}{a data frame or matrix (it can be also NULL)}
27 | 
28 | \item{y}{a numeric vector (in classification the labels must be numeric from 1:Inf)}
29 | 
30 | \item{k}{an integer specifying the k-nearest-neighbors}
31 | 
32 | \item{h}{the bandwidth (applicable if the weights_function is not NULL, defaults to 1.0)}
33 | 
34 | \item{method}{a string specifying the method. Valid methods are 'euclidean', 'manhattan', 'chebyshev', 'canberra', 'braycurtis', 'pearson_correlation', 'simple_matching_coefficient', 'minkowski' (by default the order 'p' of the minkowski parameter equals k), 'hamming', 'mahalanobis', 'jaccard_coefficient', 'Rao_coefficient'}
35 | 
36 | \item{weights_function}{there are various ways of specifying the kernel function. See the details section.}
37 | 
38 | \item{regression}{a boolean (TRUE,FALSE) specifying if regression or classification should be performed}
39 | 
40 | \item{transf_categ_cols}{a boolean (TRUE, FALSE) specifying if the categorical columns should be converted to numeric or to dummy variables}
41 | 
42 | \item{threads}{the number of cores to be used in parallel (openmp will be employed)}
43 | 
44 | \item{extrema}{if TRUE then the minimum and maximum values from the k-nearest-neighbors will be removed (can be thought as outlier removal)}
45 | 
46 | \item{Levels}{a numeric vector. In case of classification the unique levels of the response variable are necessary}
47 | 
48 | \item{p}{a numeric value specifying the 'minkowski' order, i.e. if 'method' is set to 'minkowski'. This parameter defaults to 'k'}
49 | }
50 | \value{
51 | a vector (if regression is TRUE), or a data frame with class probabilities (if regression is FALSE)
52 | }
53 | \description{
54 | This function utilizes kernel k nearest neighbors to predict new observations
55 | }
56 | \details{
57 | This function takes a number of arguments and it returns the predicted values. If TEST_data is NULL then the predictions for the train data will be returned, whereas if TEST_data is not NULL then the predictions for the TEST_data will be returned.
58 | There are three possible ways to specify the weights function, 1st option : if the weights_function is NULL then a simple k-nearest-neighbor is performed. 2nd option : the weights_function is one of 'uniform', 'triangular', 'epanechnikov', 'biweight', 'triweight', 'tricube', 'gaussian', 'cosine', 'logistic', 'gaussianSimple', 'silverman', 'inverse', 'exponential'. The 2nd option can be extended by combining kernels from the existing ones (adding or multiplying). For instance, I can multiply the tricube with the gaussian kernel by giving 'tricube_gaussian_MULT' or I can add the previously mentioned kernels by giving 'tricube_gaussian_ADD'. 3rd option : a user defined kernel function
59 | }
60 | \examples{
61 | 
62 | data(Boston)
63 | 
64 | X = Boston[, -ncol(Boston)]
65 | y = Boston[, ncol(Boston)]
66 | 
67 | out = KernelKnn(X, TEST_data = NULL, y, k = 5, method = 'euclidean', regression = TRUE)
68 | 
69 | }
70 | \author{
71 | Lampros Mouselimis
72 | }
73 | 


--------------------------------------------------------------------------------
/R/dist_knn_index_dist.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #' indices and distances of k-nearest-neighbors using a distance matrix
 3 | #'
 4 | #' @param DIST_mat a distance matrix (square matrix) having a diagonal filled with either zero's (\emph{0}) or NA's (\emph{missing values})
 5 | #' @param TEST_indices a numeric vector specifying the indices of the test data in the distance matrix (row-wise or column-wise). If the parameter equals NULL then no test data is included in the distance matrix
 6 | #' @param k an integer specifying the k-nearest-neighbors
 7 | #' @param threads the number of cores to be used in parallel (openmp will be employed)
 8 | #' @param minimize either TRUE or FALSE. If TRUE then lower values will be considered as relevant for the k-nearest search, otherwise higher values.
 9 | #' @return a list of length 2. The first sublist returns the indices and the second the distances of the k nearest neighbors for each observation.
10 | #' If TEST_indices is NULL the number of rows of each sublist equals the number of rows in the DIST_mat data. If TEST_indices is not NULL the number of rows of each sublist equals the length of the input TEST_indices.
11 | #' @author Lampros Mouselimis
12 | #' @details
13 | #' This function takes a number of arguments and it returns the indices and distances of the k-nearest-neighbors for each observation. If TEST_indices is NULL then the indices-distances for the DIST_mat be returned, whereas if TEST_indices is not NULL then the indices-distances for the test data only will be returned.
14 | #' @export
15 | #' @examples
16 | #'
17 | #' data(Boston)
18 | #'
19 | #' X = Boston[, -ncol(Boston)]
20 | #'
21 | #' dist_obj = dist(X)
22 | #'
23 | #' dist_mat = as.matrix(dist_obj)
24 | #'
25 | #' out = distMat.knn.index.dist(dist_mat, TEST_indices = NULL, k = 5)
26 | #'
27 | 
28 | 
29 | distMat.knn.index.dist = function(DIST_mat, TEST_indices = NULL, k = 5, threads = 1, minimize = T) {
30 | 
31 |   if (!is.matrix(DIST_mat)) stop("the 'DIST_mat' parameter should be of type matrix")
32 |   if (nrow(DIST_mat) != ncol(DIST_mat)) stop("the input 'DIST_mat' should be a square matrix with number of rows equal to number of columns")
33 |   DIAG = diag(DIST_mat)
34 |   nas = all(is.na(DIAG))
35 |   if (nas) {
36 |     diag(DIST_mat) = 0 }              # set diagonal to 0.0 if equal to NA
37 |   else {
38 |     if (sum(DIAG) != 0) {
39 |       stop("the diagonal of the distance matrix must be a vector of zeros or NA's")
40 |     }
41 |   }
42 |   if (!is.null(TEST_indices)) {
43 |     if (!inherits(TEST_indices, c("numeric", "integer"))) stop("the 'TEST_indices' parameter should be a numeric vector")
44 |     if (max(TEST_indices) > nrow(DIST_mat)) stop('the maximum number of the TEST_indices is greater than the rows of the input distance matrix')
45 |     tr_idx = 1:nrow(DIST_mat)
46 |     tr_idx = tr_idx[-TEST_indices]
47 |     if (!(min(TEST_indices) > max(tr_idx))) stop("The minimum index of the 'TEST_indices' parameter is greater than the maximum index of the 'DIST_mat' data! Make sure that the 'TEST_indices' consist of the last indices of the 'DIST_mat' parameter!")
48 |   }
49 |   if (!is.numeric(k) || is.null(k) || (k >= nrow(DIST_mat)) || k < 1) stop('k must be of type integer, greater than 0 and less than nrow(DIST_mat)')
50 |   if (abs(k - round(k)) > 0) {
51 |     k = round(k)
52 |     warning('k is float and will be rounded to : ', call. = F, expr = k)}
53 |   if (any(is.na(DIST_mat))) stop('the DIST_mat includes missing values')
54 |   if (!inherits(minimize, "logical")) stop("the 'minimize' parameter should be either TRUE or FALSE")
55 | 
56 |   res = DIST_MATRIX_knn(DIST_mat, TEST_indices, minimize, k, threads, T)
57 | 
58 |   return(res)
59 | }
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/man/distMat.KernelKnn.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/dist_kernelknn.R
 3 | \name{distMat.KernelKnn}
 4 | \alias{distMat.KernelKnn}
 5 | \title{kernel k-nearest-neighbors using a distance matrix}
 6 | \usage{
 7 | distMat.KernelKnn(
 8 |   DIST_mat,
 9 |   TEST_indices = NULL,
10 |   y,
11 |   k = 5,
12 |   h = 1,
13 |   weights_function = NULL,
14 |   regression = F,
15 |   threads = 1,
16 |   extrema = F,
17 |   Levels = NULL,
18 |   minimize = T
19 | )
20 | }
21 | \arguments{
22 | \item{DIST_mat}{a distance matrix (square matrix) having a \emph{diagonal} filled with either zero's (\emph{0}) or NA's (\emph{missing values})}
23 | 
24 | \item{TEST_indices}{a numeric vector specifying the indices of the test data in the distance matrix (row-wise or column-wise). If the parameter equals NULL then no test data is included in the distance matrix}
25 | 
26 | \item{y}{a numeric vector (in classification the labels must be numeric from 1:Inf). It is assumed that if the \emph{TEST_indices} is not NULL then the length of \emph{y} equals to the rows of the train data \emph{( nrow(DIST_mat) - length(TEST_indices) )}, otherwise  \emph{length(y) == nrow(DIST_mat)}.}
27 | 
28 | \item{k}{an integer specifying the k-nearest-neighbors}
29 | 
30 | \item{h}{the bandwidth (applicable if the weights_function is not NULL, defaults to 1.0)}
31 | 
32 | \item{weights_function}{there are various ways of specifying the kernel function. See the details section.}
33 | 
34 | \item{regression}{a boolean (TRUE,FALSE) specifying if regression or classification should be performed}
35 | 
36 | \item{threads}{the number of cores to be used in parallel (openmp will be employed)}
37 | 
38 | \item{extrema}{if TRUE then the minimum and maximum values from the k-nearest-neighbors will be removed (can be thought as outlier removal)}
39 | 
40 | \item{Levels}{a numeric vector. In case of classification the unique levels of the response variable are necessary}
41 | 
42 | \item{minimize}{either TRUE or FALSE. If TRUE then lower values will be considered as relevant for the k-nearest search, otherwise higher values.}
43 | }
44 | \value{
45 | a vector (if regression is TRUE), or a data frame with class probabilities (if regression is FALSE)
46 | }
47 | \description{
48 | kernel k-nearest-neighbors using a distance matrix
49 | }
50 | \details{
51 | This function takes a distance matrix (square matrix where the diagonal is filled with \emph{0} or \emph{NA}) as input. If the \emph{TEST_indices} parameter is NULL then the predictions for the train data will be returned, whereas if the \emph{TEST_indices} parameter is not NULL then the predictions for the test data will be returned.
52 | There are three possible ways to specify the weights function, 1st option : if the weights_function is NULL then a simple k-nearest-neighbor is performed. 2nd option : the weights_function is one of 'uniform', 'triangular', 'epanechnikov', 'biweight', 'triweight', 'tricube', 'gaussian', 'cosine', 'logistic', 'gaussianSimple', 'silverman', 'inverse', 'exponential'. The 2nd option can be extended by combining kernels from the existing ones (adding or multiplying). For instance, I can multiply the tricube with the gaussian kernel by giving 'tricube_gaussian_MULT' or I can add the previously mentioned kernels by giving 'tricube_gaussian_ADD'. 3rd option : a user defined kernel function
53 | }
54 | \examples{
55 | 
56 | data(Boston)
57 | 
58 | X = Boston[, -ncol(Boston)]
59 | y = Boston[, ncol(Boston)]
60 | 
61 | dist_obj = dist(X)
62 | 
63 | dist_mat = as.matrix(dist_obj)
64 | 
65 | out = distMat.KernelKnn(dist_mat, TEST_indices = NULL, y, k = 5, regression = TRUE)
66 | 
67 | }
68 | \author{
69 | Lampros Mouselimis
70 | }
71 | 


--------------------------------------------------------------------------------
/man/KernelKnnCV.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/kernelknnCV.R
 3 | \name{KernelKnnCV}
 4 | \alias{KernelKnnCV}
 5 | \title{kernel-k-nearest-neighbors using cross-validation}
 6 | \usage{
 7 | KernelKnnCV(
 8 |   data,
 9 |   y,
10 |   k = 5,
11 |   folds = 5,
12 |   h = 1,
13 |   method = "euclidean",
14 |   weights_function = NULL,
15 |   regression = F,
16 |   transf_categ_cols = F,
17 |   threads = 1,
18 |   extrema = F,
19 |   Levels = NULL,
20 |   seed_num = 1,
21 |   p = k
22 | )
23 | }
24 | \arguments{
25 | \item{data}{a data frame or matrix}
26 | 
27 | \item{y}{a numeric vector (in classification the labels must be numeric from 1:Inf)}
28 | 
29 | \item{k}{an integer specifying the k-nearest-neighbors}
30 | 
31 | \item{folds}{the number of cross validation folds (must be greater than 1)}
32 | 
33 | \item{h}{the bandwidth (applicable if the weights_function is not NULL, defaults to 1.0)}
34 | 
35 | \item{method}{a string specifying the method. Valid methods are 'euclidean', 'manhattan', 'chebyshev', 'canberra', 'braycurtis', 'pearson_correlation', 'simple_matching_coefficient', 'minkowski' (by default the order 'p' of the minkowski parameter equals k), 'hamming', 'mahalanobis', 'jaccard_coefficient', 'Rao_coefficient'}
36 | 
37 | \item{weights_function}{there are various ways of specifying the kernel function. See the details section.}
38 | 
39 | \item{regression}{a boolean (TRUE,FALSE) specifying if regression or classification should be performed}
40 | 
41 | \item{transf_categ_cols}{a boolean (TRUE, FALSE) specifying if the categorical columns should be converted to numeric or to dummy variables}
42 | 
43 | \item{threads}{the number of cores to be used in parallel (openmp will be employed)}
44 | 
45 | \item{extrema}{if TRUE then the minimum and maximum values from the k-nearest-neighbors will be removed (can be thought as outlier removal)}
46 | 
47 | \item{Levels}{a numeric vector. In case of classification the unique levels of the response variable are necessary}
48 | 
49 | \item{seed_num}{a numeric value specifying the seed of the random number generator}
50 | 
51 | \item{p}{a numeric value specifying the 'minkowski' order, i.e. if 'method' is set to 'minkowski'. This parameter defaults to 'k'}
52 | }
53 | \value{
54 | a list of length 2. The first sublist is a list of predictions (the length of the list equals the number of the folds). The second sublist is a list with the indices for each fold.
55 | }
56 | \description{
57 | This function performs kernel k nearest neighbors regression and classification using cross validation
58 | }
59 | \details{
60 | This function takes a number of arguments (including the number of cross-validation-folds) and it returns predicted values and indices for each fold.
61 | There are three possible ways to specify the weights function, 1st option : if the weights_function is NULL then a simple k-nearest-neighbor is performed. 2nd option : the weights_function is one of 'uniform', 'triangular', 'epanechnikov', 'biweight', 'triweight', 'tricube', 'gaussian', 'cosine', 'logistic', 'gaussianSimple', 'silverman', 'inverse', 'exponential'. The 2nd option can be extended by combining kernels from the existing ones (adding or multiplying). For instance, I can multiply the tricube with the gaussian kernel by giving 'tricube_gaussian_MULT' or I can add the previously mentioned kernels by giving 'tricube_gaussian_ADD'. 3rd option : a user defined kernel function
62 | }
63 | \examples{
64 | 
65 | \dontrun{
66 | data(ionosphere)
67 | 
68 | X = ionosphere[, -c(2, ncol(ionosphere))]
69 | y = as.numeric(ionosphere[, ncol(ionosphere)])
70 | 
71 | out = KernelKnnCV(X, y, k = 5, folds = 3, regression = FALSE, Levels = unique(y))
72 | }
73 | }
74 | \author{
75 | Lampros Mouselimis
76 | }
77 | 


--------------------------------------------------------------------------------
/tests/testthat/test-distance_metrics.R:
--------------------------------------------------------------------------------
 1 | #==============================================================================================================================================================
 2 | 
 3 | context('Distance metrics rcpp')
 4 | 
 5 | # TRAIN data
 6 | 
 7 | testthat::test_that("in case that the TEST data is an empty matrix (is.null) : the result for each metric is a list, the length of the output is 2 [meaning 2 sublists] and the nrows in total for the two
 8 | 
 9 |                     sublists [ indices and distances ] are 2 times the TRAIN data", {
10 | 
11 |   test_dat = matrix(, nrow = 0, ncol = 0)
12 | 
13 |   lap_regr = lapply(c('euclidean', 'manhattan', 'chebyshev', 'canberra', 'braycurtis', 'pearson_correlation', 'simple_matching_coefficient',
14 |                       'minkowski', 'hamming', 'mahalanobis', 'jaccard_coefficient', 'Rao_coefficient'), function(x)  knn_index_dist_rcpp(as.matrix(xtr), test_dat, 5, x, 1, 1, eps = 1.0e-6))
15 | 
16 |   lap_class = lapply(c('euclidean', 'manhattan', 'chebyshev', 'canberra', 'braycurtis', 'pearson_correlation', 'simple_matching_coefficient',
17 |                        'minkowski', 'hamming', 'mahalanobis', 'jaccard_coefficient', 'Rao_coefficient'), function(x)  knn_index_dist_rcpp(as.matrix(xtr_class), test_dat, 5, x, 1, 1, eps = 1.0e-6))
18 | 
19 |   out1 = sum(unlist(lapply(lap_regr, is.list))) == length(lap_regr) && sum(unlist(lapply(lap_class, is.list))) == length(lap_class)         # sublists are lists
20 | 
21 |   out2 = mean(unlist(lapply(lap_regr, length))) == 2 && mean(unlist(lapply(lap_class, length))) == 2                                        # length of sublists are 2
22 | 
23 |   out3 = mean(unlist(lapply(lap_regr, function(x) nrow(x$train_knn_dist) + nrow(x$train_knn_idx)))) == nrow(xtr) * 2 &&
24 | 
25 |     mean(unlist(lapply(lap_class, function(x) nrow(x$train_knn_dist) + nrow(x$train_knn_idx)))) == nrow(xtr_class) * 2                            # output nrow for idx and distance is nrow(train_data) * 2 [ as I sum both sublists ]
26 | 
27 |   testthat::expect_true(sum(c(out1, out2, out3)) == 3)
28 | })
29 | 
30 | 
31 | 
32 | # TEST data
33 | 
34 | testthat::test_that("in case that the TEST data is not an empty matrix : the result for each metric is a list, the length of the output is 2 [meaning 2 sublists] and the nrows in total for the two
35 | 
36 |                     sublists [ indices and distances ] are 2 times the TEST data", {
37 | 
38 |   test_dat = matrix(, nrow = 0, ncol = 0)
39 | 
40 |   lap_regr = lapply(c('euclidean', 'manhattan', 'chebyshev', 'canberra', 'braycurtis', 'pearson_correlation', 'simple_matching_coefficient',
41 |                       'minkowski', 'hamming', 'mahalanobis', 'jaccard_coefficient', 'Rao_coefficient'), function(x)  knn_index_dist_rcpp(as.matrix(xtr), as.matrix(xte), 5, x, 1, 1, eps = 1.0e-6))
42 | 
43 |   lap_class = lapply(c('euclidean', 'manhattan', 'chebyshev', 'canberra', 'braycurtis', 'pearson_correlation', 'simple_matching_coefficient',
44 |                        'minkowski', 'hamming', 'mahalanobis', 'jaccard_coefficient', 'Rao_coefficient'), function(x)  knn_index_dist_rcpp(as.matrix(xtr_class), as.matrix(xte_class), 5, x, 1, 1, eps = 1.0e-6))
45 | 
46 |   out1 = sum(unlist(lapply(lap_regr, is.list))) == length(lap_regr) && sum(unlist(lapply(lap_class, is.list))) == length(lap_class)         # sublists are lists
47 | 
48 |   out2 = mean(unlist(lapply(lap_regr, length))) == 2 && mean(unlist(lapply(lap_class, length))) == 2                                        # length of sublists are 2
49 | 
50 |   out3 = mean(unlist(lapply(lap_regr, function(x) nrow(x$test_knn_dist) + nrow(x$test_knn_idx)))) == nrow(xte) * 2 &&
51 | 
52 |     mean(unlist(lapply(lap_class, function(x) nrow(x$test_knn_dist) + nrow(x$test_knn_idx)))) == nrow(xte_class) * 2                            # output nrow for idx and distance is nrow(train_data) * 2 [ as I sum both sublists ]
53 | 
54 |   testthat::expect_true(sum(c(out1, out2, out3)) == 3)
55 | })
56 | 
57 | 
58 | #==============================================================================================================================================================
59 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | [![R-CMD-check](https://github.com/mlampros/KernelKnn/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/mlampros/KernelKnn/actions/workflows/R-CMD-check.yaml)
  3 | [![CRAN_Status_Badge](http://www.r-pkg.org/badges/version/KernelKnn)](http://cran.r-project.org/package=KernelKnn)
  4 | [![Downloads](http://cranlogs.r-pkg.org/badges/grand-total/KernelKnn?color=blue)](http://www.r-pkg.org/pkg/KernelKnn)
  5 | [![](https://img.shields.io/docker/automated/mlampros/kernelknn.svg)](https://hub.docker.com/r/mlampros/kernelknn)
  6 | [![status](https://tinyverse.netlify.app/badge/KernelKnn)](https://CRAN.R-project.org/package=KernelKnn)
  7 | 
  8 | 
  9 | ## KernelKnn
 10 | <br>
 11 | 
 12 | The KernelKnn package extends the simple k-nearest neighbors algorithm by incorporating numerous kernel functions and a variety of distance metrics. The package takes advantage of 'RcppArmadillo' to speed up the calculation of distances between observations. More details on the functionality of KernelKnn can be found in the [blog-post](http://mlampros.github.io/2016/07/10/KernelKnn/) and in the package Vignettes ( *scroll down for information on how to use the* **docker image** ).
 13 | <br><br>
 14 | 
 15 | To install the package from CRAN use, 
 16 | 
 17 | ```R
 18 | 
 19 | install.packages("KernelKnn")
 20 | 
 21 | 
 22 | ```
 23 | <br>
 24 | 
 25 | or download the latest version from Github using the *pak* package,
 26 | 
 27 | <br>
 28 | 
 29 | ```R
 30 | 
 31 | pak::pak('mlampros/KernelKnn')
 32 | 
 33 | ```
 34 | <br>
 35 | 
 36 | Use the following link to report bugs/issues,
 37 | <br><br>
 38 | 
 39 | [https://github.com/mlampros/KernelKnn/issues](https://github.com/mlampros/KernelKnn/issues)
 40 | 
 41 | 
 42 | <br>
 43 | 
 44 | **UPDATE 29-11-2019**
 45 | 
 46 | <br>
 47 | 
 48 | **Docker images** of the *KernelKnn* package are available to download from my [dockerhub](https://hub.docker.com/r/mlampros/kernelknn) account. The images come with *Rstudio* and the *R-development* version (latest) installed. The whole process was tested on Ubuntu 18.04. To **pull** & **run** the image do the following,
 49 | 
 50 | <br>
 51 | 
 52 | ```R
 53 | 
 54 | docker pull mlampros/kernelknn:rstudiodev
 55 | 
 56 | docker run -d --name rstudio_dev -e USER=rstudio -e PASSWORD=give_here_your_password --rm -p 8787:8787 mlampros/kernelknn:rstudiodev
 57 | 
 58 | ```
 59 | 
 60 | <br>
 61 | 
 62 | The user can also **bind** a home directory / folder to the image to use its files by specifying the **-v** command,
 63 | 
 64 | <br>
 65 | 
 66 | ```R
 67 | 
 68 | docker run -d --name rstudio_dev -e USER=rstudio -e PASSWORD=give_here_your_password --rm -p 8787:8787 -v /home/YOUR_DIR:/home/rstudio/YOUR_DIR mlampros/kernelknn:rstudiodev
 69 | 
 70 | 
 71 | ```
 72 | 
 73 | <br>
 74 | 
 75 | In the latter case you might have first give permission privileges for write access to **YOUR_DIR** directory (not necessarily) using,
 76 | 
 77 | <br>
 78 | 
 79 | ```R
 80 | 
 81 | chmod -R 777 /home/YOUR_DIR
 82 | 
 83 | 
 84 | ```
 85 | 
 86 | <br>
 87 | 
 88 | The **USER** defaults to *rstudio* but you have to give your **PASSWORD** of preference (see [https://rocker-project.org](https://rocker-project.org/) for more information).
 89 | 
 90 | <br>
 91 | 
 92 | Open your web-browser and depending where the docker image was *build / run* give, 
 93 | 
 94 | <br>
 95 | 
 96 | **1st. Option** on your personal computer,
 97 | 
 98 | <br>
 99 | 
100 | ```R
101 | http://0.0.0.0:8787 
102 | 
103 | ```
104 | 
105 | <br>
106 | 
107 | **2nd. Option** on a cloud instance, 
108 | 
109 | <br>
110 | 
111 | ```R
112 | http://Public DNS:8787
113 | 
114 | ```
115 | 
116 | <br>
117 | 
118 | to access the Rstudio console in order to give your username and password.
119 | 
120 | <br>
121 | 
122 | ### **Citation:**
123 | 
124 | If you use the **KernelKnn** R package in your paper or research please cite [https://CRAN.R-project.org/package=KernelKnn/citation.html](https://CRAN.R-project.org/package=KernelKnn/citation.html):
125 | 
126 | <br>
127 | 
128 | ```R
129 | @Manual{,
130 |   title = {{KernelKnn}: Kernel k Nearest Neighbors},
131 |   author = {Lampros Mouselimis},
132 |   year = {2025},
133 |   note = {R package version 1.1.6},
134 |   url = {https://CRAN.R-project.org/package=KernelKnn},
135 | }
136 | ```
137 | 
138 | <br>
139 | 
140 | 


--------------------------------------------------------------------------------
/tests/testthat/test-dist_knn_index_dist.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | context("kernel knn list of distances and indices using a distance matrix")
  4 | 
  5 | 
  6 | #=================
  7 | # Error handling
  8 | #=================
  9 | 
 10 | 
 11 | testthat::test_that("it returns an error if the input distance object is not of type matrix", {
 12 | 
 13 |   tmp_df = as.data.frame(DIST_mat)
 14 | 
 15 |   testthat::expect_error(distMat.knn.index.dist(tmp_df, TEST_indices = NULL, k = 5, threads = 1, minimize = T))
 16 | })
 17 | 
 18 | 
 19 | testthat::test_that("it returns an error if the input distance matrix is not square", {
 20 | 
 21 |   testthat::expect_error(distMat.knn.index.dist(DIST_mat[, -ncol(DIST_mat)], TEST_indices = NULL, k = 5, threads = 1, minimize = T))
 22 | })
 23 | 
 24 | 
 25 | testthat::test_that("it returns an error if the diagonal of the distance matrix is other than 0's or NA's", {
 26 | 
 27 |   TMP_DIAG = DIST_mat
 28 | 
 29 |   diag(TMP_DIAG) = -1
 30 | 
 31 |   testthat::expect_error(distMat.knn.index.dist(TMP_DIAG, TEST_indices = NULL, k = 5, threads = 1, minimize = T))
 32 | })
 33 | 
 34 | 
 35 | testthat::test_that("it returns an error if the TEST_indices parameter is not of type numeric or integer", {
 36 | 
 37 |   invalid_tst_idx = letters[1:100]
 38 | 
 39 |   testthat::expect_error(distMat.knn.index.dist(DIST_mat, TEST_indices = invalid_tst_idx, k = 5, threads = 1, minimize = T))
 40 | })
 41 | 
 42 | 
 43 | testthat::test_that("it returns an error if the maximum index of the TEST_indices parameter is greater than the rows of the distance matrix", {
 44 | 
 45 |   invalid_tst_idx = 1:(nrow(DIST_mat) + 10)
 46 | 
 47 |   invalid_tst_idx = (nrow(DIST_mat)-100):length(invalid_tst_idx)
 48 | 
 49 |   testthat::expect_error(distMat.knn.index.dist(DIST_mat, TEST_indices = invalid_tst_idx, k = 5, threads = 1, minimize = T))
 50 | })
 51 | 
 52 | testthat::test_that("it returns an error if k is NULL", {
 53 | 
 54 |   testthat::expect_error(distMat.knn.index.dist(DIST_mat, TEST_indices = NULL, k = NULL, threads = 1, minimize = T))
 55 | })
 56 | 
 57 | 
 58 | testthat::test_that("it returns an error if k is a character", {
 59 | 
 60 |   testthat::expect_error( distMat.knn.index.dist(DIST_mat, TEST_indices = NULL, k = "invalid", threads = 1, minimize = T) )
 61 | })
 62 | 
 63 | 
 64 | testthat::test_that("it returns an error if k is greater or equal to the number of rows of the distance matrix", {
 65 | 
 66 |   testthat::expect_error( distMat.knn.index.dist(DIST_mat, TEST_indices = NULL, k = nrow(DIST_mat), threads = 1, minimize = T) )
 67 | })
 68 | 
 69 | 
 70 | testthat::test_that("it returns an error if k is less than 1", {
 71 | 
 72 |   testthat::expect_error(distMat.knn.index.dist(DIST_mat, TEST_indices = NULL, k = -1, threads = 1, minimize = T) )
 73 | })
 74 | 
 75 | 
 76 | testthat::test_that("it returns a warning if k is a float", {
 77 | 
 78 |   testthat::expect_warning( distMat.knn.index.dist(DIST_mat, TEST_indices = NULL, k = 1.5, threads = 1, minimize = T) )
 79 | })
 80 | 
 81 | 
 82 | testthat::test_that("it returns an error if the minimize parameter is not a boolean", {
 83 | 
 84 |   testthat::expect_error( distMat.knn.index.dist(DIST_mat, TEST_indices = NULL, k = 5, threads = 1, minimize = 'T')  )
 85 | })
 86 | 
 87 | 
 88 | 
 89 | # testing of distMat.knn.index.dist
 90 | #----------------------------------
 91 | 
 92 | 
 93 | testthat::test_that("it returns the correct output if TEST_indices is NULL", {
 94 | 
 95 |   res = distMat.knn.index.dist(DIST_mat, TEST_indices = NULL, k = 5, threads = 1, minimize = T)
 96 | 
 97 |   testthat::expect_true( inherits(res, "list") && length(res) == 2 && mean(unlist(lapply(res, nrow))) == nrow(DIST_mat))
 98 | })
 99 | 
100 | 
101 | testthat::test_that("it returns the correct output if TEST_indices is not NULL", {
102 | 
103 |   idxs = (nrow(DIST_mat) - 100): nrow(DIST_mat)
104 | 
105 |   res = distMat.knn.index.dist(DIST_mat, TEST_indices = idxs, k = 5, threads = 1, minimize = T)
106 | 
107 |   testthat::expect_true( inherits(res, "list") && length(res) == 2 && mean(unlist(lapply(res, nrow))) == length(idxs) )
108 | })
109 | 
110 | 
111 | testthat::test_that("it returns the correct output if TEST_indices is NULL and the main diagonal is NA's", {
112 | 
113 |   tmp_dist = DIST_mat
114 | 
115 |   diag(tmp_dist) = NA
116 | 
117 |   res = distMat.knn.index.dist(tmp_dist, TEST_indices = NULL, k = 5, threads = 1, minimize = T)
118 | 
119 |   testthat::expect_true( inherits(res, "list") && length(res) == 2 && mean(unlist(lapply(res, nrow))) == nrow(tmp_dist))
120 | })
121 | 
122 | 
123 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
  1 | 
  2 | ## KernelKnn 1.1.7
  3 | 
  4 | * I removed the `SystemRequirements` from the DESCRIPTION file.
  5 | 
  6 | 
  7 | ## KernelKnn 1.1.6
  8 | 
  9 | * I updated the `Makevars` and `Makevars.win` files by adding `-DARMA_USE_CURRENT` (see issue: https://github.com/RcppCore/RcppArmadillo/issues/476)
 10 | * I removed the `-mthreads` compilation option from the "Makevars.win" file
 11 | * I removed the "CXX_STD = CXX11" from the "Makevars" files, and the "[[Rcpp::plugins(cpp11)]]" from the "distance_metrics.cpp" file due to the following NOTE from CRAN, "NOTE Specified C++11: please drop specification unless essential" (see also: https://www.tidyverse.org/blog/2023/03/cran-checks-compiled-code/#note-regarding-systemrequirements-c11)
 12 | 
 13 | 
 14 | ## KernelKnn 1.1.5
 15 | 
 16 | * I added the order *'p'* of the "minkowski" method as a new parameter to the *'KernelKnn()'*, *'KernelKnnCV()'* and *'knn.index.dist()'* functions. It defaults to 'k' (see https://github.com/mlampros/KernelKnn/issues/9)
 17 | * I added test cases for the *'KernelKnn()'* and *'knn.index.dist()'* functions
 18 | 
 19 | 
 20 | ## KernelKnn 1.1.4
 21 | 
 22 | * The pull request 7 fixed a bug in the checking of the Levels argument (see https://github.com/mlampros/KernelKnn/pull/7)
 23 | * I fixed an omission of the column names in case of classification in the *KernelKnn()* and *distMat.KernelKnn()* functions (see https://github.com/mlampros/KernelKnn/issues/8)
 24 | 
 25 | 
 26 | ## KernelKnn 1.1.3
 27 | 
 28 | * I updated the References section of the *switch.ops()* function in the *utils.R* file which explain how the combination of the kernels work
 29 | * I added an error case in all functions that make usage of the 'Levels' parameter if the 'Levels' do not match the unique 'y' labels
 30 | * I removed the *distMat.KernelKnnCV()* function (and the *tests/test-dist_kernelknnCV.R* file) because based on the current implementation of the *distMat.KernelKnn()* function the *TEST_indices* parameter *must* consist of the *last indices* of the input *DIST_mat* distance matrix and this is not the case if we run cross-validation (see [issue 5](https://github.com/mlampros/KernelKnn/issues/5))
 31 | 
 32 | 
 33 | ## KernelKnn 1.1.2
 34 | 
 35 | * I've fixed an error in the *CITATION* file
 36 | 
 37 | 
 38 | ## KernelKnn 1.1.1
 39 | 
 40 | * I've added the *CITATION* file in the *inst* directory
 41 | 
 42 | 
 43 | ## KernelKnn 1.1.0
 44 | 
 45 | * I fixed the *"failure: the condition has length > 1"* CRAN error which appeared mainly due to the misuse of the base *class()* function in multiple code snippets in the package (for more info on this matter see: https://developer.r-project.org/Blog/public/2019/11/09/when-you-think-class.-think-again/index.html)
 46 | 
 47 | 
 48 | ## KernelKnn 1.0.9
 49 | 
 50 | I added a test case to check equality of the results between *KernelKnnCV* and *distMat.KernelKnnCV* functions
 51 | 
 52 | 
 53 | ## KernelKnn 1.0.8
 54 | 
 55 | I added the *DARMA_64BIT_WORD* flag in the Makevars file to allow the package processing big datasets
 56 | 
 57 | 
 58 | ## KernelKnn 1.0.7
 59 | 
 60 | I modified the *input_dist_mat* function of the *distance_metrics.cpp* file due to a bug.
 61 | I modified  the *distMat.KernelKnn* function so that it does not return an error if the rows of the *DIST_mat* distance matrix is not equal to the length of *y* (added comments in the function documentation).
 62 | 
 63 | 
 64 | ## KernelKnn 1.0.6
 65 | 
 66 | In this version the following functions/parameters were added:
 67 | 
 68 | * *seed_num* : parameter in *KernelKnnCV* and *distMat.KernelKnnCV* cross-validation functions, which specifies the seed of R's random number generator 
 69 | * *distMat.KernelKnn* : this function performs kernel k-nearest-neighbor search by using a *distance matrix* as input
 70 | * *distMat.knn.index.dist* : this function returns the indices and distances for k-nearest neighbors using a distance matrix
 71 | * *distMat.KernelKnnCV* : this function performs cross-validated kernel k-nearest-neighbor search using a distance matrix as input
 72 | 
 73 | I also modified the *OpenMP* clauses of the .cpp file to address the ASAN errors.
 74 | 
 75 | 
 76 | ## KernelKnn 1.0.5
 77 | 
 78 | I removed *OpenImageR* and *irlba* as package dependencies. I also added an *init.c* file in the *src* folder due to a change in CRAN submissions for compiled code [  *references* : http://stackoverflow.com/questions/42313373/r-cmd-check-note-found-no-calls-to-r-registerroutines-r-usedynamicsymbols, https://github.com/RcppCore/Rcpp/issues/636  ]
 79 | 
 80 | 
 81 | ## KernelKnn 1.0.4
 82 | 
 83 | I added a try-catch Rcpp function to make possible the calculation of singular covariance matrices as sugggested in https://github.com/mlampros/KernelKnn/issues/1
 84 | 
 85 | 
 86 | ## KernelKnn 1.0.3
 87 | 
 88 | Reimplementation of the Rcpp function due to ASAN-memory-errors
 89 | 
 90 | 
 91 | ## KernelKnn 1.0.2
 92 | 
 93 | I updated the Description file with a URL and a BugReports web-address.
 94 | 
 95 | 
 96 | ## KernelKnn 1.0.1
 97 | 
 98 | Currently, Software platforms like OSX do not support openMP, thus I've made openMP optional for all cpp functions.
 99 | 
100 | 
101 | ## KernelKnn 1.0.0
102 | 
103 | 


--------------------------------------------------------------------------------
/tests/testthat/test-kernelknn_cross_valid.R:
--------------------------------------------------------------------------------
  1 | 
  2 | context("Cross validate kernelknn")
  3 | 
  4 | 
  5 | # shuffle data
  6 | 
  7 | testthat::test_that("shuffle data takes a vector as input and returns a vector as output", {
  8 | 
  9 |   y = c(1:50)
 10 | 
 11 |   testthat::expect_true(is.vector(func_shuffle(y, times = 10)))
 12 | })
 13 | 
 14 | testthat::test_that("the length of the input vector equals the length of the output vector", {
 15 | 
 16 |   y = c(1:50)
 17 | 
 18 |   output = func_shuffle(y, times = 10)
 19 | 
 20 |   testthat::expect_true(length(y) == length(output))
 21 | })
 22 | 
 23 | 
 24 | # classification folds
 25 | 
 26 | testthat::test_that("throws an error if the RESP is not a factor", {
 27 | 
 28 |   y = c(1:10)
 29 | 
 30 |   testthat::expect_error(class_folds(5, y), "RESP must be a factor")
 31 | })
 32 | 
 33 | 
 34 | testthat::test_that("the number of folds equals the number of the resulted sublist indices", {
 35 | 
 36 |   y = as.factor(sample(1:5, 100, replace = T))
 37 | 
 38 |   testthat::expect_length(class_folds(5, y), 5)
 39 | })
 40 | 
 41 | # regression folds
 42 | 
 43 | testthat::test_that("throws an error if the RESP is not a factor", {
 44 | 
 45 |   y = as.factor(c(1:50))
 46 | 
 47 |   testthat::expect_error(regr_folds(5, y), "this function is meant for regression for classification use the 'class_folds' function")
 48 | })
 49 | 
 50 | 
 51 | testthat::test_that("the number of folds equals the number of the resulted sublist indices", {
 52 | 
 53 |   y = sample(1:5, 100, replace = T)
 54 | 
 55 |   testthat::expect_length(regr_folds(5, y), 5)
 56 | })
 57 | 
 58 | 
 59 | # KernelKnnCV function
 60 | 
 61 | testthat::test_that("it returns an error if y is NULL", {
 62 | 
 63 |   testthat::expect_error( KernelKnnCV(X, y = NULL, k = 5, folds = 5, h = 1.0, method = 'euclidean', weights_function = NULL, regression = T) )
 64 | })
 65 | 
 66 | testthat::test_that("it returns an error if y is not numeric", {
 67 | 
 68 |   testthat::expect_error(  KernelKnnCV(X, y = list(y), k = 5, folds = 5, h = 1.0, method = 'euclidean', weights_function = NULL, regression = T) )
 69 | })
 70 | 
 71 | 
 72 | testthat::test_that("it returns an error if missing values are present in the data", {
 73 | 
 74 |   tmp_dat = X
 75 |   tmp_dat$crim[sample(1:length(tmp_dat$crim), 10)] = NA
 76 | 
 77 |   testthat::expect_error( KernelKnnCV(tmp_dat, y = y, k = 5, folds = 5, h = 1.0, method = 'euclidean', weights_function = NULL, regression = T) )
 78 | })
 79 | 
 80 | 
 81 | testthat::test_that("it returns an error if missing values are present in the response variable", {
 82 | 
 83 |   tmp_dat = y
 84 |   tmp_dat[sample(1:length(tmp_dat), 10)] = NA
 85 | 
 86 |   testthat::expect_error( KernelKnnCV(X, y = tmp_dat, k = 5, folds = 5, h = 1.0, method = 'euclidean', weights_function = NULL, regression = T) )
 87 | })
 88 | 
 89 | 
 90 | testthat::test_that("it returns an error if the length of y is not equal to the number of rows of the train data", {
 91 | 
 92 |   testthat::expect_error( KernelKnnCV(X, y[1:(length(y)-10)], k = 5, folds = 5, h = 1.0, method = 'euclidean', weights_function = NULL, regression = T) )
 93 | })
 94 | 
 95 | 
 96 | testthat::test_that("it returns an error if regression = F and there are unique labels less than 1", {
 97 | 
 98 |   testthat::expect_error( KernelKnnCV(X_class, as.numeric(y1_class_ext) - 1, k = 5, folds = 5, h = 1.0, method = 'euclidean', weights_function = NULL, regression = F) )
 99 | })
100 | 
101 | 
102 | testthat::test_that("it returns an error if folds < 2", {
103 | 
104 |   testthat::expect_error( KernelKnnCV(X_class, as.numeric(y1_class_ext), k = 5, folds = 1, h = 1.0, method = 'euclidean', weights_function = NULL, regression = F, Levels = unique(y1_class_ext)) )
105 | })
106 | 
107 | 
108 | testthat::test_that("it returns an error if regression is not TRUE or FALSE", {
109 | 
110 |   testthat::expect_error( KernelKnnCV(X_class, as.numeric(y1_class_ext), k = 5, folds = 3, h = 1.0, method = 'euclidean', weights_function = NULL, regression = 'F', Levels = unique(y1_class_ext)) )
111 | })
112 | 
113 | 
114 | testthat::test_that("it returns an error if each fold has less than 5 observations", {
115 | 
116 |   testthat::expect_error( KernelKnnCV(X, y = y, k = 5, folds = 100, h = 1.0, method = 'euclidean', weights_function = NULL, regression = T) )
117 | })
118 | 
119 | 
120 | # test KernelKnnCV functionality
121 | 
122 | testthat::test_that("it returns a list of length 2 where the length of the unlisted sublists equal the length of the train data, if REGRESSION = TRUE", {
123 | 
124 |   res =  KernelKnnCV(X, y, k = 5, folds = 3, method = 'euclidean', weights_function = NULL, regression = T, Levels = NULL)
125 | 
126 |   lap_pr = sum(unlist(lapply(res$preds, length)))
127 | 
128 |   lap_idx = sum(unlist(lapply(res$folds, length)))
129 | 
130 |   testthat::expect_true( lap_pr == nrow(X) && lap_idx == nrow(X) )
131 | })
132 | 
133 | 
134 | testthat::test_that("it returns a list of length 2 where the length of the unlisted sublists equal the length of the train data, if REGRESSION = FALSE", {
135 | 
136 |   res =  KernelKnnCV(X_class, as.numeric(y1_class_ext), k = 5, folds = 3, method = 'euclidean', weights_function = NULL, regression = F, Levels = as.numeric(unique(y1_class_ext)))
137 | 
138 |   lap_pr = sum(unlist(lapply(res$preds, nrow)))
139 | 
140 |   lap_idx = sum(unlist(lapply(res$folds, length)))
141 | 
142 |   testthat::expect_true( lap_pr == nrow(X_class) && lap_idx == nrow(X_class) )
143 | })
144 | 
145 | 


--------------------------------------------------------------------------------
/R/knn_index_dist.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #' indices and distances of k-nearest-neighbors
 3 | #'
 4 | #' This function returns the k nearest indices and distances of each observation
 5 | #'
 6 | #' @param data a data.frame or matrix
 7 | #' @param TEST_data a data.frame or matrix (it can be also NULL)
 8 | #' @param k an integer specifying the k-nearest-neighbors
 9 | #' @param method a string specifying the method. Valid methods are 'euclidean', 'manhattan', 'chebyshev', 'canberra', 'braycurtis', 'pearson_correlation', 'simple_matching_coefficient', 'minkowski' (by default the order 'p' of the minkowski parameter equals k), 'hamming', 'mahalanobis', 'jaccard_coefficient', 'Rao_coefficient'
10 | #' @param transf_categ_cols a boolean (TRUE, FALSE) specifying if the categorical columns should be converted to numeric or to dummy variables
11 | #' @param threads the number of cores to be used in parallel (openmp will be employed)
12 | #' @param p a numeric value specifying the 'minkowski' order, i.e. if 'method' is set to 'minkowski'. This parameter defaults to 'k'
13 | #' @return a list of length 2. The first sublist returns the indices and the second the distances of the k nearest neighbors for each observation.
14 | #' If TEST_data is NULL the number of rows of each sublist equals the number of rows in the train data. If TEST_data is not NULL the number of rows of each sublist equals the number of rows in the TEST data.
15 | #' @author Lampros Mouselimis
16 | #' @details
17 | #' This function takes a number of arguments and it returns the indices and distances of the k-nearest-neighbors for each observation. If TEST_data is NULL then the indices-distances for the train data will be returned, whereas if TEST_data is not NULL then the indices-distances for the TEST_data will be returned.
18 | #' @export
19 | #' @examples
20 | #'
21 | #' data(Boston)
22 | #'
23 | #' X = Boston[, -ncol(Boston)]
24 | #'
25 | #' out = knn.index.dist(X, TEST_data = NULL, k = 4, method = 'euclidean', threads = 1)
26 | #'
27 | 
28 | 
29 | knn.index.dist = function(data, TEST_data = NULL, k = 5, method = 'euclidean', transf_categ_cols = F, threads = 1, p = k) {
30 | 
31 |   categorical_data_present = sapply(data, function(x) is.factor(x) || is.character(x))
32 | 
33 |   if (sum(categorical_data_present) && !transf_categ_cols) stop('Categorical columns present in data. These should be either converted to numeric or the function should be run with transf_categ_cols = TRUE')
34 |   if (!is.numeric(k) || is.null(k) || (k >= nrow(data)) || k < 1) stop('k must be of type integer, greater than 0 and less than nrow(train)')
35 |   if (!is.character(method) || is.null(method) || !method %in% c('euclidean', 'manhattan', 'chebyshev', 'canberra', 'braycurtis', 'pearson_correlation', 'simple_matching_coefficient',
36 |                                                                  'minkowski', 'hamming', 'mahalanobis', 'jaccard_coefficient', 'Rao_coefficient'))
37 |     stop("method must be of type character and one of 'euclidean', 'manhattan', 'chebyshev', 'canberra', 'braycurtis', 'pearson_correlation', 'simple_matching_coefficient',
38 |          'minkowski', 'hamming', 'mahalanobis', 'jaccard_coefficient', 'Rao_coefficient'")
39 |   if (any(is.na(data))) stop('the data or the response variable includes missing values')
40 |   if (!is.null(TEST_data) && any(is.na(TEST_data))) stop('the TEST_data includes missing values')
41 |   if (method %in% c('simple_matching_coefficient', 'jaccard_coefficient', 'Rao_coefficient') && !all(unlist(lapply(1:ncol(data), function(x) sum(c(0,1) %in% unique(data[, x])) == 2))))
42 |     stop("methods : 'simple_matching_coefficient', 'jaccard_coefficient' and 'Rao_coefficient' require the data to be in binary form e.g 0,1")
43 |   if (!is.null(TEST_data) && method %in% c('simple_matching_coefficient', 'jaccard_coefficient', 'Rao_coefficient') && !all(unlist(lapply(1:ncol(TEST_data), function(x) sum(c(0,1) %in% unique(TEST_data[, x])) == 2))))
44 |     stop("methods : 'simple_matching_coefficient', 'jaccard_coefficient' and 'Rao_coefficient' require the TEST_data to be in binary form e.g 0,1")
45 | 
46 |   if (!is.null(TEST_data) && ncol(data) != ncol(TEST_data)) stop('the number of columns in train and test data differ')
47 | 
48 |   #----------------------------------------------------------------------------------------------------
49 | 
50 |   # check if any of the variables is categorical, if TRUE then convert categorical predictors to either dummy variables or numeric variables [ depending on the number of levels ]
51 | 
52 |   if (transf_categ_cols) {
53 | 
54 |     if (is.null(TEST_data)) {
55 | 
56 |       data = func_categorical_preds(data)}
57 | 
58 |     else {
59 | 
60 |       tmp_dat = func_categorical_preds(rbind(data, TEST_data))
61 | 
62 |       data = tmp_dat[1:nrow(data), ]
63 | 
64 |       TEST_data = tmp_dat[(nrow(data) + 1):nrow(tmp_dat), ]
65 |     }
66 |   }
67 |   #----------------------------------------------------------------------------------------------------
68 | 
69 |   if (is.null(TEST_data)) {
70 | 
71 |     mat = matrix(, nrow = 0, ncol = 0)
72 | 
73 |     if (!is.matrix(data)) data = as.matrix(data)
74 | 
75 |     res = knn_index_dist_rcpp(data, mat, k = k, method = method, threads = threads, p = p)
76 |   }
77 | 
78 |   else {
79 | 
80 |     if (!is.matrix(data)) data = as.matrix(data)
81 |     if (!is.matrix(TEST_data)) TEST_data = as.matrix(TEST_data)
82 | 
83 |     res = knn_index_dist_rcpp(data, TEST_data, k = k, method = method, threads = threads, p = p)
84 |   }
85 | 
86 |   return(res)
87 | }
88 | 
89 | 
90 | 


--------------------------------------------------------------------------------
/R/kernelknnCV.R:
--------------------------------------------------------------------------------
  1 | 
  2 | #' kernel-k-nearest-neighbors using cross-validation
  3 | #'
  4 | #' This function performs kernel k nearest neighbors regression and classification using cross validation
  5 | #'
  6 | #' @param data a data frame or matrix
  7 | #' @param y a numeric vector (in classification the labels must be numeric from 1:Inf)
  8 | #' @param k an integer specifying the k-nearest-neighbors
  9 | #' @param folds the number of cross validation folds (must be greater than 1)
 10 | #' @param h the bandwidth (applicable if the weights_function is not NULL, defaults to 1.0)
 11 | #' @param method a string specifying the method. Valid methods are 'euclidean', 'manhattan', 'chebyshev', 'canberra', 'braycurtis', 'pearson_correlation', 'simple_matching_coefficient', 'minkowski' (by default the order 'p' of the minkowski parameter equals k), 'hamming', 'mahalanobis', 'jaccard_coefficient', 'Rao_coefficient'
 12 | #' @param weights_function there are various ways of specifying the kernel function. See the details section.
 13 | #' @param regression a boolean (TRUE,FALSE) specifying if regression or classification should be performed
 14 | #' @param transf_categ_cols a boolean (TRUE, FALSE) specifying if the categorical columns should be converted to numeric or to dummy variables
 15 | #' @param threads the number of cores to be used in parallel (openmp will be employed)
 16 | #' @param extrema if TRUE then the minimum and maximum values from the k-nearest-neighbors will be removed (can be thought as outlier removal)
 17 | #' @param Levels a numeric vector. In case of classification the unique levels of the response variable are necessary
 18 | #' @param seed_num a numeric value specifying the seed of the random number generator
 19 | #' @param p a numeric value specifying the 'minkowski' order, i.e. if 'method' is set to 'minkowski'. This parameter defaults to 'k'
 20 | #' @return a list of length 2. The first sublist is a list of predictions (the length of the list equals the number of the folds). The second sublist is a list with the indices for each fold.
 21 | #' @author Lampros Mouselimis
 22 | #' @details
 23 | #' This function takes a number of arguments (including the number of cross-validation-folds) and it returns predicted values and indices for each fold.
 24 | #' There are three possible ways to specify the weights function, 1st option : if the weights_function is NULL then a simple k-nearest-neighbor is performed. 2nd option : the weights_function is one of 'uniform', 'triangular', 'epanechnikov', 'biweight', 'triweight', 'tricube', 'gaussian', 'cosine', 'logistic', 'gaussianSimple', 'silverman', 'inverse', 'exponential'. The 2nd option can be extended by combining kernels from the existing ones (adding or multiplying). For instance, I can multiply the tricube with the gaussian kernel by giving 'tricube_gaussian_MULT' or I can add the previously mentioned kernels by giving 'tricube_gaussian_ADD'. 3rd option : a user defined kernel function
 25 | #' @export
 26 | #' @importFrom utils txtProgressBar
 27 | #' @importFrom utils setTxtProgressBar
 28 | #' @examples
 29 | #'
 30 | #' \dontrun{
 31 | #' data(ionosphere)
 32 | #'
 33 | #' X = ionosphere[, -c(2, ncol(ionosphere))]
 34 | #' y = as.numeric(ionosphere[, ncol(ionosphere)])
 35 | #'
 36 | #' out = KernelKnnCV(X, y, k = 5, folds = 3, regression = FALSE, Levels = unique(y))
 37 | #' }
 38 | 
 39 | 
 40 | KernelKnnCV = function(data, y, k = 5, folds = 5, h = 1.0, method = 'euclidean', weights_function = NULL, regression = F, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL, seed_num = 1, p = k) {
 41 | 
 42 |   if (length(y) != nrow(data)) stop('the size of the data and y differ')
 43 |   if (!is.logical(regression)) stop('the regression argument should be either TRUE or FALSE')
 44 |   if (any(is.na(data)) || any(is.na(y))) stop('the data or the response variable includes missing values')
 45 |   if (is.null(y)) stop('the response variable should be numeric')
 46 |   if (is.integer(y)) y = as.numeric(y)
 47 |   if (!is.numeric(y)) stop('in both regression and classification the response variable should be numeric or integer and in classification it should start from 1')
 48 |   if (!regression && any(unique(y) < 1)) stop('the response variable values should begin from 1')
 49 |   if (folds < 2) stop('the number of folds should be at least 2')
 50 | 
 51 |   start = Sys.time()
 52 | 
 53 |   if (regression) {
 54 | 
 55 |     set.seed(seed_num)
 56 |     n_folds = regr_folds(folds, y)}
 57 | 
 58 |   else {
 59 | 
 60 |     set.seed(seed_num)
 61 |     n_folds = class_folds(folds, as.factor(y))
 62 |   }
 63 | 
 64 |   if (!all(unlist(lapply(n_folds, length)) > 5)) stop('Each fold has less than 5 observations. Consider decreasing the number of folds or increasing the size of the data.')
 65 | 
 66 |   tmp_fit = list()
 67 | 
 68 |   cat('\n') ; cat('cross-validation starts ..', '\n')
 69 | 
 70 |   pb <- txtProgressBar(min = 0, max = folds, style = 3); cat('\n')
 71 | 
 72 |   for (i in 1:folds) {
 73 | 
 74 |     tmp_fit[[i]] = KernelKnn(data = data[unlist(n_folds[-i]), ],
 75 |                              TEST_data = data[unlist(n_folds[i]), ],
 76 |                              y = y[unlist(n_folds[-i])],
 77 |                              k = k,
 78 |                              h = h,
 79 |                              method = method,
 80 |                              weights_function = weights_function,
 81 |                              regression = regression,
 82 |                              transf_categ_cols = transf_categ_cols,
 83 |                              threads = threads,
 84 |                              extrema = extrema,
 85 |                              Levels = Levels,
 86 |                              p = p)
 87 | 
 88 |     setTxtProgressBar(pb, i)
 89 |   }
 90 | 
 91 |   close(pb); cat('\n')
 92 | 
 93 |   end = Sys.time()
 94 | 
 95 |   t = end - start
 96 | 
 97 |   cat('time to complete :', t, attributes(t)$units, '\n'); cat('\n');
 98 | 
 99 |   return(list(preds = tmp_fit, folds = n_folds))
100 | }
101 | 
102 | 
103 | 


--------------------------------------------------------------------------------
/tests/testthat/test-utils.R:
--------------------------------------------------------------------------------
  1 | #================================================================================================================================================================
  2 | 
  3 | context("Utils testing")
  4 | 
  5 | 
  6 | # 'normalized' function
  7 | 
  8 | testthat::test_that("normalize data takes a vector as input and returns a vector as output", {
  9 | 
 10 |   y = c(1:50)
 11 | 
 12 |   testthat::expect_true(is.vector(normalized(y)))
 13 | })
 14 | 
 15 | testthat::test_that("the length of the input vector equals the length of the output vector", {
 16 | 
 17 |   y = c(1:50)
 18 | 
 19 |   output = normalized(y)
 20 | 
 21 |   testthat::expect_true(length(y) == length(output))
 22 | })
 23 | 
 24 | 
 25 | # 'func_tbl_dist' function
 26 | 
 27 | testthat::test_that("the output of the func_tbl_dist function has : same columns as the number of the unique levels, same rows as the initial matrix and all rows sum to 1.0", {
 28 | 
 29 |   unq_levels = 1:3
 30 | 
 31 |   tmp = matrix(sample(unq_levels, 40, replace = T), 8, 5)
 32 | 
 33 |   res = func_tbl_dist(tmp, unq_levels)
 34 | 
 35 |   testthat::expect_true(ncol(res) == length(unq_levels) && nrow(res) == nrow(tmp) && mean(rowSums(res)) == 1.0)
 36 | })
 37 | 
 38 | 
 39 | # 'func_tbl' function
 40 | 
 41 | testthat::test_that("the output of the func_tbl function has : same columns as the number of the unique levels, same rows as the initial matrix and all rows sum to 1.0", {
 42 | 
 43 |   unq_levels = 1:3
 44 | 
 45 |   W = matrix(rep(0.2, 40), 8, 5)
 46 | 
 47 |   tmp = matrix(sample(unq_levels, 40, replace = T), 8, 5)
 48 | 
 49 |   res = func_tbl(tmp, W, unq_levels)
 50 | 
 51 |   testthat::expect_true(ncol(res) == length(unq_levels) && nrow(res) == nrow(tmp) && mean(rowSums(res)) == 1.0)
 52 | })
 53 | 
 54 | 
 55 | # 'FUNCTION_weights' function
 56 | 
 57 | testthat::test_that("the output of the FUNCTION_weights is the multiplication of a distance matrix with a kernel function. It returns a matrix with the same dimensions
 58 | 
 59 |                     as the distance matrix and the rows sum to 1.0", {
 60 | 
 61 |   func = function(x) { 0.25 * x }
 62 | 
 63 |   tmp = matrix(sample(1:10, 16, replace = T), 4, 4)
 64 | 
 65 |   res = FUNCTION_weights(tmp, func, eps = 0.0)
 66 | 
 67 |   testthat::expect_true(ncol(res) == ncol(tmp) && nrow(res) == nrow(tmp) && mean(rowSums(res)) == 1.0)
 68 | })
 69 | 
 70 | 
 71 | 
 72 | # 'switch_secondary' function
 73 | 
 74 | testthat::test_that("the output of the switch_secondary is the multiplication of a distance matrix with a known kernel such as 'uniform', 'triangular' etc.
 75 | 
 76 |                     It returns a non-empty matrix with the same dimensions as the distance matrix", {
 77 | 
 78 |   W = matrix(1:25, 5, 5)
 79 | 
 80 |   lap = lapply(c('uniform', 'triangular', 'epanechnikov', 'biweight', 'triweight', 'tricube', 'gaussian', 'cosine', 'logistic', 'gaussianSimple', 'silverman', 'inverse', 'exponential'),
 81 | 
 82 |                function(x) switch_secondary(x, W, 1.0))
 83 | 
 84 |   dims = unlist(lapply(lap, function(x) sum(dim(x)) == sum(dim(W))))
 85 | 
 86 |   testthat::expect_true(sum(dims) == length(lap))
 87 | })
 88 | 
 89 | 
 90 | # switch.ops function
 91 | 
 92 | testthat::test_that("returns an error if the sublists are not matrices or data frames", {
 93 |   
 94 |   tmp = list(vec1 = 1:10, vec2 = 11:20)
 95 |   
 96 |   testthat::expect_error( switch.ops(tmp, MODE = 'ADD') )
 97 | })
 98 | 
 99 | 
100 | # 'FUN_kernels' function
101 | 
102 | testthat::test_that("returns an error if the combination of the kernels is incorrect", {
103 |   
104 |   testthat::expect_error( FUN_kernels('uniform_triangular', matrix(runif(100), 10, 10), h = 1.0) )
105 | })
106 | 
107 | testthat::test_that("returns an error if the combination of the kernels includes an unknown kernel", {
108 |   
109 |   testthat::expect_error( FUN_kernels('uniform_invalid_ADD', matrix(runif(100), 10, 10), h = 1.0) )
110 | })
111 | 
112 | testthat::test_that("the output of the FUN_kernels is the multiplication of a distance matrix with a known kernel such as 'uniform', 'triangular' etc. In addition here 
113 |                     
114 |                     ADD and MULTiply different kernels. It returns a non-empty matrix with the same dimensions as the distance matrix", {
115 | 
116 |   W = matrix(1:25, 5, 5)
117 | 
118 |   lap = lapply(c('uniform', 'triangular', 'epanechnikov', 'biweight', 'triweight', 'tricube', 'gaussian', 'cosine', 'logistic', 'gaussianSimple', 'silverman', 'inverse', 
119 |                  
120 |                  'exponential', 'uniform_triangular_ADD', 'biweight_triweight_MULT', 'tricube_gaussian_MULT', 'gaussianSimple_silverman_ADD'), 
121 |                
122 |                function(x) FUN_kernels(x, W, 1.0))
123 | 
124 |   dims = unlist(lapply(lap, function(x) sum(dim(x)) == sum(dim(W))))
125 | 
126 |   testthat::expect_true(sum(dims) == length(lap))
127 | })
128 | 
129 | 
130 | # 'func_categorical_preds' function
131 | 
132 | testthat::test_that("if TRUE the function builds dummy variables for factor predictors with less than 32 unique levels and converts to numeric factor predictors with more than 32 levels", {
133 | 
134 |   m = data.frame(a = as.factor(c(1:19, rep(20, 21))), b = 1:40, c = as.factor(1:40), d = 40:1)
135 | 
136 |   res = func_categorical_preds(m)
137 | 
138 |   testthat::expect_true(ncol(res) == 20 + 3)
139 | })
140 | 
141 | 
142 | # 'func_categorical_preds' function [ multiple factor variables ]
143 | 
144 | testthat::test_that("if TRUE the function builds dummy variables for factor predictors with less than 32 unique levels and converts to numeric factor predictors with more than 32 levels", {
145 |   
146 |   m = data.frame(a = as.factor(c(1:19, rep(20, 21))), s = as.factor(c(1:19, rep(20, 21))), k = as.factor(c(1:19, rep(20, 21))), b = 1:40, c = as.factor(1:40), d = 40:1)
147 |   
148 |   res = func_categorical_preds(m)
149 |   
150 |   testthat::expect_true(ncol(res) == 20 + 19 + 19 + 3)
151 | })
152 | 
153 | 
154 | # switch.ops - function
155 | 
156 | 
157 | testthat::test_that("adding 3 matrices or data.frames of all 1's results in a a matrix of all 3's", {
158 |   m1 = matrix(rep(1, 25), 5, 5)
159 |   m2 = matrix(rep(1, 25), 5, 5)
160 |   m3 = matrix(rep(1, 25), 5, 5)
161 |   
162 |   lst = list(m1, m2, m3)
163 |   
164 |   testthat::expect_equal(switch.ops(lst, MODE = 'ADD'), matrix(rep(3, 25), 5,5), check.attributes = FALSE)     # check.attributes = F otherwise due to dimnames error
165 | })
166 | 
167 | 
168 | testthat::test_that("multiplying 3 matrices or data.frames of all 2's results in a a matrix of all 8's", {
169 |   m1 = matrix(rep(2, 25), 5, 5)
170 |   m2 = matrix(rep(2, 25), 5, 5)
171 |   m3 = matrix(rep(2, 25), 5, 5)
172 |   
173 |   lst = list(m1, m2, m3)
174 |   
175 |   testthat::expect_equal(switch.ops(lst, MODE = 'MULT'), matrix(rep(8, 25), 5,5), check.attributes = FALSE)     # check.attributes = F otherwise due to dimnames error
176 | })
177 | 
178 | 
179 | testthat::test_that("it throws an error if the sublist matrices are of unequal dimensions", {
180 |   m1 = matrix(rep(2, 35), 5, 7)
181 |   m2 = matrix(rep(2, 25), 5, 5)
182 |   m3 = matrix(rep(2, 25), 5, 5)
183 |   
184 |   lst = list(m1, m2, m3)
185 |   
186 |   testthat::expect_error(switch.ops(lst, MODE = 'MULT'))     # check.attributes = F otherwise due to dimnames error
187 | })
188 | 
189 | 
190 | testthat::test_that("if LST is not a list it throws error", {
191 |   m1 = matrix(rep(1, 25), 5, 5)
192 |   
193 |   testthat::expect_error( switch.ops(m1, MODE = 'MULT') ) 
194 | })
195 | 
196 | 
197 | testthat::test_that("if the MODE is an invalid string it throws an error", {  
198 |   
199 |   m1 = matrix(rep(2, 25), 5, 5)
200 |   m2 = matrix(rep(2, 25), 5, 5)
201 |   m3 = matrix(rep(2, 25), 5, 5)
202 |   
203 |   lst = list(m1, m2, m3)
204 |   
205 |   testthat::expect_error( switch.ops(lst, MODE = 'invalid') ) 
206 | })
207 | 
208 | #================================================================================================================================================================
209 | 


--------------------------------------------------------------------------------
/R/dist_kernelknn.R:
--------------------------------------------------------------------------------
  1 | 
  2 | #' kernel k-nearest-neighbors using a distance matrix
  3 | #'
  4 | #' @param DIST_mat a distance matrix (square matrix) having a \emph{diagonal} filled with either zero's (\emph{0}) or NA's (\emph{missing values})
  5 | #' @param TEST_indices a numeric vector specifying the indices of the test data in the distance matrix (row-wise or column-wise). If the parameter equals NULL then no test data is included in the distance matrix
  6 | #' @param y a numeric vector (in classification the labels must be numeric from 1:Inf). It is assumed that if the \emph{TEST_indices} is not NULL then the length of \emph{y} equals to the rows of the train data \emph{( nrow(DIST_mat) - length(TEST_indices) )}, otherwise  \emph{length(y) == nrow(DIST_mat)}.
  7 | #' @param k an integer specifying the k-nearest-neighbors
  8 | #' @param h the bandwidth (applicable if the weights_function is not NULL, defaults to 1.0)
  9 | #' @param weights_function there are various ways of specifying the kernel function. See the details section.
 10 | #' @param regression a boolean (TRUE,FALSE) specifying if regression or classification should be performed
 11 | #' @param threads the number of cores to be used in parallel (openmp will be employed)
 12 | #' @param extrema if TRUE then the minimum and maximum values from the k-nearest-neighbors will be removed (can be thought as outlier removal)
 13 | #' @param Levels a numeric vector. In case of classification the unique levels of the response variable are necessary
 14 | #' @param minimize either TRUE or FALSE. If TRUE then lower values will be considered as relevant for the k-nearest search, otherwise higher values.
 15 | #' @return a vector (if regression is TRUE), or a data frame with class probabilities (if regression is FALSE)
 16 | #' @author Lampros Mouselimis
 17 | #' @details
 18 | #' This function takes a distance matrix (square matrix where the diagonal is filled with \emph{0} or \emph{NA}) as input. If the \emph{TEST_indices} parameter is NULL then the predictions for the train data will be returned, whereas if the \emph{TEST_indices} parameter is not NULL then the predictions for the test data will be returned.
 19 | #' There are three possible ways to specify the weights function, 1st option : if the weights_function is NULL then a simple k-nearest-neighbor is performed. 2nd option : the weights_function is one of 'uniform', 'triangular', 'epanechnikov', 'biweight', 'triweight', 'tricube', 'gaussian', 'cosine', 'logistic', 'gaussianSimple', 'silverman', 'inverse', 'exponential'. The 2nd option can be extended by combining kernels from the existing ones (adding or multiplying). For instance, I can multiply the tricube with the gaussian kernel by giving 'tricube_gaussian_MULT' or I can add the previously mentioned kernels by giving 'tricube_gaussian_ADD'. 3rd option : a user defined kernel function
 20 | #' @export
 21 | #' @importFrom stats dist
 22 | #' @examples
 23 | #'
 24 | #' data(Boston)
 25 | #'
 26 | #' X = Boston[, -ncol(Boston)]
 27 | #' y = Boston[, ncol(Boston)]
 28 | #'
 29 | #' dist_obj = dist(X)
 30 | #'
 31 | #' dist_mat = as.matrix(dist_obj)
 32 | #'
 33 | #' out = distMat.KernelKnn(dist_mat, TEST_indices = NULL, y, k = 5, regression = TRUE)
 34 | #'
 35 | 
 36 | 
 37 | distMat.KernelKnn = function(DIST_mat, TEST_indices = NULL, y, k = 5, h = 1.0, weights_function = NULL, regression = F, threads = 1, extrema = F, Levels = NULL, minimize = T) {
 38 | 
 39 |   if (!is.matrix(DIST_mat)) stop("the 'DIST_mat' parameter should be of type matrix")
 40 |   if (nrow(DIST_mat) != ncol(DIST_mat)) stop("the input 'DIST_mat' should be a square matrix with number of rows equal to number of columns")
 41 |   DIAG = diag(DIST_mat)
 42 |   nas = all(is.na(DIAG))
 43 |   if (nas) {
 44 |     diag(DIST_mat) = 0 }              # set diagonal to 0.0 if equal to NA
 45 |   else {
 46 |     if (sum(DIAG) != 0) {
 47 |       stop("the diagonal of the distance matrix must be a vector of zeros or NA's")
 48 |     }
 49 |   }
 50 |   if (!is.null(TEST_indices)) {
 51 |     if (!inherits(TEST_indices, c("numeric", "integer"))) stop("the 'TEST_indices' parameter should be a numeric vector")
 52 |     if (max(TEST_indices) > nrow(DIST_mat)) stop('the maximum number of the TEST_indices is greater than the rows of the input distance matrix')
 53 |     tr_idx = 1:nrow(DIST_mat)
 54 |     tr_idx = tr_idx[-TEST_indices]
 55 |     if (!(min(TEST_indices) > max(tr_idx))) stop("The minimum index of the 'TEST_indices' parameter is greater than the maximum index of the 'DIST_mat' data! Make sure that the 'TEST_indices' consist of the last indices of the 'DIST_mat' parameter!")
 56 |   }
 57 |   if (!is.numeric(k) || is.null(k) || (k >= nrow(DIST_mat)) || k < 1) stop('k must be of type integer, greater than 0 and less than nrow(DIST_mat)')
 58 |   if (abs(k - round(k)) > 0) {
 59 |     k = round(k)
 60 |     warning('k is float and will be rounded to : ', call. = F, expr = k)}
 61 |   if (h == 0) stop('h can be any number except for 0')
 62 |   if (is.null(y)) stop('the response variable should be numeric')
 63 |   if (is.integer(y)) y = as.numeric(y)
 64 |   if (!is.numeric(y)) stop('in both regression and classification the response variable should be numeric or integer and in classification it should start from 1')
 65 |   if (!regression && is.null(Levels)) stop('In classification give the unique values of y in form of a vector')
 66 |   if (!regression && any(unique(y) < 1)) stop('the response variable values should begin from 1')
 67 |   if (!regression) {
 68 |     if (!all(Levels %in% unique(y))) stop("The specified 'Levels' must match the unique 'y' labels!")
 69 |   }
 70 |   if (any(is.na(DIST_mat)) || any(is.na(y))) stop('the DIST_mat or the response variable includes missing values')
 71 |   if (is.null(TEST_indices)) {
 72 |     if (length(y) != nrow(DIST_mat)) {
 73 |       stop('the size of the DIST_mat and y differ')
 74 |     }
 75 |   }
 76 |   if (extrema && k < 4) stop('k must be greater than 3 if extrema = TRUE')
 77 |   if (!inherits(minimize, "logical")) stop("the 'minimize' parameter should be either TRUE or FALSE")
 78 | 
 79 |   if (extrema) {
 80 | 
 81 |     k = k + 2           # add two values (for min-max)
 82 |   }
 83 | 
 84 |   index_train = DIST_MATRIX_knn(DIST_mat, TEST_indices, minimize, k, threads, F)        # the last parameter is FALSE because it is only applicable to 'distMat.knn.index.dist' [ here I don't need two separate functions for train and test data, as the function returns only predictions and not indices ]
 85 | 
 86 |   if (extrema) {
 87 | 
 88 |     index_train$knn_idx = index_train$knn_idx[, -c(1,k)]           # remove min, max  (matrices already sorted)
 89 |     index_train$knn_dist = index_train$knn_dist[, -c(1,k)]         # remove min, max  (matrices already sorted)
 90 | 
 91 |     k = k - 2          # adjust k to previous value
 92 |   }
 93 | 
 94 |   out_train = matrix(y[index_train$knn_idx], ncol = k)
 95 | 
 96 |   if (!regression) {
 97 | 
 98 |     if (is.null(weights_function)) {
 99 | 
100 |       out = func_tbl_dist(out_train, sort(Levels))
101 |     }
102 |     else if (is.function(weights_function)) {
103 | 
104 |       W = FUNCTION_weights(index_train$knn_dist, weights_function)
105 | 
106 |       out = func_tbl(out_train, W, sort(Levels))
107 |     }
108 |     else if (is.character(weights_function) && nchar(weights_function) > 1) {
109 | 
110 |       W = FUN_kernels(weights_function, index_train$knn_dist, h)
111 | 
112 |       out = func_tbl(out_train, W, sort(Levels))
113 |     }
114 |     else {
115 | 
116 |       stop('false input for the weights_function argument')
117 |     }
118 | 
119 |     colnames(out) = paste0('class_', sort(Levels))
120 |   }
121 | 
122 |   else {
123 | 
124 |     if (is.null(weights_function)) {
125 | 
126 |       out = rowMeans(out_train)
127 |     }
128 |     else if (is.function(weights_function)) {
129 | 
130 |       W = FUNCTION_weights(index_train$knn_dist, weights_function)
131 | 
132 |       out = rowSums(out_train * W)
133 |     }
134 |     else if (is.character(weights_function) && nchar(weights_function) > 1) {
135 | 
136 |       W = FUN_kernels(weights_function, index_train$knn_dist, h)
137 | 
138 |       out = rowSums(out_train * W)
139 |     }
140 |     else {
141 | 
142 |       stop('false input for the weights_function argument')
143 |     }
144 |   }
145 | 
146 |   return(out)
147 | }
148 | 
149 | 
150 | #================================================================================================================================================================================
151 | 


--------------------------------------------------------------------------------
/vignettes/regression_using_the_housing_data.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Regression using the Housing data"
  3 | author: "Lampros Mouselimis"
  4 | date: "`r Sys.Date()`"
  5 | output: rmarkdown::html_vignette
  6 | vignette: >
  7 |   %\VignetteIndexEntry{Regression using the Housing data}
  8 |   %\VignetteEngine{knitr::rmarkdown}
  9 |   %\VignetteEncoding{UTF-8}
 10 | ---
 11 | 
 12 | The following examples illustrate the functionality of the KernelKnn package for **regression** tasks. I'll make use of the *Housing* data set, 
 13 | 
 14 | ```{r, eval=T}
 15 | 
 16 | data(Boston, package = 'KernelKnn')
 17 | 
 18 | str(Boston)
 19 | 
 20 | ```
 21 | <br>
 22 | 
 23 | 
 24 | When using an algorithm where the ouput depends on distance calculation (as is the case in k-nearest-neighbors) it is recommended to first scale the data,
 25 | 
 26 | ```{r, eval=T}
 27 | X = scale(Boston[, -ncol(Boston)])
 28 | y = Boston[, ncol(Boston)]
 29 | 
 30 | # random split of data in train and test
 31 | 
 32 | spl_train = sample(1:length(y), round(length(y) * 0.75))
 33 | spl_test = setdiff(1:length(y), spl_train)
 34 | str(spl_train)
 35 | str(spl_test)
 36 | 
 37 | 
 38 | # evaluation metric
 39 | 
 40 | mse = function (y_true, y_pred) {
 41 |   
 42 |   out = mean((y_true - y_pred)^2)
 43 |   
 44 |   out
 45 | }
 46 | 
 47 | ```
 48 | 
 49 | 
 50 | 
 51 | ## The KernelKnn function
 52 | 
 53 | The KernelKnn function takes a number of arguments. To read details for each one of the arguments type ?KernelKnn::KernelKnn in the console. 
 54 | 
 55 | A simple k-nearest-neighbors can be run with weights_function = NULL (the parameter 'regression' should be set to TRUE for regression),
 56 | 
 57 | ```{r, eval=T}
 58 | 
 59 | library(KernelKnn)
 60 | 
 61 | preds_TEST = KernelKnn(X[spl_train, ], TEST_data = X[spl_test, ], y[spl_train], k = 5 , 
 62 |                        
 63 |                        method = 'euclidean', weights_function = NULL, regression = T)
 64 | str(preds_TEST)
 65 | 
 66 | ```
 67 | 
 68 | <br>
 69 | Using transf_categ_cols = TRUE, categorical features can be either encoded to dummy or to numeric features depending on the number of the unique values (here I convert the 'chas' and 'rad' features to factor to apply the *transf_categ_cols* parameter)
 70 | 
 71 | ```{r, eval=T}
 72 | 
 73 | 
 74 | apply(Boston, 2, function(x) length(unique(x)))
 75 | 
 76 | 
 77 | tmp_bst = Boston
 78 | tmp_bst$chas = as.factor(tmp_bst$chas)
 79 | tmp_bst$rad = as.factor(tmp_bst$rad)
 80 | 
 81 | preds_TEST = KernelKnn(tmp_bst[spl_train, -ncol(tmp_bst)], 
 82 |                        
 83 |                        TEST_data = tmp_bst[spl_test, -ncol(tmp_bst)], 
 84 |                        
 85 |                        y[spl_train], k = 5 , method = 'euclidean', 
 86 |                        
 87 |                        regression = T, transf_categ_cols = T)
 88 | str(preds_TEST)
 89 | 
 90 | ```
 91 | <br>
 92 | There are two ways to use a kernel in the KernelKnn function. The **first option** is to choose one of the existing kernels (*uniform*, *triangular*, *epanechnikov*, *biweight*, *triweight*, *tricube*, *gaussian*, *cosine*, *logistic*, *silverman*, *inverse*, *gaussianSimple*, *exponential*). Here, I use the *mahalanobis* metric (which takes advantage of the covariance matrix of the data, but it somewhat slows down training in comparison to the other distance metrics) and the *biweight* kernel, because they give optimal results (according to my *RandomSearchR* package),
 93 | 
 94 | 
 95 | ```{r, eval=T}
 96 | 
 97 | 
 98 | preds_TEST_biw = KernelKnn(X[spl_train, ], TEST_data = X[spl_test, ], y[spl_train], k = 5, 
 99 |                            
100 |                            method = 'mahalanobis', weights_function = 'biweight', 
101 |                            
102 |                            regression = T, transf_categ_cols = F)
103 | str(preds_TEST_biw)
104 | 
105 | ```
106 | <br>
107 | The **second option** is to give a self defined kernel function. Here, I'll pick the density function of the normal distribution with mean = 0.0 and standard deviation = 1.0 (the data are scaled to have mean zero and unit variance),
108 | 
109 | 
110 | ```{r, eval=T}
111 | 
112 | 
113 | norm_kernel = function(W) {
114 |   
115 |   W = dnorm(W, mean = 0, sd = 1.0)
116 |   
117 |   W = W / rowSums(W)
118 |   
119 |   return(W)
120 | }
121 | 
122 | 
123 | preds_TEST_norm = KernelKnn(X[spl_train, ], TEST_data = X[spl_test, ], y[spl_train], k = 5,
124 |                             
125 |                             method = 'mahalanobis', weights_function = norm_kernel, 
126 |                             
127 |                             regression = T, transf_categ_cols = F)
128 | str(preds_TEST_norm)
129 | 
130 | ```
131 | <br>
132 | 
133 | 
134 | The computations can be speed up by using the parameter **threads** (multiple cores can be run in parallel). There is also the option to exclude **extrema** (minimum and maximum distances) during the calculation of the k-nearest-neighbor distances using extrema = TRUE. The *bandwidth* of the existing kernels can be tuned using the **h** parameter. 
135 | <br>
136 | 
137 | K-nearest-neigbor calculations in the KernelKnn function can be accomplished using the following distance metrics : *euclidean*, *manhattan*, *chebyshev*, *canberra*, *braycurtis*, *minkowski* (by default the order 'p' of the minkowski parameter equals k), *hamming*, *mahalanobis*, *pearson_correlation*, *simple_matching_coefficient*, *jaccard_coefficient* and *Rao_coefficient*. The last four are similarity measures and are appropriate for binary data [0,1].
138 | <br>
139 | 
140 | I employed my RandomSearchR package to find the optimal parameters for the KernelKnn function and the following two pairs of parameters give an optimal mean-squared-error,
141 | <br>
142 | <br>
143 | 
144 | ```{r, eval = T, echo = F}
145 | 
146 |  knitr::kable(data.frame(k = c(9,3), method = c('mahalanobis', 'canberra'), kernel = c('triweight', 'cosine')))
147 | ```
148 | 
149 | 
150 | 
151 | ## The KernelKnnCV function
152 |  
153 | I'll use the *KernelKnnCV* function to calculate the mean-squared-error using 3-fold cross-validation for the previous mentioned parameter pairs,
154 | 
155 | 
156 | ```{r, eval=T, warning = FALSE, message = FALSE, results = 'hide'}
157 | 
158 | fit_cv_pair1 = KernelKnnCV(X, y, k = 9, folds = 3, method = 'mahalanobis', 
159 |                            
160 |                            weights_function = 'triweight', regression = T, 
161 |                            
162 |                            threads = 5, seed_num = 3)
163 | ```
164 | 
165 | ```{r, eval=T}
166 | str(fit_cv_pair1)
167 | ```
168 | 
169 | ```{r, eval=T, warning = FALSE, message = FALSE, results = 'hide'}
170 | fit_cv_pair2 = KernelKnnCV(X, y, k = 3, folds = 3, method = 'canberra',
171 |                            
172 |                            weights_function = 'cosine', regression = T, 
173 |                            
174 |                            threads = 5, seed_num = 3)
175 | ```
176 | 
177 | ```{r, eval=T, warning = FALSE, message = FALSE, results = 'hide'}
178 | str(fit_cv_pair2)
179 | 
180 | ```
181 | <br>
182 | 
183 | 
184 | Each cross-validated object returns a list of length 2 ( the first sublist includes the predictions for each fold whereas the second gives the indices of the folds)
185 | 
186 | 
187 | ```{r, eval=T}
188 | mse_pair1 = unlist(lapply(1:length(fit_cv_pair1$preds), 
189 |                           
190 |                           function(x) mse(y[fit_cv_pair1$folds[[x]]], 
191 |                                           
192 |                                           fit_cv_pair1$preds[[x]])))
193 | mse_pair1
194 | 
195 | cat('mse for params_pair1 is :', mean(mse_pair1), '\n')
196 | 
197 | mse_pair2 = unlist(lapply(1:length(fit_cv_pair2$preds), 
198 |                           
199 |                           function(x) mse(y[fit_cv_pair2$folds[[x]]], 
200 |                                           
201 |                                           fit_cv_pair2$preds[[x]])))
202 | mse_pair2
203 | 
204 | cat('mse for params_pair2 is :', mean(mse_pair2), '\n')
205 | 
206 | ```
207 | <br>
208 | 
209 | 
210 | ## Adding or multiplying kernels
211 | 
212 | In the KernelKnn package there is also the option to **combine kernels** (adding or multiplying) from the existing ones. For instance, if I want to multiply the *tricube* with the *gaussian* kernel, then I'll give the following character string to the weights_function, *"tricube_gaussian_MULT"*. On the other hand, If I want to add the same kernels then the weights_function will be *"tricube_gaussian_ADD"*. I experimented with my RandomSearchR package combining the different kernels and the following two parameter settings gave optimal results, 
213 | 
214 | <br>
215 | ```{r, eval = T, echo = F}
216 | 
217 |  knitr::kable(data.frame(k = c(19,18), method = c('mahalanobis', 'mahalanobis'), kernel = c('triangular_triweight_MULT', 'biweight_triweight_gaussian_MULT')))
218 | ```
219 | 
220 | <br>
221 | 
222 | 
223 | ```{r, eval=T, warning = FALSE, message = FALSE, results = 'hide'}
224 | 
225 | fit_cv_pair1 = KernelKnnCV(X, y, k = 19, folds = 3, method = 'mahalanobis', 
226 |                            
227 |                            weights_function = 'triangular_triweight_MULT', 
228 |                            
229 |                            regression = T, threads = 5, seed_num = 3)
230 | ```
231 | 
232 | ```{r, eval=T}
233 | str(fit_cv_pair1)
234 | ```
235 | 
236 | ```{r, eval=T, warning = FALSE, message = FALSE, results = 'hide'}
237 | fit_cv_pair2 = KernelKnnCV(X, y, k = 18, folds = 3, method = 'mahalanobis', 
238 |                            
239 |                            weights_function = 'biweight_triweight_gaussian_MULT', 
240 |                            
241 |                            regression = T, threads = 5, seed_num = 3)
242 | ```
243 | 
244 | ```{r, eval=T}
245 | str(fit_cv_pair2)
246 | 
247 | ```
248 | <br>
249 | 
250 | ```{r, eval=T}
251 | mse_pair1 = unlist(lapply(1:length(fit_cv_pair1$preds), 
252 |                           
253 |                           function(x) mse(y[fit_cv_pair1$folds[[x]]], 
254 |                                           
255 |                                           fit_cv_pair1$preds[[x]])))
256 | mse_pair1
257 | 
258 | cat('mse for params_pair1 is :', mean(mse_pair1), '\n')
259 | 
260 | mse_pair2 = unlist(lapply(1:length(fit_cv_pair2$preds), 
261 |                           
262 |                           function(x) mse(y[fit_cv_pair2$folds[[x]]], 
263 |                                           
264 |                                           fit_cv_pair2$preds[[x]])))
265 | mse_pair2
266 | 
267 | cat('mse for params_pair2 is :', mean(mse_pair2), '\n')
268 | 
269 | ```
270 | <br>
271 | 
272 | 


--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
  1 | #==============================================================================================================================================
  2 | 
  3 | #' this function normalizes the data
  4 | #'
  5 | #' @keywords internal
  6 | 
  7 | normalized = function(x) {
  8 | 
  9 |   out = (x - min(x))/(max(x) - min(x))
 10 | 
 11 |   out
 12 | }
 13 | 
 14 | 
 15 | 
 16 | #' this function returns the probabilities in case of classification
 17 | #'
 18 | #' @keywords internal
 19 | 
 20 | func_tbl_dist = function(DF, Levels) {
 21 | 
 22 |   mat = matrix(rep(0, dim(DF)[1] * length(Levels)), ncol = length(Levels), nrow = dim(DF)[1])
 23 | 
 24 |   for (i in 1:dim(DF)[1]) {
 25 | 
 26 |     tmp_tbl = prop.table(table(DF[i, ]))
 27 | 
 28 |     mat[i, as.numeric(names(tmp_tbl))] = tmp_tbl
 29 |   }
 30 | 
 31 |   mat
 32 | }
 33 | 
 34 | 
 35 | 
 36 | #' this function returns a table of probabilities for each label
 37 | #'
 38 | #' @keywords internal
 39 | 
 40 | func_tbl = function(DF, W, labels) {
 41 | 
 42 |   tmp_W = matrix(rep(0, dim(DF)[1] * length(labels)), ncol = length(labels), nrow = dim(DF)[1])
 43 | 
 44 |   for (i in 1:length(labels)) {
 45 | 
 46 |     tmp_W[, i] <- rowSums(W * (DF == labels[i]))
 47 |   }
 48 | 
 49 |   tmp_W
 50 | }
 51 | 
 52 | 
 53 | 
 54 | #' this function is used as a kernel-function-identifier [ takes the distances and a weights-kernel (in form of a function) and returns weights ]
 55 | #'
 56 | #' @keywords internal
 57 | 
 58 | FUNCTION_weights = function(W_dist_matrix, weights_function, eps = 1.0e-6) {
 59 | 
 60 |   W_dist_matrix = t(apply(W_dist_matrix, 1, normalized))
 61 | 
 62 |   W_dist_matrix = W_dist_matrix - eps
 63 | 
 64 |   W = do.call(weights_function, list(W_dist_matrix))
 65 | 
 66 |   W <- W/rowSums(W)
 67 | 
 68 |   W
 69 | }
 70 | 
 71 | 
 72 | 
 73 | # secondary function used in 'FUN_kernels' function
 74 | #'
 75 | #' @keywords internal
 76 | 
 77 | switch_secondary = function(kernel, W, h, eps = 1.0e-6) {
 78 | 
 79 |   W = t(apply(W, 1, normalized))
 80 | 
 81 |   W = W - eps                                                 # add small value in case of 0's
 82 | 
 83 |   kernel <- match.arg(kernel, c('uniform', 'triangular', 'epanechnikov', 'biweight', 'triweight', 'tricube', 'gaussian', 'cosine', 'logistic', 'gaussianSimple',
 84 | 
 85 |                                 'silverman', 'inverse', 'exponential'), FALSE)
 86 | 
 87 |   switch(kernel,
 88 | 
 89 |          uniform = {W = (1/2)* abs(W/h)},
 90 | 
 91 |          triangular = {W = (1 - abs(W/h))},
 92 | 
 93 |          epanechnikov = {W = (3/4) * (1 - (W / h) ^ 2)},
 94 | 
 95 |          biweight = {W = (15/16) * ((1 - ((W / h) ^ 2)) ^ 2)},
 96 | 
 97 |          triweight = {W = (35/32) * ((1 - ((W / h) ^ 2)) ^ 3)},
 98 | 
 99 |          tricube = {W = (70/81) * ((1 - (abs(W) ^ 3)/(h ^ 3)) ^ 3)},
100 | 
101 |          gaussian = {W = (1/sqrt(2*pi)) * exp((-1/2) * ((W ^ 2)/(2*(h ^ 2))))},
102 | 
103 |          gaussianSimple = {W = exp(- (W / h) ^ 2)},
104 | 
105 |          cosine = {W = (pi/4) * cos((pi * abs(W - 0.5))/(2*h))},
106 | 
107 |          logistic = {W = (1/(exp(W/h) + 2 + exp(-W/h)))},
108 | 
109 |          silverman = {W = 1/2 * exp(-abs(W/h)/sqrt(2)) * sin((abs(W)/(2*h)) + (pi/4))},
110 | 
111 |          inverse = {W <- 1/abs(W/h)},
112 | 
113 |          exponential = {W = exp(- abs(W / h))},
114 |   )
115 | 
116 |   W <- W/rowSums(W)	                                           # normalize weights
117 | 
118 |   return(W)
119 | }
120 | 
121 | 
122 | #' Arithmetic operations on lists
123 | #'
124 | #' @references
125 | #'
126 | #' https://www.cs.toronto.edu/~duvenaud/cookbook/
127 | #'
128 | #' https://raw.githubusercontent.com/duvenaud/phd-thesis/master/kernels.pdf
129 | #'
130 | #' @keywords internal
131 | 
132 | 
133 | switch.ops = function (LST, MODE = 'ADD') {
134 | 
135 |   if (!inherits(LST, "list"))  stop("LST must be a list")
136 | 
137 |   if (!all(unlist(lapply(LST, function(x) inherits(x, c('data.frame', 'matrix')))))) stop('the sublist objects must be either matrices or data frames')
138 | 
139 |   r = all(unlist(lapply(LST, nrow)) == unlist(lapply(LST, nrow))[1])
140 | 
141 |   c = all(unlist(lapply(LST, ncol)) == unlist(lapply(LST, ncol))[1])
142 | 
143 |   if (!all(c(r, c))) stop("the dimensions of the included data.frames or matrices differ")
144 | 
145 |   if (MODE == 'ADD') {
146 | 
147 |     init_df = data.frame(matrix(rep(0, dim(LST[[1]])[1] * dim(LST[[1]])[2]), nrow = dim(LST[[1]])[1], ncol = dim(LST[[1]])[2]))}
148 | 
149 |   else if (MODE == 'MULT') {
150 | 
151 |     init_df = data.frame(matrix(rep(1, dim(LST[[1]])[1] * dim(LST[[1]])[2]), nrow = dim(LST[[1]])[1], ncol = dim(LST[[1]])[2]))
152 |   }
153 | 
154 |   else {
155 | 
156 |     stop('invalid MODE type')
157 |   }
158 | 
159 |   for (i in 1:length(LST)) {
160 | 
161 |     if (MODE == 'ADD') {
162 | 
163 |       init_df = init_df + LST[[i]]}
164 | 
165 |     if (MODE == 'MULT') {
166 | 
167 |       init_df = init_df * LST[[i]]
168 |     }
169 |   }
170 | 
171 |   colnames(init_df) = colnames(LST[[1]])
172 | 
173 |   return(as.matrix(init_df))
174 | }
175 | 
176 | 
177 | 
178 | #' performs kernel smoothing using a bandwidth. Besides using a kernel there is also the option to combine kernels
179 | #'
180 | #' @keywords internal
181 | 
182 | 
183 | FUN_kernels = function(kernel, W, h) {
184 | 
185 |   spl = strsplit(kernel, '_')[[1]]
186 | 
187 |   s_op = spl[length(spl)]
188 | 
189 |   s_kerns = spl[-length(spl)]
190 | 
191 |   if (length(spl) > 1) {
192 | 
193 |     if (length(s_kerns) < 2) stop('invalid kernel combination')
194 | 
195 |     kernels = c('uniform', 'triangular', 'epanechnikov', 'biweight', 'triweight', 'tricube', 'gaussian', 'cosine',
196 | 
197 |                 'logistic', 'gaussianSimple', 'silverman', 'inverse', 'exponential')
198 | 
199 |     if (sum(s_kerns %in% kernels) != length(s_kerns)) stop('invalid kernel combination')
200 | 
201 |     lap = lapply(s_kerns, function(x) switch_secondary(x, W, h))
202 | 
203 |     W = switch.ops(lap, MODE = s_op)
204 | 
205 |     W <- W/rowSums(W)
206 |   }
207 | 
208 |   else {
209 | 
210 |     W = switch_secondary(kernel, W, h)
211 |   }
212 | 
213 |   return(W)
214 | }
215 | 
216 | 
217 | 
218 | #' OPTION to convert categorical features TO either numeric [ if levels more than 32] OR to dummy variables [ if levels less than 32 ]
219 | #'
220 | #' @keywords internal
221 | #' @importFrom stats model.matrix
222 | 
223 | func_categorical_preds = function(prepr_categ) {
224 | 
225 |   less32 = sapply(prepr_categ, function(x) is.factor(x) && length(unique(x)) < 32)
226 |   greater32 = sapply(prepr_categ, function(x) is.factor(x) && length(unique(x)) >= 32)
227 | 
228 |   if (sum(less32) == 1) {
229 | 
230 |     rem_predictors = names(which(less32))
231 |     out_L = model.matrix(~. - 1, data = data.frame(prepr_categ[, rem_predictors]))
232 |     colnames(out_L) = paste0(rem_predictors, 1:dim(out_L)[2])
233 |   }
234 | 
235 |   if (sum(less32) > 1) {
236 | 
237 |     rem_predictors = names(which(less32))
238 |     out_L = model.matrix(~. - 1, data = prepr_categ[, rem_predictors])
239 |     colnames(out_L) = make.names(colnames(out_L))
240 |   }
241 | 
242 |   if (sum(greater32) > 0) {
243 | 
244 |     fact_predictors = names(which(greater32))
245 | 
246 |     for (nams in fact_predictors) {
247 | 
248 |       prepr_categ[, nams] = as.numeric(prepr_categ[, nams])
249 |     }
250 |   }
251 | 
252 |   if (sum(less32) > 0) {
253 | 
254 |     return(cbind(prepr_categ[, -which(colnames(prepr_categ) %in% rem_predictors)], out_L))
255 |   }
256 | 
257 |   else {
258 | 
259 |     return(prepr_categ)
260 |   }
261 | }
262 | 
263 | 
264 | #' shuffle data
265 | #'
266 | #' this function shuffles the items of a vector
267 | #' @keywords internal
268 | 
269 | func_shuffle = function(vec, times = 10) {
270 | 
271 |   for (i in 1:times) {
272 | 
273 |     out = sample(vec, length(vec))
274 |   }
275 |   out
276 | }
277 | 
278 | 
279 | #' stratified folds (in classification)                      [ detailed information about class_folds in the FeatureSelection package ]
280 | #'
281 | #' this function creates stratified folds in binary and multiclass classification
282 | #' @keywords internal
283 | #' @importFrom utils combn
284 | 
285 | 
286 | class_folds = function(folds, RESP) {
287 | 
288 |   if (!is.factor(RESP)) {
289 | 
290 |     stop(simpleError("RESP must be a factor"))
291 |   }
292 | 
293 |   clas = lapply(unique(RESP), function(x) which(RESP == x))
294 | 
295 |   len = lapply(clas, function(x) length(x))
296 | 
297 |   samp_vec = rep(1/folds, folds)
298 | 
299 |   prop = lapply(len, function(y) sapply(1:length(samp_vec), function(x) round(y * samp_vec[x])))
300 | 
301 |   repl = unlist(lapply(prop, function(x) sapply(1:length(x), function(y) rep(paste0('fold_', y), x[y]))))
302 | 
303 |   spl = suppressWarnings(split(1:length(RESP), repl))
304 | 
305 |   sort_names = paste0('fold_', 1:folds)
306 | 
307 |   spl = spl[sort_names]
308 | 
309 |   spl = lapply(spl, function(x) func_shuffle(x))           # the indices of the unique levels will be shuffled
310 | 
311 |   ind = t(combn(1:folds, 2))
312 | 
313 |   ind1 = apply(ind, 1, function(x) length(intersect(spl[x[1]], spl[x[2]])))
314 | 
315 |   if (sum(ind1) > 0) {
316 | 
317 |     stop(simpleError("there is an intersection between the resulted indexes of the folds"))
318 | 
319 |   }
320 | 
321 |   if (length(unlist(spl)) != length(RESP)) {
322 | 
323 |     stop(simpleError("the number of items in the folds are not equal with the response items"))
324 |   }
325 | 
326 |   spl
327 | }
328 | 
329 | 
330 | #' create folds (in regression)                                           [ detailed information about class_folds in the FeatureSelection package ]
331 | #'
332 | #' this function creates both stratified and non-stratified folds in regression
333 | #' @keywords internal
334 | 
335 | 
336 | regr_folds = function(folds, RESP) {
337 | 
338 |   if (is.factor(RESP)) {
339 | 
340 |     stop(simpleError("this function is meant for regression for classification use the 'class_folds' function"))
341 |   }
342 | 
343 |   samp_vec = rep(1/folds, folds)
344 | 
345 |   sort_names = paste0('fold_', 1:folds)
346 | 
347 |   prop = lapply(length(RESP), function(y) sapply(1:length(samp_vec), function(x) round(y * samp_vec[x])))
348 | 
349 |   repl = func_shuffle(unlist(lapply(prop, function(x) sapply(1:length(x), function(y) rep(paste0('fold_', y), x[y])))))
350 | 
351 |   spl = suppressWarnings(split(1:length(RESP), repl))
352 | 
353 |   spl = spl[sort_names]
354 | 
355 |   if (length(unlist(spl)) != length(RESP)) {
356 | 
357 |     stop(simpleError("the length of the splits are not equal with the length of the response"))
358 |   }
359 | 
360 |   spl
361 | }
362 | 
363 | 
364 | 
365 | #================================================================================================================================================================
366 | 


--------------------------------------------------------------------------------
/vignettes/binary_classification_using_the_ionosphere_data.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "binary classification using the ionosphere data"
  3 | author: "Lampros Mouselimis"
  4 | date: "`r Sys.Date()`"
  5 | output: rmarkdown::html_vignette
  6 | vignette: >
  7 |   %\VignetteIndexEntry{binary classification using the ionosphere data}
  8 |   %\VignetteEngine{knitr::rmarkdown}
  9 |   %\VignetteEncoding{UTF-8}
 10 | ---
 11 | 
 12 | 
 13 | The following examples illustrate the functionality of the KernelKnn package for **classification** tasks. I'll make use of the *ionosphere* data set, 
 14 | 
 15 | 
 16 | 
 17 | ```{r, eval=T}
 18 | 
 19 | data(ionosphere, package = 'KernelKnn')
 20 | 
 21 | apply(ionosphere, 2, function(x) length(unique(x)))
 22 | 
 23 | # the second column will be removed as it has a single unique value
 24 | 
 25 | ionosphere = ionosphere[, -2]
 26 | ```
 27 | <br>
 28 | 
 29 | 
 30 | When using an algorithm where the ouput depends on distance calculation (as is the case in k-nearest-neighbors) it is recommended to first scale the data,
 31 | 
 32 | 
 33 | ```{r, eval=T}
 34 | # recommended is to scale the data
 35 | 
 36 | X = scale(ionosphere[, -ncol(ionosphere)])
 37 | y = ionosphere[, ncol(ionosphere)]
 38 | ```
 39 | <br>
 40 | 
 41 | **important note** : In classification, both functions *KernelKnn* and *KernelKnnCV* accept a numeric vector as a response variable (here y) and the unique values of the labels should begin from 1. This is important otherwise the internal functions do not work. Furthermore, both functions (by default) return predictions in form of probabilities, which can be converted to labels by using either a threshold (if binary classification) or the maximum value of each column (if multiclass classification).
 42 | 
 43 | 
 44 | ```{r, eval=T}
 45 | 
 46 | # labels should be numeric and begin from 1:Inf
 47 | 
 48 | y = c(1:length(unique(y)))[ match(ionosphere$class, sort(unique(ionosphere$class))) ]
 49 | 
 50 | # random split of data in train and test
 51 | 
 52 | spl_train = sample(1:length(y), round(length(y) * 0.75))
 53 | spl_test = setdiff(1:length(y), spl_train)
 54 | str(spl_train)
 55 | str(spl_test)
 56 | 
 57 | 
 58 | # evaluation metric
 59 | 
 60 | acc = function (y_true, preds) {
 61 |   
 62 |   out = table(y_true, max.col(preds, ties.method = "random"))
 63 |   
 64 |   acc = sum(diag(out))/sum(out)
 65 |   
 66 |   acc
 67 | }
 68 | 
 69 | ```
 70 | 
 71 | 
 72 | ## The KernelKnn function
 73 | 
 74 | The KernelKnn function takes a number of arguments. To read details for each one of the arguments type ?KernelKnn::KernelKnn in the console. 
 75 | 
 76 | A simple k-nearest-neighbors can be run with weights_function = NULL and the parameter 'regression' should be set to FALSE. In classification the *Levels* parameter takes the unique values of the response variable,
 77 | 
 78 | ```{r, eval=T, warning = FALSE, message = FALSE}
 79 | 
 80 | library(KernelKnn)
 81 | 
 82 | preds_TEST = KernelKnn(X[spl_train, ], TEST_data = X[spl_test, ], y[spl_train], k = 5 , 
 83 |                        
 84 |                        method = 'euclidean', weights_function = NULL, regression = F,
 85 |                        
 86 |                        Levels = unique(y))
 87 | head(preds_TEST)
 88 | 
 89 | ```
 90 | <br>
 91 | There are two ways to use a kernel in the KernelKnn function. The **first option** is to choose one of the existing kernels (*uniform*, *triangular*, *epanechnikov*, *biweight*, *triweight*, *tricube*, *gaussian*, *cosine*, *logistic*, *silverman*, *inverse*, *gaussianSimple*, *exponential*). Here, I use the *canberra* metric and the *tricube* kernel because they give optimal results (according to my RandomSearchR package),
 92 | 
 93 | 
 94 | ```{r, eval=T}
 95 | 
 96 | 
 97 | preds_TEST_tric = KernelKnn(X[spl_train, ], TEST_data = X[spl_test, ], y[spl_train], k = 10 , 
 98 |                             
 99 |                             method = 'canberra', weights_function = 'tricube', regression = F,  
100 |                             
101 |                             Levels = unique(y))
102 | head(preds_TEST_tric)
103 | 
104 | ```
105 | 
106 | <br>
107 | The **second option** is to give a self defined kernel function. Here, I'll pick the density function of the normal distribution with mean = 0.0 and standard deviation = 1.0 (the data are scaled to have mean zero and unit variance),
108 | 
109 | 
110 | ```{r, eval=T}
111 | 
112 | 
113 | norm_kernel = function(W) {
114 |   
115 |   W = dnorm(W, mean = 0, sd = 1.0)
116 |   
117 |   W = W / rowSums(W)
118 |   
119 |   return(W)
120 | }
121 | 
122 | 
123 | preds_TEST_norm = KernelKnn(X[spl_train, ], TEST_data = X[spl_test, ], y[spl_train], k = 10 , 
124 |                             
125 |                             method = 'canberra', weights_function = norm_kernel, regression = F, 
126 |                             
127 |                             Levels = unique(y))
128 | head(preds_TEST_norm)
129 | 
130 | ```
131 | <br>
132 | 
133 | The computations can be speed up by using the parameter **threads** (multiple cores can be run in parallel). There is also the option to exclude **extrema** (minimum and maximum distances) during the calculation of the k-nearest-neighbor distances using extrema = TRUE. The *bandwidth* of the existing kernels can be tuned using the **h** parameter. 
134 | <br>
135 | 
136 | K-nearest-neigbor calculations in the KernelKnn function can be accomplished using the following distance metrics : *euclidean*, *manhattan*, *chebyshev*, *canberra*, *braycurtis*, *minkowski* (by default the order 'p' of the minkowski parameter equals k), *hamming*, *mahalanobis*, *pearson_correlation*, *simple_matching_coefficient*, *jaccard_coefficient* and *Rao_coefficient*. The last four are similarity measures and are appropriate for binary data [0,1].
137 | <br>
138 | 
139 | I employed my RandomSearchR package to find the optimal parameters for the KernelKnn function and the following two pairs of parameters give an optimal accuracy,
140 | <br><br>
141 | 
142 | ```{r, eval = T, echo = F}
143 | 
144 |  knitr::kable(data.frame(k = c(10,9), method = c('canberra', 'canberra'), kernel = c('tricube', 'epanechnikov')))
145 | ```
146 | 
147 | 
148 | ## The KernelKnnCV function
149 |  
150 | I'll use the *KernelKnnCV* function to calculate the accuracy using 5-fold cross-validation for the previous mentioned parameter pairs,
151 | 
152 | 
153 | ```{r, eval=T, warning = FALSE, message = FALSE, results = 'hide'}
154 | 
155 | fit_cv_pair1 = KernelKnnCV(X, y, k = 10 , folds = 5, method = 'canberra', 
156 |                            
157 |                            weights_function = 'tricube', regression = F, 
158 |                            
159 |                            Levels = unique(y), threads = 5, seed_num = 5)
160 | ```
161 | 
162 | ```{r, eval=T}
163 | str(fit_cv_pair1)
164 | ```
165 | 
166 | ```{r, eval=T, warning = FALSE, message = FALSE, results = 'hide'}
167 | fit_cv_pair2 = KernelKnnCV(X, y, k = 9 , folds = 5,method = 'canberra',
168 |                            
169 |                            weights_function = 'epanechnikov', regression = F,
170 |                            
171 |                            Levels = unique(y), threads = 5, seed_num = 5)
172 | ```
173 | 
174 | 
175 | ```{r, eval=T}
176 | str(fit_cv_pair2)
177 | 
178 | ```
179 | <br>
180 | 
181 | 
182 | Each cross-validated object returns a list of length 2 ( the first sublist includes the predictions for each fold whereas the second gives the indices of the folds)
183 | 
184 | 
185 | ```{r, eval=T}
186 | acc_pair1 = unlist(lapply(1:length(fit_cv_pair1$preds), 
187 |                           
188 |                           function(x) acc(y[fit_cv_pair1$folds[[x]]], 
189 |                                           
190 |                                           fit_cv_pair1$preds[[x]])))
191 | acc_pair1
192 | 
193 | cat('accurcay for params_pair1 is :', mean(acc_pair1), '\n')
194 | 
195 | acc_pair2 = unlist(lapply(1:length(fit_cv_pair2$preds), 
196 |                           
197 |                           function(x) acc(y[fit_cv_pair2$folds[[x]]], 
198 |                                           
199 |                                           fit_cv_pair2$preds[[x]])))
200 | acc_pair2
201 | 
202 | cat('accuracy for params_pair2 is :', mean(acc_pair2), '\n')
203 | 
204 | ```
205 | <br>
206 | 
207 | 
208 | ## Adding or multiplying kernels
209 | 
210 | In the KernelKnn package there is also the option to **combine kernels** (adding or multiplying) from the existing ones. For instance, if I want to multiply the *tricube* with the *gaussian* kernel, then I'll give the following character string to the weights_function, *"tricube_gaussian_MULT"*. On the other hand, If I want to add the same kernels then the weights_function will be *"tricube_gaussian_ADD"*. I experimented with my RandomSearchR package combining the different kernels and the following two parameter settings gave optimal results, 
211 | 
212 | <br>
213 | 
214 | ```{r, eval = T, echo = F}
215 | 
216 |  knitr::kable(data.frame(k = c(16,5), method = c('canberra', 'canberra'), kernel = c('biweight_triweight_gaussian_MULT', 'triangular_triweight_MULT')))
217 | ```
218 | 
219 | <br>
220 | 
221 | ```{r, eval=T, warning = FALSE, message = FALSE, results = 'hide'}
222 | 
223 | fit_cv_pair1 = KernelKnnCV(X, y, k = 16, folds = 5, method = 'canberra', 
224 |                            
225 |                            weights_function = 'biweight_triweight_gaussian_MULT', 
226 |                            
227 |                            regression = F, Levels = unique(y), threads = 5, 
228 |                            
229 |                            seed_num = 5)
230 | ```
231 | 
232 | ```{r, eval=T}
233 | str(fit_cv_pair1)
234 | ```
235 | 
236 | ```{r, eval=T, warning = FALSE, message = FALSE, results = 'hide'}
237 | 
238 | fit_cv_pair2 = KernelKnnCV(X, y, k = 5, folds = 5, method = 'canberra', 
239 |                            
240 |                            weights_function = 'triangular_triweight_MULT', 
241 |                            
242 |                            regression = F, Levels = unique(y), threads = 5,
243 |                            
244 |                            seed_num = 5)
245 | 
246 | ```
247 | 
248 | ```{r, eval=T}
249 | str(fit_cv_pair2)
250 | 
251 | ```
252 | <br>
253 | 
254 | ```{r, eval=T}
255 | acc_pair1 = unlist(lapply(1:length(fit_cv_pair1$preds), 
256 |                           
257 |                           function(x) acc(y[fit_cv_pair1$folds[[x]]], 
258 |                                           
259 |                                           fit_cv_pair1$preds[[x]])))
260 | acc_pair1
261 | 
262 | cat('accuracy for params_pair1 is :', mean(acc_pair1), '\n')
263 | 
264 | acc_pair2 = unlist(lapply(1:length(fit_cv_pair2$preds), 
265 |                           
266 |                           function(x) acc(y[fit_cv_pair2$folds[[x]]],
267 |                                           
268 |                                           fit_cv_pair2$preds[[x]])))
269 | acc_pair2
270 | 
271 | cat('accuracy for params_pair2 is :', mean(acc_pair2), '\n')
272 | 
273 | ```
274 | <br>
275 | 
276 | 


--------------------------------------------------------------------------------
/R/kernelknn.R:
--------------------------------------------------------------------------------
  1 | 
  2 | #' kernel k-nearest-neighbors
  3 | #'
  4 | #' This function utilizes kernel k nearest neighbors to predict new observations
  5 | #'
  6 | #' @param data a data frame or matrix
  7 | #' @param TEST_data a data frame or matrix (it can be also NULL)
  8 | #' @param y a numeric vector (in classification the labels must be numeric from 1:Inf)
  9 | #' @param k an integer specifying the k-nearest-neighbors
 10 | #' @param h the bandwidth (applicable if the weights_function is not NULL, defaults to 1.0)
 11 | #' @param method a string specifying the method. Valid methods are 'euclidean', 'manhattan', 'chebyshev', 'canberra', 'braycurtis', 'pearson_correlation', 'simple_matching_coefficient', 'minkowski' (by default the order 'p' of the minkowski parameter equals k), 'hamming', 'mahalanobis', 'jaccard_coefficient', 'Rao_coefficient'
 12 | #' @param weights_function there are various ways of specifying the kernel function. See the details section.
 13 | #' @param regression a boolean (TRUE,FALSE) specifying if regression or classification should be performed
 14 | #' @param transf_categ_cols a boolean (TRUE, FALSE) specifying if the categorical columns should be converted to numeric or to dummy variables
 15 | #' @param threads the number of cores to be used in parallel (openmp will be employed)
 16 | #' @param extrema if TRUE then the minimum and maximum values from the k-nearest-neighbors will be removed (can be thought as outlier removal)
 17 | #' @param Levels a numeric vector. In case of classification the unique levels of the response variable are necessary
 18 | #' @param p a numeric value specifying the 'minkowski' order, i.e. if 'method' is set to 'minkowski'. This parameter defaults to 'k'
 19 | #' @return a vector (if regression is TRUE), or a data frame with class probabilities (if regression is FALSE)
 20 | #' @author Lampros Mouselimis
 21 | #' @details
 22 | #' This function takes a number of arguments and it returns the predicted values. If TEST_data is NULL then the predictions for the train data will be returned, whereas if TEST_data is not NULL then the predictions for the TEST_data will be returned.
 23 | #' There are three possible ways to specify the weights function, 1st option : if the weights_function is NULL then a simple k-nearest-neighbor is performed. 2nd option : the weights_function is one of 'uniform', 'triangular', 'epanechnikov', 'biweight', 'triweight', 'tricube', 'gaussian', 'cosine', 'logistic', 'gaussianSimple', 'silverman', 'inverse', 'exponential'. The 2nd option can be extended by combining kernels from the existing ones (adding or multiplying). For instance, I can multiply the tricube with the gaussian kernel by giving 'tricube_gaussian_MULT' or I can add the previously mentioned kernels by giving 'tricube_gaussian_ADD'. 3rd option : a user defined kernel function
 24 | #' @export
 25 | #' @examples
 26 | #'
 27 | #' data(Boston)
 28 | #'
 29 | #' X = Boston[, -ncol(Boston)]
 30 | #' y = Boston[, ncol(Boston)]
 31 | #'
 32 | #' out = KernelKnn(X, TEST_data = NULL, y, k = 5, method = 'euclidean', regression = TRUE)
 33 | #'
 34 | 
 35 | 
 36 | KernelKnn = function(data, TEST_data = NULL, y, k = 5, h = 1.0, method = 'euclidean', weights_function = NULL, regression = F, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL, p = k) {
 37 | 
 38 |   categorical_data_present = sapply(data, function(x) is.factor(x) || is.character(x))
 39 | 
 40 |   if (sum(categorical_data_present) && !transf_categ_cols) stop('Categorical columns present in data. These should be either converted to numeric or the function should be run with transf_categ_cols = TRUE')
 41 |   if (!is.numeric(k) || is.null(k) || (k >= nrow(data)) || k < 1) stop('k must be of type integer, greater than 0 and less than nrow(train)')
 42 |   if (abs(k - round(k)) > 0) {
 43 |     k = round(k)
 44 |     warning('k is float and will be rounded to : ', call. = F, expr = k)}
 45 |   if (h == 0) stop('h can be any number except for 0')
 46 |   if (!is.character(method) || is.null(method) || !method %in% c('euclidean', 'manhattan', 'chebyshev', 'canberra', 'braycurtis', 'pearson_correlation', 'simple_matching_coefficient',
 47 |                                                                  'minkowski', 'hamming', 'mahalanobis', 'jaccard_coefficient', 'Rao_coefficient'))
 48 |     stop("method must be of type character and one of 'euclidean', 'manhattan', 'chebyshev', 'canberra', 'braycurtis', 'pearson_correlation', 'simple_matching_coefficient',
 49 |          'minkowski', 'hamming', 'mahalanobis', 'jaccard_coefficient', 'Rao_coefficient'")
 50 |   if (is.null(y)) stop('the response variable should be numeric')
 51 |   if (is.integer(y)) y = as.numeric(y)
 52 |   if (!is.numeric(y)) stop('in both regression and classification the response variable should be numeric or integer and in classification it should start from 1')
 53 |   if (!regression && is.null(Levels)) stop('In classification give the unique values of y in form of a vector')
 54 |   if (!regression && any(unique(y) < 1)) stop('the response variable values should begin from 1')
 55 |   if (!regression) {
 56 |     if (!all(Levels %in% unique(y))) stop("The specified 'Levels' must match the unique 'y' labels!")
 57 |   }
 58 |   if (any(is.na(data)) || any(is.na(y))) stop('the data or the response variable includes missing values')
 59 |   if (!is.null(TEST_data) && any(is.na(TEST_data))) stop('the TEST_data includes missing values')
 60 |   if (length(y) != nrow(data)) stop('the size of the data and y differ')
 61 |   if (extrema && k < 4) stop('k must be greater than 3 if extrema = TRUE')
 62 |   if (method %in% c('simple_matching_coefficient', 'jaccard_coefficient', 'Rao_coefficient') && !sum(apply(data, 2, function(x) all(unique(x) %in% c(0,1)))) == ncol(data))
 63 |     stop("methods : 'simple_matching_coefficient', 'jaccard_coefficient' and 'Rao_coefficient' require the data to be in binary form e.g 0,1")
 64 |   if (!is.null(TEST_data) && method %in% c('simple_matching_coefficient', 'jaccard_coefficient', 'Rao_coefficient') && !sum(apply(TEST_data, 2, function(x) all(unique(x) %in% c(0,1)))) == ncol(TEST_data))
 65 |     stop("methods : 'simple_matching_coefficient', 'jaccard_coefficient' and 'Rao_coefficient' require the TEST_data to be in binary form e.g 0,1")
 66 | 
 67 |   if (!is.null(TEST_data) && ncol(data) != ncol(TEST_data)) stop('the number of columns in train and test data differ')
 68 | 
 69 |   #----------------------------------------------------------------------------------------------------
 70 | 
 71 |   # check if any of the variables is categorical, if TRUE then convert categorical predictors to either dummy variables or numeric variables [ depending on the number of levels ]
 72 | 
 73 |   if (transf_categ_cols) {
 74 | 
 75 |     if (is.null(TEST_data)) {
 76 | 
 77 |       data = func_categorical_preds(data)}
 78 | 
 79 |     else {
 80 | 
 81 |       tmp_dat = func_categorical_preds(rbind(data, TEST_data))
 82 | 
 83 |       data = tmp_dat[1:nrow(data), ]
 84 | 
 85 |       TEST_data = tmp_dat[(nrow(data) + 1):nrow(tmp_dat), ]
 86 |     }
 87 |   }
 88 |   #----------------------------------------------------------------------------------------------------
 89 | 
 90 |   if (is.null(TEST_data)) {
 91 | 
 92 |     mat = matrix(, nrow = 0, ncol = 0)
 93 | 
 94 |     if (!is.matrix(data)) data = as.matrix(data)
 95 | 
 96 |     if (extrema) {
 97 | 
 98 |       k = k + 2           # add two values (for min-max)
 99 |     }
100 | 
101 |     index_train = knn_index_dist_rcpp(data, mat, k = k, method = method, threads = threads, p = p)
102 | 
103 |     if (extrema) {
104 | 
105 |       index_train$train_knn_idx = index_train$train_knn_idx[, -c(1,k)]           # remove min, max  (matrices already sorted)
106 |       index_train$train_knn_dist = index_train$train_knn_dist[, -c(1,k)]         # remove min, max  (matrices already sorted)
107 | 
108 |       k = k - 2          # adjust k to previous value
109 |     }
110 | 
111 |     out_train = matrix(y[index_train$train_knn_idx], nrow = nrow(data), ncol = k)
112 | 
113 |     if (!regression) {
114 | 
115 |       if (is.null(weights_function)) {
116 | 
117 |         out = func_tbl_dist(out_train, sort(Levels))
118 |       }
119 |       else if (is.function(weights_function)) {
120 | 
121 |         W = FUNCTION_weights(index_train$train_knn_dist, weights_function)
122 | 
123 |         out = func_tbl(out_train, W, sort(Levels))
124 |       }
125 |       else if (is.character(weights_function) && nchar(weights_function) > 1) {
126 | 
127 |         W = FUN_kernels(weights_function, index_train$train_knn_dist, h)
128 | 
129 |         out = func_tbl(out_train, W, sort(Levels))
130 |       }
131 |       else {
132 |         stop('false input for the weights_function argument')
133 |       }
134 | 
135 |       colnames(out) = paste0('class_', sort(Levels))
136 |     }
137 | 
138 |     else {
139 | 
140 |       if (is.null(weights_function)) {
141 | 
142 |         out = rowMeans(out_train)
143 |       }
144 |       else if (is.function(weights_function)) {
145 | 
146 |         W = FUNCTION_weights(index_train$train_knn_dist, weights_function)
147 | 
148 |         out = rowSums(out_train * W)
149 |       }
150 |       else if (is.character(weights_function) && nchar(weights_function) > 1) {
151 | 
152 |         W = FUN_kernels(weights_function, index_train$train_knn_dist, h)
153 | 
154 |         out = rowSums(out_train * W)
155 |       }
156 |       else {
157 | 
158 |         stop('false input for the weights_function argument')
159 |       }
160 |     }
161 | 
162 |     return(out)
163 |   }
164 | 
165 |   else {
166 | 
167 |     if (!is.matrix(data)) data = as.matrix(data)
168 |     if (!is.matrix(TEST_data)) TEST_data = as.matrix(TEST_data)
169 | 
170 |     if (extrema) {
171 | 
172 |       k = k + 2           # add two values (for min-max)
173 |     }
174 | 
175 |     index = knn_index_dist_rcpp(data, TEST_data, k = k, method = method, threads = threads, p = p)
176 | 
177 |     if (extrema) {
178 | 
179 |       index$test_knn_idx = index$test_knn_idx[, -c(1,k)]             # remove min, max  (matrices already sorted)
180 |       index$test_knn_dist = index$test_knn_dist[, -c(1,k)]           # remove min, max  (matrices already sorted)
181 | 
182 |       k = k - 2          # adjust k to previous value
183 |     }
184 | 
185 |     out_test = matrix(y[index$test_knn_idx], ncol = k)
186 | 
187 |     if (!regression) {
188 | 
189 |       if (is.null(weights_function)) {
190 | 
191 |         out_te = func_tbl_dist(out_test, sort(Levels))
192 |       }
193 |       else if (is.function(weights_function)) {
194 | 
195 |         W_te = FUNCTION_weights(index$test_knn_dist, weights_function)
196 | 
197 |         out_te = func_tbl(out_test, W_te, sort(Levels))
198 |       }
199 |       else if (is.character(weights_function) && nchar(weights_function) > 1) {
200 | 
201 |         W_te = FUN_kernels(weights_function, index$test_knn_dist, h)
202 | 
203 |         out_te = func_tbl(out_test, W_te, sort(Levels))
204 |       }
205 |       else {
206 | 
207 |         stop('false input for the weights_function argument')
208 |       }
209 | 
210 |       colnames(out_te) = paste0('class_', sort(Levels))
211 |     }
212 | 
213 |     else {
214 | 
215 |       if (is.null(weights_function)) {
216 | 
217 |         out_te = rowMeans(out_test)
218 |       }
219 |       else if (is.function(weights_function)) {
220 | 
221 |         W_te = FUNCTION_weights(index$test_knn_dist, weights_function)
222 | 
223 |         out_te = rowSums(out_test * W_te)
224 |       }
225 |       else if (is.character(weights_function) && nchar(weights_function) > 1) {
226 | 
227 |         W_te = FUN_kernels(weights_function, index$test_knn_dist, h)
228 | 
229 |         out_te = rowSums(out_test * W_te)
230 |       }
231 |       else {
232 |         stop('false input for the weights_function argument')
233 |       }
234 |     }
235 | 
236 |     return(out_te)
237 |   }
238 | }
239 | 
240 | 
241 | #================================================================================================================================================================================
242 | 


--------------------------------------------------------------------------------
/tests/testthat/test-knn_index_dist.R:
--------------------------------------------------------------------------------
  1 | #==============================================================================================================================================================
  2 | 
  3 | context("Knn index distance")
  4 | 
  5 | #=================
  6 | # Error handling
  7 | #=================
  8 | 
  9 | 
 10 | testthat::test_that("it returns an error if a factor variable is present in the data and the transf_categ_cols = FALSE", {
 11 | 
 12 |   tmp_dat = xtr
 13 | 
 14 |   tmp_dat$rad = as.factor(tmp_dat$rad)
 15 | 
 16 |   testthat::expect_error(knn.index.dist(tmp_dat, TEST_data = xte, k = 5, method = 'euclidean', transf_categ_cols = F, threads = 1))
 17 | })
 18 | 
 19 | testthat::test_that("it returns an error if a character variable is present in the data and the transf_categ_cols = FALSE", {
 20 | 
 21 |   tmp_dat = xtr
 22 | 
 23 |   tmp_dat$rad = as.character(tmp_dat$rad)
 24 | 
 25 |   testthat::expect_error(knn.index.dist(tmp_dat, TEST_data = xte, k = 5, method = 'euclidean', transf_categ_cols = F, threads = 1))
 26 | })
 27 | 
 28 | 
 29 | testthat::test_that("'simple_matching_coefficient', 'jaccard_coefficient' and 'Rao_coefficient' can work only with binary data, case TEST_data is NULL", {
 30 | 
 31 |   testthat::expect_error(knn.index.dist(xtr, TEST_data = NULL, k = 5, method = 'simple_matching_coefficient', transf_categ_cols = F, threads = 1))
 32 | })
 33 | 
 34 | 
 35 | testthat::test_that("'simple_matching_coefficient', 'jaccard_coefficient' and 'Rao_coefficient' can work only with binary data, case TEST_data is NULL", {
 36 | 
 37 |   testthat::expect_error(knn.index.dist(xtr, TEST_data = NULL, k = 5, method = 'jaccard_coefficient', transf_categ_cols = F, threads = 1))
 38 | })
 39 | 
 40 | 
 41 | testthat::test_that("'simple_matching_coefficient', 'jaccard_coefficient' and 'Rao_coefficient' can work only with binary data, case TEST_data is NULL", {
 42 | 
 43 |   testthat::expect_error(knn.index.dist(xtr, TEST_data = NULL, k = 5, method = 'Rao_coefficient', transf_categ_cols = F, threads = 1))
 44 | })
 45 | 
 46 | 
 47 | testthat::test_that("'simple_matching_coefficient', 'jaccard_coefficient' and 'Rao_coefficient' can work only with binary data, case TEST_data is not NULL", {
 48 | 
 49 |   testthat::expect_error(knn.index.dist(xtr, TEST_data = xte, k = 5, method = 'simple_matching_coefficient', transf_categ_cols = F, threads = 1))
 50 | })
 51 | 
 52 | 
 53 | testthat::test_that("'simple_matching_coefficient', 'jaccard_coefficient' and 'Rao_coefficient' can work only with binary data, case TEST_data is not NULL", {
 54 | 
 55 |   testthat::expect_error(knn.index.dist(xtr, TEST_data = xte, k = 5, method = 'jaccard_coefficient', transf_categ_cols = F, threads = 1))
 56 | })
 57 | 
 58 | 
 59 | testthat::test_that("'simple_matching_coefficient', 'jaccard_coefficient' and 'Rao_coefficient' can work only with binary data, case TEST_data is not NULL", {
 60 | 
 61 |   testthat::expect_error(knn.index.dist(xtr, TEST_data = xte, k = 5, method = 'Rao_coefficient', transf_categ_cols = F, threads = 1))
 62 | })
 63 | 
 64 | 
 65 | testthat::test_that("it returns an error if k is NULL", {
 66 | 
 67 |   testthat::expect_error( knn.index.dist(xtr, TEST_data = xte, k = NULL, method = 'euclidean', transf_categ_cols = F, threads = 1))
 68 | })
 69 | 
 70 | 
 71 | testthat::test_that("it returns an error if k is a character", {
 72 | 
 73 |   testthat::expect_error( knn.index.dist(xtr, TEST_data = xte, k = 'invalid', method = 'euclidean', transf_categ_cols = F, threads = 1) )
 74 | })
 75 | 
 76 | 
 77 | testthat::test_that("it returns an error if k is greater or equal to the number of rows of the train data", {
 78 | 
 79 |   testthat::expect_error( knn.index.dist(xtr, TEST_data = xte, k = nrow(xtr), method = 'euclidean', transf_categ_cols = F, threads = 1) )
 80 | })
 81 | 
 82 | 
 83 | testthat::test_that("it returns an error if k is less than 1", {
 84 | 
 85 |   testthat::expect_error( knn.index.dist(xtr, TEST_data = xte, k = -1, method = 'euclidean', transf_categ_cols = F, threads = 1) )
 86 | })
 87 | 
 88 | 
 89 | testthat::test_that("it returns an error if the method is NULL", {
 90 | 
 91 |   testthat::expect_error( knn.index.dist(xtr, TEST_data = xte, k = 5, method = NULL, transf_categ_cols = F, threads = 1) )
 92 | })
 93 | 
 94 | 
 95 | testthat::test_that("it returns an error if the method is not a character", {
 96 | 
 97 |   testthat::expect_error( knn.index.dist(xtr, TEST_data = xte, k = 5 , method = 1, transf_categ_cols = F, threads = 1) )
 98 | })
 99 | 
100 | 
101 | testthat::test_that("it returns an error if the method is a character, but not one of the valid names", {
102 | 
103 |   testthat::expect_error( knn.index.dist(xtr, TEST_data = xte, k = 5 , method = 'invalid', transf_categ_cols = F, threads = 1) )
104 | })
105 | 
106 | 
107 | testthat::test_that("it returns an error if missing values are present in the data", {
108 | 
109 |   tmp_dat = xtr
110 |   tmp_dat$crim[sample(1:length(tmp_dat$crim), 10)] = NA
111 | 
112 |   testthat::expect_error( knn.index.dist(tmp_dat, TEST_data = xte, k = 5 , method = 'euclidean', transf_categ_cols = F, threads = 1) )
113 | })
114 | 
115 | 
116 | testthat::test_that("it returns an error if missing values are present in the TEST data", {
117 | 
118 |   tmp_dat = xte
119 |   tmp_dat$crim[sample(1:length(tmp_dat$crim), 10)] = NA
120 | 
121 |   testthat::expect_error( knn.index.dist(xtr, TEST_data = tmp_dat, k = 5 , method = 'euclidean', transf_categ_cols = F, threads = 1) )
122 | })
123 | 
124 | 
125 | testthat::test_that("if the number of columns in train and test data differ it returns an error", {
126 | 
127 |   tmp_xte = xte[, -ncol(xte)]
128 | 
129 |   testthat::expect_error( knn.index.dist(xtr, TEST_data = tmp_xte, k = 4 , method = 'euclidean', transf_categ_cols = F, threads = 1) )
130 | })
131 | 
132 | 
133 | # testing of knn.index.dist
134 | 
135 | 
136 | testthat::test_that("if transf_categ_cols = TRUE and TEST_data = NULL the knn.index.dist returns a list with number of rows in each sublist equal to the number of rows in train data", {
137 | 
138 |   tmp_xtr = xtr
139 |   tmp_xtr$rad = as.factor(tmp_xtr$rad)
140 | 
141 |   res = knn.index.dist(tmp_xtr, TEST_data = NULL, k = 5 , method = 'euclidean', transf_categ_cols = T, threads = 1)
142 | 
143 |   testthat::expect_true(length(res) == 2 && mean(unlist(lapply(res, nrow))) == nrow(xtr))
144 | })
145 | 
146 | 
147 | testthat::test_that("if transf_categ_cols = TRUE and TEST_data is NOT NULL the knn.index.dist returns a list with number of rows in each sublsit equal to the number of rows in the TEST data", {
148 | 
149 |   tmp_xtr = xtr
150 |   tmp_xtr$rad = as.factor(tmp_xtr$rad)
151 | 
152 |   tmp_xte = xte
153 |   tmp_xte$rad = as.factor(tmp_xte$rad)
154 | 
155 |   res = knn.index.dist(tmp_xtr, TEST_data = tmp_xte, k = 5 , method = 'euclidean', transf_categ_cols = T, threads = 1)
156 | 
157 |   testthat::expect_true(length(res) == 2 && mean(unlist(lapply(res, nrow))) == nrow(xte))
158 | })
159 | 
160 | 
161 | 
162 | testthat::test_that("if the TEST data is NULL for all posible combinations [ when transf_categ_cols = T ] the knn.index.dist returns a list of length 2 with
163 | 
164 |                     number of rows in each sublist equal to the number of rows in the train data", {
165 | 
166 |   tmp_xtr = xtr
167 |   tmp_xtr$rad = as.factor(tmp_xtr$rad)
168 | 
169 |   lst = list()
170 | 
171 |   for (k in 4:6) {
172 | 
173 |     for (metric in c('euclidean', 'manhattan', 'chebyshev')) {
174 | 
175 |       tmp_res = knn.index.dist(tmp_xtr, TEST_data = NULL, k = k , method = metric, transf_categ_cols = T, threads = 1)
176 | 
177 |       lst = lappend(lst, length(tmp_res) == 2 && mean(unlist(lapply(tmp_res, nrow))) == nrow(xtr))
178 |     }
179 |   }
180 | 
181 |   testthat::expect_true(all(unlist(lst)))
182 | })
183 | 
184 | 
185 | 
186 | testthat::test_that("if the TEST data is NULL for all posible combinations [ when transf_categ_cols = F ] the knn.index.dist returns a list of length 2 with
187 | 
188 |                     number of rows in each sublist equal to the number of rows in the train data", {
189 | 
190 |   lst = list()
191 | 
192 |   for (k in 4:6) {
193 | 
194 |     for (metric in c('euclidean', 'manhattan', 'chebyshev')) {
195 | 
196 |       tmp_res = knn.index.dist(xtr, TEST_data = NULL, k = k , method = metric, transf_categ_cols = F, threads = 1)
197 | 
198 |       lst = lappend(lst, length(tmp_res) == 2 && mean(unlist(lapply(tmp_res, nrow))) == nrow(xtr))
199 |     }
200 |   }
201 | 
202 |   testthat::expect_true(all(unlist(lst)))
203 | })
204 | 
205 | 
206 | 
207 | testthat::test_that("if the TEST data is NOT NULL for all posible combinations [ when transf_categ_cols = T ] the knn.index.dist returns a list of length 2 with
208 | 
209 |                     number of rows in each sublist equal to the number of rows in the TEST data", {
210 | 
211 |   tmp_xtr = xtr
212 |   tmp_xtr$rad = as.factor(tmp_xtr$rad)
213 | 
214 |   tmp_xte = xte
215 |   tmp_xte$rad = as.factor(tmp_xte$rad)
216 | 
217 |   lst = list()
218 | 
219 |   for (k in 4:6) {
220 | 
221 |     for (metric in c('euclidean', 'manhattan', 'chebyshev')) {
222 | 
223 |       tmp_res = knn.index.dist(tmp_xtr, TEST_data = tmp_xte, k = k , method = metric, transf_categ_cols = T, threads = 1)
224 | 
225 |       lst = lappend(lst, length(tmp_res) == 2 && mean(unlist(lapply(tmp_res, nrow))) == nrow(tmp_xte))
226 |     }
227 |   }
228 | 
229 |   testthat::expect_true(all(unlist(lst)))
230 | })
231 | 
232 | 
233 | 
234 | testthat::test_that("if the TEST data is NULL for all posible combinations [ when transf_categ_cols = F ] the knn.index.dist returns a list of length 2 with
235 | 
236 |                     number of rows in each sublist equal to the number of rows in the train data", {
237 | 
238 |   lst = list()
239 | 
240 |   for (k in 4:6) {
241 | 
242 |     for (metric in c('euclidean', 'manhattan', 'chebyshev')) {
243 | 
244 |       tmp_res = knn.index.dist(xtr, TEST_data = xte, k = k , method = metric, transf_categ_cols = F, threads = 1)
245 | 
246 |       lst = lappend(lst, length(tmp_res) == 2 && mean(unlist(lapply(tmp_res, nrow))) == nrow(xte))
247 |     }
248 |   }
249 | 
250 |   testthat::expect_true(all(unlist(lst)))
251 | })
252 | 
253 | 
254 | 
255 | testthat::test_that("the similarity measures 'simple_matching_coefficient', 'jaccard_coefficient', 'Rao_coefficient' and 'pearson_correlation' return correct output
256 | 
257 |                     in case of binary data when TEST data is not NULL", {
258 | 
259 | 
260 |   dat = do.call(cbind, lapply(1:10, function(x) sample(0:1, 100, replace = T)))
261 |   TES = do.call(cbind, lapply(1:10, function(x) sample(0:1, 50, replace = T)))
262 | 
263 |   lst = count = list()
264 | 
265 |   for (k in 4:6) {
266 | 
267 |     for (metric in c('simple_matching_coefficient', 'jaccard_coefficient', 'Rao_coefficient', 'pearson_correlation')) {
268 | 
269 |       tmp_lst = knn.index.dist(dat, TEST_data = TES, k = k , method = metric, transf_categ_cols = F, threads = 1)
270 | 
271 |       lst = lappend(lst, tmp_lst)
272 | 
273 |       count = lappend(count, ncol(tmp_lst$test_knn_idx) == k && ncol(tmp_lst$test_knn_dist) == k)
274 |     }
275 |   }
276 | 
277 |   res = unlist(lapply(lst, function(x) nrow(x$test_knn_idx) == nrow(TES) && nrow(x$test_knn_dist) == nrow(TES)))
278 | 
279 |   testthat::expect_true(all(res) && all(unlist(count)))
280 | })
281 | 
282 | 
283 | testthat::test_that("the 'p' parameter when method is 'minkowski' returns the expected output", {
284 | 
285 |   k = 2
286 |   res_wo = knn.index.dist(data = X, k = k , method = 'minkowski')            # without specifying the 'p' parameter
287 |   res_w = knn.index.dist(data = X, k = k , method = 'minkowski', p = k)      # by specifying the 'p' parameter
288 |   res_dif = knn.index.dist(data = X, k = k , method = 'minkowski', p = 1)    # 'p' is set to 1
289 | 
290 |   is_identical = identical(res_wo, res_w)
291 |   is_not_identical = identical(res_wo, res_dif)
292 | 
293 |   testthat::expect_true(is_identical & (!is_not_identical))
294 | })
295 | 
296 | 
297 | #==============================================================================================================================================================
298 | 


--------------------------------------------------------------------------------
/vignettes/image_classification_using_MNIST_CIFAR_data.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Image classification of the MNIST and CIFAR-10 data using KernelKnn and HOG (histogram of oriented gradients)"
  3 | author: "Lampros Mouselimis"
  4 | date: "`r Sys.Date()`"
  5 | output: rmarkdown::html_vignette
  6 | vignette: >
  7 |   %\VignetteIndexEntry{Image classification of the MNIST and CIFAR-10 data using KernelKnn and HOG (histogram of oriented gradients)}
  8 |   %\VignetteEngine{knitr::rmarkdown}
  9 |   %\VignetteEncoding{UTF-8}
 10 | ---
 11 | 
 12 | 
 13 | In this vignette I'll illustrate how to increase the accuracy on the MNIST (to approx. 98.4%) and CIFAR-10 data (to approx. 58.3%) using the KernelKnn package and HOG (histogram of oriented gradients).
 14 | <br>
 15 | 
 16 | ### dependencies
 17 | 
 18 | * The MNIST and Cifar-10 data sets can be downloaded from Github using **system("wget https://raw.githubusercontent.com/mlampros/DataSets/master/mnist.zip")** and **system("wget https://raw.githubusercontent.com/mlampros/DataSets/master/cifar_10.zip")**
 19 | * the **irlba** package, which is needed for comparison purposes,  can be installed from CRAN directly 
 20 | * the *HOG_apply* function is part of the **OpenImageR** package, which can be installed from CRAN as well.
 21 | 
 22 | 
 23 | ### MNIST data set
 24 | 
 25 | The MNIST data set of handwritten digits has a training set of 70,000 examples and each row of the matrix corresponds to a 28 x 28 image. The unique values of the response variable *y* range from 0 to 9. More information about the data can be found in the *DataSets* repository (the folder includes also an Rmarkdown file).
 26 | 
 27 | 
 28 | ```{r, eval = F, echo = T, warning = F, message = F, cache = T}
 29 | 
 30 | # using system('wget..') on a linux OS 
 31 | 
 32 | system("wget https://raw.githubusercontent.com/mlampros/DataSets/master/mnist.zip")             
 33 | 
 34 | mnist <- read.table(unz("mnist.zip", "mnist.csv"), nrows = 70000, header = T, 
 35 |                     
 36 |                     quote = "\"", sep = ",")
 37 | 
 38 | ```
 39 | 
 40 | 
 41 | ```{r, eval = F, cache = T}
 42 | X = mnist[, -ncol(mnist)]
 43 | dim(X)
 44 | 
 45 | ## [1] 70000   784
 46 | 
 47 | # the KernelKnn function requires that the labels are numeric and start from 1 : Inf
 48 | 
 49 | y = mnist[, ncol(mnist)] + 1          
 50 | table(y)
 51 | 
 52 | ## y
 53 | ##    1    2    3    4    5    6    7    8    9   10 
 54 | ## 6903 7877 6990 7141 6824 6313 6876 7293 6825 6958
 55 | 
 56 | ```
 57 | <br>
 58 | 
 59 | K nearest neighbors do not perform well in high dimensions due to the *curse of dimensionality* (k observations that are nearest to a given test observation x1 may be very far away from x1 in p-dimensional space when p is large [ An introduction to statistical learning, James/Witten/Hastie/Tibshirani, pages 108-109 ]), leading to a very poor k-nearest-neighbors fit. One option to overcome this problem would be to use truncated svd (irlba package) to reduce the dimensions of the data. A second option, which is appropriate in case of images, would be to use image descriptors. In this vignette, I'll compare those two approaches. <br><br>
 60 | 
 61 | ##### KernelKnnCV using truncated svd
 62 | 
 63 | 
 64 | I experimented with different settings and the following parameters gave the best results,
 65 | <br><br>
 66 | 
 67 | ```{r, eval = T, echo = F}
 68 | 
 69 |  knitr::kable(data.frame(irlba_singlular_vectors = 40, k = 8, method = 'braycurtis', kernel = 'biweight_tricube_MULT'), align = 'l')
 70 | ```
 71 | <br>
 72 | 
 73 | ```{r, eval = F, cache = T}
 74 | 
 75 | library(irlba)
 76 | 
 77 | svd_irlb = irlba(as.matrix(X), nv = 40, nu = 40, verbose = F)            # irlba truncated svd
 78 | 
 79 | new_x = as.matrix(X) %*% svd_irlb$v               # new_x using the 40 right singular vectors
 80 | 
 81 | ```
 82 | <br>
 83 | 
 84 | ```{r, eval = F, cache = T, warning = FALSE, message = FALSE, results = 'hide'}
 85 | 
 86 | library(KernelKnn)
 87 | 
 88 | fit = KernelKnnCV(as.matrix(new_x), y, k = 8, folds = 4, method = 'braycurtis',
 89 |                   
 90 |                   weights_function = 'biweight_tricube_MULT', regression = F, 
 91 |                   
 92 |                   threads = 6, Levels = sort(unique(y)))
 93 | 
 94 | 
 95 | # str(fit)
 96 | 
 97 | 
 98 | # evaluation metric
 99 | 
100 | acc = function (y_true, preds) {
101 |   
102 |   out = table(y_true, max.col(preds, ties.method = "random"))
103 |   
104 |   acc = sum(diag(out))/sum(out)
105 |   
106 |   acc
107 | }
108 | 
109 | ```
110 | <br>
111 | 
112 | ```{r, eval = F, cache = F}
113 | 
114 | acc_fit = unlist(lapply(1:length(fit$preds), 
115 |                         
116 |                         function(x) acc(y[fit$folds[[x]]], 
117 |                                         
118 |                                         fit$preds[[x]])))
119 | acc_fit
120 | 
121 | ## [1] 0.9742857 0.9749143 0.9761143 0.9741143
122 | 
123 | cat('mean accuracy using cross-validation :', mean(acc_fit), '\n')
124 | 
125 | ## mean accuracy using cross-validation : 0.9748571
126 | 
127 | ```
128 | <br>
129 | 
130 | Utilizing truncated svd a 4-fold cross-validation KernelKnn model gives a 97.48% accuracy.
131 | <br><br>
132 | 
133 | ##### KernelKnnCV and HOG (histogram of oriented gradients)
134 | 
135 | In this chunk of code, besides KernelKnnCV I'll also use HOG. The histogram of oriented gradients (HOG) is a feature descriptor used in computer vision and image processing for the purpose of object detection. The technique counts occurrences of gradient orientation in localized portions of an image. This method is similar to that of edge orientation histograms, scale-invariant feature transform descriptors, and shape contexts, but differs in that it is computed on a dense grid of uniformly spaced cells and uses overlapping local contrast normalization for improved accuracy (Wikipedia).
136 | <br>
137 | 
138 | ```{r, eval = F, cache = T}
139 | 
140 | library(OpenImageR)
141 | 
142 | hog = HOG_apply(X, cells = 6, orientations = 9, rows = 28, columns = 28, threads = 6)
143 | 
144 | ## 
145 | ## time to complete : 1.802997 secs
146 | 
147 | dim(hog)
148 | 
149 | ## [1] 70000   324
150 | 
151 | ```
152 | <br>
153 | 
154 | ```{r, eval = F, cache = T, warning = FALSE, message = FALSE, results = 'hide'}
155 | 
156 | fit_hog = KernelKnnCV(hog, y, k = 20, folds = 4, method = 'braycurtis',
157 |                   
158 |                   weights_function = 'biweight_tricube_MULT', regression = F, 
159 |                   
160 |                   threads = 6, Levels = sort(unique(y)))
161 | 
162 | 
163 | #str(fit_hog)
164 | 
165 | ```
166 | <br>
167 | 
168 | ```{r, eval = F, cache = F}
169 | 
170 | acc_fit_hog = unlist(lapply(1:length(fit_hog$preds), 
171 |                             
172 |                             function(x) acc(y[fit_hog$folds[[x]]], 
173 |                                             
174 |                                             fit_hog$preds[[x]])))
175 | acc_fit_hog
176 | 
177 | ## [1] 0.9833714 0.9840571 0.9846857 0.9838857
178 | 
179 | cat('mean accuracy for hog-features using cross-validation :', mean(acc_fit_hog), '\n')
180 | 
181 | ## mean accuracy for hog-features using cross-validation : 0.984
182 | 
183 | ```
184 | <br>
185 | 
186 | By changing from the simple svd-features to HOG-features the accuracy of a 4-fold cross-validation model increased from 97.48% to 98.4% (approx. 1% difference)
187 | 
188 | 
189 | 
190 | ### CIFAR-10 data set
191 | 
192 | 
193 | CIFAR-10 is an established computer-vision dataset used for object recognition. The data I'll use in this example is a subset of an 80 million tiny images dataset and consists of 60,000 32x32 color images containing one of 10 object classes ( 6000 images per class ). Furthermore, the data were converted from RGB to gray, normalized and rounded to 2 decimal places (to reduce the storage size). More information about the data can be found in my *DataSets* repository (I included an Rmarkdown file). 
194 | <br><br>
195 | 
196 | I'll build the kernel k-nearest-neighbors models in the same way I've done for the mnist data set and then I'll compare the results.
197 | 
198 | 
199 | ```{r, eval = F, echo = T, warning = F, message = F, cache = T}
200 | 
201 | # using system('wget..') on a linux OS 
202 | 
203 | system("wget https://raw.githubusercontent.com/mlampros/DataSets/master/cifar_10.zip")      
204 | 
205 | cifar_10 <- read.table(unz("cifar_10.zip", "cifar_10.csv"), nrows = 60000, header = T, 
206 |                        
207 |                        quote = "\"", sep = ",")
208 | 
209 | ```
210 | 
211 | 
212 | ##### KernelKnnCV using truncated svd
213 | 
214 | 
215 | ```{r, eval = F, cache = T}
216 | X = cifar_10[, -ncol(cifar_10)]
217 | dim(X)
218 | 
219 | ## [1] 60000  1024
220 | 
221 | # the KernelKnn function requires that the labels are numeric and start from 1 : Inf
222 | 
223 | y = cifar_10[, ncol(cifar_10)]         
224 | table(y)
225 | 
226 | ## y
227 | ##    1    2    3    4    5    6    7    8    9   10 
228 | ## 6000 6000 6000 6000 6000 6000 6000 6000 6000 6000
229 | 
230 | ```
231 | <br>
232 | 
233 | 
234 | The parameter settings are similar to those for the mnist data,
235 | 
236 | ```{r, eval = T, echo = F}
237 | 
238 |  knitr::kable(data.frame(irlba_singlular_vectors = 40, k = 8, method = 'braycurtis', 
239 |                          
240 |                          kernel = 'biweight_tricube_MULT'), align = 'l')
241 | ```
242 | <br>
243 | 
244 | ```{r, eval = F, cache = T}
245 | 
246 | svd_irlb = irlba(as.matrix(X), nv = 40, nu = 40, verbose = F)            # irlba truncated svd
247 | 
248 | new_x = as.matrix(X) %*% svd_irlb$v               # new_x using the 40 right singular vectors
249 | 
250 | ```
251 | <br>
252 | 
253 | ```{r, eval = F, cache = T, warning = FALSE, message = FALSE, results = 'hide'}
254 | 
255 | fit = KernelKnnCV(as.matrix(new_x), y, k = 8, folds = 4, method = 'braycurtis',
256 |                   
257 |                   weights_function = 'biweight_tricube_MULT', regression = F,
258 |                   
259 |                   threads = 6, Levels = sort(unique(y)))
260 | 
261 | 
262 | # str(fit)
263 | 
264 | ```
265 | <br>
266 | 
267 | ```{r, eval = F, cache = F}
268 | 
269 | acc_fit = unlist(lapply(1:length(fit$preds),
270 |                         
271 |                         function(x) acc(y[fit$folds[[x]]], 
272 |                                         
273 |                                         fit$preds[[x]])))
274 | acc_fit
275 | 
276 | ## [1] 0.4080667 0.4097333 0.4040000 0.4102667
277 | 
278 | cat('mean accuracy using cross-validation :', mean(acc_fit), '\n')
279 | 
280 | ## mean accuracy using cross-validation : 0.4080167
281 | 
282 | ```
283 | <br>
284 | 
285 | The accuracy of a 4-fold cross-validation model using truncated svd is 40.8%.
286 | 
287 | ##### KernelKnnCV using HOG (histogram of oriented gradients)
288 | 
289 | <br>
290 | Next, I'll run the KernelKnnCV using the HOG-descriptors,
291 | <br><br>
292 | 
293 | 
294 | ```{r, eval = F, cache = T}
295 | 
296 | hog = HOG_apply(X, cells = 6, orientations = 9, rows = 32,
297 |                 
298 |                 columns = 32, threads = 6)
299 | 
300 | ## 
301 | ## time to complete : 3.394621 secs
302 | 
303 | dim(hog)
304 | 
305 | ## [1] 60000   324
306 | 
307 | ```
308 | <br>
309 | 
310 | ```{r, eval = F, cache = T, warning = FALSE, message = FALSE, results = 'hide'}
311 | 
312 | fit_hog = KernelKnnCV(hog, y, k = 20, folds = 4, method = 'braycurtis',
313 |                   
314 |                   weights_function = 'biweight_tricube_MULT', regression = F,
315 |                   
316 |                   threads = 6, Levels = sort(unique(y)))
317 | 
318 | 
319 | # str(fit_hog)
320 | 
321 | ```
322 | <br>
323 | 
324 | ```{r, eval = F, cache = F}
325 | 
326 | acc_fit_hog = unlist(lapply(1:length(fit_hog$preds), 
327 |                             
328 |                             function(x) acc(y[fit_hog$folds[[x]]], 
329 |                                             
330 |                                             fit_hog$preds[[x]])))
331 | acc_fit_hog
332 | 
333 | ## [1] 0.5807333 0.5884000 0.5777333 0.5861333
334 | 
335 | cat('mean accuracy for hog-features using cross-validation :', mean(acc_fit_hog), '\n')
336 | 
337 | ## mean accuracy for hog-features using cross-validation : 0.58325
338 | 
339 | ```
340 | <br>
341 | 
342 | By using hog-descriptors in a 4-fold cross-validation model the accuracy in the cifar-10 data increases from 40.8% to 58.3% (approx. 17.5% difference).  
343 | 
344 | 
345 | ```{r, eval = F, echo = F}
346 | 
347 | # remove cache and mnist.zip once vignettes are built
348 | 
349 | # unlink("image_classification_using_MNIST_CIFAR_data_cache", recursive = TRUE)                    # USE this chunk in case of 'eval = TRUE'
350 | # unlink("mnist.zip", recursive = TRUE)
351 | # unlink("cifar_10.zip", recursive = TRUE)
352 | ```
353 | 


--------------------------------------------------------------------------------
/tests/testthat/test-distmat_kernelknn.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | context("kernel knn with distance matrix")
  4 | 
  5 | #=================
  6 | # Error handling
  7 | #=================
  8 | 
  9 | 
 10 | testthat::test_that("it returns an error if the input distance object is not of type matrix", {
 11 | 
 12 |   tmp_df = as.data.frame(DIST_mat)
 13 | 
 14 |   testthat::expect_error(distMat.KernelKnn(tmp_df, TEST_indices = NULL, y, k = 5, h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T))
 15 | })
 16 | 
 17 | 
 18 | testthat::test_that("it returns an error if the input distance matrix is not square", {
 19 | 
 20 |   testthat::expect_error(distMat.KernelKnn(DIST_mat[, -ncol(DIST_mat)], TEST_indices = NULL, y, k = 5, h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T))
 21 | })
 22 | 
 23 | 
 24 | testthat::test_that("it returns an error if the diagonal of the distance matrix is other than 0's or NA's", {
 25 | 
 26 |   TMP_DIAG = DIST_mat
 27 | 
 28 |   diag(TMP_DIAG) = -1
 29 | 
 30 |   testthat::expect_error(distMat.KernelKnn(TMP_DIAG, TEST_indices = NULL, y, k = 5, h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T))
 31 | })
 32 | 
 33 | 
 34 | testthat::test_that("it returns an error if the TEST_indices parameter is not of type numeric or integer", {
 35 | 
 36 |   invalid_tst_idx = letters[1:100]
 37 | 
 38 |   testthat::expect_error(distMat.KernelKnn(DIST_mat, TEST_indices = invalid_tst_idx, y, k = 5, h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T))
 39 | })
 40 | 
 41 | 
 42 | testthat::test_that("it returns an error if the maximum index of the TEST_indices parameter is greater than the rows of the distance matrix", {
 43 | 
 44 |   invalid_tst_idx = 1:(nrow(DIST_mat) + 10)
 45 | 
 46 |   invalid_tst_idx = (nrow(DIST_mat)-100):length(invalid_tst_idx)
 47 | 
 48 |   testthat::expect_error(distMat.KernelKnn(DIST_mat, TEST_indices = invalid_tst_idx, y, k = 5, h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T))
 49 | })
 50 | 
 51 | testthat::test_that("it returns an error if k is NULL", {
 52 | 
 53 |   testthat::expect_error( distMat.KernelKnn(DIST_mat, TEST_indices = NULL, y, k = NULL, h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T) )
 54 | })
 55 | 
 56 | 
 57 | testthat::test_that("it returns an error if k is a character", {
 58 | 
 59 |   testthat::expect_error( distMat.KernelKnn(DIST_mat, TEST_indices = NULL, y, k = "invalid", h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T) )
 60 | })
 61 | 
 62 | 
 63 | testthat::test_that("it returns an error if k is greater or equal to the number of rows of the distance matrix", {
 64 | 
 65 |   testthat::expect_error( distMat.KernelKnn(DIST_mat, TEST_indices = NULL, y, k = nrow(DIST_mat), h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T) )
 66 | })
 67 | 
 68 | 
 69 | testthat::test_that("it returns an error if k is less than 1", {
 70 | 
 71 |   testthat::expect_error( distMat.KernelKnn(DIST_mat, TEST_indices = NULL, y, k = -1, h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T) )
 72 | })
 73 | 
 74 | 
 75 | testthat::test_that("it returns a warning if k is a float", {
 76 | 
 77 |   testthat::expect_warning( distMat.KernelKnn(DIST_mat, TEST_indices = NULL, y, k = 1.5, h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T) )
 78 | })
 79 | 
 80 | 
 81 | testthat::test_that("it returns an error if h = 0", {
 82 | 
 83 |   testthat::expect_error( distMat.KernelKnn(DIST_mat, TEST_indices = NULL, y, k = 5, h = 0.0, weights_function = NULL, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T) )
 84 | })
 85 | 
 86 | 
 87 | testthat::test_that("it returns an error if y is NULL", {
 88 | 
 89 |   testthat::expect_error( distMat.KernelKnn(DIST_mat, TEST_indices = NULL, y = NULL, k = 5, h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T) )
 90 | })
 91 | 
 92 | 
 93 | testthat::test_that("it returns an error if y is not numeric", {
 94 | 
 95 |   testthat::expect_error( distMat.KernelKnn(DIST_mat, TEST_indices = NULL, y = list(y), k = 5, h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T)  )
 96 | })
 97 | 
 98 | 
 99 | testthat::test_that("it returns an error if regression = F and the Levels = NULL", {
100 | 
101 |   testthat::expect_error( distMat.KernelKnn(DIST_mat_class, TEST_indices = NULL, as.numeric(y1_class_ext), k = 5, h = 1.0, weights_function = NULL, regression = F, threads = 1, extrema = F, Levels = NULL, minimize = T) )
102 | })
103 | 
104 | 
105 | testthat::test_that("it returns an error if regression = F and there are unique labels less than 1", {
106 | 
107 |   testthat::expect_error( distMat.KernelKnn(DIST_mat_class, TEST_indices = NULL, as.numeric(y1_class_ext) - 1, k = 5, h = 1.0, weights_function = NULL, regression = F, threads = 1, extrema = F, Levels = unique(as.numeric(y1_class_ext)), minimize = T) )
108 | })
109 | 
110 | 
111 | testthat::test_that("it returns an error if missing values are present in the data", {
112 | 
113 |   tmp_dat = DIST_mat
114 |   tmp_dat[2,1] = NA
115 | 
116 |   testthat::expect_error( distMat.KernelKnn(tmp_dat, TEST_indices = NULL, y, k = 5, h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T)  )
117 | })
118 | 
119 | 
120 | testthat::test_that("it returns an error if missing values are present in the response variable", {
121 | 
122 |   tmp_dat = y
123 |   tmp_dat[1] = NA
124 | 
125 |   testthat::expect_error( distMat.KernelKnn(DIST_mat, TEST_indices = NULL, tmp_dat, k = 5, h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T) )
126 | })
127 | 
128 | 
129 | testthat::test_that("it returns an error if the length of y is not equal to the number of rows of the distance matrix", {
130 | 
131 |   testthat::expect_error( distMat.KernelKnn(DIST_mat, TEST_indices = NULL, y[1:100], k = 5, h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T) )
132 | })
133 | 
134 | 
135 | testthat::test_that("it returns an error if k is less than 3 and extrema = TRUE", {
136 | 
137 |   testthat::expect_error( distMat.KernelKnn(DIST_mat, TEST_indices = NULL, y, k = 3, h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = T, Levels = NULL, minimize = T) )
138 | })
139 | 
140 | 
141 | testthat::test_that("it returns an error if the minimize parameter is not a boolean", {
142 | 
143 |   testthat::expect_error( distMat.KernelKnn(DIST_mat, TEST_indices = NULL, y, k = 5, h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = 'T') )
144 | })
145 | 
146 | 
147 | testthat::test_that("it returns error if the weights function is invalid for regression = T, if TEST_indices is NULL", {
148 | 
149 |   testthat::expect_error( distMat.KernelKnn(DIST_mat, TEST_indices = NULL, y, k = 5, h = 1.0, weights_function = list(), regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T) )
150 | })
151 | 
152 | 
153 | testthat::test_that("it returns error if the weights function is invalid for regression = F, if TEST_indices is NULL", {
154 | 
155 |   testthat::expect_error( distMat.KernelKnn(DIST_mat, TEST_indices = NULL, y, k = 5, h = 1.0, weights_function = matrix(,0,0), regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T) )
156 | })
157 | 
158 | 
159 | testthat::test_that("it returns error if the weights function is invalid for regression = F, TEST_indices is NOT NULL", {
160 | 
161 |   testthat::expect_error(distMat.KernelKnn(DIST_mat_class, TEST_indices = 1:100, as.numeric(y1_class_ext), k = 5, h = 1.0, weights_function = data.frame(matrix(,0,0)), regression = F, threads = 1, extrema = F, Levels = unique(as.numeric(y1_class_ext)), minimize = T))
162 | })
163 | 
164 | 
165 | testthat::test_that("it returns error if the weights function is invalid for regression = T, TEST_indices is NOT NULL", {
166 | 
167 |   testthat::expect_error( distMat.KernelKnn(DIST_mat, TEST_indices = 1:100, y, k = 5, h = 1.0, weights_function = as.factor(1:10), regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T) )
168 | })
169 | 
170 | 
171 | #=====================
172 | # testing of KernelKnn
173 | #=====================
174 | 
175 | 
176 | testthat::test_that("if TEST_indices = NULL, if regression = T the function returns the correct output", {
177 | 
178 |   res = distMat.KernelKnn(DIST_mat, TEST_indices = NULL, y, k = 5, h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T)
179 | 
180 |   testthat::expect_true(length(res) == nrow(DIST_mat))
181 | })
182 | 
183 | 
184 | testthat::test_that("if TEST_indices = NULL, if regression = T, if extrema = T, the function returns the correct output", {
185 | 
186 |   res = distMat.KernelKnn(DIST_mat, TEST_indices = NULL, y, k = 5, h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = T, Levels = NULL, minimize = T)
187 | 
188 |   testthat::expect_true(length(res) == nrow(DIST_mat))
189 | })
190 | 
191 | 
192 | testthat::test_that("if TEST_indices = NULL, if regression = T, if the diagonal of the distance matrix is filled with NA's the function returns the correct output", {
193 | 
194 |   tmp_dist = DIST_mat
195 | 
196 |   diag(tmp_dist) = NA
197 | 
198 |   res = distMat.KernelKnn(tmp_dist, TEST_indices = NULL, y, k = 5, h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T)
199 | 
200 |   testthat::expect_true(length(res) == nrow(tmp_dist))
201 | })
202 | 
203 | 
204 | testthat::test_that("if TEST_indices is not NULL, if regression = T the function returns the correct output", {
205 | 
206 |   tmp_idx = (nrow(DIST_mat) - 100): nrow(DIST_mat)
207 | 
208 |   res = distMat.KernelKnn(DIST_mat, TEST_indices = tmp_idx, y, k = 5, h = 1.0, weights_function = NULL, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T)
209 | 
210 |   testthat::expect_true(length(res) == length(tmp_idx))
211 | })
212 | 
213 | 
214 | testthat::test_that("using either a kernel or a user-defined-kernel-function (here in both cases a 'uniform' kernel), IF regression = T, returns the same result", {
215 | 
216 |   res_kernel = distMat.KernelKnn(DIST_mat, TEST_indices = NULL, y, k = 5, h = 1.0, weights_function = 'uniform', regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T)
217 | 
218 |   uniform = function(W) {
219 | 
220 |     W = (1/2) * abs(W)
221 | 
222 |     W = W / rowSums(W)
223 | 
224 |     return(W)
225 |   }
226 | 
227 |   res_kernel_function = distMat.KernelKnn(DIST_mat, TEST_indices = NULL, y, k = 5, h = 1.0, weights_function = uniform, regression = T, threads = 1, extrema = F, Levels = NULL, minimize = T)
228 | 
229 |   df = data.frame(first_case = res_kernel, sec_case = res_kernel_function)
230 | 
231 |   df$difference_of_results = round(df$first_case - df$sec_case, 3)          # difference of results
232 | 
233 |   testthat::expect_true(sum(df$difference_of_results) == 0.0)
234 | })
235 | 
236 | 
237 | 
238 | testthat::test_that("using either a kernel or a user-defined-kernel-function (here in both cases a 'uniform' kernel) returns the same result, IF regression = F, wenn TEST_indices is not NULL", {
239 | 
240 |   idxs = (nrow(DIST_mat_class) - 100): nrow(DIST_mat_class)
241 | 
242 |   res_kernel = distMat.KernelKnn(DIST_mat_class, TEST_indices = idxs, as.numeric(y1_class_ext), k = 5, h = 1.0, weights_function = 'uniform', regression = F, threads = 1, extrema = F, Levels = unique(as.numeric(y1_class_ext)), minimize = T)
243 | 
244 |   uniform = function(W) {
245 | 
246 |     W = (1/2) * abs(W)
247 | 
248 |     W = W / rowSums(W)
249 | 
250 |     return(W)
251 |   }
252 | 
253 |   res_kernel_function = distMat.KernelKnn(DIST_mat_class, TEST_indices = idxs, as.numeric(y1_class_ext), k = 5, h = 1.0, weights_function = uniform, regression = F, threads = 1, extrema = F, Levels = unique(as.numeric(y1_class_ext)), minimize = T)
254 | 
255 |   df_col1 = data.frame(first_case = res_kernel[, 1], sec_case = res_kernel_function[, 1])
256 | 
257 |   df_col2 = data.frame(first_case = res_kernel[, 2], sec_case = res_kernel_function[, 2])
258 | 
259 |   df_col1$difference_of_results = round(df_col1$first_case - df_col1$sec_case, 3)          # difference of results in 'col1'
260 | 
261 |   df_col2$difference_of_results = round(df_col2$first_case - df_col2$sec_case, 3)          # difference of results in 'col2'
262 | 
263 |   testthat::expect_true(sum(df_col1$difference_of_results) == 0.0 && sum(df_col2$difference_of_results) == 0.0)
264 | })
265 | 
266 | 


--------------------------------------------------------------------------------
/src/distance_metrics.cpp:
--------------------------------------------------------------------------------
  1 | # define ARMA_WARN_LEVEL 0
  2 | # include <RcppArmadillo.h>
  3 | // [[Rcpp::depends("RcppArmadillo")]]
  4 | // [[Rcpp::plugins(openmp)]]
  5 | 
  6 | 
  7 | #ifdef _OPENMP
  8 | #include <omp.h>
  9 | #endif
 10 | 
 11 | 
 12 | 
 13 | // return data struct
 14 | //
 15 | 
 16 | struct return_data {
 17 | 
 18 |   arma::mat knn_idx;
 19 | 
 20 |   arma::mat knn_dist;
 21 | };
 22 | 
 23 | 
 24 | 
 25 | // kernelknn-class
 26 | //
 27 | 
 28 | 
 29 | class kernelKnn {
 30 | 
 31 |   private:
 32 | 
 33 |     arma::mat knn_indices;
 34 | 
 35 |     arma::mat knn_distances;
 36 | 
 37 |   public:
 38 | 
 39 |     kernelKnn() { }
 40 | 
 41 | 
 42 |     // inner loop [ no difference between threaded and non-threaded version for the 'inner_loop' method (parallelization) ]
 43 |     //
 44 | 
 45 |     arma::rowvec inner_loop(arma::mat& MATRIX_1st, arma::mat& MATRIX_2nd, int i, int ITERS, int k, std::string& method, arma::mat& cov_mat, double p, double eps = 1.0e-6) {
 46 | 
 47 | 
 48 |       arma::rowvec tmp_out = arma::zeros<arma::rowvec>(ITERS);
 49 | 
 50 | 
 51 |       for (int j = 0; j < ITERS; j++) {                                                                                                                                                        // http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.DistanceMetric.html
 52 | 
 53 |         double tmp_idx;
 54 | 
 55 |         if (method == "euclidean") {
 56 | 
 57 |           tmp_idx = std::sqrt(arma::as_scalar(arma::accu(arma::square((MATRIX_1st.row(j) - MATRIX_2nd.row(i))))));
 58 |         }
 59 | 
 60 |         else if (method == "manhattan") {
 61 | 
 62 |           tmp_idx = arma::as_scalar(arma::accu(arma::abs((MATRIX_1st.row(j) - MATRIX_2nd.row(i)))));
 63 |         }
 64 | 
 65 |         else if (method == "chebyshev") {
 66 | 
 67 |           tmp_idx = arma::as_scalar(max(arma::abs((MATRIX_1st.row(j) - MATRIX_2nd.row(i)))));
 68 |         }
 69 | 
 70 |         else if (method == "canberra") {
 71 | 
 72 |           tmp_idx = arma::as_scalar(arma::accu(arma::abs((MATRIX_1st.row(j) - MATRIX_2nd.row(i)) + eps)/(arma::abs(MATRIX_1st.row(j)) + arma::abs(MATRIX_2nd.row(i)) + eps)));                 // added 1.0e-6 otherwise rstudio crashes
 73 |         }
 74 | 
 75 |         else if (method == "braycurtis") {
 76 | 
 77 |           tmp_idx = arma::as_scalar(arma::accu(arma::abs((MATRIX_1st.row(j) - MATRIX_2nd.row(i))))/(arma::accu(arma::abs(MATRIX_1st.row(j))) + arma::accu(arma::abs(MATRIX_2nd.row(i)))));
 78 |         }
 79 | 
 80 |         else if (method == "pearson_correlation") {
 81 | 
 82 |           tmp_idx = arma::as_scalar(1.0 - arma::cor(MATRIX_1st.row(j), MATRIX_2nd.row(i)));
 83 |         }
 84 | 
 85 |         else if (method == "simple_matching_coefficient") {
 86 | 
 87 |           double a = eps;
 88 |           double d = eps;
 89 | 
 90 |           for (unsigned int t = 0; t < MATRIX_1st.row(j).n_elem; t++) {
 91 | 
 92 |             if (MATRIX_1st.row(j)(t) == 1 && MATRIX_2nd.row(i)(t) == 1) {
 93 | 
 94 |               a += 1.0;}
 95 | 
 96 |             if (MATRIX_1st.row(j)(t) == 0 && MATRIX_2nd.row(i)(t) == 0) {
 97 | 
 98 |               d += 1.0;
 99 |             }
100 |           }
101 | 
102 |           tmp_idx = 1.0 - ((a + d) / MATRIX_1st.row(j).n_elem);
103 |         }
104 | 
105 |         else if (method == "minkowski") {                                                                                                       // by default the order of the minkowski parameter equals k
106 | 
107 |           tmp_idx = std::pow(arma::as_scalar(arma::accu(arma::pow(arma::abs((MATRIX_1st.row(j) - MATRIX_2nd.row(i))), p))), 1.0/p);
108 |         }
109 | 
110 |         else if (method == "hamming") {                                                                                                          // for binary data
111 | 
112 |           tmp_idx = arma::as_scalar(accu(MATRIX_1st.row(j) != MATRIX_2nd.row(i))/(MATRIX_1st.row(j).n_elem * 1.0));
113 |         }
114 | 
115 |         else if (method == "mahalanobis") {                                                                                                       // first create covariance matrix from data
116 | 
117 |           tmp_idx = arma::as_scalar(std::sqrt(arma::as_scalar(((MATRIX_1st.row(j) - MATRIX_2nd.row(i)) * cov_mat) * (MATRIX_1st.row(j) - MATRIX_2nd.row(i)).t())));
118 |         }
119 | 
120 |         else if (method == "jaccard_coefficient") {                                                                                                // for binary data
121 | 
122 |           double a = eps;
123 |           double b = eps;
124 |           double c = eps;
125 | 
126 |           for (unsigned int t = 0; t < MATRIX_1st.row(j).n_elem; t++) {
127 | 
128 |             if (MATRIX_1st.row(j)(t) == 1 && MATRIX_2nd.row(i)(t) == 1) {
129 | 
130 |               a += 1.0;}
131 | 
132 |             if (MATRIX_1st.row(j)(t) == 1 && MATRIX_2nd.row(i)(t) == 0) {
133 | 
134 |               b += 1.0;}
135 | 
136 |             if (MATRIX_1st.row(j)(t) == 0 && MATRIX_2nd.row(i)(t) == 1) {
137 | 
138 |               c += 1.0;
139 |             }
140 |           }
141 | 
142 |           tmp_idx = 1.0 - (a / (a + b + c));
143 |         }
144 | 
145 |         else if (method == "Rao_coefficient") {                                                                                     // for binary data
146 | 
147 |           double a = eps;
148 | 
149 |           for (unsigned int t = 0; t < MATRIX_1st.row(j).n_elem; t++) {
150 | 
151 |             if (MATRIX_1st.row(j)(t) == 1 && MATRIX_2nd.row(i)(t) == 1) {
152 | 
153 |               a += 1.0;
154 |             }
155 |           }
156 | 
157 |           tmp_idx = 1.0 - (a / MATRIX_1st.row(j).n_elem);
158 |         }
159 | 
160 |         else {
161 | 
162 |           tmp_idx = 0;                                             // default = 0; create exceptions in R, so that tmp_idx is never 0;
163 |         }
164 | 
165 |         if ( tmp_idx != tmp_idx ) {                                // handling of NAs, if NaN then distance 1.0 [  NaN will compare false to everything, including itself ], http://stackoverflow.com/questions/11569337/using-an-if-statement-to-switch-nan-values-in-an-array-to-0-0]
166 | 
167 |           tmp_out(j) = 1.0;
168 |         }
169 | 
170 |         else {
171 | 
172 |           tmp_out(j) = tmp_idx;
173 |         }
174 |       }
175 | 
176 |       return tmp_out;
177 |     }
178 | 
179 | 
180 |     // calculate the 'inverse' AND in case of exception the 'Moore-Penrose pseudo-inverse' of the covariance matrix FOR the 'mahalanobis' distance
181 |     // https://github.com/mlampros/KernelKnn/issues/1
182 |     //
183 | 
184 |     arma::mat INV_EXC(arma::mat cov_data) {
185 | 
186 |       arma::mat inv_tmp;
187 | 
188 |       try {
189 | 
190 |         inv_tmp = arma::inv(arma::cov(cov_data));
191 |       }
192 | 
193 |       catch(...) {
194 | 
195 |         Rcpp::warning("the input matrix seems singular. The Moore-Penrose pseudo-inverse of the covariance matrix will be calculated");
196 |       }
197 | 
198 |       if (inv_tmp.empty()) {
199 | 
200 |         inv_tmp = arma::pinv(arma::cov(cov_data));
201 |       }
202 | 
203 |       return inv_tmp;
204 |     }
205 | 
206 | 
207 |     // secondary function for the 'train_only' and 'test_only' methods [ due to ASAN errors ]
208 |     //
209 | 
210 |     arma::field<arma::rowvec> inner_field_func(arma::mat& MATRIX, arma::mat& MATRIX1, int i, int ITERS, int k, std::string& method, arma::mat& cov_mat, double p, double eps = 1.0e-6) {
211 | 
212 |       arma::rowvec tmp_out = inner_loop(MATRIX, MATRIX1, i, ITERS, k, method, cov_mat, p, eps);
213 | 
214 |       arma::uvec index_out = arma::sort_index(tmp_out, "ascend");
215 | 
216 |       arma::field<arma::rowvec> F(2,1);
217 | 
218 |       F(0,0) = tmp_out;
219 | 
220 |       F(1,0) = arma::conv_to< arma::rowvec >::from(index_out);
221 | 
222 |       return F;
223 |     }
224 | 
225 | 
226 |     // train-data-input-only
227 |     //
228 | 
229 |     void train_only(arma::mat& MATRIX, int k, std::string& method, int threads, double p, double eps = 1.0e-6) {
230 | 
231 |       #ifdef _OPENMP
232 |       omp_set_num_threads(threads);
233 |       #endif
234 | 
235 |       unsigned int COLS = MATRIX.n_cols;
236 | 
237 |       arma::mat cov_mat(COLS, COLS);
238 | 
239 |       if (method == "mahalanobis") {
240 | 
241 |         cov_mat = INV_EXC(MATRIX);
242 |       }
243 | 
244 |       int ITERS = MATRIX.n_rows;
245 | 
246 |       knn_indices.set_size(ITERS, k);                                                                   // loop to calculate the distances for the TRAIN data [ MATRIX ]
247 | 
248 |       knn_distances.set_size(ITERS, k);
249 | 
250 |       int i,f;
251 | 
252 |       #ifdef _OPENMP
253 |       #pragma omp parallel for schedule(static) shared(ITERS, p, eps, cov_mat, method, k, MATRIX) private(i, f)
254 |       #endif
255 |       for (i = 0; i < ITERS; i++) {
256 | 
257 |         arma::field<arma::rowvec> unl_field = inner_field_func(MATRIX, MATRIX, i, ITERS, k, method, cov_mat, p, eps);
258 | 
259 |         for (f = 1; f < k + 1; f++) {
260 | 
261 |           int IDX_subset_iter = unl_field(1,0)(f);
262 | 
263 |           #ifdef _OPENMP
264 |           #pragma omp atomic write
265 |           #endif
266 |           knn_indices(i,f-1) = IDX_subset_iter + 1;                                                     // 'knn_indices', 'knn_distances' : class-members (not variables) are 'shared' by default when they aren't present in the clauses [ see comment https://stackoverflow.com/questions/5891641/data-members-in-an-openmp-loop ]
267 | 
268 |           #ifdef _OPENMP
269 |           #pragma omp atomic write
270 |           #endif
271 |           knn_distances(i,f-1) = unl_field(0,0)(IDX_subset_iter);
272 |         }
273 |       }
274 |     }
275 | 
276 | 
277 |     // test-data-only
278 |     //
279 | 
280 |     void test_only(arma::mat& MATRIX, arma::mat& TEST_DATA, int k, std::string& method, int threads, double p, double eps = 1.0e-6) {
281 | 
282 |       #ifdef _OPENMP
283 |       omp_set_num_threads(threads);
284 |       #endif
285 | 
286 |       unsigned int COLS = MATRIX.n_cols;
287 | 
288 |       arma::mat cov_mat(COLS, COLS);
289 | 
290 |       if (method == "mahalanobis") {
291 | 
292 |         cov_mat = INV_EXC(arma::join_vert(MATRIX, TEST_DATA));
293 |       }
294 | 
295 |       int ITERS_TEST = TEST_DATA.n_rows;
296 | 
297 |       int ITERS_TRAIN = MATRIX.n_rows;
298 | 
299 |       knn_indices.set_size(ITERS_TEST, k);
300 | 
301 |       knn_distances.set_size(ITERS_TEST, k);
302 | 
303 |       int i,f;
304 | 
305 |       #ifdef _OPENMP
306 |       #pragma omp parallel for schedule(static) shared(ITERS_TEST, p, eps, cov_mat, threads, method, k, ITERS_TRAIN, TEST_DATA, MATRIX) private(i, f)
307 |       #endif
308 |       for (i = 0; i < ITERS_TEST; i++) {
309 | 
310 |         arma::field<arma::rowvec> unl_field = inner_field_func(MATRIX, TEST_DATA, i, ITERS_TRAIN, k, method, cov_mat, p, eps);
311 | 
312 |         for (f = 0; f < k; f++) {
313 | 
314 |           int IDX_subset_iter = unl_field(1,0)(f);
315 | 
316 |           #ifdef _OPENMP
317 |           #pragma omp atomic write
318 |           #endif
319 |           knn_indices(i,f) = IDX_subset_iter + 1;
320 | 
321 |           #ifdef _OPENMP
322 |           #pragma omp atomic write
323 |           #endif
324 |           knn_distances(i,f) = unl_field(0,0)(IDX_subset_iter);
325 |         }
326 |       }
327 |     }
328 | 
329 | 
330 |     // secondary function for the 'input_dist_mat' [ due to ASAN errors ]
331 |     //
332 | 
333 |     arma::field<arma::rowvec> inner_dist_field(arma::mat& DIST_MAT, bool idx_openmp_flag, arma::uvec& test_idx, double min_sort, unsigned int i, const char *asc_des) {
334 | 
335 |       arma::rowvec tmp_row = DIST_MAT.row(i);                                                             // take each row IF test-indices NOT Null AND IF minimize = true THEN fill the test-indices with A MAXIMUM VALUE (so that sort_index does not pick one of the test-indices) IF minimize = false THEN fill the test-indices with a MINIMUM VALUE
336 | 
337 |       if (idx_openmp_flag) {
338 | 
339 |         tmp_row(test_idx).fill(min_sort);                                                                  // another option would be to use erase for the 'test_idx' (remove the 'test_idx' from 'tmp_row') so that the for-loop can run faster, however I won't be able to calculate the sorted indices correctly (especially in case of cross-validation)
340 |       }
341 | 
342 |       arma::uvec index_out = arma::sort_index(tmp_row, asc_des);
343 | 
344 |       arma::field<arma::rowvec> F(2,1);
345 | 
346 |       F(0,0) = tmp_row;
347 | 
348 |       F(1,0) = arma::conv_to< arma::rowvec >::from(index_out);
349 | 
350 |       return F;
351 |     }
352 | 
353 | 
354 |     // input data is a distance matrix
355 |     //
356 | 
357 |     void input_dist_mat(arma::mat DIST_MAT_input, Rcpp::Nullable<Rcpp::NumericVector> TEST_IDX = R_NilValue, bool is_min = true, int k = 5, int threads = 1) {
358 | 
359 |       #ifdef _OPENMP
360 |       omp_set_num_threads(threads);
361 |       #endif
362 | 
363 |       arma::mat& DIST_MAT = DIST_MAT_input;                                    // copy initial data (by reference), because I'll modified it
364 | 
365 |       arma::uvec test_idx;
366 | 
367 |       bool idx_openmp_flag = false;
368 | 
369 |       if (TEST_IDX.isNotNull()) {
370 | 
371 |         idx_openmp_flag = true;
372 | 
373 |         test_idx = Rcpp::as<arma::uvec>(TEST_IDX);
374 | 
375 |         test_idx = test_idx - 1;                                               // adjust the indices to c++ indexing (which begins from 0)
376 | 
377 |         DIST_MAT = DIST_MAT.rows(test_idx);                                    // overwrite the DIST_MAT using the test-indices (in case that TEST_IDX not NULL)
378 |       }
379 | 
380 |       unsigned int ROWS = DIST_MAT.n_rows;
381 | 
382 |       const char *asc_des = is_min ? "ascend" : "descend";
383 | 
384 |       double min_sort = arma::datum::inf;;
385 | 
386 |       if (idx_openmp_flag) {
387 | 
388 |         min_sort = is_min ? arma::datum::inf : (-arma::datum::inf);             // in case of minimization of distance (and TEST_IDX not NULL) assign the max. value to train-data (and the opposite if maximization). That way I "indirectly" omit the TEST_IDX data from sorted-distance-calculation
389 |       }
390 | 
391 |       int start_idx = idx_openmp_flag ? 0 : 1;
392 | 
393 |       int end_idx = idx_openmp_flag ? k : (k + 1);
394 | 
395 |       knn_indices.set_size(ROWS, k);
396 | 
397 |       knn_distances.set_size(ROWS, k);
398 | 
399 |       unsigned int i;
400 | 
401 |       #ifdef _OPENMP
402 |       #pragma omp parallel for schedule(static) shared(ROWS, DIST_MAT, idx_openmp_flag, min_sort, test_idx, asc_des, start_idx, end_idx) private(i)
403 |       #endif
404 |       for (i = 0; i < ROWS; i++) {
405 | 
406 |         arma::field<arma::rowvec> unl_field = inner_dist_field(DIST_MAT, idx_openmp_flag, test_idx, min_sort, i, asc_des);
407 | 
408 |         for (int f = start_idx; f < end_idx; f++) {
409 | 
410 |           int IDX_subset_iter = unl_field(1,0)(f);
411 | 
412 |           int app_idx = idx_openmp_flag ? f : f-1;
413 | 
414 |           #ifdef _OPENMP
415 |           #pragma omp atomic write
416 |           #endif
417 |           knn_indices(i,app_idx) = IDX_subset_iter + 1;
418 | 
419 |           #ifdef _OPENMP
420 |           #pragma omp atomic write
421 |           #endif
422 |           knn_distances(i,app_idx) = unl_field(0,0)(IDX_subset_iter);
423 |         }
424 |       }
425 |     }
426 | 
427 | 
428 |     // return data for either train or test
429 |     //
430 | 
431 |     return_data return_train_test() {
432 | 
433 |       return { knn_indices, knn_distances };
434 |     }
435 | 
436 | 
437 |     ~kernelKnn() { }
438 | };
439 | 
440 | 
441 | 
442 | 
443 | //------------------------------------------Rcpp::export
444 | 
445 | 
446 | // kernel-knn (input : raw data)
447 | //------------------------------
448 | 
449 | // [[Rcpp::export]]
450 | Rcpp::List knn_index_dist_rcpp(arma::mat& MATRIX, arma::mat& TEST_DATA, int k, std::string& method, int threads, double p, double eps = 1.0e-6) {
451 | 
452 |   kernelKnn kn;
453 | 
454 |   std::string name_idx;
455 | 
456 |   std::string name_dist;
457 | 
458 |   if (TEST_DATA.is_empty()) {
459 | 
460 |     kn.train_only( MATRIX, k, method, threads, p, eps );
461 | 
462 |     name_idx = "train_knn_idx";
463 | 
464 |     name_dist = "train_knn_dist";
465 |   }
466 | 
467 |   if (!TEST_DATA.is_empty()) {
468 | 
469 |     kn.test_only( MATRIX, TEST_DATA, k, method, threads, p, eps );
470 | 
471 |     name_idx = "test_knn_idx";
472 | 
473 |     name_dist = "test_knn_dist";
474 |   }
475 | 
476 |   return_data dat = kn.return_train_test();
477 | 
478 |   return Rcpp::List::create(Rcpp::Named(name_idx) = dat.knn_idx, Rcpp::Named(name_dist) = dat.knn_dist);
479 | }
480 | 
481 | 
482 | 
483 | 
484 | // kernel-knn (input : distance matrix)
485 | //-------------------------------------
486 | 
487 | // [[Rcpp::export]]
488 | Rcpp::List DIST_MATRIX_knn(arma::mat& DIST_MAT, Rcpp::Nullable<Rcpp::NumericVector> TEST_IDX = R_NilValue, bool is_min = true, int k = 5, int threads = 1, bool rcpp_list_names = false) {
489 | 
490 |   kernelKnn dist_knn;
491 | 
492 |   dist_knn.input_dist_mat(DIST_MAT, TEST_IDX, is_min, k, threads);
493 | 
494 |   return_data dat = dist_knn.return_train_test();
495 | 
496 |   std::string name_idx, name_dist;
497 | 
498 |   if (!rcpp_list_names) {                 // used in 'distMat.KernelKnn'
499 | 
500 |     name_idx = "knn_idx";
501 |     name_dist = "knn_dist";}
502 | 
503 |   else {
504 | 
505 |     if (TEST_IDX.isNull()) {              // used in 'distMat.knn.index.dist'
506 | 
507 |       name_idx = "train_knn_idx";
508 |       name_dist = "train_knn_dist";}
509 | 
510 |     else {
511 | 
512 |       name_idx = "test_knn_idx";
513 |       name_dist = "test_knn_dist";
514 |     }
515 |   }
516 | 
517 |   return Rcpp::List::create(Rcpp::Named(name_idx) = dat.knn_idx, Rcpp::Named(name_dist) = dat.knn_dist);
518 | }
519 | 
520 | 
521 | 
522 | 


--------------------------------------------------------------------------------
/tests/testthat/test-kernelknn.R:
--------------------------------------------------------------------------------
  1 | #==============================================================================================================================================================
  2 | 
  3 | context("Kernel knn")
  4 | 
  5 | 
  6 | #=================
  7 | # Error handling
  8 | #=================
  9 | 
 10 | 
 11 | testthat::test_that("it returns an error if a factor variable is present in the data and the transf_categ_cols = FALSE", {
 12 | 
 13 |   tmp_dat = xtr
 14 | 
 15 |   tmp_dat$rad = as.factor(tmp_dat$rad)
 16 | 
 17 |   testthat::expect_error(KernelKnn(tmp_dat, TEST_data = xte, y1, k = 5, 1.0, method = 'euclidean', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL))
 18 | })
 19 | 
 20 | testthat::test_that("it returns an error if a character variable is present in the data and the transf_categ_cols = FALSE", {
 21 | 
 22 |   tmp_dat = xtr
 23 | 
 24 |   tmp_dat$rad = as.character(tmp_dat$rad)
 25 | 
 26 |   testthat::expect_error(KernelKnn(tmp_dat, TEST_data = xte, y1, k = 5, 1.0, method = 'euclidean', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL))
 27 | })
 28 | 
 29 | 
 30 | testthat::test_that("'simple_matching_coefficient', 'jaccard_coefficient' and 'Rao_coefficient' can work only with binary data, case TEST_data is NULL", {
 31 | 
 32 |   testthat::expect_error(KernelKnn(xtr, TEST_data = NULL, y1, k = 5, 1.0, method = 'simple_matching_coefficient', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL))
 33 | })
 34 | 
 35 | 
 36 | testthat::test_that("'simple_matching_coefficient', 'jaccard_coefficient' and 'Rao_coefficient' can work only with binary data, case TEST_data is NULL", {
 37 | 
 38 |   testthat::expect_error(KernelKnn(xtr, TEST_data = NULL, y1, k = 5, 1.0, method = 'jaccard_coefficient', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL))
 39 | })
 40 | 
 41 | 
 42 | testthat::test_that("'simple_matching_coefficient', 'jaccard_coefficient' and 'Rao_coefficient' can work only with binary data, case TEST_data is NULL", {
 43 | 
 44 |   testthat::expect_error(KernelKnn(xtr, TEST_data = NULL, y1, k = 5, 1.0, method = 'Rao_coefficient', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL))
 45 | })
 46 | 
 47 | 
 48 | testthat::test_that("'simple_matching_coefficient', 'jaccard_coefficient' and 'Rao_coefficient' can work only with binary data, case TEST_data is not NULL", {
 49 | 
 50 |   testthat::expect_error(KernelKnn(xtr, TEST_data = xte, y1, k = 5, 1.0, method = 'simple_matching_coefficient', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL))
 51 | })
 52 | 
 53 | 
 54 | testthat::test_that("'simple_matching_coefficient', 'jaccard_coefficient' and 'Rao_coefficient' can work only with binary data, case TEST_data is not NULL", {
 55 | 
 56 |   testthat::expect_error(KernelKnn(xtr, TEST_data = xte, y1, k = 5, 1.0, method = 'jaccard_coefficient', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL))
 57 | })
 58 | 
 59 | 
 60 | testthat::test_that("'simple_matching_coefficient', 'jaccard_coefficient' and 'Rao_coefficient' can work only with binary data, case TEST_data is not NULL", {
 61 | 
 62 |   testthat::expect_error(KernelKnn(xtr, TEST_data = xte, y1, k = 5, 1.0, method = 'Rao_coefficient', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL))
 63 | })
 64 | 
 65 | 
 66 | testthat::test_that("it returns an error if k is NULL", {
 67 | 
 68 |   testthat::expect_error( KernelKnn(xtr, TEST_data = xte, y1, k = NULL, 1.0, method = 'euclidean', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL) )
 69 | })
 70 | 
 71 | 
 72 | testthat::test_that("it returns an error if k is a character", {
 73 | 
 74 |   testthat::expect_error( KernelKnn(xtr, TEST_data = xte, y1, k = 'invalid', 1.0, method = 'euclidean', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL) )
 75 | })
 76 | 
 77 | 
 78 | testthat::test_that("it returns an error if k is greater or equal to the number of rows of the train data", {
 79 | 
 80 |   testthat::expect_error( KernelKnn(xtr, TEST_data = xte, y1, k = nrow(xtr) , 1.0, method = 'euclidean', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL) )
 81 | })
 82 | 
 83 | 
 84 | testthat::test_that("it returns an error if k is less than 1", {
 85 | 
 86 |   testthat::expect_error( KernelKnn(xtr, TEST_data = xte, y1, k = -1 , 1.0, method = 'euclidean', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL) )
 87 | })
 88 | 
 89 | 
 90 | testthat::test_that("it returns a warning if k is a float", {
 91 | 
 92 |   testthat::expect_warning( KernelKnn(xtr, TEST_data = xte, y1, k = 1.5 , 1.0, method = 'euclidean', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL) )
 93 | })
 94 | 
 95 | 
 96 | testthat::test_that("it returns an error if h = 0", {
 97 | 
 98 |   testthat::expect_error( KernelKnn(xtr, TEST_data = xte, y1, k = 4 , 0.0, method = 'euclidean', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL) )
 99 | })
100 | 
101 | testthat::test_that("it returns an error if the method is NULL", {
102 | 
103 |   testthat::expect_error( KernelKnn(xtr, TEST_data = xte, y1, k = 5 , 1.0, method = NULL, weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL) )
104 | })
105 | 
106 | 
107 | testthat::test_that("it returns an error if the method is not a character", {
108 | 
109 |   testthat::expect_error( KernelKnn(xtr, TEST_data = xte, y1, k = 5 , 1.0, method = 1, weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL) )
110 | })
111 | 
112 | 
113 | testthat::test_that("it returns an error if the method is a character, but not one of the valid names", {
114 | 
115 |   testthat::expect_error( KernelKnn(xtr, TEST_data = xte, y1, k = 5 , 1.0, method = 'invalid', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL) )
116 | })
117 | 
118 | 
119 | testthat::test_that("it returns an error if y is NULL", {
120 | 
121 |   testthat::expect_error( KernelKnn(xtr, TEST_data = xte, NULL, k = 5 , 1.0, method = 'euclidean', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL) )
122 | })
123 | 
124 | 
125 | testthat::test_that("it returns an error if y is not numeric", {
126 | 
127 |   testthat::expect_error( KernelKnn(xtr, TEST_data = xte, list(y1), k = 5 , 1.0, method = 'euclidean', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL) )
128 | })
129 | 
130 | 
131 | testthat::test_that("it returns an error if regression = F and the Levels = NULL", {
132 | 
133 |   testthat::expect_error( KernelKnn(xtr, TEST_data = xte, as.numeric(y1_class), k = 5 , 1.0, method = 'euclidean', weights_function = NULL, regression = F, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL) )
134 | })
135 | 
136 | 
137 | testthat::test_that("it returns an error if regression = F and there are unique labels less than 1", {
138 | 
139 |   testthat::expect_error( KernelKnn(xtr, TEST_data = xte, as.numeric(y1_class) - 1, k = 5 , 1.0, method = 'euclidean', weights_function = NULL, regression = F, transf_categ_cols = F, threads = 1, extrema = F, Levels = unique(as.numeric(y1_class))) )
140 | })
141 | 
142 | 
143 | testthat::test_that("it returns an error if missing values are present in the data", {
144 | 
145 |   tmp_dat = xtr
146 |   tmp_dat$crim[sample(1:length(tmp_dat$crim), 10)] = NA
147 | 
148 |   testthat::expect_error( KernelKnn(tmp_dat, TEST_data = xte, y1, k = 5 , 1.0, method = 'euclidean', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL) )
149 | })
150 | 
151 | 
152 | testthat::test_that("it returns an error if missing values are present in the response variable", {
153 | 
154 |   tmp_dat = y1
155 |   tmp_dat[sample(1:length(tmp_dat), 10)] = NA
156 | 
157 |   testthat::expect_error( KernelKnn(xtr, TEST_data = xte, tmp_dat, k = 5 , 1.0, method = 'euclidean', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL) )
158 | })
159 | 
160 | 
161 | testthat::test_that("it returns an error if missing values are present in the TEST data", {
162 | 
163 |   tmp_dat = xte
164 |   tmp_dat$crim[sample(1:length(tmp_dat$crim), 10)] = NA
165 | 
166 |   testthat::expect_error( KernelKnn(xtr, TEST_data = tmp_dat, y1, k = 5 , 1.0, method = 'euclidean', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL) )
167 | })
168 | 
169 | 
170 | testthat::test_that("it returns an error if the length of y is not equal to the number of rows of the train data", {
171 | 
172 |   testthat::expect_error( KernelKnn(xtr, TEST_data = xte, y1[1:(length(y1)-10)], k = 5 , 1.0, method = 'euclidean', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL) )
173 | })
174 | 
175 | 
176 | testthat::test_that("it returns an error if k is less than 3 and extrema = TRUE", {
177 | 
178 |   testthat::expect_error( KernelKnn(xtr, TEST_data = xte, y1, k = 3 , 1.0, method = 'euclidean', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = T, Levels = NULL) )
179 | })
180 | 
181 | 
182 | testthat::test_that("if the number of columns in train and test data differ it returns an error", {
183 | 
184 |   tmp_xte = xte[, -ncol(xte)]
185 | 
186 |   testthat::expect_error( KernelKnn(xtr, TEST_data = tmp_xte, y1, k = 4 , 1.0, method = 'euclidean', weights_function = NULL, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL) )
187 | })
188 | 
189 | 
190 | testthat::test_that("it returns error if the weights function is invalid for regression = T, if TEST_data is NULL", {
191 | 
192 |   testthat::expect_error(KernelKnn(xtr, TEST_data = NULL, y1, k = 5 , h = 1.0, method = 'euclidean', weights_function = list(), regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL))
193 | })
194 | 
195 | 
196 | testthat::test_that("it returns error if the weights function is invalid for regression = F, if TEST_data is NULL", {
197 | 
198 |   testthat::expect_error(KernelKnn(xtr_class, TEST_data = NULL, as.numeric(y1_class), k = 5 , h = 1.0, method = 'euclidean', weights_function = matrix(,0,0), regression = F, transf_categ_cols = F, threads = 1, extrema = F, Levels = unique(y1_class)))
199 | })
200 | 
201 | 
202 | testthat::test_that("it returns error if the weights function is invalid for regression = F, TEST_data is NOT NULL", {
203 | 
204 |   testthat::expect_error(KernelKnn(xtr_class, TEST_data = xte_class, as.numeric(y1_class), k = 5 , h = 1.0, method = 'euclidean', weights_function = data.frame(matrix(,0,0)), regression = F, transf_categ_cols = F, threads = 1, extrema = F, Levels = unique(y1_class)))
205 | })
206 | 
207 | 
208 | testthat::test_that("it returns error if the weights function is invalid for regression = T, TEST_data is NOT NULL", {
209 | 
210 |   testthat::expect_error(KernelKnn(xtr, TEST_data = xte, y1, k = 5 , h = 1.0, method = 'euclidean', weights_function = as.factor(1:10), regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL))
211 | })
212 | 
213 | 
214 | testthat::test_that("it returns a warning if the input matrix seems singular in case of the 'mahalanobis' distance", {
215 | 
216 |   tmp_x = singular_mat[, -ncol(singular_mat)]
217 | 
218 |   tmp_y = as.numeric(singular_mat[, ncol(singular_mat)])
219 | 
220 |   testthat::expect_warning(KernelKnn(tmp_x, TEST_data = NULL, tmp_y, k = 5 , h = 1.0, method = 'mahalanobis', weights_function = NULL, regression = F, transf_categ_cols = F, threads = 1, extrema = F, Levels = unique(tmp_y)))
221 | })
222 | 
223 | 
224 | # testing of KernelKnn
225 | 
226 | 
227 | testthat::test_that("if transf_categ_cols = TRUE and TEST_data = NULL the KernelKnn returns output equal to the number of rows in the train data", {
228 | 
229 |   tmp_xtr = xtr
230 |   tmp_xtr$rad = as.factor(tmp_xtr$rad)
231 | 
232 |   res = KernelKnn(tmp_xtr, TEST_data = NULL, y1, k = 5 , 1.0, method = 'euclidean', weights_function = NULL, regression = T, transf_categ_cols = T, threads = 1, extrema = F, Levels = NULL)
233 | 
234 |   testthat::expect_true(length(res) == nrow(xtr))
235 | })
236 | 
237 | 
238 | testthat::test_that("if transf_categ_cols = TRUE and TEST_data is NOT NULL the KernelKnn returns output equal to the number of rows in the TEST data", {
239 | 
240 |   tmp_xtr = xtr
241 |   tmp_xtr$rad = as.factor(tmp_xtr$rad)
242 | 
243 |   tmp_xte = xte
244 |   tmp_xte$rad = as.factor(tmp_xte$rad)
245 | 
246 |   res = KernelKnn(tmp_xtr, TEST_data = tmp_xte, y1, k = 5 , 1.0, method = 'euclidean', weights_function = NULL, regression = T, transf_categ_cols = T, threads = 1, extrema = F, Levels = NULL)
247 | 
248 |   testthat::expect_true(length(res) == nrow(tmp_xte))
249 | })
250 | 
251 | 
252 | 
253 | testthat::test_that("using either a kernel or a user-defined-kernel-function (here in both cases a 'uniform' kernel) returns the same result, when both train and test sets are used", {
254 | 
255 |   res_kernel = KernelKnn(xtr, TEST_data = xte, y1, k = 5 , 1.0, method = 'euclidean', weights_function = 'uniform', regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL)
256 | 
257 |   uniform = function(W) {
258 | 
259 |     W = (1/2) * abs(W)
260 | 
261 |     W = W / rowSums(W)
262 | 
263 |     return(W)
264 |   }
265 | 
266 |   res_kernel_function = KernelKnn(xtr, TEST_data = xte, y1, k = 5 , 1.0, method = 'euclidean', weights_function = uniform, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL)
267 | 
268 |   df = data.frame(first_case = res_kernel, sec_case = res_kernel_function)
269 | 
270 |   df$difference_of_results = round(df$first_case - df$sec_case, 3)          # difference of results
271 | 
272 |   testthat::expect_true(sum(df$difference_of_results) == 0.0)
273 | })
274 | 
275 | 
276 | testthat::test_that("using either a kernel or a user-defined-kernel-function (here in both cases a 'uniform' kernel) returns the same result, when ONLY train data is used", {
277 | 
278 |   res_kernel = KernelKnn(xtr, TEST_data = NULL, y1, k = 5 , 1.0, method = 'euclidean', weights_function = 'uniform', regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL)
279 | 
280 |   uniform = function(W) {
281 | 
282 |     W = (1/2) * abs(W)
283 | 
284 |     W = W / rowSums(W)
285 | 
286 |     return(W)
287 |   }
288 | 
289 |   res_kernel_function = KernelKnn(xtr, TEST_data = NULL, y1, k = 5 , 1.0, method = 'euclidean', weights_function = uniform, regression = T, transf_categ_cols = F, threads = 1, extrema = F, Levels = NULL)
290 | 
291 |   df = data.frame(first_case = res_kernel, sec_case = res_kernel_function)
292 | 
293 |   df$difference_of_results = round(df$first_case - df$sec_case, 3)          # difference of results
294 | 
295 |   testthat::expect_true(sum(df$difference_of_results) == 0.0)
296 | })
297 | 
298 | 
299 | testthat::test_that("if the TEST data is NULL for all posible combinations [ WHEN the weights_function is a character, when regression = T, when transf_categ_cols = T] the length of the output
300 | 
301 |                     matches the number of rows of the input", {
302 | 
303 |   tmp_xtr = xtr
304 |   tmp_xtr$rad = as.factor(tmp_xtr$rad)
305 | 
306 |   lst = count = list()
307 | 
308 |   for (k in 4:6) {
309 | 
310 |     for (h in c(0.1, 0.5, 1.0)) {
311 | 
312 |       for (metric in c('euclidean', 'manhattan', 'chebyshev')) {
313 | 
314 |         for (w_func in c('uniform', 'triangular', 'epanechnikov')) {
315 | 
316 |           for (extr in c(T,F)) {
317 | 
318 |             lst = lappend(lst, KernelKnn(tmp_xtr, TEST_data = NULL, y1, k = k , h = h, method = metric, weights_function = w_func, regression = T, transf_categ_cols = T, threads = 1, extrema = extr, Levels = NULL))
319 | 
320 |             count = lappend(count, 1)
321 |           }
322 |         }
323 |       }
324 |     }
325 |   }
326 | 
327 |   testthat::expect_true(nrow(do.call(cbind, lst)) == nrow(tmp_xtr) && ncol(do.call(cbind, lst)) == length(unlist(count)))
328 | })
329 | 
330 | 
331 | testthat::test_that("if the TEST data is NULL for all posible combinations [ WHEN the weights_function is a function, when regression = T, when transf_categ_cols = T] the length of the output
332 | 
333 |                     matches the number of rows of the input", {
334 | 
335 |   tmp_xtr = xtr
336 |   tmp_xtr$rad = as.factor(tmp_xtr$rad)
337 | 
338 |   epanechnikov = function(W) {
339 | 
340 |     W = (3/4) * (1 - W ^ 2)
341 | 
342 |     W = W / rowSums(W)
343 | 
344 |     return(W)
345 |   }
346 | 
347 |   lst = count = list()
348 | 
349 |   for (k in 4:6) {
350 | 
351 |     for (h in c(0.1, 0.5, 1.0)) {
352 | 
353 |       for (metric in c('canberra', 'braycurtis', 'minkowski')) {
354 | 
355 |         for (extr in c(T,F)) {
356 | 
357 |           lst = lappend(lst, KernelKnn(tmp_xtr, TEST_data = NULL, y1, k = k , h = h, method = metric, weights_function = epanechnikov, regression = T, transf_categ_cols = T, threads = 1, extrema = extr, Levels = NULL))
358 | 
359 |           count = lappend(count, 1)
360 |         }
361 |       }
362 |     }
363 |   }
364 | 
365 |   testthat::expect_true(nrow(do.call(cbind, lst)) == nrow(tmp_xtr) && ncol(do.call(cbind, lst)) == length(unlist(count)))
366 | })
367 | 
368 | 
369 | testthat::test_that("if the TEST data is NULL for all posible combinations [ WHEN the weights_function is a character, when regression = F, when transf_categ_cols = F] the length of the output
370 | 
371 |                     matches the number of rows of the input", {
372 | 
373 |   lst = count = list()
374 | 
375 |   for (k in 4:6) {
376 | 
377 |     for (h in c(0.1, 0.5, 1.0)) {
378 | 
379 |       for (metric in c('euclidean', 'manhattan', 'chebyshev')) {
380 | 
381 |         for (w_func in c('uniform', 'triangular', 'epanechnikov')) {
382 | 
383 |           for (extr in c(T,F)) {
384 | 
385 |             lst = lappend(lst, KernelKnn(xtr_class, TEST_data = NULL, as.numeric(y1_class), k = k , h = h, method = metric, weights_function = w_func, regression = F, transf_categ_cols = F, threads = 1, extrema = extr, Levels = as.numeric(unique(y1_class))))
386 | 
387 |             count = lappend(count, 1)
388 |           }
389 |         }
390 |       }
391 |     }
392 |   }
393 | 
394 |   NCOL = mean(unlist(lapply(lst, ncol)))
395 |   NROW = mean(unlist(lapply(lst, nrow)))
396 | 
397 |   testthat::expect_true(NROW == nrow(xtr_class) && NCOL == length(unique(y1_class)))
398 | })
399 | 
400 | 
401 | testthat::test_that("if the TEST data is NULL for all posible combinations [ WHEN the weights_function is a function, when regression = F, when transf_categ_cols = F] the length of the output
402 | 
403 |                     matches the number of rows of the input", {
404 | 
405 |   logistic = function(W) {
406 | 
407 |     W = (1/(exp(W) + 2 + exp(-W)))
408 | 
409 |     W = W / rowSums(W)
410 | 
411 |     return(W)
412 |   }
413 | 
414 |   lst = count = list()
415 | 
416 |   for (k in 4:6) {
417 | 
418 |     for (h in c(0.1, 0.5, 1.0)) {
419 | 
420 |       for (metric in c('canberra', 'braycurtis', 'minkowski')) {
421 | 
422 |         for (extr in c(T,F)) {
423 | 
424 |           lst = lappend(lst, KernelKnn(xtr_class, TEST_data = NULL, as.numeric(y1_class), k = k , h = h, method = metric, weights_function = logistic, regression = F, transf_categ_cols = F, threads = 1, extrema = extr, Levels = as.numeric(unique(y1_class))))
425 | 
426 |           count = lappend(count, 1)
427 |         }
428 |       }
429 |     }
430 |   }
431 | 
432 |   NCOL = mean(unlist(lapply(lst, ncol)))
433 |   NROW = mean(unlist(lapply(lst, nrow)))
434 | 
435 |   testthat::expect_true(NROW == nrow(xtr_class) && NCOL == length(unique(y1_class)))
436 | })
437 | 
438 | 
439 | 
440 | testthat::test_that("if the TEST data is NOT NULL for all posible combinations [ WHEN the weights_function is a character, when regression = T, when transf_categ_cols = T] the length of the output
441 | 
442 |                     matches the number of rows of the input", {
443 | 
444 |   tmp_xtr = xtr
445 |   tmp_xtr$rad = as.factor(tmp_xtr$rad)
446 | 
447 |   tmp_xte = xte
448 |   tmp_xte$rad = as.factor(tmp_xte$rad)
449 | 
450 |   lst = count = list()
451 | 
452 |   for (k in 4:6) {
453 | 
454 |     for (h in c(0.1, 0.5, 1.0)) {
455 | 
456 |       for (metric in c('canberra', 'braycurtis', 'minkowski')) {
457 | 
458 |         for (w_func in c('uniform', 'triangular', 'epanechnikov')) {
459 | 
460 |           for (extr in c(T,F)) {
461 | 
462 |             lst = lappend(lst, KernelKnn(tmp_xtr, TEST_data = tmp_xte, y1, k = k , h = h, method = metric, weights_function = w_func, regression = T, transf_categ_cols = T, threads = 1, extrema = extr, Levels = NULL))
463 | 
464 |             count = lappend(count, 1)
465 |           }
466 |         }
467 |       }
468 |     }
469 |   }
470 | 
471 |   testthat::expect_true(nrow(do.call(cbind, lst)) == nrow(tmp_xte) && ncol(do.call(cbind, lst)) == length(unlist(count)))
472 | })
473 | 
474 | 
475 | testthat::test_that("if the TEST data is NOT NULL for all posible combinations [ WHEN the weights_function is a function, when regression = T, when transf_categ_cols = T] the length of the output
476 | 
477 |                     matches the number of rows of the input", {
478 | 
479 |   tmp_xtr = xtr
480 |   tmp_xtr$rad = as.factor(tmp_xtr$rad)
481 | 
482 |   tmp_xte = xte
483 |   tmp_xte$rad = as.factor(tmp_xte$rad)
484 | 
485 |   epanechnikov = function(W) {
486 | 
487 |     W = (3/4) * (1 - W ^ 2)
488 | 
489 |     W = W / rowSums(W)
490 | 
491 |     return(W)
492 |   }
493 | 
494 |   lst = count = list()
495 | 
496 |   for (k in 4:6) {
497 | 
498 |     for (h in c(0.1, 0.5, 1.0)) {
499 | 
500 |       for (metric in c('canberra', 'hamming', 'mahalanobis')) {
501 | 
502 |         for (extr in c(T,F)) {
503 | 
504 |           lst = lappend(lst, KernelKnn(tmp_xtr, TEST_data = tmp_xte, y1, k = k , h = h, method = metric, weights_function = epanechnikov, regression = T, transf_categ_cols = T, threads = 1, extrema = extr, Levels = NULL))
505 | 
506 |           count = lappend(count, 1)
507 |         }
508 |       }
509 |     }
510 |   }
511 | 
512 |   testthat::expect_true(nrow(do.call(cbind, lst)) == nrow(tmp_xte) && ncol(do.call(cbind, lst)) == length(unlist(count)))
513 | })
514 | 
515 | 
516 | testthat::test_that("if the TEST data is NOT NULL for all posible combinations [ WHEN the weights_function is a character, when regression = F, when transf_categ_cols = F] the length of the output
517 | 
518 |                     matches the number of rows of the input", {
519 | 
520 |   lst = count = list()
521 | 
522 |   for (k in 4:6) {
523 | 
524 |     for (h in c(0.1, 0.5, 1.0)) {
525 | 
526 |       for (metric in c('euclidean', 'manhattan', 'chebyshev')) {
527 | 
528 |         for (w_func in c('uniform', 'triangular', 'epanechnikov')) {
529 | 
530 |           for (extr in c(T,F)) {
531 | 
532 |             lst = lappend(lst, KernelKnn(xtr_class, TEST_data = xte_class, as.numeric(y1_class), k = k , h = h, method = metric, weights_function = w_func, regression = F, transf_categ_cols = F, threads = 1, extrema = extr, Levels = as.numeric(unique(y1_class))))
533 | 
534 |             count = lappend(count, 1)
535 |           }
536 |         }
537 |       }
538 |     }
539 |   }
540 | 
541 |   NCOL = mean(unlist(lapply(lst, ncol)))
542 |   NROW = mean(unlist(lapply(lst, nrow)))
543 | 
544 |   testthat::expect_true(NROW == nrow(xte_class) && NCOL == length(unique(y1_class)))
545 | })
546 | 
547 | 
548 | testthat::test_that("if the TEST data is NOT NULL for all posible combinations [ WHEN the weights_function is a function, when regression = F, when transf_categ_cols = F] the length of the output
549 | 
550 |                     matches the number of rows of the input", {
551 | 
552 |   logistic = function(W) {
553 | 
554 |     W = (1/(exp(W) + 2 + exp(-W)))
555 | 
556 |     W = W / rowSums(W)
557 | 
558 |     return(W)
559 |   }
560 | 
561 |   lst = count = list()
562 | 
563 |   for (k in 4:6) {
564 | 
565 |     for (h in c(0.1, 0.5, 1.0)) {
566 | 
567 |       for (metric in c('canberra', 'braycurtis', 'minkowski')) {
568 | 
569 |         for (extr in c(T,F)) {
570 | 
571 |           lst = lappend(lst, KernelKnn(xtr_class, TEST_data = xte_class, as.numeric(y1_class), k = k , h = h, method = metric, weights_function = logistic, regression = F, transf_categ_cols = F, threads = 1, extrema = extr, Levels = as.numeric(unique(y1_class))))
572 | 
573 |           count = lappend(count, 1)
574 |         }
575 |       }
576 |     }
577 |   }
578 | 
579 |   NCOL = mean(unlist(lapply(lst, ncol)))
580 |   NROW = mean(unlist(lapply(lst, nrow)))
581 | 
582 |   testthat::expect_true(NROW == nrow(xte_class) && NCOL == length(unique(y1_class)))
583 | })
584 | 
585 | 
586 | testthat::test_that("if the TEST data is NULL for all posible combinations [ WHEN the weights_function is NULL, when regression = T, when transf_categ_cols = T] the length of the output
587 | 
588 |                     matches the number of rows of the input", {
589 | 
590 |   tmp_xtr = xtr
591 |   tmp_xtr$rad = as.factor(tmp_xtr$rad)
592 | 
593 |   lst = count = list()
594 | 
595 |   for (k in 4:6) {
596 | 
597 |     for (h in c(0.1, 0.5, 1.0)) {
598 | 
599 |       for (metric in c('euclidean', 'manhattan', 'chebyshev')) {
600 | 
601 |         for (extr in c(T,F)) {
602 | 
603 |           lst = lappend(lst, KernelKnn(tmp_xtr, TEST_data = NULL, y1, k = k , h = h, method = metric, weights_function = NULL, regression = T, transf_categ_cols = T, threads = 1, extrema = extr, Levels = NULL))
604 | 
605 |           count = lappend(count, 1)
606 |         }
607 |       }
608 |     }
609 |   }
610 | 
611 |   testthat::expect_true(nrow(do.call(cbind, lst)) == nrow(tmp_xtr) && ncol(do.call(cbind, lst)) == length(unlist(count)))
612 | })
613 | 
614 | 
615 | 
616 | testthat::test_that("if the TEST data is NULL for all posible combinations [ WHEN the weights_function is NULL, when regression = F, when transf_categ_cols = F] the length of the output
617 | 
618 |                     matches the number of rows of the input", {
619 | 
620 |   lst = count = list()
621 | 
622 |   for (k in 4:6) {
623 | 
624 |     for (h in c(0.1, 0.5, 1.0)) {
625 | 
626 |       for (metric in c('euclidean', 'manhattan', 'chebyshev')) {
627 | 
628 |         for (extr in c(T,F)) {
629 | 
630 |           lst = lappend(lst, KernelKnn(xtr_class, TEST_data = NULL, as.numeric(y1_class), k = k , h = h, method = metric, weights_function = NULL, regression = F, transf_categ_cols = F, threads = 1, extrema = extr, Levels = as.numeric(unique(y1_class))))
631 | 
632 |           count = lappend(count, 1)
633 |         }
634 |       }
635 |     }
636 |   }
637 | 
638 |   NCOL = mean(unlist(lapply(lst, ncol)))
639 |   NROW = mean(unlist(lapply(lst, nrow)))
640 | 
641 |   testthat::expect_true(NROW == nrow(xtr_class) && NCOL == length(unique(y1_class)))
642 | })
643 | 
644 | 
645 | 
646 | testthat::test_that("if the TEST data is NULL for all posible combinations [ WHEN the weights_function is NULL, when regression = T, when transf_categ_cols = T] the length of the output
647 | 
648 |                     matches the number of rows of the input", {
649 | 
650 |   tmp_xtr = xtr
651 |   tmp_xtr$rad = as.factor(tmp_xtr$rad)
652 | 
653 |   tmp_xte = xte
654 |   tmp_xte$rad = as.factor(tmp_xte$rad)
655 | 
656 |   lst = count = list()
657 | 
658 |   for (k in 4:6) {
659 | 
660 |     for (h in c(0.1, 0.5, 1.0)) {
661 | 
662 |       for (metric in c('euclidean', 'manhattan', 'chebyshev')) {
663 | 
664 |         for (extr in c(T,F)) {
665 | 
666 |           lst = lappend(lst, KernelKnn(tmp_xtr, TEST_data = tmp_xte, y1, k = k , h = h, method = metric, weights_function = NULL, regression = T, transf_categ_cols = T, threads = 1, extrema = extr, Levels = NULL))
667 | 
668 |           count = lappend(count, 1)
669 |         }
670 |       }
671 |     }
672 |   }
673 | 
674 |   testthat::expect_true(nrow(do.call(cbind, lst)) == nrow(tmp_xte) && ncol(do.call(cbind, lst)) == length(unlist(count)))
675 | })
676 | 
677 | 
678 | 
679 | testthat::test_that("if the TEST data is NULL for all posible combinations [ WHEN the weights_function is NULL, when regression = F, when transf_categ_cols = F] the length of the output
680 | 
681 |                     matches the number of rows of the input", {
682 | 
683 |   lst = count = list()
684 | 
685 |   for (k in 4:6) {
686 | 
687 |     for (h in c(0.1, 0.5, 1.0)) {
688 | 
689 |       for (metric in c('euclidean', 'manhattan', 'chebyshev')) {
690 | 
691 |         for (extr in c(T,F)) {
692 | 
693 |           lst = lappend(lst, KernelKnn(xtr_class, TEST_data = xte_class, as.numeric(y1_class), k = k , h = h, method = metric, weights_function = NULL, regression = F, transf_categ_cols = F, threads = 1, extrema = extr, Levels = as.numeric(unique(y1_class))))
694 | 
695 |           count = lappend(count, 1)
696 |         }
697 |       }
698 |     }
699 |   }
700 | 
701 |   NCOL = mean(unlist(lapply(lst, ncol)))
702 |   NROW = mean(unlist(lapply(lst, nrow)))
703 | 
704 |   testthat::expect_true(NROW == nrow(xte_class) && NCOL == length(unique(y1_class)))
705 | })
706 | 
707 | 
708 | 
709 | 
710 | testthat::test_that("the similarity measures 'simple_matching_coefficient', 'jaccard_coefficient', 'Rao_coefficient' and 'pearson_correlation' return correct output
711 | 
712 |                     in case of binary data", {
713 | 
714 | 
715 |   dat = do.call(cbind, lapply(1:10, function(x) sample(0:1, 100, replace = T)))
716 |   TES = do.call(cbind, lapply(1:10, function(x) sample(0:1, 50, replace = T)))
717 |   y = sample(1:2, 100, replace = T)
718 | 
719 |   lst = count = list()
720 | 
721 |   for (k in 4:6) {
722 | 
723 |     for (h in c(0.1, 0.5, 1.0)) {
724 | 
725 |       for (metric in c('simple_matching_coefficient', 'jaccard_coefficient', 'Rao_coefficient', 'pearson_correlation')) {
726 | 
727 |         for (extr in c(T,F)) {
728 | 
729 |           lst = lappend(lst, KernelKnn(dat, TEST_data = TES, y, k = k , h = h, method = metric, weights_function = NULL, regression = F, transf_categ_cols = F, threads = 1, extrema = extr, Levels = unique(y)))
730 | 
731 |           count = lappend(count, 1)
732 |         }
733 |       }
734 |     }
735 |   }
736 | 
737 |   NCOL = mean(unlist(lapply(lst, ncol)))
738 |   NROW = mean(unlist(lapply(lst, nrow)))
739 | 
740 |   testthat::expect_true(NROW == nrow(TES) && NCOL == length(unique(y)))
741 | })
742 | 
743 | 
744 | 
745 | testthat::test_that("the 'p' parameter when method is 'minkowski' returns the expected output", {
746 | 
747 |   k = 2
748 |   res_wo = KernelKnn(data = X, y = y, k = k , h = 1.0, method = 'minkowski', regression = T, threads = 1)            # without specifying the 'p' parameter
749 |   res_w = KernelKnn(data = X, y = y, k = k , h = 1.0, method = 'minkowski', regression = T, threads = 1, p = k)      # by specifying the 'p' parameter
750 |   res_dif = KernelKnn(data = X, y = y, k = k , h = 1.0, method = 'minkowski', regression = T, threads = 1, p = 1)    # 'p' is set to 1
751 | 
752 |   is_identical = identical(res_wo, res_w)
753 |   is_not_identical = identical(res_wo, res_dif)
754 | 
755 |   testthat::expect_true(is_identical & (!is_not_identical))
756 | })
757 | 
758 | 
759 | #==============================================================================================================================================================
760 | 


--------------------------------------------------------------------------------