├── .Rbuildignore
├── .gitignore
├── LICENSE
├── data
    └── breastcancer.rda
├── OneR.Rproj
├── man
    ├── is.OneR.Rd
    ├── print.OneR.Rd
    ├── plot.OneR.Rd
    ├── summary.OneR.Rd
    ├── breastcancer.Rd
    ├── maxlevels.Rd
    ├── predict.OneR.Rd
    ├── eval_model.Rd
    ├── bin.Rd
    ├── OneR.Rd
    └── optbin.Rd
├── NAMESPACE
├── DESCRIPTION
├── R
    ├── OneR_data.R
    ├── OneR_internal.R
    ├── OneR_main.R
    └── OneR.R
├── README.md
├── vignettes
    ├── OneR.R
    ├── OneR.Rmd
    └── OneR.html
└── NEWS


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | inst/doc
6 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2016 - 2017
2 | COPYRIGHT HOLDER: Holger von Jouanne-Diedrich
3 | 


--------------------------------------------------------------------------------
/data/breastcancer.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vonjd/OneR/HEAD/data/breastcancer.rda


--------------------------------------------------------------------------------
/OneR.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: knitr
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | PackageRoxygenize: rd,collate,namespace,vignette
22 | 


--------------------------------------------------------------------------------
/man/is.OneR.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/OneR.R
 3 | \name{is.OneR}
 4 | \alias{is.OneR}
 5 | \title{Test OneR model objects}
 6 | \usage{
 7 | is.OneR(x)
 8 | }
 9 | \arguments{
10 | \item{x}{object to be tested.}
11 | }
12 | \value{
13 | a logical whether object is of class "OneR".
14 | }
15 | \description{
16 | Test if object is a OneR model.
17 | }
18 | \examples{
19 | model <- OneR(iris)
20 | is.OneR(model) # evaluates to TRUE
21 | }
22 | \references{
23 | \url{https://github.com/vonjd/OneR}
24 | }
25 | \author{
26 | Holger von Jouanne-Diedrich
27 | }
28 | \keyword{OneR}
29 | \keyword{model}
30 | 


--------------------------------------------------------------------------------
/man/print.OneR.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/OneR.R
 3 | \name{print.OneR}
 4 | \alias{print.OneR}
 5 | \title{Print OneR models}
 6 | \usage{
 7 | \method{print}{OneR}(x, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{object of class \code{"OneR"}.}
11 | 
12 | \item{...}{further arguments passed to or from other methods.}
13 | }
14 | \description{
15 | \code{print} method for class \code{OneR}.
16 | }
17 | \details{
18 | Prints the rules and the accuracy of an OneR model.
19 | }
20 | \examples{
21 | model <- OneR(iris)
22 | print(model)
23 | }
24 | \references{
25 | \url{https://github.com/vonjd/OneR}
26 | }
27 | \seealso{
28 | \code{\link{OneR}}
29 | }
30 | \author{
31 | Holger von Jouanne-Diedrich
32 | }
33 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(OneR,data.frame)
 4 | S3method(OneR,default)
 5 | S3method(OneR,formula)
 6 | S3method(optbin,data.frame)
 7 | S3method(optbin,default)
 8 | S3method(optbin,formula)
 9 | S3method(plot,OneR)
10 | S3method(predict,OneR)
11 | S3method(print,OneR)
12 | S3method(summary,OneR)
13 | export(OneR)
14 | export(bin)
15 | export(eval_model)
16 | export(is.OneR)
17 | export(maxlevels)
18 | export(optbin)
19 | importFrom(graphics,mosaicplot)
20 | importFrom(stats,addmargins)
21 | importFrom(stats,binom.test)
22 | importFrom(stats,binomial)
23 | importFrom(stats,chisq.test)
24 | importFrom(stats,coef)
25 | importFrom(stats,filter)
26 | importFrom(stats,glm)
27 | importFrom(stats,kmeans)
28 | importFrom(stats,model.frame)
29 | importFrom(stats,na.omit)
30 | importFrom(stats,quantile)
31 | 


--------------------------------------------------------------------------------
/man/plot.OneR.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/OneR.R
 3 | \name{plot.OneR}
 4 | \alias{plot.OneR}
 5 | \title{Plot Diagnostics for an OneR object}
 6 | \usage{
 7 | \method{plot}{OneR}(x, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{object of class \code{"OneR"}.}
11 | 
12 | \item{...}{further arguments passed to or from other methods.}
13 | }
14 | \description{
15 | Plots a mosaic plot for the feature attribute and the target of the OneR model.
16 | }
17 | \details{
18 | If more than 20 levels are present for either the feature attribute or the target the function stops with an error.
19 | }
20 | \examples{
21 | model <- OneR(iris)
22 | plot(model)
23 | }
24 | \references{
25 | \url{https://github.com/vonjd/OneR}
26 | }
27 | \seealso{
28 | \code{\link{OneR}}
29 | }
30 | \author{
31 | Holger von Jouanne-Diedrich
32 | }
33 | \keyword{diagnostics}
34 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: OneR
 2 | Type: Package
 3 | Title: One Rule Machine Learning Classification Algorithm with Enhancements
 4 | Version: 2.2
 5 | Date: 2017-05-05
 6 | Author: Holger von Jouanne-Diedrich
 7 | Maintainer: Holger von Jouanne-Diedrich <holger.jouanne-diedrich@h-ab.de>
 8 | Depends: R (>= 2.10)
 9 | Description: Implements the One Rule (OneR) Machine Learning classification algorithm (Holte, R.C. (1993) <doi:10.1023/A:1022631118932>) with enhancements for sophisticated handling of numeric data and missing values together with extensive diagnostic functions. It is useful as a baseline for machine learning models and the rules are often helpful heuristics.
10 | License: MIT + file LICENSE
11 | URL: https://github.com/vonjd/OneR
12 | BugReports: https://github.com/vonjd/OneR/issues
13 | LazyData: TRUE
14 | RoxygenNote: 6.0.1
15 | Suggests: knitr,
16 |     rmarkdown
17 | VignetteBuilder: knitr
18 | 


--------------------------------------------------------------------------------
/man/summary.OneR.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/OneR.R
 3 | \name{summary.OneR}
 4 | \alias{summary.OneR}
 5 | \title{Summarize OneR models}
 6 | \usage{
 7 | \method{summary}{OneR}(object, ...)
 8 | }
 9 | \arguments{
10 | \item{object}{object of class \code{"OneR"}.}
11 | 
12 | \item{...}{further arguments passed to or from other methods.}
13 | }
14 | \description{
15 | \code{summary} method for class \code{OneR}.
16 | }
17 | \details{
18 | Prints the rules of the OneR model, the accuracy, a contingency table of the feature attribute and the target and performs a chi-squared test on this table.
19 | 
20 | In the contingency table the maximum values in each column are highlighted by adding a '*', thereby representing the rules of the OneR model.
21 | }
22 | \examples{
23 | model <- OneR(iris)
24 | summary(model)
25 | }
26 | \references{
27 | \url{https://github.com/vonjd/OneR}
28 | }
29 | \seealso{
30 | \code{\link{OneR}}
31 | }
32 | \author{
33 | Holger von Jouanne-Diedrich
34 | }
35 | \keyword{diagnostics}
36 | 


--------------------------------------------------------------------------------
/R/OneR_data.R:
--------------------------------------------------------------------------------
 1 | #' Breast Cancer Wisconsin Original Data Set
 2 | #'
 3 | #' Dataset containing the original Wisconsin breast cancer data.
 4 | #'
 5 | #' \enumerate{
 6 | #'   \item Clump Thickness: 1 - 10
 7 | #'   \item Uniformity of Cell Size: 1 - 10
 8 | #'   \item Uniformity of Cell Shape: 1 - 10
 9 | #'   \item Marginal Adhesion: 1 - 10
10 | #'   \item Single Epithelial Cell Size: 1 - 10
11 | #'   \item Bare Nuclei: 1 - 10
12 | #'   \item Bland Chromatin: 1 - 10
13 | #'   \item Normal Nucleoli: 1 - 10
14 | #'   \item Mitoses: 1 - 10
15 | #'   \item Class: benign, malignant
16 | #' }
17 | #'
18 | #' @name breastcancer
19 | #' @docType data
20 | #' @references The data were obtained from the UCI machine learning repository, see \url{https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Original)}
21 | #' @keywords data datasets Wisconsin breast cancer
22 | #' @usage data(breastcancer)
23 | #' @format A data frame with 699 instances and 10 attributes. The variables are as follows:
24 | #' @examples
25 | #' data(breastcancer)
26 | #' data <- optbin(breastcancer, method = "infogain")
27 | #' model <- OneR(data, verbose = TRUE)
28 | #' summary(model)
29 | #' plot(model)
30 | #' prediction <- predict(model, data)
31 | #' eval_model(prediction, data)
32 | NULL
33 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # OneR
 2 | This R package implements the One Rule (OneR) Machine Learning classification algorithm with enhancements for sophisticated handling of numeric data and missing values together with extensive diagnostic functions. It is useful as a baseline for machine learning models and the rules are often helpful heuristics.
 3 | 
 4 | ## Documentation
 5 | 
 6 | This video gives a step-by-step introduction: [Quick Start Guide for the OneR package](https://www.youtube.com/watch?v=AGC0oRlXxgU)
 7 | 
 8 | You can find the vignette and full documentation in the package and on CRAN: [OneR: One Rule Machine Learning Classification Algorithm with Enhancements](https://cran.r-project.org/package=OneR)
 9 | 
10 | ## Installation
11 | 
12 | Install the latest stable version from [CRAN](https://cran.r-project.org/package=OneR):
13 | 
14 | ```r
15 | install.packages("OneR")
16 | ```
17 | 
18 | Install the latest development version from GitHub:
19 | 
20 | ```R
21 | install.packages("devtools")
22 | library(devtools)
23 | install_github("vonjd/OneR")
24 | ```
25 | 
26 | ## Contact
27 | 
28 | I would love to hear about your experiences with the OneR package. Please drop me a note - you can reach me at my university account: [Holger K. von Jouanne-Diedrich](https://www.h-ab.de/nc/eng/about-aschaffenburg-university-of-applied-sciences/organisation/personal/?tx_fhapersonal_pi1%5BshowUid%5D=jouanne-diedrich)
29 | 


--------------------------------------------------------------------------------
/man/breastcancer.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/OneR_data.R
 3 | \docType{data}
 4 | \name{breastcancer}
 5 | \alias{breastcancer}
 6 | \title{Breast Cancer Wisconsin Original Data Set}
 7 | \format{A data frame with 699 instances and 10 attributes. The variables are as follows:}
 8 | \usage{
 9 | data(breastcancer)
10 | }
11 | \description{
12 | Dataset containing the original Wisconsin breast cancer data.
13 | }
14 | \details{
15 | \enumerate{
16 |   \item Clump Thickness: 1 - 10
17 |   \item Uniformity of Cell Size: 1 - 10
18 |   \item Uniformity of Cell Shape: 1 - 10
19 |   \item Marginal Adhesion: 1 - 10
20 |   \item Single Epithelial Cell Size: 1 - 10
21 |   \item Bare Nuclei: 1 - 10
22 |   \item Bland Chromatin: 1 - 10
23 |   \item Normal Nucleoli: 1 - 10
24 |   \item Mitoses: 1 - 10
25 |   \item Class: benign, malignant
26 | }
27 | }
28 | \examples{
29 | data(breastcancer)
30 | data <- optbin(breastcancer, method = "infogain")
31 | model <- OneR(data, verbose = TRUE)
32 | summary(model)
33 | plot(model)
34 | prediction <- predict(model, data)
35 | eval_model(prediction, data)
36 | }
37 | \references{
38 | The data were obtained from the UCI machine learning repository, see \url{https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Original)}
39 | }
40 | \keyword{Wisconsin}
41 | \keyword{breast}
42 | \keyword{cancer}
43 | \keyword{data}
44 | \keyword{datasets}
45 | 


--------------------------------------------------------------------------------
/man/maxlevels.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/OneR.R
 3 | \name{maxlevels}
 4 | \alias{maxlevels}
 5 | \title{Remove factors with too many levels}
 6 | \usage{
 7 | maxlevels(data, maxlevels = 20, na.omit = TRUE)
 8 | }
 9 | \arguments{
10 | \item{data}{data frame which contains the data.}
11 | 
12 | \item{maxlevels}{number of maximum factor levels.}
13 | 
14 | \item{na.omit}{logical value whether missing values should be treated as a level, defaults to omit missing values before counting.}
15 | }
16 | \value{
17 | A data frame.
18 | }
19 | \description{
20 | Removes all columns of a data frame where a factor (or character string) has more than a maximum number of levels.
21 | }
22 | \details{
23 | Often categories that have very many levels are not useful in modelling OneR rules because they result in too many rules and tend to overfit.
24 | Examples are IDs or names.
25 | 
26 | Character strings are treated as factors although they keep their datatype. Numeric data is left untouched.
27 | If data contains unused factor levels (e.g. due to subsetting) these are ignored and a warning is given.
28 | }
29 | \examples{
30 | df <- data.frame(numeric = c(1:26), alphabet = letters)
31 | str(df)
32 | str(maxlevels(df))
33 | }
34 | \references{
35 | \url{https://github.com/vonjd/OneR}
36 | }
37 | \seealso{
38 | \code{\link{OneR}}
39 | }
40 | \author{
41 | Holger von Jouanne-Diedrich
42 | }
43 | 


--------------------------------------------------------------------------------
/man/predict.OneR.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/OneR.R
 3 | \name{predict.OneR}
 4 | \alias{predict.OneR}
 5 | \title{Predict method for OneR models}
 6 | \usage{
 7 | \method{predict}{OneR}(object, newdata, type = c("class", "prob"), ...)
 8 | }
 9 | \arguments{
10 | \item{object}{object of class \code{"OneR"}.}
11 | 
12 | \item{newdata}{data frame in which to look for the feature variable with which to predict.}
13 | 
14 | \item{type}{character string denoting the type of predicted value returned. Default \code{"class"} gives a named vector with the predicted classes, \code{"prob"} gives a matrix whose columns are the probability of the first, second, etc. class.}
15 | 
16 | \item{...}{further arguments passed to or from other methods.}
17 | }
18 | \value{
19 | The default is a factor with the predicted classes, if \code{"type = prob"} a matrix is returned whose columns are the probability of the first, second, etc. class.
20 | }
21 | \description{
22 | Predict cases or probabilities based on OneR model object.
23 | }
24 | \details{
25 | \code{newdata} can have the same format as used for building the model but must at least have the feature variable that is used in the OneR rules.
26 | If cases appear that were not present when building the model the predicted case is \code{UNSEEN} or \code{NA} when \code{"type = prob"}.
27 | }
28 | \examples{
29 | model <- OneR(iris)
30 | prediction <- predict(model, iris[1:4])
31 | eval_model(prediction, iris[5])
32 | 
33 | ## type prob
34 | predict(model, data.frame(Petal.Width = seq(0, 3, 0.5)))
35 | predict(model, data.frame(Petal.Width = seq(0, 3, 0.5)), type = "prob")
36 | }
37 | \references{
38 | \url{https://github.com/vonjd/OneR}
39 | }
40 | \seealso{
41 | \code{\link{OneR}}
42 | }
43 | \author{
44 | Holger von Jouanne-Diedrich
45 | }
46 | 


--------------------------------------------------------------------------------
/man/eval_model.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/OneR.R
 3 | \name{eval_model}
 4 | \alias{eval_model}
 5 | \title{Classification Evaluation function}
 6 | \usage{
 7 | eval_model(prediction, actual, dimnames = c("Prediction", "Actual"),
 8 |   zero.print = "0")
 9 | }
10 | \arguments{
11 | \item{prediction}{vector which contains the predicted values.}
12 | 
13 | \item{actual}{data frame which contains the actual data. When there is more than one column the last last column is taken. A single vector is allowed too.}
14 | 
15 | \item{dimnames}{character vector of printed dimnames for the confusion matrices.}
16 | 
17 | \item{zero.print}{character specifying how zeros should be printed; for sparse confusion matrices, using "." can produce more readable results.}
18 | }
19 | \value{
20 | Invisibly returns a list with the number of correctly classified and total instances and a confusion matrix with the absolute numbers.
21 | }
22 | \description{
23 | Function for evaluating a OneR classification model. Prints confusion matrices with prediction vs. actual in absolute and relative numbers. Additionally it gives the accuracy, error rate as well as the error rate reduction versus the base rate accuracy together with a p-value.
24 | }
25 | \details{
26 | Error rate reduction versus the base rate accuracy is calculated by the following formula:\cr\cr
27 | \eqn{(Accuracy(Prediction) - Accuracy(Baserate)) / (1 - Accuracy(Baserate))},\cr\cr
28 | giving a number between 0 (no error reduction) and 1 (no error).\cr\cr
29 | In some borderline cases when the model is performing worse than the base rate negative numbers can result. This shows that something is seriously wrong with the model generating this prediction.\cr\cr
30 | The provided p-value gives the probability of obtaining a distribution of predictions like this (or even more unambiguous) under the assumption that the real accuracy is equal to or lower than the base rate accuracy.
31 | More technicaly it is derived from a one-sided binomial test with the alternative hypothesis that the prediction's accuracy is bigger than the base rate accuracy.
32 | Loosly speaking a low p-value (< 0.05) signifies that the model really is able to give predictions that are better than the base rate.
33 | }
34 | \examples{
35 | data <- iris
36 | model <- OneR(data)
37 | summary(model)
38 | prediction <- predict(model, data)
39 | eval_model(prediction, data)
40 | }
41 | \references{
42 | \url{https://github.com/vonjd/OneR}
43 | }
44 | \author{
45 | Holger von Jouanne-Diedrich
46 | }
47 | \keyword{accuracy}
48 | \keyword{evaluation}
49 | 


--------------------------------------------------------------------------------
/man/bin.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/OneR.R
 3 | \name{bin}
 4 | \alias{bin}
 5 | \title{Binning function}
 6 | \usage{
 7 | bin(data, nbins = 5, labels = NULL, method = c("length", "content",
 8 |   "clusters"), na.omit = TRUE)
 9 | }
10 | \arguments{
11 | \item{data}{data frame or vector which contains the data.}
12 | 
13 | \item{nbins}{number of bins (= levels).}
14 | 
15 | \item{labels}{character vector of labels for the resulting category.}
16 | 
17 | \item{method}{character string specifying the binning method, see 'Details'; can be abbreviated.}
18 | 
19 | \item{na.omit}{logical value whether instances with missing values should be removed.}
20 | }
21 | \value{
22 | A data frame or vector.
23 | }
24 | \description{
25 | Discretizes all numerical data in a data frame into categorical bins of equal length or content or based on automatically determined clusters.
26 | }
27 | \details{
28 | Character strings and logical strings are coerced into factors. Matrices are coerced into data frames. When called with a single vector only the respective factor (and not a data frame) is returned.
29 | Method \code{"length"} gives intervals of equal length, method \code{"content"} gives intervals of equal content (via quantiles).
30 | Method \code{"clusters"} determins \code{"nbins"} clusters via 1D kmeans with deterministic seeding of the initial cluster centres (Jenks natural breaks optimization).
31 | 
32 | When \code{"na.omit = FALSE"} an additional level \code{"NA"} is added to each factor with missing values.
33 | }
34 | \examples{
35 | data <- iris
36 | str(data)
37 | str(bin(data))
38 | str(bin(data, nbins = 3))
39 | str(bin(data, nbins = 3, labels = c("small", "medium", "large")))
40 | 
41 | ## Difference between methods "length" and "content"
42 | set.seed(1); table(bin(rnorm(900), nbins = 3))
43 | set.seed(1); table(bin(rnorm(900), nbins = 3, method = "content"))
44 | 
45 | ## Method "clusters"
46 | intervals <- paste(levels(bin(faithful$waiting, nbins = 2, method = "cluster")), collapse = " ")
47 | hist(faithful$waiting, main = paste("Intervals:", intervals))
48 | abline(v = c(42.9, 67.5, 96.1), col = "blue")
49 | 
50 | ## Missing values
51 | bin(c(1:10, NA), nbins = 2, na.omit = FALSE) # adds new level "NA"
52 | bin(c(1:10, NA), nbins = 2)                  # omits missing values by default (with warning)
53 | }
54 | \references{
55 | \url{https://github.com/vonjd/OneR}
56 | }
57 | \seealso{
58 | \code{\link{OneR}}, \code{\link{optbin}}
59 | }
60 | \author{
61 | Holger von Jouanne-Diedrich
62 | }
63 | \keyword{Jenks}
64 | \keyword{binning}
65 | \keyword{breaks}
66 | \keyword{clusters}
67 | \keyword{discretization}
68 | \keyword{discretize}
69 | 


--------------------------------------------------------------------------------
/man/OneR.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/OneR_main.R
 3 | \name{OneR}
 4 | \alias{OneR}
 5 | \alias{OneR.formula}
 6 | \alias{OneR.data.frame}
 7 | \title{One Rule function}
 8 | \usage{
 9 | OneR(x, ...)
10 | 
11 | \method{OneR}{formula}(formula, data, ties.method = c("first", "chisq"),
12 |   verbose = FALSE, ...)
13 | 
14 | \method{OneR}{data.frame}(x, ties.method = c("first", "chisq"),
15 |   verbose = FALSE, ...)
16 | }
17 | \arguments{
18 | \item{x}{data frame with the last column containing the target variable.}
19 | 
20 | \item{...}{arguments passed to or from other methods.}
21 | 
22 | \item{formula}{formula, additionally the argument \code{data} is needed.}
23 | 
24 | \item{data}{data frame which contains the data, only needed when using the formula interface.}
25 | 
26 | \item{ties.method}{character string specifying how ties are treated, see 'Details'; can be abbreviated.}
27 | 
28 | \item{verbose}{if \code{TRUE} prints rank, names and predictive accuracy of the attributes in decreasing order (with \code{ties.method = "first"}).}
29 | }
30 | \value{
31 | Returns an object of class "OneR". Internally this is a list consisting of the function call with the specified arguments, the names of the target and feature variables,
32 | a list of the rules, the number of correctly classified and total instances and the contingency table of the best predictor vs. the target variable.
33 | }
34 | \description{
35 | Builds a model according to the One Rule (OneR) machine learning classification algorithm.
36 | }
37 | \details{
38 | All numerical data is automatically converted into five categorical bins of equal length. Instances with missing values are removed.
39 | This is done by internally calling the default version of \code{\link{bin}} before starting the OneR algorithm.
40 | To finetune this behaviour data preprocessing with the \code{\link{bin}} or \code{\link{optbin}} functions should be performed.
41 | If data contains unused factor levels (e.g. due to subsetting) these are ignored and a warning is given.
42 | 
43 | When there is more than one attribute with best performance either the first (from left to right) is being chosen (method \code{"first"}) or
44 | the one with the lowest p-value of a chi-squared test (method \code{"chisq"}).
45 | }
46 | \section{Methods (by class)}{
47 | \itemize{
48 | \item \code{formula}: method for formulas.
49 | 
50 | \item \code{data.frame}: method for data frames.
51 | }}
52 | 
53 | \examples{
54 | data <- optbin(iris)
55 | model <- OneR(data, verbose = TRUE)
56 | summary(model)
57 | plot(model)
58 | prediction <- predict(model, data)
59 | eval_model(prediction, data)
60 | 
61 | ## The same with the formula interface:
62 | data <- optbin(iris)
63 | model <- OneR(Species ~., data = data, verbose = TRUE)
64 | summary(model)
65 | plot(model)
66 | prediction <- predict(model, data)
67 | eval_model(prediction, data)
68 | }
69 | \references{
70 | \url{https://github.com/vonjd/OneR}
71 | }
72 | \seealso{
73 | \code{\link{bin}}, \code{\link{optbin}}, \code{\link{eval_model}}, \code{\link{maxlevels}}
74 | }
75 | \author{
76 | Holger von Jouanne-Diedrich
77 | }
78 | \keyword{1R}
79 | \keyword{One}
80 | \keyword{OneR}
81 | \keyword{Rule}
82 | 


--------------------------------------------------------------------------------
/vignettes/OneR.R:
--------------------------------------------------------------------------------
 1 | ## ------------------------------------------------------------------------
 2 | library(OneR)
 3 | 
 4 | ## ------------------------------------------------------------------------
 5 | data <- optbin(iris)
 6 | 
 7 | ## ------------------------------------------------------------------------
 8 | model <- OneR(data, verbose = TRUE)
 9 | 
10 | ## ------------------------------------------------------------------------
11 | summary(model)
12 | 
13 | ## ---- fig.width=7.15, fig.height=5---------------------------------------
14 | plot(model)
15 | 
16 | ## ------------------------------------------------------------------------
17 | prediction <- predict(model, data)
18 | 
19 | ## ------------------------------------------------------------------------
20 | eval_model(prediction, data)
21 | 
22 | ## ------------------------------------------------------------------------
23 | data(breastcancer)
24 | data <- breastcancer
25 | 
26 | ## ------------------------------------------------------------------------
27 | set.seed(12) # for reproducibility
28 | random <- sample(1:nrow(data), 0.8 * nrow(data))
29 | data_train <- optbin(data[random, ], method = "infogain")
30 | data_test <- data[-random, ]
31 | 
32 | ## ------------------------------------------------------------------------
33 | model_train <- OneR(data_train, verbose = TRUE)
34 | 
35 | ## ------------------------------------------------------------------------
36 | summary(model_train)
37 | 
38 | ## ---- fig.width=7.15, fig.height=5---------------------------------------
39 | plot(model_train)
40 | 
41 | ## ------------------------------------------------------------------------
42 | prediction <- predict(model_train, data_test)
43 | 
44 | ## ------------------------------------------------------------------------
45 | eval_model(prediction, data_test)
46 | 
47 | ## ------------------------------------------------------------------------
48 | data <- iris
49 | str(data)
50 | str(bin(data))
51 | str(bin(data, nbins = 3))
52 | str(bin(data, nbins = 3, labels = c("small", "medium", "large")))
53 | 
54 | ## ------------------------------------------------------------------------
55 | set.seed(1); table(bin(rnorm(900), nbins = 3))
56 | set.seed(1); table(bin(rnorm(900), nbins = 3, method = "content"))
57 | 
58 | ## ---- fig.width=7.15, fig.height=5---------------------------------------
59 | intervals <- paste(levels(bin(faithful$waiting, nbins = 2, method = "cluster")), collapse = " ")
60 | hist(faithful$waiting, main = paste("Intervals:", intervals))
61 | abline(v = c(42.9, 67.5, 96.1), col = "blue")
62 | 
63 | ## ------------------------------------------------------------------------
64 | bin(c(1:10, NA), nbins = 2, na.omit = FALSE) # adds new level "NA"
65 | bin(c(1:10, NA), nbins = 2)
66 | 
67 | ## ------------------------------------------------------------------------
68 | df <- data.frame(numeric = c(1:26), alphabet = letters)
69 | str(df)
70 | str(maxlevels(df))
71 | 
72 | ## ------------------------------------------------------------------------
73 | model <- OneR(iris)
74 | predict(model, data.frame(Petal.Width = seq(0, 3, 0.5)))
75 | 
76 | ## ------------------------------------------------------------------------
77 | predict(model, data.frame(Petal.Width = seq(0, 3, 0.5)), type = "prob")
78 | 
79 | ## ---- eval=FALSE---------------------------------------------------------
80 | #  help(package = OneR)
81 | 
82 | 


--------------------------------------------------------------------------------
/man/optbin.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/OneR.R
 3 | \name{optbin}
 4 | \alias{optbin}
 5 | \alias{optbin.formula}
 6 | \alias{optbin.data.frame}
 7 | \title{Optimal Binning function}
 8 | \usage{
 9 | optbin(x, ...)
10 | 
11 | \method{optbin}{formula}(formula, data, method = c("logreg", "infogain",
12 |   "naive"), na.omit = TRUE, ...)
13 | 
14 | \method{optbin}{data.frame}(x, method = c("logreg", "infogain", "naive"),
15 |   na.omit = TRUE, ...)
16 | }
17 | \arguments{
18 | \item{x}{data frame with the last column containing the target variable.}
19 | 
20 | \item{...}{arguments passed to or from other methods.}
21 | 
22 | \item{formula}{formula, additionally the argument \code{data} is needed.}
23 | 
24 | \item{data}{data frame which contains the data, only needed when using the formula interface.}
25 | 
26 | \item{method}{character string specifying the method for optimal binning, see 'Details'; can be abbreviated.}
27 | 
28 | \item{na.omit}{logical value whether instances with missing values should be removed.}
29 | }
30 | \value{
31 | A data frame with the target variable being in the last column.
32 | }
33 | \description{
34 | Discretizes all numerical data in a data frame into categorical bins where the cut points are optimally aligned with the target categories, thereby a factor is returned.
35 | When building a OneR model this could result in fewer rules with enhanced accuracy.
36 | }
37 | \details{
38 | The cutpoints are calculated by pairwise logistic regressions (method \code{"logreg"}), information gain (method \code{"infogain"}) or as the means of the expected values of the respective classes (\code{"naive"}).
39 | The function is likely to give unsatisfactory results when the distributions of the respective classes are not (linearly) separable. Method \code{"naive"} should only be used when distributions are (approximately) normal,
40 | although in this case \code{"logreg"} should give comparable results, so it is the preferable (and therefore default) method.
41 | 
42 | Method \code{"infogain"} is an entropy based method which calculates cut points based on information gain. The idea is that uncertainty is minimized by making the resulting bins as pure as possible. This method is the standard method of many decision tree algorithms.
43 | 
44 | Character strings and logical strings are coerced into factors. Matrices are coerced into data frames. If the target is numeric it is turned into a factor with the number of levels equal to the number of values. Additionally a warning is given.
45 | 
46 | When \code{"na.omit = FALSE"} an additional level \code{"NA"} is added to each factor with missing values.
47 | If the target contains unused factor levels (e.g. due to subsetting) these are ignored and a warning is given.
48 | }
49 | \section{Methods (by class)}{
50 | \itemize{
51 | \item \code{formula}: method for formulas.
52 | 
53 | \item \code{data.frame}: method for data frames.
54 | }}
55 | 
56 | \examples{
57 | data <- iris # without optimal binning
58 | model <- OneR(data, verbose = TRUE)
59 | summary(model)
60 | 
61 | data_opt <- optbin(iris) # with optimal binning
62 | model_opt <- OneR(data_opt, verbose = TRUE)
63 | summary(model_opt)
64 | 
65 | ## The same with the formula interface:
66 | data_opt <- optbin(Species ~., data = iris)
67 | model_opt <- OneR(data_opt, verbose = TRUE)
68 | summary(model_opt)
69 | 
70 | }
71 | \references{
72 | \url{https://github.com/vonjd/OneR}
73 | }
74 | \seealso{
75 | \code{\link{OneR}}, \code{\link{bin}}
76 | }
77 | \author{
78 | Holger von Jouanne-Diedrich
79 | }
80 | \keyword{binning}
81 | \keyword{discretization}
82 | \keyword{discretize}
83 | 


--------------------------------------------------------------------------------
/NEWS:
--------------------------------------------------------------------------------
 1 | OneR 2.2 (2017-05-05)
 2 | =====================
 3 | 
 4 | MAJOR IMPORVEMENTS
 5 | - OneR: massive speedup of main OneR function (> 30 times faster).
 6 | - OneR & optbin: standard S3 method interface added for formulas and data frames.
 7 | 
 8 | MINOR IMPROVEMENTS
 9 | - optbin: speedup of method "infogain".
10 | - Some minor corrections in documentation.
11 | 
12 | 
13 | OneR 2.1 (2016-10-24)
14 | =====================
15 | 
16 | NEW FEATURES
17 | - eval_model: all instances of "prediction" and "actual" are now being printed in the confusion matrices. Two new arguments were added: "dimnames" for the printed dimnames of the confusion matrices and "zero.print" specifying how zeros should be printed; for sparse confusion matrices, using "." can produce more readable results. A new performance measure "error rate reduction versus the base rate accuracy" was added together with a p-value.
18 | 
19 | MINOR IMPROVEMENTS
20 | - Some minor corrections in documentation.
21 | - Some streamlining and consolidation of code for better maintenance.
22 | 
23 | 
24 | OneR 2.0 (2016-08-12)
25 | =====================
26 | 
27 | NEW FEATURES
28 | - Added a vignette.
29 | - breastcancer: Breast Cancer Wisconsin Original Data Set now included in the package.
30 | - predict: new type "prob" which gives a matrix whose columns are the probability of the first, second, etc. class.
31 | - optbin: new method "infogain" (information gain) which is an entropy based method to determine the cutpoints which make the resulting bins as pure as possible.
32 | - OneR, optbin, maxlevels: consistent handling of unused factor levels (e.g. due to subsetting) was added. These are dropped for analysis and a warning is given.
33 | 
34 | MINOR IMPROVEMENTS
35 | - bin & optbin: in case of removing instances due to missing values the resulting warning gives the number of removed instances.
36 | - maxlevels: with data containing missing values an unhelpful warning was given.
37 | - predict: numerical values that are smaller or bigger than model limits are now transformed into (-Inf, min] or (max, Inf] respectively.
38 | - predict: output of type "class" is a factor now.
39 | - Some streamlining and consolidation of code for better maintenance.
40 | 
41 | BUGFIXES
42 | - bin & optbin: in some borderline cases when the function addNA was used in preprocessing print.OneR stopped with an error.
43 | 
44 | 
45 | OneR 1.3 (2016-07-22)
46 | =====================
47 | 
48 | NEW FEATURES
49 | - bin: new method "clusters", which determines the bins according to automatically determined clusters in the data.
50 | - OneR: a new element "call" with the specified arguments of the actual function call was added to the internal class structure of OneR objects.
51 | - print & summary: the function call with the specified arguments which was used to build the model is printed first.
52 | 
53 | MINOR IMPROVEMENTS
54 | - bin & optbin: in cases where there were missing values and already a factor level "NA" the functions gave an unhelpful warning.
55 | - eval_model: added warning when actual contains missing values.
56 | - eval_model: added "Confusion matrix" to printout for clarity.
57 | - Extension of and minor corrections in documentation
58 | - Some minor streamlining of code.
59 | 
60 | BUGFIXES
61 | - predict: the combination of intervals and "NA"s caused an error.
62 | - bin: the method "content" stopped with an error in case of missing values.
63 | - optbin: the method "logreg" stopped in some borderline cases with missing values with an error.
64 | - optbin: some borderline cases could result in a "breaks are not unique" error.
65 | - OneR: in some borderline cases with very large datasets the numbering of printed ranks (verbose = TRUE) could be wrong due to rounding errors.
66 | 
67 | 
68 | OneR 1.2 (2016-06-20)
69 | =====================
70 | 
71 | Initial release on CRAN
72 | 


--------------------------------------------------------------------------------
/R/OneR_internal.R:
--------------------------------------------------------------------------------
  1 | # Internal OneR functions
  2 | 
  3 | # modified cut function for ensuring consistency of cut points and chosen cut points
  4 | # http://stackoverflow.com/questions/37899503/inconsistent-behaviour-of-cut-different-intervals-with-same-number-and-same-d
  5 | CUT <- function(x, breaks, ...) {
  6 |   if (length(breaks) == 1L) {
  7 |     nb <- as.integer(breaks + 1)
  8 |     dx <- diff(rx <- range(x, na.rm = TRUE))
  9 |     if (dx == 0) {
 10 |       dx <- abs(rx[1L])
 11 |       breaks <- seq.int(rx[1L] - dx/1000, rx[2L] + dx/1000, length.out = nb)
 12 |     } else {
 13 |       breaks <- seq.int(rx[1L], rx[2L], length.out = nb)
 14 |       breaks[c(1L, nb)] <- c(rx[1L] - dx/1000, rx[2L] + dx/1000)
 15 |     }
 16 |   }
 17 |   breaks.f <- c(breaks[1], as.numeric(formatC(0 + breaks[2:(length(breaks)-1)], digits = 3, width = 1L)), breaks[length(breaks)])
 18 |   cut(x, breaks = unique(breaks.f), ...)
 19 | }
 20 | 
 21 | nerrors <- function(x) {
 22 |   sum(rowSums(x) - apply(x, 1, max))
 23 | }
 24 | 
 25 | mode <- function(x) {
 26 |   names(sort(-table(x[ , ncol(x)])))[1]
 27 | }
 28 | 
 29 | ADDNA <- function(x) {
 30 |   if (is.factor(x) & !("NA" %in% levels(x))) x <- factor(x, levels = c(levels(x), "NA"))
 31 |   x[is.na(x)] <- "NA"
 32 |   x
 33 | }
 34 | 
 35 | add_range <- function(x, midpoints) {
 36 |   c(min(x, na.rm = TRUE) - 1/1000 * diff(range(x, na.rm = TRUE)), midpoints, max(x, na.rm = TRUE) + 1/1000 * diff(range(x, na.rm = TRUE)))
 37 | }
 38 | 
 39 | get_breaks <- function(x) {
 40 |   x <- x[x != "NA"]
 41 |   lower = as.numeric(sub("\\((.+),.*", "\\1", x))
 42 |   upper = as.numeric(sub("[^,]*,([^]]*)\\]", "\\1", x))
 43 |   breaks <- unique(na.omit(c(lower, upper)))
 44 |   breaks
 45 | }
 46 | 
 47 | #' @importFrom stats coef
 48 | #' @importFrom stats glm
 49 | #' @importFrom stats binomial
 50 | logreg_midpoint <- function(data) {
 51 |   df <- data.frame(x = unlist(data), target = factor(rep(names(data), sapply(data, length))))
 52 |   coefs <-  suppressWarnings(coef(glm(target ~ x, data = df, family = binomial)))
 53 |   midpoint <- - coefs[1] / coefs[2]
 54 |   # test limits
 55 |   range <- sort(sapply(data, mean, na.rm = TRUE))
 56 |   if (length(range) == 1) range <- c(range, range)
 57 |   if (is.na(midpoint)) return(mean(range, na.rm = TRUE))
 58 |   if (midpoint < range[1]) return(range[1])
 59 |   if (midpoint > range[2]) return(range[2])
 60 |   # ---
 61 |   midpoint
 62 | }
 63 | 
 64 | entropy <- function(x) {
 65 |   freqs <- table(x) / length(x)
 66 |   - sum(freqs * log2(freqs))
 67 | }
 68 | 
 69 | #' @importFrom stats na.omit
 70 | infogain_midpoint <- function(data) {
 71 |   df <- data.frame(numvar = unlist(data), target = factor(rep(names(data), sapply(data, length))))
 72 |   data <- na.omit(df[order(df[ , 1]), ])
 73 |   numvar <- data$numvar; target <- data$target
 74 |   # determine midpoint candidates
 75 |   left_thresholds <- which(as.logical(diff(as.numeric(target))))
 76 |   midpoints <- (numvar[left_thresholds] + numvar[(left_thresholds + 1)]) / 2
 77 |   # calculate average entropies for all midpoint candidates
 78 |   belows <- lapply(midpoints, function(x) as.character(data[numvar <= x, 2]))
 79 |   aboves <- lapply(midpoints, function(x) as.character(data[numvar > x, 2]))
 80 |   below_entropies <- sapply(belows, function(x) length(x)/length(target) * entropy(x))
 81 |   above_entropies <- sapply(aboves, function(x) length(x)/length(target) * entropy(x))
 82 |   # calculate entropies after split and choose lowest
 83 |   after_entropies <- below_entropies + above_entropies
 84 |   midpoints[which.min(after_entropies)]
 85 | }
 86 | 
 87 | #' @importFrom stats na.omit
 88 | #' @importFrom stats filter
 89 | optcut <- function(x, target, method) {
 90 |   orig <- x
 91 |   tmp <- na.omit(cbind(x, target))
 92 |   x <- tmp[ , 1]; target <- tmp[ , 2]
 93 |   xs <- split(x, target)
 94 |   if (method == "naive") {
 95 |     midpoints <- sort(sapply(xs, mean, na.rm = TRUE))
 96 |     # Cutpoints are the means of the expected values of the respective target levels.
 97 |     breaks <- add_range(x, na.omit(filter(midpoints, c(1/2, 1/2))))
 98 |   } else {
 99 |     midpoints <- sapply(xs, mean, na.rm = TRUE)
100 |     nl <- xs[order(midpoints)]
101 |     pairs <- matrix(c(1:(length(nl) - 1), 2:length(nl)), ncol = 2, byrow = TRUE)
102 |     if (method == "logreg") {
103 |       midpoints <- apply(pairs, 1, function(x) logreg_midpoint(c(nl[x[1]], nl[x[2]])))
104 |     }
105 |     if (method == "infogain") {
106 |       midpoints <- apply(pairs, 1, function(x) infogain_midpoint(c(nl[x[1]], nl[x[2]])))
107 |     }
108 |     breaks <- add_range(x, na.omit(midpoints))
109 |   }
110 |   CUT(orig, breaks = unique(breaks))
111 | }
112 | 


--------------------------------------------------------------------------------
/R/OneR_main.R:
--------------------------------------------------------------------------------
  1 | # OneR main function
  2 | 
  3 | #' One Rule function
  4 | #'
  5 | #' Builds a model according to the One Rule (OneR) machine learning classification algorithm.
  6 | #' @param x data frame with the last column containing the target variable.
  7 | #' @param formula formula, additionally the argument \code{data} is needed.
  8 | #' @param data data frame which contains the data, only needed when using the formula interface.
  9 | #' @param ties.method character string specifying how ties are treated, see 'Details'; can be abbreviated.
 10 | #' @param verbose if \code{TRUE} prints rank, names and predictive accuracy of the attributes in decreasing order (with \code{ties.method = "first"}).
 11 | #' @param ... arguments passed to or from other methods.
 12 | #' @return Returns an object of class "OneR". Internally this is a list consisting of the function call with the specified arguments, the names of the target and feature variables,
 13 | #' a list of the rules, the number of correctly classified and total instances and the contingency table of the best predictor vs. the target variable.
 14 | #' @keywords 1R OneR One Rule
 15 | #' @details All numerical data is automatically converted into five categorical bins of equal length. Instances with missing values are removed.
 16 | #' This is done by internally calling the default version of \code{\link{bin}} before starting the OneR algorithm.
 17 | #' To finetune this behaviour data preprocessing with the \code{\link{bin}} or \code{\link{optbin}} functions should be performed.
 18 | #' If data contains unused factor levels (e.g. due to subsetting) these are ignored and a warning is given.
 19 | #'
 20 | #' When there is more than one attribute with best performance either the first (from left to right) is being chosen (method \code{"first"}) or
 21 | #' the one with the lowest p-value of a chi-squared test (method \code{"chisq"}).
 22 | #' @author Holger von Jouanne-Diedrich
 23 | #' @references \url{https://github.com/vonjd/OneR}
 24 | #' @seealso \code{\link{bin}}, \code{\link{optbin}}, \code{\link{eval_model}}, \code{\link{maxlevels}}
 25 | #' @examples
 26 | #' data <- optbin(iris)
 27 | #' model <- OneR(data, verbose = TRUE)
 28 | #' summary(model)
 29 | #' plot(model)
 30 | #' prediction <- predict(model, data)
 31 | #' eval_model(prediction, data)
 32 | #'
 33 | #' ## The same with the formula interface:
 34 | #' data <- optbin(iris)
 35 | #' model <- OneR(Species ~., data = data, verbose = TRUE)
 36 | #' summary(model)
 37 | #' plot(model)
 38 | #' prediction <- predict(model, data)
 39 | #' eval_model(prediction, data)
 40 | #' @importFrom stats model.frame
 41 | #' @importFrom stats chisq.test
 42 | #' @export
 43 | OneR <- function(x, ...) UseMethod("OneR")
 44 | 
 45 | #' @export
 46 | OneR.default <- function(x, ...) {
 47 |   stop("data type not supported")
 48 | }
 49 | 
 50 | #' @export
 51 | #' @describeIn OneR method for formulas.
 52 | OneR.formula <- function(formula, data, ties.method = c("first", "chisq"), verbose = FALSE, ...) {
 53 |   call <- match.call()
 54 |   method <- match.arg(ties.method)
 55 |   mf <- model.frame(formula = formula, data = data, na.action = NULL)
 56 |   data <- mf[c(2:ncol(mf), 1)]
 57 |   OneR.data.frame(x = data, ties.method = ties.method, verbose = verbose, fcall = call)
 58 | }
 59 | 
 60 | #' @export
 61 | #' @describeIn OneR method for data frames.
 62 | OneR.data.frame <- function(x, ties.method = c("first", "chisq"), verbose = FALSE, ...) {
 63 |   if (!is.null(list(...)$fcall)) call <- list(...)$fcall
 64 |   else call <- match.call()
 65 |   method <- match.arg(ties.method)
 66 |   data <- x
 67 |   if (dim(data.frame(data))[2] < 2) stop("data must have at least two columns")
 68 |   data <- bin(data)
 69 |   if (nrow(data) == 0) stop("no data to analyse")
 70 |   # test if unused factor levels and drop them for analysis
 71 |   nlevels_orig <- sum(sapply(data, nlevels))
 72 |   data <- droplevels(data)
 73 |   nlevels_new <- sum(sapply(data, nlevels))
 74 |   if (nlevels_new < nlevels_orig) warning("data contains unused factor levels")
 75 |   # main routine for finding the best predictor(s)
 76 |   tables <- lapply(data[ , 1:(ncol(data)-1), drop = FALSE], table, data[ , ncol(data)])
 77 |   errors <- sapply(tables, nerrors)
 78 |   perf <- nrow(data) - errors
 79 |   target <- names(data[ , ncol(data), drop = FALSE])
 80 |   best <- which(perf == max(perf))
 81 |   # method "chisq
 82 |   if (length(best) > 1) {
 83 |     if (method == "chisq") {
 84 |       features <- names(data[ , best, drop = FALSE])
 85 |       p.values <- sapply(features, function(x) suppressWarnings(chisq.test(table(c(data[target], data[x])))$p.value))
 86 |       p.values[is.na(p.values)] <- Inf
 87 |       if (all(p.values == Inf)) warning("chi-squared tests failed, first best attribute is chosen instead")
 88 |       best <- best[which.min(p.values)]
 89 |     } else best <- best[1]
 90 |   }
 91 |   # preparation and output of results
 92 |   groups <- split(data[ , ncol(data), drop = FALSE], data[ , best])
 93 |   majority <- lapply(groups, mode)
 94 |   feature <- names(data[ , best, drop = FALSE])
 95 |   cont_table <- table(c(data[target], data[feature]))
 96 |   output <- c(call = call,
 97 |               target = target,
 98 |               feature = feature,
 99 |               rules = list(majority),
100 |               correct_instances = max(perf),
101 |               total_instances = nrow(data),
102 |               cont_table = list(cont_table))
103 |   class(output) <- "OneR"
104 |   # print additional diagnostic information if wanted
105 |   if (verbose == TRUE) {
106 |     newbest <- which(which(perf == max(perf)) == best)
107 |     accs <- round(100 * sort(perf, decreasing = TRUE) / nrow(data), 2)
108 |     attr <- colnames(data[order(perf, decreasing = TRUE)])
109 |     M <- matrix(c(as.character(attr), paste0(accs, "%")), ncol = 2)
110 |     rownames(M) <- rank((100 - sort(perf, decreasing = TRUE)), ties.method = "min")
111 |     rownames(M)[newbest] <- paste0(rownames(M)[newbest], " *")
112 |     colnames(M) <- c("Attribute", "Accuracy")
113 |     cat("\n")
114 |     print(M, quote = FALSE)
115 |     cat("---\nChosen attribute due to accuracy\nand ties method (if applicable): '*'\n\n")
116 |   }
117 |   output
118 | }
119 | 


--------------------------------------------------------------------------------
/vignettes/OneR.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "OneR - Establishing a New Baseline for Machine Learning Classification Models"
  3 | author: "An R package by Holger K. von Jouanne-Diedrich"
  4 | date: "`r Sys.Date()`"
  5 | output: rmarkdown::html_vignette
  6 | vignette: >
  7 |   %\VignetteIndexEntry{OneR - Establishing a New Baseline for Machine Learning Classification Models}
  8 |   %\VignetteEngine{knitr::rmarkdown}
  9 |   %\VignetteEncoding{UTF-8}
 10 | ---
 11 | 
 12 | **Note:** You can find a step-by-step introduction on YouTube: [Quick Start Guide for the OneR package](https://www.youtube.com/watch?v=AGC0oRlXxgU)
 13 | 
 14 | ## Introduction
 15 | 
 16 | The following story is one of the most often told in the Data Science community: some time ago the military built a system which aim it was to distinguish military vehicles from civilian ones. They chose a neural network approach and trained the system with pictures of tanks, humvees and missile launchers on the one hand and normal cars, pickups and trucks on the other. After having reached a satisfactory accuracy they brought the system into the field (quite literally). It failed completely, performing no better than a coin toss. What had happened? No one knew, so they re-engineered the black box (no small feat in itself) and found that most of the military pics where taken at dusk or dawn and most civilian pics under brighter weather conditions. The neural net had learned the difference between light and dark!
 17 | 
 18 | Although this might be an urban legend the fact that it is so often told wants to tell us something:
 19 | 
 20 | 1. Many of our Machine Learning models are so complex that we cannot understand them ourselves.
 21 | 2. Because of 1. we cannot differentiate between the simpler aspects of a problem which can be tackled by simple models and the more sophisticated ones which need specialized treatment.
 22 | 
 23 | The above is not only true for neural networks (and especially deep neural networks) but for most of the methods used today, especially Support Vector Machines and Random Forests and in general all kinds of ensemble based methods.
 24 | 
 25 | In one word: we need a good baseline which builds “the best simple model” that strikes a balance between the best accuracy possible with a model that is still simple enough to understand: I have developed the OneR package for finding this sweet spot and thereby establishing a new baseline for classification models in Machine Learning (ML).
 26 | 
 27 | This package is filling a longstanding gap because only a JAVA based implementation was available so far ([RWeka package](https://cran.r-project.org/package=RWeka) as an interface for the [OneR JAVA class](http://weka.sourceforge.net/doc.dev/weka/classifiers/rules/OneR.html)). Additionally several enhancements have been made (see below).
 28 | 
 29 | ## Design principles for the OneR package
 30 | 
 31 | The following design principles were followed for programming the package:
 32 | 
 33 | - Easy: the learning curve for new users should be minimal. Results should be obtained with ease and only minimal preprocessing and modeling steps should be necessary.
 34 | - Versatile: all types of data, i.e. categorical and numeric, should be computable - as input variable as well as as target.
 35 | - Fast: the running times of model trainings should be short.
 36 | - Accurate: the accuracy of trained models should be good overall.
 37 | - Robust: models should not be prone to overfitting; the reached accuracy on training data should be comparable to the accuracy of predictions from new, unseen cases.
 38 | - Comprehensible: it should be easy to understand which rules the model has learned. Not only should the rules be easily comprehensible but they should serve as heuristics that are usable even without a computer.
 39 | - Reproducible: because the used algorithms are strictly deterministic one will always get the same models on the same data. Many ML algorithms have stochastic components so that the data scientist will get a different model every time.
 40 | - Intuitive: model diagnostics should be presented in form of simple tables and plots.
 41 | - Native R: the whole package is written in native R code. Thereby the source code can be easily checked and the whole package is very lean. Additionally the package has no dependencies at all other than base R itself.
 42 | 
 43 | The package is based on the – as the name might reveal – one rule classification algorithm [Holte93]. Although the underlying method is simple enough (basically 1-level decision trees, you can find out more here: [OneR](http://www.saedsayad.com/oner.htm)) several enhancements have been made:
 44 | 
 45 | - Discretization of numeric data: the OneR algorithm can only handle categorical data, so numeric data has to be discretized. The original OneR algorithm separates the respective values in ever smaller and smaller buckets until the best possible accuracy is being reached. It can be argued that this is the definition of overfitting and contradicts the original spirit of OneR because tons of rules (one for every bucket) will result. One can of course introduce a new parameter “maximum bucket size” but finding the right value for this one doesn’t come naturally either. Therefore I take a radically different approach: there are several methods for handling numeric data in the package (in the bin and the optbin function), the most promising one is the (default) “logreg” method in the optbin function which gives only as many bins as there are target categories and which optimizes the cut points according to pairwise logistic regressions.
 46 | - Missing values: in the original algorithm missing values were always handled as a separate level in the respective attribute. While missing values can sometimes reveal interesting patterns in other cases they are, well, just values that are missing. In the OneR package missing values can be handled as separate levels (level “NA”) or they can be omitted (the default).
 47 | - Tie breaking: sometimes the OneR algorithm will find several attributes that provide rules which all give the same best accuracy. The original algorithm just took the first attribute. While this is implemented in the OneR function as the default too a different method for tie breaking can be chosen: the contingency tables of all “best” rules are tested against each other with a Pearson’s Chi squared test and the one with the smallest p-value is being chosen. The rationale behind this is that thereby the attribute with the best signal-to-noise ratio is being found.
 48 | 
 49 | ## Getting started with a simple example
 50 | 
 51 | You can also watch this video which goes through the following example step-by-step:
 52 | 
 53 | [Quick Start Guide for the OneR package (Video)](https://www.youtube.com/watch?v=AGC0oRlXxgU)
 54 | 
 55 | After installing from CRAN load package
 56 | 
 57 | ```{r}
 58 | library(OneR)
 59 | ```
 60 | 
 61 | Use the famous Iris dataset and determine optimal bins for numeric data
 62 | 
 63 | ```{r}
 64 | data <- optbin(iris)
 65 | ```
 66 | 
 67 | Build model with best predictor
 68 | 
 69 | ```{r}
 70 | model <- OneR(data, verbose = TRUE)
 71 | ```
 72 | 
 73 | Show learned rules and model diagnostics
 74 | 
 75 | ```{r}
 76 | summary(model)
 77 | ```
 78 | 
 79 | Plot model diagnostics
 80 | 
 81 | ```{r, fig.width=7.15, fig.height=5}
 82 | plot(model)
 83 | ```
 84 | 
 85 | Use model to predict data
 86 | 
 87 | ```{r}
 88 | prediction <- predict(model, data)
 89 | ```
 90 | 
 91 | Evaluate prediction statistics
 92 | 
 93 | ```{r}
 94 | eval_model(prediction, data)
 95 | ```
 96 | 
 97 | Please note that the very good accuracy of 96% is reached effortlessly.
 98 | 
 99 | "Petal.Width" is identified as the attribute with the highest predictive value. The cut points of the intervals are found automatically (via the included optbin function). The results are three very simple, yet accurate, rules to predict the respective species.
100 | 
101 | The nearly perfect separation of the areas in the diagnostic plot give a good indication of the model’s ability to separate the different species.
102 | 
103 | ## A more sophisticated real-world example
104 | 
105 | The next example tries to find a model for the identification of breast cancer. The data were obtained from the UCI machine learning repository (see also the package documentation). According to this source the best out-of-sample performance was 95.9%, so let's see what we can achieve with the OneR package...
106 | 
107 | 
108 | ```{r}
109 | data(breastcancer)
110 | data <- breastcancer
111 | ```
112 | 
113 | Divide training (80%) and test set (20%)
114 | 
115 | ```{r}
116 | set.seed(12) # for reproducibility
117 | random <- sample(1:nrow(data), 0.8 * nrow(data))
118 | data_train <- optbin(data[random, ], method = "infogain")
119 | data_test <- data[-random, ]
120 | ```
121 | 
122 | Train OneR model on training set
123 | 
124 | ```{r}
125 | model_train <- OneR(data_train, verbose = TRUE)
126 | ```
127 | 
128 | Show model and diagnostics
129 | 
130 | ```{r}
131 | summary(model_train)
132 | ```
133 | 
134 | Plot model diagnostics
135 | 
136 | ```{r, fig.width=7.15, fig.height=5}
137 | plot(model_train)
138 | ```
139 | 
140 | Use trained model to predict test set
141 | 
142 | ```{r}
143 | prediction <- predict(model_train, data_test)
144 | ```
145 | 
146 | Evaluate model performance on test set
147 | 
148 | ```{r}
149 | eval_model(prediction, data_test)
150 | ```
151 | 
152 | The best reported out-of-sample accuracy on this dataset was at 95.9% and it was reached with considerable effort. The reached accuracy for the test set here lies at 94.3%! This is achieved with just one simple rule that when "Uniformity of Cell Size" is bigger than 2 the examined tissue is malignant. The cut points of the intervals are again found automatically (via the included optbin function). The very good separation of the areas in the diagnostic plot give a good indication of the model’s ability to differentiate between benign and malignant tissue. Additionally when you look at the distribution of misclassifications not a single malignant instance is missed, which is obviously very desirable in a clinical context.
153 | 
154 | ## Included functions
155 | 
156 | ### OneR
157 | 
158 | OneR is the main function of the package. It builds a model according to the One Rule machine learning algorithm for categorical data. All numerical data is automatically converted into five categorical bins of equal length. When verbose is TRUE it gives the predictive accuracy of the attributes in decreasing order.
159 | 
160 | ### bin
161 | 
162 | bin discretizes all numerical data in a data frame into categorical bins of equal length or equal content or based on automatically determined clusters.
163 | 
164 | Examples
165 | ```{r}
166 | data <- iris
167 | str(data)
168 | str(bin(data))
169 | str(bin(data, nbins = 3))
170 | str(bin(data, nbins = 3, labels = c("small", "medium", "large")))
171 | ```
172 | 
173 | Difference between methods "length" and "content"
174 | 
175 | ```{r}
176 | set.seed(1); table(bin(rnorm(900), nbins = 3))
177 | set.seed(1); table(bin(rnorm(900), nbins = 3, method = "content"))
178 | ```
179 | 
180 | Method "clusters"
181 | ```{r, fig.width=7.15, fig.height=5}
182 | intervals <- paste(levels(bin(faithful$waiting, nbins = 2, method = "cluster")), collapse = " ")
183 | hist(faithful$waiting, main = paste("Intervals:", intervals))
184 | abline(v = c(42.9, 67.5, 96.1), col = "blue")
185 | ```
186 | 
187 | Handling of missing values
188 | 
189 | ```{r}
190 | bin(c(1:10, NA), nbins = 2, na.omit = FALSE) # adds new level "NA"
191 | bin(c(1:10, NA), nbins = 2)
192 | ```
193 | 
194 | ### optbin
195 | 
196 | optbin discretizes all numerical data in a data frame into categorical bins where the cut points are optimally aligned with the target categories, thereby a factor is returned. When building a OneR model this could result in fewer rules with enhanced accuracy. The cutpoints are calculated by pairwise logistic regressions (method "logreg") or as the means of the expected values of the respective classes ("naive"). The function is likely to give unsatisfactory results when the distributions of the respective classes are not (linearly) separable. Method "naive" should only be used when distributions are (approximately) normal, although in this case "logreg" should give comparable results, so it is the preferable (and therefore default) method.
197 | 
198 | Method "infogain" is an entropy based method which calculates cut points based on information gain. The idea is that uncertainty is minimized by making the resulting bins as pure as possible. This method is the standard method of many decision tree algorithms.
199 | 
200 | ### maxlevels
201 | 
202 | maxlavels removes all columns of a data frame where a factor (or character string) has more than a maximum number of levels. Often categories that have very many levels are not useful in modelling OneR rules because they result in too many rules and tend to overfit. Examples are IDs or names.
203 | 
204 | ```{r}
205 | df <- data.frame(numeric = c(1:26), alphabet = letters)
206 | str(df)
207 | str(maxlevels(df))
208 | ```
209 | 
210 | ### predict
211 | 
212 | predict is a S3 method for predicting cases or probabilites based on OneR model objects. The second argument "newdata"" can have the same format as used for building the model but must at least have the feature variable that is used in the OneR rules. The default output is a factor with the predicted classes.
213 | 
214 | ```{r}
215 | model <- OneR(iris)
216 | predict(model, data.frame(Petal.Width = seq(0, 3, 0.5)))
217 | ```
218 | 
219 | If "type = prob" a matrix is returned whose columns are the probability of the first, second, etc. class.
220 | 
221 | ```{r}
222 | predict(model, data.frame(Petal.Width = seq(0, 3, 0.5)), type = "prob")
223 | ```
224 | 
225 | ### eval_model
226 | 
227 | eval_model is a simple function for evaluating a OneR classification model. It prints confusion matrices with prediction vs. actual in absolute and relative numbers. Additionally it gives the accuracy, error rate as well as the error rate reduction versus the base rate accuracy together with a p-value. The second argument "actual" is a data frame which contains the actual data in the last column. A single vector is allowed too.
228 | 
229 | For the details please consult the available help entries.
230 | 
231 | ## Help overview
232 | 
233 | From within R:
234 | 
235 | ```{r, eval=FALSE}
236 | help(package = OneR)
237 | ```
238 | 
239 | ...or as a pdf here: [OneR.pdf](https://cran.r-project.org/package=OneR/OneR.pdf)
240 | 
241 | Issues can be posted here: https://github.com/vonjd/OneR/issues
242 | 
243 | The latest version of the package (and full sourcecode) can be found here: https://github.com/vonjd/OneR
244 | 
245 | ## Sources
246 | 
247 | [Holte93] R. Holte: Very Simple Classification Rules Perform Well on Most Commonly Used Datasets, 1993. Available online here: https://link.springer.com/article/10.1023/A:1022631118932
248 | 
249 | ## Contact
250 | 
251 | I would love to hear about your experiences with the OneR package. Please drop me a note - you can reach me at my university account: [Holger K. von Jouanne-Diedrich](https://www.h-ab.de/nc/eng/about-aschaffenburg-university-of-applied-sciences/organisation/personal/?tx_fhapersonal_pi1%5BshowUid%5D=jouanne-diedrich)
252 | 
253 | ## License
254 | 
255 | This package is under [MIT License](https://cran.r-project.org/package=OneR/LICENSE).
256 | 


--------------------------------------------------------------------------------
/R/OneR.R:
--------------------------------------------------------------------------------
  1 | # OneR helper functions
  2 | 
  3 | #' Binning function
  4 | #'
  5 | #' Discretizes all numerical data in a data frame into categorical bins of equal length or content or based on automatically determined clusters.
  6 | #' @param data data frame or vector which contains the data.
  7 | #' @param nbins number of bins (= levels).
  8 | #' @param labels character vector of labels for the resulting category.
  9 | #' @param method character string specifying the binning method, see 'Details'; can be abbreviated.
 10 | #' @param na.omit logical value whether instances with missing values should be removed.
 11 | #' @return A data frame or vector.
 12 | #' @keywords binning discretization discretize clusters Jenks breaks
 13 | #' @details Character strings and logical strings are coerced into factors. Matrices are coerced into data frames. When called with a single vector only the respective factor (and not a data frame) is returned.
 14 | #' Method \code{"length"} gives intervals of equal length, method \code{"content"} gives intervals of equal content (via quantiles).
 15 | #' Method \code{"clusters"} determins \code{"nbins"} clusters via 1D kmeans with deterministic seeding of the initial cluster centres (Jenks natural breaks optimization).
 16 | #'
 17 | #' When \code{"na.omit = FALSE"} an additional level \code{"NA"} is added to each factor with missing values.
 18 | #' @author Holger von Jouanne-Diedrich
 19 | #' @references \url{https://github.com/vonjd/OneR}
 20 | #' @seealso \code{\link{OneR}}, \code{\link{optbin}}
 21 | #' @examples
 22 | #' data <- iris
 23 | #' str(data)
 24 | #' str(bin(data))
 25 | #' str(bin(data, nbins = 3))
 26 | #' str(bin(data, nbins = 3, labels = c("small", "medium", "large")))
 27 | #'
 28 | #' ## Difference between methods "length" and "content"
 29 | #' set.seed(1); table(bin(rnorm(900), nbins = 3))
 30 | #' set.seed(1); table(bin(rnorm(900), nbins = 3, method = "content"))
 31 | #'
 32 | #' ## Method "clusters"
 33 | #' intervals <- paste(levels(bin(faithful$waiting, nbins = 2, method = "cluster")), collapse = " ")
 34 | #' hist(faithful$waiting, main = paste("Intervals:", intervals))
 35 | #' abline(v = c(42.9, 67.5, 96.1), col = "blue")
 36 | #'
 37 | #' ## Missing values
 38 | #' bin(c(1:10, NA), nbins = 2, na.omit = FALSE) # adds new level "NA"
 39 | #' bin(c(1:10, NA), nbins = 2)                  # omits missing values by default (with warning)
 40 | #' @importFrom stats quantile
 41 | #' @importFrom stats kmeans
 42 | #' @export
 43 | bin <- function(data, nbins = 5, labels = NULL, method = c("length", "content", "clusters"), na.omit = TRUE) {
 44 |   method <- match.arg(method)
 45 |   vec <- FALSE
 46 |   if (is.atomic(data) == TRUE & is.null(dim(data)) == TRUE) { vec <- TRUE; data <- data.frame(data) }
 47 |   # could be a matrix -> data frame (even with only one column)
 48 |   if (is.list(data) == FALSE) data <- data.frame(data)
 49 |   if (na.omit == TRUE) {
 50 |     len_rows_orig <- nrow(data)
 51 |     data <- na.omit(data)
 52 |     len_rows_new <- nrow(data)
 53 |     no_removed <- len_rows_orig - len_rows_new
 54 |     if (no_removed > 0) warning(paste(no_removed, "instance(s) removed due to missing values"))
 55 |   }
 56 |   if (!is.null(labels)) if (nbins != length(labels)) stop("number of 'nbins' and 'labels' differ")
 57 |   if (nbins <= 1) stop("nbins must be bigger than 1")
 58 |   data[] <- lapply(data, function(x) if (is.numeric(x)) {
 59 |     if (length(unique(x)) <= nbins) as.factor(x)
 60 |     else {
 61 |       if (method == "content") nbins <- add_range(x, na.omit(quantile(x, (1:(nbins-1)/nbins), na.rm = TRUE)))
 62 |       if (method == "clusters") {
 63 |         midpoints <- sort(kmeans(na.omit(x), centers = seq(min(x, na.rm = TRUE), max(x, na.rm = TRUE), length = nbins))$centers)
 64 |         nbins <- add_range(x, na.omit(filter(midpoints, c(1/2, 1/2))))
 65 |       }
 66 |       CUT(x, breaks = unique(nbins), labels = labels)
 67 |     }
 68 |   } else as.factor(x))
 69 |   data[] <- lapply(data, function(x) if(any(is.na(as.character(x)))) ADDNA(x) else x)
 70 |   if (vec) { data <- unlist(data); names(data) <- NULL }
 71 |   data
 72 | }
 73 | 
 74 | #' Optimal Binning function
 75 | #'
 76 | #' Discretizes all numerical data in a data frame into categorical bins where the cut points are optimally aligned with the target categories, thereby a factor is returned.
 77 | #' When building a OneR model this could result in fewer rules with enhanced accuracy.
 78 | #' @param x data frame with the last column containing the target variable.
 79 | #' @param formula formula, additionally the argument \code{data} is needed.
 80 | #' @param data data frame which contains the data, only needed when using the formula interface.
 81 | #' @param method character string specifying the method for optimal binning, see 'Details'; can be abbreviated.
 82 | #' @param na.omit logical value whether instances with missing values should be removed.
 83 | #' @param ... arguments passed to or from other methods.
 84 | #' @return A data frame with the target variable being in the last column.
 85 | #' @keywords binning discretization discretize
 86 | #' @details The cutpoints are calculated by pairwise logistic regressions (method \code{"logreg"}), information gain (method \code{"infogain"}) or as the means of the expected values of the respective classes (\code{"naive"}).
 87 | #' The function is likely to give unsatisfactory results when the distributions of the respective classes are not (linearly) separable. Method \code{"naive"} should only be used when distributions are (approximately) normal,
 88 | #' although in this case \code{"logreg"} should give comparable results, so it is the preferable (and therefore default) method.
 89 | #'
 90 | #' Method \code{"infogain"} is an entropy based method which calculates cut points based on information gain. The idea is that uncertainty is minimized by making the resulting bins as pure as possible. This method is the standard method of many decision tree algorithms.
 91 | #'
 92 | #' Character strings and logical strings are coerced into factors. Matrices are coerced into data frames. If the target is numeric it is turned into a factor with the number of levels equal to the number of values. Additionally a warning is given.
 93 | #'
 94 | #' When \code{"na.omit = FALSE"} an additional level \code{"NA"} is added to each factor with missing values.
 95 | #' If the target contains unused factor levels (e.g. due to subsetting) these are ignored and a warning is given.
 96 | #' @author Holger von Jouanne-Diedrich
 97 | #' @references \url{https://github.com/vonjd/OneR}
 98 | #' @seealso \code{\link{OneR}}, \code{\link{bin}}
 99 | #' @examples
100 | #' data <- iris # without optimal binning
101 | #' model <- OneR(data, verbose = TRUE)
102 | #' summary(model)
103 | #'
104 | #' data_opt <- optbin(iris) # with optimal binning
105 | #' model_opt <- OneR(data_opt, verbose = TRUE)
106 | #' summary(model_opt)
107 | #'
108 | #' ## The same with the formula interface:
109 | #' data_opt <- optbin(Species ~., data = iris)
110 | #' model_opt <- OneR(data_opt, verbose = TRUE)
111 | #' summary(model_opt)
112 | #'
113 | #' @export
114 | optbin <- function(x, ...) UseMethod("optbin")
115 | 
116 | #' @export
117 | optbin.default <- function(x, ...) {
118 |   stop("data type not supported")
119 | }
120 | 
121 | #' @export
122 | #' @describeIn optbin method for formulas.
123 | optbin.formula <- function(formula, data, method = c("logreg", "infogain", "naive"), na.omit = TRUE, ...) {
124 |   method <- match.arg(method)
125 |   mf <- model.frame(formula = formula, data = data, na.action = NULL)
126 |   data <- mf[c(2:ncol(mf), 1)]
127 |   optbin.data.frame(x = data, method = method, na.omit = na.omit)
128 | }
129 | 
130 | #' @export
131 | #' @describeIn optbin method for data frames.
132 | optbin.data.frame <- function(x, method = c("logreg", "infogain", "naive"), na.omit = TRUE, ...) {
133 |   method <- match.arg(method)
134 |   data <- x
135 |   if (dim(data)[2] < 2) stop("data must have at least two columns")
136 |   if (is.numeric(data[ , ncol(data)]) == TRUE) warning("target is numeric")
137 |   data[ncol(data)] <- as.factor(data[ , ncol(data)])
138 |   if (na.omit == TRUE) {
139 |     len_rows_orig <- nrow(data)
140 |     data <- na.omit(data)
141 |     len_rows_new <- nrow(data)
142 |     no_removed <- len_rows_orig - len_rows_new
143 |     if (no_removed > 0) warning(paste(no_removed, "instance(s) removed due to missing values"))
144 |   } else {
145 |     # only add NA to target
146 |     if(any(is.na(as.character(data[ , ncol(data)])))) data[ncol(data)] <- ADDNA(data[ , ncol(data)])
147 |   }
148 |   target <- data[ , ncol(data)]
149 |   # Test if unused factor levels and drop them for analysis
150 |   nlevels_orig <- nlevels(target)
151 |   target <- droplevels(target)
152 |   nbins <- nlevels(target)
153 |   if (nbins < nlevels_orig) warning("target contains unused factor levels")
154 |   if (nbins <= 1) stop("number of target levels must be bigger than 1")
155 |   data[] <- lapply(data, function(x) if (is.numeric(x)) {
156 |     if (length(unique(x)) <= nbins) as.factor(x) else optcut(x, target, method)
157 |   } else as.factor(x))
158 |   data[] <- lapply(data, function(x) if(any(is.na(as.character(x)))) ADDNA(x) else x)
159 |   data
160 | }
161 | 
162 | #' Remove factors with too many levels
163 | #'
164 | #' Removes all columns of a data frame where a factor (or character string) has more than a maximum number of levels.
165 | #' @param data data frame which contains the data.
166 | #' @param maxlevels number of maximum factor levels.
167 | #' @param na.omit logical value whether missing values should be treated as a level, defaults to omit missing values before counting.
168 | #' @return A data frame.
169 | #' @details Often categories that have very many levels are not useful in modelling OneR rules because they result in too many rules and tend to overfit.
170 | #' Examples are IDs or names.
171 | #'
172 | #' Character strings are treated as factors although they keep their datatype. Numeric data is left untouched.
173 | #' If data contains unused factor levels (e.g. due to subsetting) these are ignored and a warning is given.
174 | #' @author Holger von Jouanne-Diedrich
175 | #' @references \url{https://github.com/vonjd/OneR}
176 | #' @seealso \code{\link{OneR}}
177 | #' @examples
178 | #' df <- data.frame(numeric = c(1:26), alphabet = letters)
179 | #' str(df)
180 | #' str(maxlevels(df))
181 | #' @export
182 | maxlevels <- function(data, maxlevels = 20, na.omit = TRUE) {
183 |   if (is.list(data) == FALSE) stop("data must be a data frame")
184 |   if (maxlevels <= 2) stop("maxlevels must be bigger than 2")
185 |   tmp <- suppressWarnings(bin(data, nbins = 2, na.omit = na.omit))
186 |   # Test if unused factor levels and drop them for analysis
187 |   nlevels_orig <- sapply(tmp, nlevels)
188 |   tmp <- droplevels(tmp)
189 |   nlevels_new <- sapply(tmp, nlevels)
190 |   if (sum(nlevels_new) < sum(nlevels_orig)) warning("data contains unused factor levels")
191 |   cols <- nlevels_new <= maxlevels
192 |   data[cols]
193 | }
194 | 
195 | #' Predict method for OneR models
196 | #'
197 | #' Predict cases or probabilities based on OneR model object.
198 | #' @param object object of class \code{"OneR"}.
199 | #' @param newdata data frame in which to look for the feature variable with which to predict.
200 | #' @param type character string denoting the type of predicted value returned. Default \code{"class"} gives a named vector with the predicted classes, \code{"prob"} gives a matrix whose columns are the probability of the first, second, etc. class.
201 | #' @param ... further arguments passed to or from other methods.
202 | #' @return The default is a factor with the predicted classes, if \code{"type = prob"} a matrix is returned whose columns are the probability of the first, second, etc. class.
203 | #' @details \code{newdata} can have the same format as used for building the model but must at least have the feature variable that is used in the OneR rules.
204 | #' If cases appear that were not present when building the model the predicted case is \code{UNSEEN} or \code{NA} when \code{"type = prob"}.
205 | #' @author Holger von Jouanne-Diedrich
206 | #' @references \url{https://github.com/vonjd/OneR}
207 | #' @seealso \code{\link{OneR}}
208 | #' @examples
209 | #' model <- OneR(iris)
210 | #' prediction <- predict(model, iris[1:4])
211 | #' eval_model(prediction, iris[5])
212 | #'
213 | #' ## type prob
214 | #' predict(model, data.frame(Petal.Width = seq(0, 3, 0.5)))
215 | #' predict(model, data.frame(Petal.Width = seq(0, 3, 0.5)), type = "prob")
216 | #' @export
217 | predict.OneR <- function(object, newdata, type = c("class", "prob"), ...) {
218 |   type <- match.arg(type)
219 |   if (is.list(newdata) == FALSE) stop("newdata must be a data frame")
220 |   if (all(names(newdata) != object$feature)) stop("cannot find feature column in newdata")
221 |   model <- object
222 |   data <- newdata
223 |   index <- which(names(data) == model$feature)[1]
224 |   if (is.numeric(data[ , index])) {
225 |     levels <- names(model$rules)
226 |     if (substring(levels[1], 1, 1) == "(" & grepl(",", levels[1]) == TRUE & substring(levels[1], nchar(levels[1]), nchar(levels[1])) == "]") {
227 |       features <- as.character(cut(data[ , index], breaks = c(-Inf, get_breaks(levels), Inf)))
228 |     } else features <- as.character(data[ , index])
229 |   } else features <- as.character(data[ , index])
230 |   features[is.na(features)] <- "NA"
231 |   if (type == "prob") {
232 |     probs <- prop.table(model$cont_table, margin = 2)
233 |     probrules <- lapply(names(model$rules), function(x) probs[ , x])
234 |     names(probrules) <- names(model$rules)
235 |     M <- t(sapply(features, function(x) if (is.null(probrules[[x]]) == TRUE) rep(NA, dim(model$cont_table)[1]) else probrules[[x]]))
236 |     colnames(M) <- rownames(model$cont_table)
237 |     return(M)
238 |   }
239 |   factor(sapply(features, function(x) if (is.null(model$rules[[x]]) == TRUE) "UNSEEN" else model$rules[[x]]))
240 | }
241 | 
242 | #' Summarize OneR models
243 | #'
244 | #' \code{summary} method for class \code{OneR}.
245 | #' @param object object of class \code{"OneR"}.
246 | #' @param ... further arguments passed to or from other methods.
247 | #' @details Prints the rules of the OneR model, the accuracy, a contingency table of the feature attribute and the target and performs a chi-squared test on this table.
248 | #'
249 | #' In the contingency table the maximum values in each column are highlighted by adding a '*', thereby representing the rules of the OneR model.
250 | #' @author Holger von Jouanne-Diedrich
251 | #' @references \url{https://github.com/vonjd/OneR}
252 | #' @seealso \code{\link{OneR}}
253 | #' @keywords diagnostics
254 | #' @examples
255 | #' model <- OneR(iris)
256 | #' summary(model)
257 | #' @importFrom stats addmargins
258 | #' @importFrom stats chisq.test
259 | #' @export
260 | summary.OneR <- function(object, ...) {
261 |   model <- object
262 |   print(model)
263 |   tbl <- model$cont_table
264 |   pos <- cbind(apply(tbl, 2, which.max), 1:dim(tbl)[2])
265 |   tbl <- addmargins(tbl)
266 |   tbl[pos] <- paste("*", tbl[pos])
267 |   cat("Contingency table:\n")
268 |   print(tbl, quote = FALSE, right = TRUE)
269 |   cat("---\nMaximum in each column: '*'\n")
270 |   # chi-squared test
271 |   digits <- getOption("digits")
272 |   x <- suppressWarnings(chisq.test(model$cont_table))
273 |   cat("\nPearson's Chi-squared test:\n")
274 |   out <- character()
275 |   if (!is.null(x$statistic))
276 |     out <- c(out, paste(names(x$statistic), "=", format(signif(x$statistic, max(1L, digits - 2L)))))
277 |   if (!is.null(x$parameter))
278 |     out <- c(out, paste(names(x$parameter), "=", format(signif(x$parameter, max(1L, digits - 2L)))))
279 |   if (!is.null(x$p.value)) {
280 |     fp <- format.pval(x$p.value, digits = max(1L, digits - 3L))
281 |     out <- c(out, paste("p-value", if (substr(fp, 1L, 1L) == "<") fp else paste("=", fp)))
282 |   }
283 |   cat(strwrap(paste(out, collapse = ", ")), sep = "\n")
284 |   cat("\n")
285 | }
286 | 
287 | #' Print OneR models
288 | #'
289 | #' \code{print} method for class \code{OneR}.
290 | #' @param x object of class \code{"OneR"}.
291 | #' @param ... further arguments passed to or from other methods.
292 | #' @details Prints the rules and the accuracy of an OneR model.
293 | #' @author Holger von Jouanne-Diedrich
294 | #' @references \url{https://github.com/vonjd/OneR}
295 | #' @seealso \code{\link{OneR}}
296 | #' @examples
297 | #' model <- OneR(iris)
298 | #' print(model)
299 | #' @export
300 | print.OneR <- function(x, ...) {
301 |   model <- x
302 |   cat("\nCall:\n")
303 |   print(model$call)
304 |   cat("\nRules:\n")
305 |   longest <- max(nchar(names(model$rules)))
306 |   for (iter in 1:length(model$rules)) {
307 |     len <- longest - nchar(names(model$rules[iter]))
308 |     cat("If ", model$feature, " = ", names(model$rules[iter]), rep(" ", len)," then ", model$target, " = ", model$rules[[iter]], "\n", sep = "")
309 |   }
310 |   cat("\nAccuracy:\n")
311 |   cat(model$correct_instances, " of ", model$total_instances, " instances classified correctly (", round(100 * model$correct_instances / model$total_instances, 2), "%)\n\n", sep = "")
312 | }
313 | 
314 | #' Plot Diagnostics for an OneR object
315 | #'
316 | #' Plots a mosaic plot for the feature attribute and the target of the OneR model.
317 | #' @param x object of class \code{"OneR"}.
318 | #' @param ... further arguments passed to or from other methods.
319 | #' @details If more than 20 levels are present for either the feature attribute or the target the function stops with an error.
320 | #' @author Holger von Jouanne-Diedrich
321 | #' @references \url{https://github.com/vonjd/OneR}
322 | #' @seealso \code{\link{OneR}}
323 | #' @keywords diagnostics
324 | #' @examples
325 | #' model <- OneR(iris)
326 | #' plot(model)
327 | #' @importFrom graphics mosaicplot
328 | #' @export
329 | plot.OneR <- function(x, ...) {
330 |   model <- x
331 |   if (any(dim(model$cont_table) > 20)) stop("cannot plot more than 20 levels")
332 |   mosaicplot(t(model$cont_table), color = TRUE, main = "OneR model diagnostic plot")
333 | }
334 | 
335 | #' Test OneR model objects
336 | #'
337 | #' Test if object is a OneR model.
338 | #' @param x object to be tested.
339 | #' @return a logical whether object is of class "OneR".
340 | #' @keywords OneR model
341 | #' @author Holger von Jouanne-Diedrich
342 | #' @references \url{https://github.com/vonjd/OneR}
343 | #' @examples
344 | #' model <- OneR(iris)
345 | #' is.OneR(model) # evaluates to TRUE
346 | #' @export
347 | is.OneR <- function(x) inherits(x, "OneR")
348 | 
349 | #' Classification Evaluation function
350 | #'
351 | #' Function for evaluating a OneR classification model. Prints confusion matrices with prediction vs. actual in absolute and relative numbers. Additionally it gives the accuracy, error rate as well as the error rate reduction versus the base rate accuracy together with a p-value.
352 | #' @param prediction vector which contains the predicted values.
353 | #' @param actual data frame which contains the actual data. When there is more than one column the last last column is taken. A single vector is allowed too.
354 | #' @param dimnames character vector of printed dimnames for the confusion matrices.
355 | #' @param zero.print character specifying how zeros should be printed; for sparse confusion matrices, using "." can produce more readable results.
356 | #' @details Error rate reduction versus the base rate accuracy is calculated by the following formula:\cr\cr
357 | #' \eqn{(Accuracy(Prediction) - Accuracy(Baserate)) / (1 - Accuracy(Baserate))},\cr\cr
358 | #' giving a number between 0 (no error reduction) and 1 (no error).\cr\cr
359 | #' In some borderline cases when the model is performing worse than the base rate negative numbers can result. This shows that something is seriously wrong with the model generating this prediction.\cr\cr
360 | #' The provided p-value gives the probability of obtaining a distribution of predictions like this (or even more unambiguous) under the assumption that the real accuracy is equal to or lower than the base rate accuracy.
361 | #' More technicaly it is derived from a one-sided binomial test with the alternative hypothesis that the prediction's accuracy is bigger than the base rate accuracy.
362 | #' Loosly speaking a low p-value (< 0.05) signifies that the model really is able to give predictions that are better than the base rate.
363 | #' @return Invisibly returns a list with the number of correctly classified and total instances and a confusion matrix with the absolute numbers.
364 | #' @author Holger von Jouanne-Diedrich
365 | #' @references \url{https://github.com/vonjd/OneR}
366 | #' @keywords evaluation accuracy
367 | #' @examples
368 | #' data <- iris
369 | #' model <- OneR(data)
370 | #' summary(model)
371 | #' prediction <- predict(model, data)
372 | #' eval_model(prediction, data)
373 | #' @importFrom stats addmargins
374 | #' @importFrom stats binom.test
375 | #' @export
376 | eval_model <- function (prediction, actual, dimnames = c("Prediction", "Actual"), zero.print = "0") {
377 |   if (any(is.na(prediction))) stop("prediction contains missing values")
378 |   prediction <- factor(prediction)
379 |   if (!is.list(actual)) actual <- data.frame(actual)
380 |   actual <- actual[ , ncol(actual)]
381 |   actual <- factor(actual)
382 |   if (any(is.na(actual))) actual <- ADDNA(actual)
383 |   # make sure that all levels are included in the same format and order in each set
384 |   all_levels <- sort(unique(c(levels(prediction), levels(actual))))
385 |   prediction <- factor(prediction, levels = all_levels, labels = all_levels)
386 |   actual <- factor(actual, levels = all_levels, labels = all_levels)
387 |   if (length(prediction) != length(actual)) stop("prediction and actual must have the same length")
388 |   # create and print confusion matrices
389 |   conf <- table(prediction, actual, dnn = dimnames)
390 |   conf.m <- addmargins(conf)
391 |   cat("\nConfusion matrix (absolute):\n")
392 |   print(conf.m, zero.print = zero.print)
393 |   conf.p <- prop.table(conf)
394 |   conf.pm <- addmargins(conf.p)
395 |   cat("\nConfusion matrix (relative):\n")
396 |   print(round(conf.pm, 2), zero.print = zero.print)
397 |   # calculate and print performance measures
398 |   N <- sum(conf)
399 |   correct_class <- sum(diag(conf))
400 |   acc <- correct_class / N
401 |   cat("\nAccuracy:\n", round(acc, 4), " (", correct_class, "/", N, ")\n", sep = "")
402 |   error.rt <- 1 - acc
403 |   cat("\nError rate:\n", round(error.rt, 4), " (", N - correct_class, "/", N, ")\n", sep = "")
404 |   base.rt <- max(conf.pm[nrow(conf.pm), 1:(ncol(conf.pm) - 1)])
405 |   errordown.p <- (acc - base.rt) / (1 - base.rt)
406 |   # binomial test
407 |   digits <- getOption("digits")
408 |   out <- character()
409 |   x <- binom.test(correct_class, N, p = base.rt, alternative = "greater")
410 |   if (!is.null(x$p.value)) {
411 |     fp <- format.pval(x$p.value, digits = max(1L, digits - 3L))
412 |     out <- c(out, paste("p-value", if (substr(fp, 1L, 1L) == "<") fp else paste("=", fp)))
413 |   }
414 |   cat("\nError rate reduction (vs. base rate):\n", round(errordown.p, 4), " (", out, ")\n\n", sep = "")
415 |   # return list invisibly
416 |   invisible(list(correct_instances = correct_class, total_instances = N, conf_matrix = conf))
417 | }
418 | 


--------------------------------------------------------------------------------
/vignettes/OneR.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | 
  3 | <html xmlns="http://www.w3.org/1999/xhtml">
  4 | 
  5 | <head>
  6 | 
  7 | <meta charset="utf-8" />
  8 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  9 | <meta name="generator" content="pandoc" />
 10 | 
 11 | <meta name="viewport" content="width=device-width, initial-scale=1">
 12 | 
 13 | <meta name="author" content="An R package by Holger K. von Jouanne-Diedrich" />
 14 | 
 15 | <meta name="date" content="2017-05-05" />
 16 | 
 17 | <title>OneR - Establishing a New Baseline for Machine Learning Classification Models</title>
 18 | 
 19 | 
 20 | 
 21 | <style type="text/css">code{white-space: pre;}</style>
 22 | <style type="text/css">
 23 | div.sourceCode { overflow-x: auto; }
 24 | table.sourceCode, tr.sourceCode, td.lineNumbers, td.sourceCode {
 25 |   margin: 0; padding: 0; vertical-align: baseline; border: none; }
 26 | table.sourceCode { width: 100%; line-height: 100%; }
 27 | td.lineNumbers { text-align: right; padding-right: 4px; padding-left: 4px; color: #aaaaaa; border-right: 1px solid #aaaaaa; }
 28 | td.sourceCode { padding-left: 5px; }
 29 | code > span.kw { color: #007020; font-weight: bold; } /* Keyword */
 30 | code > span.dt { color: #902000; } /* DataType */
 31 | code > span.dv { color: #40a070; } /* DecVal */
 32 | code > span.bn { color: #40a070; } /* BaseN */
 33 | code > span.fl { color: #40a070; } /* Float */
 34 | code > span.ch { color: #4070a0; } /* Char */
 35 | code > span.st { color: #4070a0; } /* String */
 36 | code > span.co { color: #60a0b0; font-style: italic; } /* Comment */
 37 | code > span.ot { color: #007020; } /* Other */
 38 | code > span.al { color: #ff0000; font-weight: bold; } /* Alert */
 39 | code > span.fu { color: #06287e; } /* Function */
 40 | code > span.er { color: #ff0000; font-weight: bold; } /* Error */
 41 | code > span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
 42 | code > span.cn { color: #880000; } /* Constant */
 43 | code > span.sc { color: #4070a0; } /* SpecialChar */
 44 | code > span.vs { color: #4070a0; } /* VerbatimString */
 45 | code > span.ss { color: #bb6688; } /* SpecialString */
 46 | code > span.im { } /* Import */
 47 | code > span.va { color: #19177c; } /* Variable */
 48 | code > span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
 49 | code > span.op { color: #666666; } /* Operator */
 50 | code > span.bu { } /* BuiltIn */
 51 | code > span.ex { } /* Extension */
 52 | code > span.pp { color: #bc7a00; } /* Preprocessor */
 53 | code > span.at { color: #7d9029; } /* Attribute */
 54 | code > span.do { color: #ba2121; font-style: italic; } /* Documentation */
 55 | code > span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
 56 | code > span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
 57 | code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
 58 | </style>
 59 | 
 60 | 
 61 | 
 62 | <link href="data:text/css;charset=utf-8,body%20%7B%0Abackground%2Dcolor%3A%20%23fff%3B%0Amargin%3A%201em%20auto%3B%0Amax%2Dwidth%3A%20700px%3B%0Aoverflow%3A%20visible%3B%0Apadding%2Dleft%3A%202em%3B%0Apadding%2Dright%3A%202em%3B%0Afont%2Dfamily%3A%20%22Open%20Sans%22%2C%20%22Helvetica%20Neue%22%2C%20Helvetica%2C%20Arial%2C%20sans%2Dserif%3B%0Afont%2Dsize%3A%2014px%3B%0Aline%2Dheight%3A%201%2E35%3B%0A%7D%0A%23header%20%7B%0Atext%2Dalign%3A%20center%3B%0A%7D%0A%23TOC%20%7B%0Aclear%3A%20both%3B%0Amargin%3A%200%200%2010px%2010px%3B%0Apadding%3A%204px%3B%0Awidth%3A%20400px%3B%0Aborder%3A%201px%20solid%20%23CCCCCC%3B%0Aborder%2Dradius%3A%205px%3B%0Abackground%2Dcolor%3A%20%23f6f6f6%3B%0Afont%2Dsize%3A%2013px%3B%0Aline%2Dheight%3A%201%2E3%3B%0A%7D%0A%23TOC%20%2Etoctitle%20%7B%0Afont%2Dweight%3A%20bold%3B%0Afont%2Dsize%3A%2015px%3B%0Amargin%2Dleft%3A%205px%3B%0A%7D%0A%23TOC%20ul%20%7B%0Apadding%2Dleft%3A%2040px%3B%0Amargin%2Dleft%3A%20%2D1%2E5em%3B%0Amargin%2Dtop%3A%205px%3B%0Amargin%2Dbottom%3A%205px%3B%0A%7D%0A%23TOC%20ul%20ul%20%7B%0Amargin%2Dleft%3A%20%2D2em%3B%0A%7D%0A%23TOC%20li%20%7B%0Aline%2Dheight%3A%2016px%3B%0A%7D%0Atable%20%7B%0Amargin%3A%201em%20auto%3B%0Aborder%2Dwidth%3A%201px%3B%0Aborder%2Dcolor%3A%20%23DDDDDD%3B%0Aborder%2Dstyle%3A%20outset%3B%0Aborder%2Dcollapse%3A%20collapse%3B%0A%7D%0Atable%20th%20%7B%0Aborder%2Dwidth%3A%202px%3B%0Apadding%3A%205px%3B%0Aborder%2Dstyle%3A%20inset%3B%0A%7D%0Atable%20td%20%7B%0Aborder%2Dwidth%3A%201px%3B%0Aborder%2Dstyle%3A%20inset%3B%0Aline%2Dheight%3A%2018px%3B%0Apadding%3A%205px%205px%3B%0A%7D%0Atable%2C%20table%20th%2C%20table%20td%20%7B%0Aborder%2Dleft%2Dstyle%3A%20none%3B%0Aborder%2Dright%2Dstyle%3A%20none%3B%0A%7D%0Atable%20thead%2C%20table%20tr%2Eeven%20%7B%0Abackground%2Dcolor%3A%20%23f7f7f7%3B%0A%7D%0Ap%20%7B%0Amargin%3A%200%2E5em%200%3B%0A%7D%0Ablockquote%20%7B%0Abackground%2Dcolor%3A%20%23f6f6f6%3B%0Apadding%3A%200%2E25em%200%2E75em%3B%0A%7D%0Ahr%20%7B%0Aborder%2Dstyle%3A%20solid%3B%0Aborder%3A%20none%3B%0Aborder%2Dtop%3A%201px%20solid%20%23777%3B%0Amargin%3A%2028px%200%3B%0A%7D%0Adl%20%7B%0Amargin%2Dleft%3A%200%3B%0A%7D%0Adl%20dd%20%7B%0Amargin%2Dbottom%3A%2013px%3B%0Amargin%2Dleft%3A%2013px%3B%0A%7D%0Adl%20dt%20%7B%0Afont%2Dweight%3A%20bold%3B%0A%7D%0Aul%20%7B%0Amargin%2Dtop%3A%200%3B%0A%7D%0Aul%20li%20%7B%0Alist%2Dstyle%3A%20circle%20outside%3B%0A%7D%0Aul%20ul%20%7B%0Amargin%2Dbottom%3A%200%3B%0A%7D%0Apre%2C%20code%20%7B%0Abackground%2Dcolor%3A%20%23f7f7f7%3B%0Aborder%2Dradius%3A%203px%3B%0Acolor%3A%20%23333%3B%0Awhite%2Dspace%3A%20pre%2Dwrap%3B%20%0A%7D%0Apre%20%7B%0Aborder%2Dradius%3A%203px%3B%0Amargin%3A%205px%200px%2010px%200px%3B%0Apadding%3A%2010px%3B%0A%7D%0Apre%3Anot%28%5Bclass%5D%29%20%7B%0Abackground%2Dcolor%3A%20%23f7f7f7%3B%0A%7D%0Acode%20%7B%0Afont%2Dfamily%3A%20Consolas%2C%20Monaco%2C%20%27Courier%20New%27%2C%20monospace%3B%0Afont%2Dsize%3A%2085%25%3B%0A%7D%0Ap%20%3E%20code%2C%20li%20%3E%20code%20%7B%0Apadding%3A%202px%200px%3B%0A%7D%0Adiv%2Efigure%20%7B%0Atext%2Dalign%3A%20center%3B%0A%7D%0Aimg%20%7B%0Abackground%2Dcolor%3A%20%23FFFFFF%3B%0Apadding%3A%202px%3B%0Aborder%3A%201px%20solid%20%23DDDDDD%3B%0Aborder%2Dradius%3A%203px%3B%0Aborder%3A%201px%20solid%20%23CCCCCC%3B%0Amargin%3A%200%205px%3B%0A%7D%0Ah1%20%7B%0Amargin%2Dtop%3A%200%3B%0Afont%2Dsize%3A%2035px%3B%0Aline%2Dheight%3A%2040px%3B%0A%7D%0Ah2%20%7B%0Aborder%2Dbottom%3A%204px%20solid%20%23f7f7f7%3B%0Apadding%2Dtop%3A%2010px%3B%0Apadding%2Dbottom%3A%202px%3B%0Afont%2Dsize%3A%20145%25%3B%0A%7D%0Ah3%20%7B%0Aborder%2Dbottom%3A%202px%20solid%20%23f7f7f7%3B%0Apadding%2Dtop%3A%2010px%3B%0Afont%2Dsize%3A%20120%25%3B%0A%7D%0Ah4%20%7B%0Aborder%2Dbottom%3A%201px%20solid%20%23f7f7f7%3B%0Amargin%2Dleft%3A%208px%3B%0Afont%2Dsize%3A%20105%25%3B%0A%7D%0Ah5%2C%20h6%20%7B%0Aborder%2Dbottom%3A%201px%20solid%20%23ccc%3B%0Afont%2Dsize%3A%20105%25%3B%0A%7D%0Aa%20%7B%0Acolor%3A%20%230033dd%3B%0Atext%2Ddecoration%3A%20none%3B%0A%7D%0Aa%3Ahover%20%7B%0Acolor%3A%20%236666ff%3B%20%7D%0Aa%3Avisited%20%7B%0Acolor%3A%20%23800080%3B%20%7D%0Aa%3Avisited%3Ahover%20%7B%0Acolor%3A%20%23BB00BB%3B%20%7D%0Aa%5Bhref%5E%3D%22http%3A%22%5D%20%7B%0Atext%2Ddecoration%3A%20underline%3B%20%7D%0Aa%5Bhref%5E%3D%22https%3A%22%5D%20%7B%0Atext%2Ddecoration%3A%20underline%3B%20%7D%0A%0Acode%20%3E%20span%2Ekw%20%7B%20color%3A%20%23555%3B%20font%2Dweight%3A%20bold%3B%20%7D%20%0Acode%20%3E%20span%2Edt%20%7B%20color%3A%20%23902000%3B%20%7D%20%0Acode%20%3E%20span%2Edv%20%7B%20color%3A%20%2340a070%3B%20%7D%20%0Acode%20%3E%20span%2Ebn%20%7B%20color%3A%20%23d14%3B%20%7D%20%0Acode%20%3E%20span%2Efl%20%7B%20color%3A%20%23d14%3B%20%7D%20%0Acode%20%3E%20span%2Ech%20%7B%20color%3A%20%23d14%3B%20%7D%20%0Acode%20%3E%20span%2Est%20%7B%20color%3A%20%23d14%3B%20%7D%20%0Acode%20%3E%20span%2Eco%20%7B%20color%3A%20%23888888%3B%20font%2Dstyle%3A%20italic%3B%20%7D%20%0Acode%20%3E%20span%2Eot%20%7B%20color%3A%20%23007020%3B%20%7D%20%0Acode%20%3E%20span%2Eal%20%7B%20color%3A%20%23ff0000%3B%20font%2Dweight%3A%20bold%3B%20%7D%20%0Acode%20%3E%20span%2Efu%20%7B%20color%3A%20%23900%3B%20font%2Dweight%3A%20bold%3B%20%7D%20%20code%20%3E%20span%2Eer%20%7B%20color%3A%20%23a61717%3B%20background%2Dcolor%3A%20%23e3d2d2%3B%20%7D%20%0A" rel="stylesheet" type="text/css" />
 63 | 
 64 | </head>
 65 | 
 66 | <body>
 67 | 
 68 | 
 69 | 
 70 | 
 71 | <h1 class="title toc-ignore">OneR - Establishing a New Baseline for Machine Learning Classification Models</h1>
 72 | <h4 class="author"><em>An R package by Holger K. von Jouanne-Diedrich</em></h4>
 73 | <h4 class="date"><em>2017-05-05</em></h4>
 74 | 
 75 | 
 76 | 
 77 | <p><strong>Note:</strong> You can find a step-by-step introduction on YouTube: <a href="https://www.youtube.com/watch?v=AGC0oRlXxgU">Quick Start Guide for the OneR package</a></p>
 78 | <div id="introduction" class="section level2">
 79 | <h2>Introduction</h2>
 80 | <p>The following story is one of the most often told in the Data Science community: some time ago the military built a system which aim it was to distinguish military vehicles from civilian ones. They chose a neural network approach and trained the system with pictures of tanks, humvees and missile launchers on the one hand and normal cars, pickups and trucks on the other. After having reached a satisfactory accuracy they brought the system into the field (quite literally). It failed completely, performing no better than a coin toss. What had happened? No one knew, so they re-engineered the black box (no small feat in itself) and found that most of the military pics where taken at dusk or dawn and most civilian pics under brighter weather conditions. The neural net had learned the difference between light and dark!</p>
 81 | <p>Although this might be an urban legend the fact that it is so often told wants to tell us something:</p>
 82 | <ol style="list-style-type: decimal">
 83 | <li>Many of our Machine Learning models are so complex that we cannot understand them ourselves.</li>
 84 | <li>Because of 1. we cannot differentiate between the simpler aspects of a problem which can be tackled by simple models and the more sophisticated ones which need specialized treatment.</li>
 85 | </ol>
 86 | <p>The above is not only true for neural networks (and especially deep neural networks) but for most of the methods used today, especially Support Vector Machines and Random Forests and in general all kinds of ensemble based methods.</p>
 87 | <p>In one word: we need a good baseline which builds “the best simple model” that strikes a balance between the best accuracy possible with a model that is still simple enough to understand: I have developed the OneR package for finding this sweet spot and thereby establishing a new baseline for classification models in Machine Learning (ML).</p>
 88 | <p>This package is filling a longstanding gap because only a JAVA based implementation was available so far (<a href="https://cran.r-project.org/package=RWeka">RWeka package</a> as an interface for the <a href="http://weka.sourceforge.net/doc.dev/weka/classifiers/rules/OneR.html">OneR JAVA class</a>). Additionally several enhancements have been made (see below).</p>
 89 | </div>
 90 | <div id="design-principles-for-the-oner-package" class="section level2">
 91 | <h2>Design principles for the OneR package</h2>
 92 | <p>The following design principles were followed for programming the package:</p>
 93 | <ul>
 94 | <li>Easy: the learning curve for new users should be minimal. Results should be obtained with ease and only minimal preprocessing and modeling steps should be necessary.</li>
 95 | <li>Versatile: all types of data, i.e. categorical and numeric, should be computable - as input variable as well as as target.</li>
 96 | <li>Fast: the running times of model trainings should be short.</li>
 97 | <li>Accurate: the accuracy of trained models should be good overall.</li>
 98 | <li>Robust: models should not be prone to overfitting; the reached accuracy on training data should be comparable to the accuracy of predictions from new, unseen cases.</li>
 99 | <li>Comprehensible: it should be easy to understand which rules the model has learned. Not only should the rules be easily comprehensible but they should serve as heuristics that are usable even without a computer.</li>
100 | <li>Reproducible: because the used algorithms are strictly deterministic one will always get the same models on the same data. Many ML algorithms have stochastic components so that the data scientist will get a different model every time.</li>
101 | <li>Intuitive: model diagnostics should be presented in form of simple tables and plots.</li>
102 | <li>Native R: the whole package is written in native R code. Thereby the source code can be easily checked and the whole package is very lean. Additionally the package has no dependencies at all other than base R itself.</li>
103 | </ul>
104 | <p>The package is based on the – as the name might reveal – one rule classification algorithm [Holte93]. Although the underlying method is simple enough (basically 1-level decision trees, you can find out more here: <a href="http://www.saedsayad.com/oner.htm">OneR</a>) several enhancements have been made:</p>
105 | <ul>
106 | <li>Discretization of numeric data: the OneR algorithm can only handle categorical data, so numeric data has to be discretized. The original OneR algorithm separates the respective values in ever smaller and smaller buckets until the best possible accuracy is being reached. It can be argued that this is the definition of overfitting and contradicts the original spirit of OneR because tons of rules (one for every bucket) will result. One can of course introduce a new parameter “maximum bucket size” but finding the right value for this one doesn’t come naturally either. Therefore I take a radically different approach: there are several methods for handling numeric data in the package (in the bin and the optbin function), the most promising one is the (default) “logreg” method in the optbin function which gives only as many bins as there are target categories and which optimizes the cut points according to pairwise logistic regressions.</li>
107 | <li>Missing values: in the original algorithm missing values were always handled as a separate level in the respective attribute. While missing values can sometimes reveal interesting patterns in other cases they are, well, just values that are missing. In the OneR package missing values can be handled as separate levels (level “NA”) or they can be omitted (the default).</li>
108 | <li>Tie breaking: sometimes the OneR algorithm will find several attributes that provide rules which all give the same best accuracy. The original algorithm just took the first attribute. While this is implemented in the OneR function as the default too a different method for tie breaking can be chosen: the contingency tables of all “best” rules are tested against each other with a Pearson’s Chi squared test and the one with the smallest p-value is being chosen. The rationale behind this is that thereby the attribute with the best signal-to-noise ratio is being found.</li>
109 | </ul>
110 | </div>
111 | <div id="getting-started-with-a-simple-example" class="section level2">
112 | <h2>Getting started with a simple example</h2>
113 | <p>You can also watch this video which goes through the following example step-by-step:</p>
114 | <p><a href="https://www.youtube.com/watch?v=AGC0oRlXxgU">Quick Start Guide for the OneR package (Video)</a></p>
115 | <p>After installing from CRAN load package</p>
116 | <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(OneR)</code></pre></div>
117 | <p>Use the famous Iris dataset and determine optimal bins for numeric data</p>
118 | <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">data &lt;-<span class="st"> </span><span class="kw">optbin</span>(iris)</code></pre></div>
119 | <p>Build model with best predictor</p>
120 | <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">model &lt;-<span class="st"> </span><span class="kw">OneR</span>(data, <span class="dt">verbose =</span> <span class="ot">TRUE</span>)</code></pre></div>
121 | <pre><code>## 
122 | ##     Attribute    Accuracy
123 | ## 1 * Petal.Width  96%     
124 | ## 2   Petal.Length 95.33%  
125 | ## 3   Sepal.Length 74.67%  
126 | ## 4   Sepal.Width  55.33%  
127 | ## ---
128 | ## Chosen attribute due to accuracy
129 | ## and ties method (if applicable): '*'</code></pre>
130 | <p>Show learned rules and model diagnostics</p>
131 | <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">summary</span>(model)</code></pre></div>
132 | <pre><code>## 
133 | ## Call:
134 | ## OneR.data.frame(x = data, verbose = TRUE)
135 | ## 
136 | ## Rules:
137 | ## If Petal.Width = (0.0976,0.791] then Species = setosa
138 | ## If Petal.Width = (0.791,1.63]   then Species = versicolor
139 | ## If Petal.Width = (1.63,2.5]     then Species = virginica
140 | ## 
141 | ## Accuracy:
142 | ## 144 of 150 instances classified correctly (96%)
143 | ## 
144 | ## Contingency table:
145 | ##             Petal.Width
146 | ## Species      (0.0976,0.791] (0.791,1.63] (1.63,2.5] Sum
147 | ##   setosa               * 50            0          0  50
148 | ##   versicolor              0         * 48          2  50
149 | ##   virginica               0            4       * 46  50
150 | ##   Sum                    50           52         48 150
151 | ## ---
152 | ## Maximum in each column: '*'
153 | ## 
154 | ## Pearson's Chi-squared test:
155 | ## X-squared = 266.35, df = 4, p-value &lt; 2.2e-16</code></pre>
156 | <p>Plot model diagnostics</p>
157 | <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">plot</span>(model)</code></pre></div>
158 | <p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAq4AAAHgCAMAAABTmx+1AAAA6lBMVEUAAAAAACEAACgAADoAAEkAAGYADQAAOjoAOmYAOpAAZpAAZrYoAAAoAGYoOpAqOpA6AAA6ADo6AGY6OgA6Ojo6OpA6Zlg6ZrY6kNtJAChNTU1YAABYtttYtv9mAABmADpmAElmAFhmAGZmOgBmZjpmZmZmgWZmkJBmtttmtv982/+BZgCB//+QOgCQOjqQOmaQkGaQkLaQnWaQtpCQ27aQ29uQ2/+d//+urq62ZgC2Zjq2Zma225C2/7a2/9u2///bkDrb/7bb/9vb///m5ub/nDr/tlj/tmb/25D/29v//5z//7b//9v///9TIigjAAAACXBIWXMAAA7DAAAOwwHHb6hkAAATBElEQVR4nO3dDXvjSn2GcUFYaHBfsm1P675lYaGA23qhpXUrTpfVOSkoYOv7f51qZiRZsi3Ljuax9E/u33Vt4tiKpMT3TsbyW1IAZiRT7wBwOXKFIeQKQ8gVhpArDCFXGEKuMIRcYQi5whByhSHkCkPIFYaQKwwhVxhCrjCEXGEIucIQcoUh5ApDyBWGkCsMIVcYQq4whFxhCLnCEHKFIeQKQ8gVhpArDCFXGEKuMIRcYcgbyzVPnNWZJXbrJHj35ZoVp93ly7XcPx0tcHzutb7+cmLVB04t8PVVP8x8valct4/DKTa5JnebK1Z9k1y3j+9elKv/vlfhLeW6L/HMFd5aaHHFui/K9cr9HdpID8225+Et5ZpV84DnhyRZui/vNmkziKZVn/WVXY7E1ZVenlrk/luzZmm3CreO6jvdinwRh2vpLlCd6795v9nqm7u709pAVv//Sv0fht9Wq84705pyFZ8fw5+NetvNKtLr5zZz9YZyLa/GfQvlFZq1/uhX04T239q0lWuLv9qz/RjdnuueWEtngXBuvt9suOzur0KurTlIawNZs/pOrmk4u/ovU3/pvrva9n4V5GqRGyWrk9VwVn1cVmeUCyzbY1Mr12XRjK/LVu5LP8gtfC2+p8O1FJ0F/Lnlh4VfxcE3t3entYGw1/l+L5vm/WV1hWEdfn1hgfY+MhkwKBTiuT+87qpc+XMXITF3tk+qVi0dpgXhY1hH85e/TjSE011LnWtrgf25WVKF2lzW3p32BsozmtZauYYlstZFfmKwn3Ic7+Nr8KZzraYBizDNC39L97nWQ3EY38JHv466uiws7perymqtpQqzvUA4t97CollRM3etNnOwgea/zj7Xo5tTaT1LqbbdXgW5WtSZDHRzzZOjXPc1HOVar6j8rlV92hXRXUvrhlq9QB1S/X3ty9q7095AM3VeHeTaPW5RFem+wy/QWQW5GnR8U6s9uq72S4U5X9Pr0OjaDJBHa2mfanINg3xnFD03uhbF/tgAo+sbyrU5kJUnzYGsg+GsKFq3q3snA2fmrgdrKYrjuWtez1GXh3PXZnc6G9h92nS21Z6a5s1dGcxdX53WrSh/O6XTR/izux+W0uag5nGu/UcGDtdSHB8Z8N/sdmVxeGRgvzutDYST9ZjfHKeqjwHUO3n2yEA9Stv3lnI9uBO23Ud1yf6oZbG/RX6ca31Ms5p4Nus8Xkt3gf3ctfru7nHX/e60N5DV8+HwZ+HguOviaCMHx13dAnnCcVeT2g9x6fThw6kO+9d3E1QtnMg1HErYD2zJKq3vUDpYS2eB/ZGBu03W3A327svh3LW7AX8y1Obu9fqv9r1ay/02XMedbbdWsb+3zLg3lus8pUn3dtOLVvE6hs8B5DqltL4P65pH0/SsiVwhVh+pHf+XmlyhF26cjZ4KkCswP+QKQ8gVhpArDCFXGEKuMIRcYQi5whByhSHkCkPIFYaQKwwhVxhCrjCEXGEIucIQcoUh5ApDyBWGkCsMIVcYQq4whFxhCLnCEHKFIeQKQ8gVhpArDJlzrtkyvHGJk1dvfRLeis29zmTz7gH1MtX51au0bz+6V/89997aN9jto10KS9Qnsvq13lcT7WotW57aKfdV9ar0RXjh5N9Ou5uzzvX5b4vth83zX7rfYjiR3z+FN5YoTz9/9bRbr5qLqm9plglvMp1O8Mtt7/aJXSr2b4AdfoR84UuZYldr5S6f3qkyXvdeB27Xwi9+0t0sZp3rbzb+dxf69CfcIJD5N+Vdhc9Fc1E4XZ2fLXe//Pxxolzbu31il8orvj7R/AhT51ru8umdCvxZ1f8/cu3hfl35ovoFhRNumFovfaFF+vPqNaebZcL59TLbiXLt7PaJXaqXKdU/QubnOdN1EHbn1E61Ls5/EN7+k1xP81d0N1f3boM/dW9H5f65945qXeSk4e3Q/TJT5drZ7RO75NRl1D+C/9Gm68Dt8umdcsqZjf/8h6cim3Q3nXnnejAZcGeXv6+mxPBXqrmoycFdNmWu53apdW7zI7iPs8m1vVPuwvf7F6Kfdjed+ebqfn2HN7UWfgrlf73NCLa/XVOf77+ccjJwbpfqZYpmhrOYenTtTAY6O9VMWcP5jK5n/GZTv4nvxy/tA1n+N1eEN+RrLqqnjEXnqNFEN7XO7JLfrfpEeE/B8PYvE9/UOrlT5Zf+7eWW/rbg5LtZzDpXd3hl0Ndn30BiogNZ5/Tt8NQHsk463lly7dUcU+m3+/dzF052N0G/nh2ewd0EJxzt7MS7Oe9cgQPkCkPIFYaQKwy5Ya6JAVftbYyfOOo6Rm5s3LeP/1EvcMtc/372+Fszc+TaRq4zR65t5Dpz5NpGrjNHrm3kOnPk2nbdkYG5MLLPURqKsZILNzV1jMPauf6PFYmNfSbX6MhVh1yjI1cdco2OXHXINTpy1SHX6MhVh1yjI1cdco2OXHXINTpy1SHX6MhVh1yjI1cdco3Oeq6zFqWhGCu5cFNTxzjMeK6vH7m2kevMkWsbuc4cubaR68yRa5vxXKe+MXVelIZirOTCTU0d4zDruf7fjJFrdOSqQ67RkasOuUZHrjrkGh256pBrdOSqQ67RkasOuUZHrjrkGh256pBrdOSqQ67RkasOuUZHrjrkGh256pBrdOSqQ67RkasOuUZHrjrkGh256pBrdOSqQ67RkasOuUZHrjrkGh256pBrdOSqQ67RkasOuUZHrjrkGh256pBrdOSqQ67RkasOuUZHrjrkGh256pBrdOSqQ67RkasOuUZHrjrkGh256pBrdOSqQ67RkasOuUZHrjrkGh256pBrdOSqQ67RkasOuUZHrjrkGh256pBrdOSqQ67RkasOuUZHrjrkGh256pBrdOSqQ67RkasOuUZHrjrkGh256pBrdOSqQ67RkasOuUZHrjrkGh256pBrdOSqQ67RkasOuUZHrjrkGh256pBrdOSqQ67RkasOuUZHrjrkGh256pBrdOSqQ67RkasOuUZHrjrkGh256pBrdOSqQ67RkasOuUZHrjrkGh256pBrdOSqQ67RkasOuUZHrjrkGh256pBrdOSqQ67RkasOuUZHrjrkGh256pBrdOSqQ67RkasOuUZHrjrkGh256pBrdOSqQ67RkasOuUZHrjrkGh256kyS6/ZxWWRJcv/0gk1NHeMwctWZJNf0/un5YVGkixdsauoYh5GrzhS5bh9XRZ6U/959uX5TU8c4jFx1pso1LVPNyHU2yLVfutg+3j9tH5kMzAa59ts+Jneb3foFtZKrCLlKkKsGuUqQqwa5nlHOBu6f0uVLNjV1jMPIVWeSXPO7TeZuar2gV3LVINdeu/WyKHPlQNaMkGsvd9zV5crdBPNBrr3q0TV9wYMGyFWDXPuFuWuWrF6wqaljHEauOpMdGUiSu83AUlm5UHIwYSBXDXIda/thk67yg7u+yFWDXMfafvySLcsP3U1NHeMwctWZca67T5t8Qa63Qa6nbR+XfuZ6Yl56KL//dp0c3JVArhrkKkGuGuQ6WjkVeDw82EWuGuR6hruDID/8O3+onLtmi+evunclkKsGufYLd2cNPZugvJWVcmTgRsi1l3vMgDPwmIHdLz9/2DC63ga59nKPGXCGHpGVJ/fffDi464tcNci1X3i0wPMDj3edDXI94/nhkscMuOOzhwMwuWqQ61i79crdVcDc9RbIdSx/UIAjA7dBrmdc9NRCRtcbItd+Fz61kLnr7ZBrL55aOD/k2uvCpxYyd70hcu110VMLd+vwIEPmrjdBrv0ue2rh4cAaNjV1jMPIVWfOTy0sl/rFJ+6EvQlyHWu3XqUrHuJyG+Q6lnsA4YqbWrdBrmf4ycDQa7j40ZW7CW6DXPvlftqaDU1euZvgdsi1l793tSh4jawZIdde2+ox10OvQMhTC2+HXHvVo2t2fnTlqYU3RK79cj9mXnA3AU8tvBVy7dW8isv5F3LhqYU3RK6j8dTC2yFXCXLVINc+/hmw+QWPGTi5qaljHEauOrfP9flhFW5s+RNXb2rqGIeRq87tc03dSw25N9wOp67d1NQxDiNXnSle39UdeXWl8kZF80Gup/kXyArPKiTX+SDX03yuYdo6cK/W6U1NHeMwctWZZu7qH4wVZgTXbmrqGIeRq87tc83vNttHN66mL5gLkKsIufbJ/SOznx9eMBUgVxVylSBXDXKVIFcNcpUgVw1ylSBXDXKVIFcNcpUgVw1ylSBXDXKVIFcNcpUgVw1ylSBXDXKVIFcNcpUgVw1ylSBXDXKVIFcNcpUgVw1ylSBXDXKVIFcNcpUgVw1ylSBXDXKVIFcNcpUgVw1ylSBXDXKVIFcNcpUgVw1ylSBXDXKVIFcNcpUgVw1ylSBXDXKVIFcNcpUgVw1ylSBXDXKVIFcNcpUgVw1ylSBXDXKVIFcNcpUgVw1ylSBXDXKVIFcNcpUgVw1ylSBXDXKVIFcNcpUgVw1ylSBXDXKVIFcNcpUgVw1ylSBXDXKVIFcNcpUgVw1ylSBXDXKVIFcNcpUgVw1ylSBXDXKVIFcNcpUgVw1ylSBXDXKVIFcNcpUgVw1ylSBXDXKVIFcNcpUgVw1ylSBXDXKVIFcNcpUgVw1ylSBXDXKVIFcNcpUgVw1ylSBXDXKVIFcNcpUgVw1ylSBXDXKVIFcNcpUgVw1ylSBXDXKVIFcNcpUgVw1ylSBXDXKVIFcNcpUgVw1ylSBXDXKVIFcNcpUgVw1ylSBXDXKVIFcNcpUgVw1ylSBXDXKVIFcNcpUgVw1ylSBXDXKVIFcNcpUgVw1ylSBXDXKVIFcNcpUgVw1yHWv78cuJTU0d4zBy1SHX6MhVZ8a5FunqxKamjnFYO1c79vs8dZLnzDjX7aP7Nb7rDrG2crWIXCMiVzVyfSE3vB4MruQqR64vs1uXc9f8/qm7qaljHEauOjPO1R8ZODw8QK5q5Poyr2B0nfrm/hX2+zx1kufMONdXMHed9VXfQa5jZe4/PbneBrmOE466Gj/uOuurvoNcRypvZGXLoztiyVWDXEfaffrPcoD97r8dbGrqGIeRq858cy1+//1/XCd/Y/vIwNRX78XIdaxXcNx11ld9B7mO9fvv/9Pz+/9gdL0Jch0pzF3v/vtgU1PHOIxcdeab68m5gIn7iVp7O/XVezFyHWm3/tn90+++8y+Kdd/KrK/6DnId648PSfLuf796Gl5ytmZ91XeQ61inZwOmzPqq7yDXUbYfP/t7Yb97z+h6C+Q62qlHZNky66u+g1zHytzoevSIrCtul5/e17ErGF5Da9Gpr96Lkes42x/93cOpXk2Z9VXfQa4j2b6RFcz6qu8g17HSny3K2euJl8awY9ZXfQe5jhQen333FxwZuAVyHaucDaRHj882dlPLDiP7fFVBfVdgjJV01cddbY+umCPRcdc8uf/mw0azbrxdwrlrckeviEt0ZODPvhT54tn0Y1wwQ6Ljrv+6clPYV3D4FbMierzrn/rJwJ8zuiIq3csR33+7XkpWjrdLNBn4/Jj84hO3tBCZJtcf/2SVrrilhdhkRwbS1Wt4oAvmRXjc1fazCTBHPJsAhggfM0CuiE11IItjWBAQzV2XWZIsFKvGmyaau7pac2YDiEz11jxlr9+jVkQmmrv+A3MBCKjmrorV4s0z/j59eFvIFYaQKwwhVxhCrjCEXMdLw8s+dI7c5a1XXNo++i9S/wi1LHFHTXbrRXWuW/Zu45bfn4E+5Dpe6HD72HrAZCe96ovM38uX/rNbrHOor8zVLUKuw8h1vJCrHyRrp3J9fig/bT/8+n252PP71jODyPVi5DpelavPMStnBcvyZJKUZ7pZwqrJ1Y+o+f2365UfacO55fJ3v7r7tVv+m8efPyS2X7ZRjlzHa42uWfnv+WEZCk3L2aw7ox42/dcL/6n8FxYpJwh5Uo+u1QrQi1zHq+eui2pKmoehc+teJMwNuXWu+f3Trhxaq0/uXD8gF2md67IaotGHXMerjgws6+lrmVxTqPvrXn9RTlhdwuUnN3V154bHWLbnrkxgzyLX8dLmkEBevZRpyDVz74XXGl3LMbUcWetP7tyMXK9DruO1cq1nns1f+vZkoEiXmTs4my7cJ0bXFyDX8fa5NrE1LeatyUCR/9AdFSjyP/nJqlW0u3lFrpch1/H2uYYb9v6m07IaWJPlPsHnv34f5rZuqTBfcHdoJWF5ch1GruO1cvXHXd2omib3T+6Y6ibc7N+t3Sxgt/ZLhk/t464bt/w35DqIXGEIucIQcoUh5ApDyBWGkCsMIVcYQq4whFxhCLnCEHKFIeQKQ8gVhpArDCFXGEKuMIRcYQi5whByhSHkCkPIFYaQKwwhVxhCrjCEXGEIucIQcoUh5ApDyBWGkCsMIVcYQq4whFxhCLnCEHKFIeQKQ8gVhpArDCFXGEKuMIRcYQi5whByhSHkCkPIFYaQKwwhVxhCrjCEXGEIucIQcoUh5ApDyBWGkCsMIVcYQq4whFxhCLnCEHKFIeQKQ8gVhpArDCFXGPL/hai5MEF39LIAAAAASUVORK5CYII=" /><!-- --></p>
159 | <p>Use model to predict data</p>
160 | <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">prediction &lt;-<span class="st"> </span><span class="kw">predict</span>(model, data)</code></pre></div>
161 | <p>Evaluate prediction statistics</p>
162 | <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">eval_model</span>(prediction, data)</code></pre></div>
163 | <pre><code>## 
164 | ## Confusion matrix (absolute):
165 | ##             Actual
166 | ## Prediction   setosa versicolor virginica Sum
167 | ##   setosa         50          0         0  50
168 | ##   versicolor      0         48         4  52
169 | ##   virginica       0          2        46  48
170 | ##   Sum            50         50        50 150
171 | ## 
172 | ## Confusion matrix (relative):
173 | ##             Actual
174 | ## Prediction   setosa versicolor virginica  Sum
175 | ##   setosa       0.33       0.00      0.00 0.33
176 | ##   versicolor   0.00       0.32      0.03 0.35
177 | ##   virginica    0.00       0.01      0.31 0.32
178 | ##   Sum          0.33       0.33      0.33 1.00
179 | ## 
180 | ## Accuracy:
181 | ## 0.96 (144/150)
182 | ## 
183 | ## Error rate:
184 | ## 0.04 (6/150)
185 | ## 
186 | ## Error rate reduction (vs. base rate):
187 | ## 0.94 (p-value &lt; 2.2e-16)</code></pre>
188 | <p>Please note that the very good accuracy of 96% is reached effortlessly.</p>
189 | <p>“Petal.Width” is identified as the attribute with the highest predictive value. The cut points of the intervals are found automatically (via the included optbin function). The results are three very simple, yet accurate, rules to predict the respective species.</p>
190 | <p>The nearly perfect separation of the areas in the diagnostic plot give a good indication of the model’s ability to separate the different species.</p>
191 | </div>
192 | <div id="a-more-sophisticated-real-world-example" class="section level2">
193 | <h2>A more sophisticated real-world example</h2>
194 | <p>The next example tries to find a model for the identification of breast cancer. The data were obtained from the UCI machine learning repository (see also the package documentation). According to this source the best out-of-sample performance was 95.9%, so let’s see what we can achieve with the OneR package…</p>
195 | <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">data</span>(breastcancer)
196 | data &lt;-<span class="st"> </span>breastcancer</code></pre></div>
197 | <p>Divide training (80%) and test set (20%)</p>
198 | <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">set.seed</span>(<span class="dv">12</span>) <span class="co"># for reproducibility</span>
199 | random &lt;-<span class="st"> </span><span class="kw">sample</span>(<span class="dv">1</span>:<span class="kw">nrow</span>(data), <span class="fl">0.8</span> *<span class="st"> </span><span class="kw">nrow</span>(data))
200 | data_train &lt;-<span class="st"> </span><span class="kw">optbin</span>(data[random, ], <span class="dt">method =</span> <span class="st">&quot;infogain&quot;</span>)</code></pre></div>
201 | <pre><code>## Warning in optbin.data.frame(data[random, ], method = &quot;infogain&quot;): 12
202 | ## instance(s) removed due to missing values</code></pre>
203 | <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">data_test &lt;-<span class="st"> </span>data[-random, ]</code></pre></div>
204 | <p>Train OneR model on training set</p>
205 | <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">model_train &lt;-<span class="st"> </span><span class="kw">OneR</span>(data_train, <span class="dt">verbose =</span> <span class="ot">TRUE</span>)</code></pre></div>
206 | <pre><code>## 
207 | ##     Attribute                   Accuracy
208 | ## 1 * Uniformity of Cell Size     92.32%  
209 | ## 2   Uniformity of Cell Shape    91.59%  
210 | ## 3   Bare Nuclei                 90.68%  
211 | ## 4   Bland Chromatin             90.31%  
212 | ## 5   Normal Nucleoli             90.13%  
213 | ## 6   Single Epithelial Cell Size 89.4%   
214 | ## 7   Marginal Adhesion           85.92%  
215 | ## 8   Clump Thickness             84.28%  
216 | ## 9   Mitoses                     78.24%  
217 | ## ---
218 | ## Chosen attribute due to accuracy
219 | ## and ties method (if applicable): '*'</code></pre>
220 | <p>Show model and diagnostics</p>
221 | <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">summary</span>(model_train)</code></pre></div>
222 | <pre><code>## 
223 | ## Call:
224 | ## OneR.data.frame(x = data_train, verbose = TRUE)
225 | ## 
226 | ## Rules:
227 | ## If Uniformity of Cell Size = (0.991,2] then Class = benign
228 | ## If Uniformity of Cell Size = (2,10]    then Class = malignant
229 | ## 
230 | ## Accuracy:
231 | ## 505 of 547 instances classified correctly (92.32%)
232 | ## 
233 | ## Contingency table:
234 | ##            Uniformity of Cell Size
235 | ## Class       (0.991,2] (2,10] Sum
236 | ##   benign        * 318     30 348
237 | ##   malignant        12  * 187 199
238 | ##   Sum             330    217 547
239 | ## ---
240 | ## Maximum in each column: '*'
241 | ## 
242 | ## Pearson's Chi-squared test:
243 | ## X-squared = 381.78, df = 1, p-value &lt; 2.2e-16</code></pre>
244 | <p>Plot model diagnostics</p>
245 | <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">plot</span>(model_train)</code></pre></div>
246 | <p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAq4AAAHgCAMAAABTmx+1AAAAkFBMVEUAAAAAADoAAGYAOpAAZrY6AAA6ADo6AGY6OgA6Ojo6OpA6ZrY6kNtNTU1mAABmADpmAGZmZjpmtrZmtttmtv+QOgCQOjqQOmaQZgCQkGaQkLaQ27aQ29uQ2/+2ZgC2Zjq2Zma225C2/7a2/9u2///bkDrb25Db/9vb///m5ub/tmb/trb/25D//7b//9v///93c63FAAAACXBIWXMAAA7DAAAOwwHHb6hkAAAW8UlEQVR4nO2di3Ya2XZFsYVvJ32J7CSo8xTu2xG3FSKJ//+71PsBW2iX2GjXEnOOYVmGYi04TBeHQwGLPYAMi+wrAOAHXUEIdAUh0BWEQFcQAl1BCHQFIdAVhEBXEAJdQQh0BSHQFYRAVxACXUEIdAUh0BWEQFcQAl1BCHQFIdAVhEBXEAJdQQh0BSHQFYRAVxACXUEIdAUh0BWEQFcQAl1BCHQFIdAVhLgyXXeLkvWJLV7uFjVfH6YEb8bbFyk3j0cbHJ86ld8fjOgDrA1+n3Rj5stV6fp8+7aKna6LL/cToj9E1+fbr+/Stbrcp+CadO1NPHGHDzZaTsh26Trx+r5V8gqX6Z4H16TrtpkHPH1bLFblP7/cb7qd6Kbxs72ziz1xc6cXvy131UW33dZlRJnRXLIMqow4TBlv0JxaXbivbS48vjqDgm37/2tTPTD8TxO9G01riog/buuHjba7i9hMn9vMlSvStbgbexeKO3Q7eNBvpgnDx9rNQNcB1d2+7ffRw7mukTLaoD5119fW5335h1rXwRxkULDt4ke6buqTm/8y7T/LSzfdfQS6KlLuJZtfm91Z83PVnFBssBrumwa6rvbd/nU10H1V7eSWlS2VT4cp+9EG1anFj2UVcXDh4dUZFNTXetdfy8756rzWwjqjyqs3GF5HJgOC1IZUlA+85V25rk5d1oqVJ1dKtTRb19OC+med0T3yt4rW4oxTWl0HG/SnbheNqN15w6szLChO6Fwb6FpvsR2cVU0M+inH8XX8DFy1rs00YFlP8+rH0l7Xdldc79/qn1VGa9223rzarjFrkNKIOdygPrVtWHZB3dy1qTko6P7r9LoePZ3atLOUpnsYga6KjCYDY113iyNdexuOdG2Dikut299LI8Ypgydq7QatSO3lhucNr86woJs6rw90Ha9bNEaWl6g2GEWgqyDHT7WGe9d1v1U95+t8fWvv2u0gj1KGv3W61jv50V701N51v+/XBti7XpGu3ULWbtEtZB3szvb7wfPqVycDJ+auByn7/fHcddfOUVeHc9fu6owKXn67H3UNp6a77qUM5q6fjsGzqOp5ysiP+mG33y1tukXNY11fXxk4TNkfrwxUFy6vyvJwZaC/OoOC+td2n9+tU7VrAO2VPLky0O6l9bkmXQ9ehB360ZzTr1ru+2fkx7q2a5rNxPNw3XWQMt6gn7s2lx6vu/ZXZ1iw7RZUq4eFg3XX5VHJwbprucGOdVdNhoe4jPyoxGmW/duXCRoXDF3rpYR+x7ZYb9oXlA5SRhv0KwNf7rfdy2BfHw7nruOC6tfatvJVr/8evqq16jtKj0fdg4jNxCMgZsuV6TpPNovx86Z3RXyO3ecboGsmm/Y1rClH07yShK5wYdqV2vMfqdEVLk/95OzsqQC6AswPdAUh0BWEQFcQAl1BCHQFIdAVhEBXEAJdQQh0BSHQFYRAVxACXUEIdAUh0BWEQFcQAl1BCHQFIdAVhEBXEAJdQQh0BSHQFYRAVxACXUEIdAUh0BWEQFcQ4rp03a7qr0Ip2TRfplL98/lH9YFo1WewFz9OfSX3p2G7aj5yvqQagOZ7Y2Z8669K16e/7p+/3z/9Un2lz83jy2/39c/uK6m35Sf7r/ebGd9hYRSD0dzcffOd3M3YzPnWX5WuP+/3T78+1l+lsi2/YmJV/3z5tz9+dB83WZw05zssjJ/1Z8qWI7CvB6AZmznf+mvStXzA25XftFJ9W1H5pRar+mc3GWg2mvMdFkVzi9sbPhibOd/6a9K12Hv0upbfXPjPq+Znr2vxgLif9R0WRTkYzc0tQdfZUd5D3WSgpPF23ev69Jf7/RXpWt/ckmcmA3OjvEv6p1rLffFL/bPTtT5r1ndYFJWfvzwM/8lTrXnx8779WuAfD8cLWcWfTf1dVXO+w8L4eV9/d9yq+q/aLWTN+z/rVen69NeTZ//e7mvmfIeF0Q/G7+Mv4Zjzrb8qXetVm9d4+c/m71kvlMfRDkZ7u5t/zfrWX5euIA66ghDoCkKgKwjxgbouYAqfbtxCHIoIcVb9E/hZfLZxQ9fPDLqaDkWEOKuyB0wKdDUdighxVmUPmBToajoUEeKsyh4wKdDVdCgixFmVPWBSoKvpUESIsyp7wKRAV9OhiBBnVfaASYGupkMRIc6q7AGTAl1NhyJCnFXZAyYFupoORYQ4q7IHTAp0NR2KCHFWZQ+YFOhqOhQR4qzKHjAp0NV0KCLEWZU9YFKgq+lQRIizKnvApEBX06GIEGdV9oBJga6mQxEhzqrsAZMCXU2HIkKcVdkDJgW6mg5FhDirsgdMCnQ1HYoIcVZlD5gU6Go6FBHirMoeMCnQ1XQoIsRZlT1gUqCr6VBEiLMqe8CkQFfToYgQZ1X2gEmBrqZDESHOquwBkwJdTYciQpxV2QMmBbqaDkWEOKuyB0wKdDUdighxVmUPmBToajoUEeKsyh4wKdDVdCgixFmVPWBSoKvpUESIsyp7wKRAV9OhiBBnVfaASYGupkMRIc6q7AGTAl1NhyJCnFXZAyYFupoORYQ4q7IHTAp0NR2KCHFWZQ+YFOhqOhQR4qzKHjAp0NV0KCLEWZU9YFKgq+lQRIizKnvApOCLikyHIkKcVdkGSMH381mg60xBVwt0nSnoaoGuMwVdLdB1pqCrBbrOFFYGTIciQpxV2QZIMdD1fz8F6PqZQVfToYgQZ1W2AVKgq+lQRIizKtsAKdDVdCgixFmVbYAU6Go6FBHirMo2QAp0NR2KCHFWZRsgBbqaDkWEOKuyDZACXU2HIkKcVdkGSIGupkMRIc6qbAOkQFfToYgQZ1W2AVKgq+lQRIizKtsAKdDVdCgixFmVbYAU6Go6FBHirMo2QAp0NR2KCHFWZRsgBbqaDkWEOKuyDZACXU2HIkKcVdkGSIGupkMRIc6qbAOkQFfToYgQZ1W2AVKgq+lQRIizKtsAKdDVdCgixFmVbYAU6Go6FBHirMo2QAp0NR2KCHFWZRsgBbqaDkWEOKuyDZACXU2HIkKcVdkGSIGupkMRIc6qbAOkQFfToYgQZ1W2AVKgq+lQRIizKtsAKdDVdCgixFmVbYAU6Go6FBHirMo2QAp0NR2KCHFWZRsgBbqaDkWEOKuyDZACXU2HIkKcVdkGSIGupkMRIc6qbAOkQFfToYgQZ1W2AVKgq+lQRIizKtsAKdDVdCgixFmVbYAU6Go6FBHirMo2QAp0NR2KCHFWZRsgBbqaDkWEOKuyDZACXU2HIkKcVdkGSIGupkMRIc6qbAOkQFfToYgQZ1W2AVKgq+lQRIizKtsAKdDVdCgixFmVbYAU6Go6FBHirMo2QAp0NR2KCHFWZRsgBbqaDkWEOKuyDZACXU2HIkKcVdkGSIGupkMRIc6qbAOkQFfToYgQZ1W2AVKgq+lQRIizKtsAKdDVdCgixFmVbYAU6Go6FBHirMo2QAp0NR2KCHFWZRsgBbqaDkWEOKuyDZACXU2HIkKcVdkGSIGupkMRIc6qbAOkQFfToYgQZ1W2AVKgq+lQRIizKtsAKdDVdCgixFmVbYAU6Go6FBHirMo2QAp0NR2KCHFWZRsgBbqaDkWEOKuyDZACXU2HIkKcVdkGSIGupkMRIc6qbAOkQFfToYgQZ1W2AVKgq+lQRIizKtsAKdDVdCgixFmVbYAU6Go6FBHirMo2QAp0NR2KCHFWZRsgBbqaDkWEOKuyDZACXU2HJm7/fLvabxeLm8d3VGUbIAW6mg5N3H5z8/j0bbnfLN9RlW2AFOhqOjRt8+fb9X63KP58fZhelW2AFOhqOjRt81LXTaHq9rSuxXRhsTjcBF2ngK4WkycDy+fbm8fn25OTgefv91ZVtgFSoKvF9Kdaiy/3L3enp67PP6x9L7pOAV0tLrOQtVlbVdkGSIGuFhfRtdgFM3c9E3S1YN11pqCr6dDE7V3rrvXetZjkjquyDZACXS0us+5azl13y6dfR/tgdJ0CulpcZN21Whl4/vHHeIEAXaeArhYXWXd9uSv3rjd/Z+/6ftDV4iLrrtXk9ebPu9W4KtsAKdDVggMIZwq6mg5FhIwpJq2su54Nulq8YzKwsFz0VGUbIAW6mg5N3H5z87hd7p++Wa+y9nBE1tmgq8XkhaxV8ZT/cb89+bIWR2SdD7pavGPd9emXh+rPia04Iuts0NViYsjL3arad57WlSOyzgddLaaGlC9nbVZvTQZYGTgbdLWYHLJZlja+Y2EAXSeBrqZDESFHFEL/62+HT7fQdQroanERXV/u1pv1wfFY6DoNdLW4zLsJfjxs1kfLA+g6BXS1mBLSvqL15qta1d51d/hsDF2ngK4WF5u7HguNrlNAVwuOyJop6Go6NGnr+kDXp2+rN7arjnc9XJlF1ymgq8WkkPKNBMO/X6N5NwFz1zNAV4tJIZvWwPJIlxM079ViZeAM0NViSshL/26W0y/CNu+EPazKNkAKdLWYtpDVHbly6o3brx3Bja5TQFeLabr2e1feTXBh0NV0aMrG/We3bN7xqUPoOgV0NR2asnH3npe33vxiV2UbIAW6mg5N2nq3qKYD28VbC69mVbYBUqCr6dC0zV/uyidR7/gejT26TgNdTYciQpxV2QZIga6mQxEhzqpsA6RAV9OhiBBnVbYBUqCr6VBEiLMq2wAp0NV0KCLEWZVtgBToajoUEeKsyjZACnQ1HYoIcVZlGyAFupoORYQ4q7INkAJdTYciQpxV2QZIga6mQxEhzqpsA6RAV9OhiBBnVbYBUqCr6VBEiLMq2wAp0NV0KCLEWZVtgBToajoUEeKsyjZACnQ1HYoIcVZlGyAFupoORYQ4q7INkAJdTYciQpxV2QZIga6mQxEhzqpsA6RAV9OhiBBnVbYBUqCr6VBEiLMq2wAp0NV0KCLEWZVtgBToajoUEeKsyjZACnQ1HYoIcVZlGyAFupoORYQ4q7INkAJdTYciQpxV2QZIga6mQxEhzqpsA6RAV9OhiBBnVbYBUqCr6VBEiLMq2wAp0NV0KCLEWZVtgBToajoUEeKsyjZACnQ1HYoIcVZlGyAFupoORYQ4q7INkAJdTYciQpxV2QZIga6mQxEhzqpsA6RAV9OhiBBnVbYBUqCr6VBEiLMq2wAp0NV0KCLEWZVtgBToajoUEeKsyjZACnQ1HYoIcVZlGyAFupoORYQ4q7INkAJdTYciQpxV2QZIga6mQxEhzqpsA6RAV9OhiBBnVbYBUqCr6VBEiLMq2wAp0NV0KCLEWZVtgBToajoUEeKsyjZACnQ1HYoIcVZlGyAFupoORYQ4q7INkAJdTYciQpxV2QZIga6mQxEhzqpsA6RAV9OhiBBnVbYBUqCr6VBEiLMq2wAp0NV0KCLEWZVtgBToajoUEeKsyjZACnQ1HYoIcVZlGyAFupoORYQ4q7INkAJdTYciQpxV2QZIga6mQxEhzqpsA6RAV9OhiBBnVbYBUqCr6VBEiLMq2wAp0NV0KCLEWZVtgBToajoUEeKsyjZACnQ1HYoIcVZlGyAFupoORYQ4q7INkAJdTYciQpxV2QZIga6mQxEhzqpsA6RAV9OhiBBnVbYBUqCr6VBEiLMq2wAp0NV0KCLEWZVtgBToajoUEeKsyjZACnQ1HYoIcVZlGyAFupoORYQ4q7INkAJdTYciQpxV2QZIga6mQxEhzqpsA6RAV9OhiBBnVbYBUqCr6VBEiLMq2wAp0NV0KCLEWZVtgBToajoUEeKsyjZACnQ1HYoIcVZlGyAFupoORYQ4q7INkAJdTYciQpxV2QZIga6mQxEhzqpsA6RAV9OhiBBnVbYBUqCr6VBEiLMq2wAp0NV0KCLEWZVtgBToajoUEeKsyjZACnQ1HYoIcVZlGyAFupoORYQ4q7INkAJdTYciQpxV2QZIga6mQxEhzqpsA6RAV9OhiBBnVbYBUqCr6VBEiLMq2wAp0NV0KCLEWZVtgBToajoUEeKsyjZACnQ1HYoIcVZlGyAFupoORYQ4q7INkAJdTYciQpxV2QZIga6mQxEhzqpsA6RAV9OhiBBnVbYBUqCr6VBEiLMq2wAp0NV0KCLEWZVtgBToajoUEeKsyjZACnQ1HYoIcVZlGyAFupoORYQc8vzjofkxqso2QAp0tbiAri93i4qbx4OqbAOkQFeLy+1dj6uyDZACXS0uM3fdlnvXr0wGzgBdLS6zd/1+b1VlGyAFulowGZgp6GpxmcnAZm1VZRsgBbpaXGbvesvc9VzQ1YKXCWYKupoORYQcwcrA2aCrxaVWBjbr3fKwKtsAKdDV4lIrA9vV8YuwMIV+3LJFi2G+ur78dr9b2qtZMBl0HYxFRMgRu5s/7xari0RfH+g6GIuIELgk6DoYi4iQI8yVAXgf6DoYi4iQQ+xjBuB9oOtgLCJCDnnlmAGYQj9u2aLFMF9dXzlmIHvApEBXi488ZiB7wKRAV4uPPGYge8CkQFfToYiQQ+q96+LL+AnXJxn2DwJdLS43d90tn34dvbnwkwz7B4GuFpd84/Yf4wWCTzLsHwS6WlzmmIG7cu9683f2ru8HXS0uMxkoJ683f96Njxr4JMP+QaCrBSsDMwVdTYciQsYUk1Z73RWm8OnGLcItjsgCIS6xd61XXTkiC8Jh7wpCcLwrCPGB74QFOJcPfCcswLnwTlgQgnfCghCsDIAQ6ApCXOh4Vxay4BJ84DthAc7lMnvXn3zOAFwCXtUCIZgMgBBMBkCID/xYDIBzYd0VhEBXEAJdQQh0BSHQFYRAVxACXUEIdD3Jy139jrPNTfdxX8+36+pV5hNHn1eb7KxPEN8PL/pyt1gsboafI1ZcsLps/Xuzdt2dAuh6GkPXkufbN98q8Zpk/UV3lbab4afgDnTdLdaH5wK6nuZVXd/c4b2ua3P607d13TCIHui6WQ77oQZdTzLQ9fn2X4qH53Wp1NO36mF6t6ge15+//8fi69++leeuynO6TW7+vbzwthGu3rq5aHl6o+n/lf/aNlGdrgNPy1OqQ9zKLZoNrxV0PclI1+KBefvlvrSnnpxWYi6LP4V4T9/Kc0sTt18f2k12xT+rz7rdD7dej5IritgiYTWeDLTnt6dsy/Bmw2sFXU8y0nVVPYK3Lr5UH1+7K/2tzuh/dJuUf57+cl8HdVvX8g2nv/Xvu8rzwWSh2Q83p5STh27Dj7r9cwNdTzLSdd09WlceVnPPRs76X+2PbgdcTECbx/zx1vuxrrvqCVV95nDO+3JXfh1JfUo1x+02/KCbPzvQ9SRv6toKZuta7Ac3tZXjrffjycCu+QTUQ13L4npqUc8X+g0/5MbPEHQ9zWbZ/vWevevz9/9qvh73aO/aPdUqJrW7drmq17Xdg7bTh3rCurv2dS10Pc22mT6ujnQdz0ZtXV/u/rGR8mjuOlzIGrw0cLgyUE9o2+Wuq3/FAF1PUz6Zrx6Sj3QdP9c3dC393C76FwXGKwPNywQvd+UOs3qk31Qu9ysDpaDlNtVCVrNbbTf86GGYC+j6Bt0rpUe6duuupq77TXmpZl1gf7B1TfUya7333dYvuB69CFuKWZxSXYlqzrq97ncVoetFOfjiRjgTdL0o2+td0b8I6HpBnr7dsHMNBV1BCHQFIdAVhEBXEAJdQQh0BSHQFYRAVxACXUEIdAUh0BWEQFcQAl1BCHQFIdAVhEBXEAJdQQh0BSHQFYRAVxACXUEIdAUh0BWEQFcQAl1BCHQFIdAVhEBXEAJdQQh0BSHQFYRAVxACXUEIdAUh0BWEQFcQAl1BCHQFIdAVhEBXEAJdQQh0BSHQFYRAVxACXUEIdAUh0BWEQFcQAl1BCHQFIdAVhEBXEAJdQQh0BSHQFYRAVxACXUEIdAUh0BWEQFcQ4v8BTlZ5EgudfSEAAAAASUVORK5CYII=" /><!-- --></p>
247 | <p>Use trained model to predict test set</p>
248 | <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">prediction &lt;-<span class="st"> </span><span class="kw">predict</span>(model_train, data_test)</code></pre></div>
249 | <p>Evaluate model performance on test set</p>
250 | <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">eval_model</span>(prediction, data_test)</code></pre></div>
251 | <pre><code>## 
252 | ## Confusion matrix (absolute):
253 | ##            Actual
254 | ## Prediction  benign malignant Sum
255 | ##   benign        92         0  92
256 | ##   malignant      8        40  48
257 | ##   Sum          100        40 140
258 | ## 
259 | ## Confusion matrix (relative):
260 | ##            Actual
261 | ## Prediction  benign malignant  Sum
262 | ##   benign      0.66      0.00 0.66
263 | ##   malignant   0.06      0.29 0.34
264 | ##   Sum         0.71      0.29 1.00
265 | ## 
266 | ## Accuracy:
267 | ## 0.9429 (132/140)
268 | ## 
269 | ## Error rate:
270 | ## 0.0571 (8/140)
271 | ## 
272 | ## Error rate reduction (vs. base rate):
273 | ## 0.8 (p-value = 7.993e-12)</code></pre>
274 | <p>The best reported out-of-sample accuracy on this dataset was at 95.9% and it was reached with considerable effort. The reached accuracy for the test set here lies at 94.3%! This is achieved with just one simple rule that when “Uniformity of Cell Size” is bigger than 2 the examined tissue is malignant. The cut points of the intervals are again found automatically (via the included optbin function). The very good separation of the areas in the diagnostic plot give a good indication of the model’s ability to differentiate between benign and malignant tissue. Additionally when you look at the distribution of misclassifications not a single malignant instance is missed, which is obviously very desirable in a clinical context.</p>
275 | </div>
276 | <div id="included-functions" class="section level2">
277 | <h2>Included functions</h2>
278 | <div id="oner" class="section level3">
279 | <h3>OneR</h3>
280 | <p>OneR is the main function of the package. It builds a model according to the One Rule machine learning algorithm for categorical data. All numerical data is automatically converted into five categorical bins of equal length. When verbose is TRUE it gives the predictive accuracy of the attributes in decreasing order.</p>
281 | </div>
282 | <div id="bin" class="section level3">
283 | <h3>bin</h3>
284 | <p>bin discretizes all numerical data in a data frame into categorical bins of equal length or equal content or based on automatically determined clusters.</p>
285 | <p>Examples</p>
286 | <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">data &lt;-<span class="st"> </span>iris
287 | <span class="kw">str</span>(data)</code></pre></div>
288 | <pre><code>## 'data.frame':    150 obs. of  5 variables:
289 | ##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
290 | ##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
291 | ##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
292 | ##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
293 | ##  $ Species     : Factor w/ 3 levels &quot;setosa&quot;,&quot;versicolor&quot;,..: 1 1 1 1 1 1 1 1 1 1 ...</code></pre>
294 | <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">str</span>(<span class="kw">bin</span>(data))</code></pre></div>
295 | <pre><code>## 'data.frame':    150 obs. of  5 variables:
296 | ##  $ Sepal.Length: Factor w/ 5 levels &quot;(4.3,5.02]&quot;,&quot;(5.02,5.74]&quot;,..: 2 1 1 1 1 2 1 1 1 1 ...
297 | ##  $ Sepal.Width : Factor w/ 5 levels &quot;(2,2.48]&quot;,&quot;(2.48,2.96]&quot;,..: 4 3 3 3 4 4 3 3 2 3 ...
298 | ##  $ Petal.Length: Factor w/ 5 levels &quot;(0.994,2.18]&quot;,..: 1 1 1 1 1 1 1 1 1 1 ...
299 | ##  $ Petal.Width : Factor w/ 5 levels &quot;(0.0976,0.58]&quot;,..: 1 1 1 1 1 1 1 1 1 1 ...
300 | ##  $ Species     : Factor w/ 3 levels &quot;setosa&quot;,&quot;versicolor&quot;,..: 1 1 1 1 1 1 1 1 1 1 ...</code></pre>
301 | <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">str</span>(<span class="kw">bin</span>(data, <span class="dt">nbins =</span> <span class="dv">3</span>))</code></pre></div>
302 | <pre><code>## 'data.frame':    150 obs. of  5 variables:
303 | ##  $ Sepal.Length: Factor w/ 3 levels &quot;(4.3,5.5]&quot;,&quot;(5.5,6.7]&quot;,..: 1 1 1 1 1 1 1 1 1 1 ...
304 | ##  $ Sepal.Width : Factor w/ 3 levels &quot;(2,2.8]&quot;,&quot;(2.8,3.6]&quot;,..: 2 2 2 2 2 3 2 2 2 2 ...
305 | ##  $ Petal.Length: Factor w/ 3 levels &quot;(0.994,2.97]&quot;,..: 1 1 1 1 1 1 1 1 1 1 ...
306 | ##  $ Petal.Width : Factor w/ 3 levels &quot;(0.0976,0.9]&quot;,..: 1 1 1 1 1 1 1 1 1 1 ...
307 | ##  $ Species     : Factor w/ 3 levels &quot;setosa&quot;,&quot;versicolor&quot;,..: 1 1 1 1 1 1 1 1 1 1 ...</code></pre>
308 | <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">str</span>(<span class="kw">bin</span>(data, <span class="dt">nbins =</span> <span class="dv">3</span>, <span class="dt">labels =</span> <span class="kw">c</span>(<span class="st">&quot;small&quot;</span>, <span class="st">&quot;medium&quot;</span>, <span class="st">&quot;large&quot;</span>)))</code></pre></div>
309 | <pre><code>## 'data.frame':    150 obs. of  5 variables:
310 | ##  $ Sepal.Length: Factor w/ 3 levels &quot;small&quot;,&quot;medium&quot;,..: 1 1 1 1 1 1 1 1 1 1 ...
311 | ##  $ Sepal.Width : Factor w/ 3 levels &quot;small&quot;,&quot;medium&quot;,..: 2 2 2 2 2 3 2 2 2 2 ...
312 | ##  $ Petal.Length: Factor w/ 3 levels &quot;small&quot;,&quot;medium&quot;,..: 1 1 1 1 1 1 1 1 1 1 ...
313 | ##  $ Petal.Width : Factor w/ 3 levels &quot;small&quot;,&quot;medium&quot;,..: 1 1 1 1 1 1 1 1 1 1 ...
314 | ##  $ Species     : Factor w/ 3 levels &quot;setosa&quot;,&quot;versicolor&quot;,..: 1 1 1 1 1 1 1 1 1 1 ...</code></pre>
315 | <p>Difference between methods “length” and “content”</p>
316 | <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">set.seed</span>(<span class="dv">1</span>); <span class="kw">table</span>(<span class="kw">bin</span>(<span class="kw">rnorm</span>(<span class="dv">900</span>), <span class="dt">nbins =</span> <span class="dv">3</span>))</code></pre></div>
317 | <pre><code>## 
318 | ## (-3.01,-0.735]  (-0.735,1.54]    (1.54,3.82] 
319 | ##            212            623             65</code></pre>
320 | <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">set.seed</span>(<span class="dv">1</span>); <span class="kw">table</span>(<span class="kw">bin</span>(<span class="kw">rnorm</span>(<span class="dv">900</span>), <span class="dt">nbins =</span> <span class="dv">3</span>, <span class="dt">method =</span> <span class="st">&quot;content&quot;</span>))</code></pre></div>
321 | <pre><code>## 
322 | ## (-3.01,-0.423] (-0.423,0.444]   (0.444,3.82] 
323 | ##            300            300            300</code></pre>
324 | <p>Method “clusters”</p>
325 | <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">intervals &lt;-<span class="st"> </span><span class="kw">paste</span>(<span class="kw">levels</span>(<span class="kw">bin</span>(faithful$waiting, <span class="dt">nbins =</span> <span class="dv">2</span>, <span class="dt">method =</span> <span class="st">&quot;cluster&quot;</span>)), <span class="dt">collapse =</span> <span class="st">&quot; &quot;</span>)
326 | <span class="kw">hist</span>(faithful$waiting, <span class="dt">main =</span> <span class="kw">paste</span>(<span class="st">&quot;Intervals:&quot;</span>, intervals))
327 | <span class="kw">abline</span>(<span class="dt">v =</span> <span class="kw">c</span>(<span class="fl">42.9</span>, <span class="fl">67.5</span>, <span class="fl">96.1</span>), <span class="dt">col =</span> <span class="st">&quot;blue&quot;</span>)</code></pre></div>
328 | <p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAq4AAAHgCAMAAABTmx+1AAAAmVBMVEUAAAAAADoAAGYAAP8AOjoAOmYAOpAAZrY6AAA6ADo6AGY6OgA6Ojo6OpA6kLY6kNtmAABmADpmOgBmOpBmZgBmZmZmkJBmtrZmtv+QOgCQOjqQZgCQkGaQtpCQ29uQ2/+2ZgC2Zma2kDq2tma225C2/7a2/9u2///bkDrbtmbb25Db2//b/7bb////tmb/25D//7b//9v////1LiKUAAAACXBIWXMAAA7DAAAOwwHHb6hkAAAV0ElEQVR4nO2dbWPjuHVG6UlqeTdNpUmaxGqTmn3ZDNvu0Jb+/48LQfBNEkkRJB4NL33Oh13ZurgArDMgCFJEcgYwQ/KjGwAwHXQFQ6ArGAJdwRDoCoZAVzAEuoIh0BUMga5gCHQFQ6ArGAJdwRDoCoZAVzAEuoIh0BUMga5gCHQFQ6ArGAJdwRDoCoZAVzAEuoIh0BUMga5gCHQFQ6ArGAJdwRDoCoZAVzAEuoIh0BUMsWldT8fk+fvV7/7z27LyLR+Hp7c6bOd/kSTJlzZ/8fskee2UyKv33f9LqgTnc+Z/3pVFmjqrGu4X66sxv2xMQeoyX9ZgjU+m68fhSzRds/rNzAuTX7lU2ttxqdClev/Wu7RXV1/DhGI9NaaXkf436LpmbnVLk2i6Fm/uyxfvL6Uhxc9F7uKHukhayOLkqMfX3L0s3t+1P3c829dJW5l8DVOKNb1rarwuVbYSXVeN1y0rPsTUDzTlkOOETetBqHjzby/J0+/8OFR8pvvqk3U/V7qWh9zyE+76m1dDV/n570p3XMbmH4T/uWNMemlJR2z3upa6U0VetXlKsesar0pV04abTljjU+jaHEIrXauDpvOq+hj/1w+CWeeYW7y6KO+iu590VnmZJf/0h4sjfqWrV7+oqirhZEo7A1vaOVLnTQ3dKsoaJha7qvG6lMv19yO6rppGt+ITLf67r2VKKw/2pa7us/MDUxnvT5zKj7782b+Vl8U7n3R9flUMgf9xbHWtxthzLY+fIlQ/d+3qHqjrOWhnRG9qmFjsqsarUkU/d01mdF0pja6v7TGyEdUPXlk1t8yqiWc1F8zKA3xZvvhl33y38rL43+upo2s7+PmA4r8dXfeV9ufLWXR32tvK1BzbpxS7qvGqVOevga6rpZ271p9lWn3AzbCUVXqV08C8PuJXJ9j1aOvYX+Wu1E59WDs/bQPTavrR6lq8ql1pR+GWrJ2AtDVMLHZV42Wp7l8DXVfLgK7dBaGsPWHale5Va5dluC9fzXQvzmpqmfLKisqhrq09h3G/LFX+nF/nO1enVj26Til2VeNlqe5fA11Xy/Do2nzmzdBUzAZ+OVSzhn0V3nyw7dpAgw9La/OrofZiEHae79LOobpjUN+KWq+uE4td1XhZqvvXQNfVMqBr94ja6PpxePprfe3otXuqdT79m4u4FqVH17Rn6Gvrqk660s6p3WUuX8W1rhOLXdV4Uerir4Guq+VW16xefn2thq3uxM8vxZbHUTcjaE+1mjOgzifdMaeaDGRJx8wyRXkq14yY5fp+pVhzUleH+gW0XVemqoZJxa5r7JRqEqLryrnVNe+uu7pft7rm1YpQfabVTAayvlWm7vmVf1lPcYv8jZ7VsFsV80NxmaCZj/i3qkmne+t6IWtasesaO6XQ1Qq3urpP0Q93lQCtrs2Cvnvz6S1rtPNWXK/ht7cMVF41J3AX5ep/AVVl9QDcnDJVb9WXxq4uE7Qz5zvFrmvslEJXOF+c4dwnvVl+GuLmIqyyBnOg61yKT/3mxGqI95eJode3uIhrMAe6zia7XbEf4HS8vsgwHJlc3ECorsEa6DqbsHtnp3G6uj1bXYM10BUMga5gCHQFQ6ArGAJdwRDoCoZAVzAEuoIh0BUMga5gCHQFQ6ArGAJdwRDoCoZAVzAEuoIh0BUMga5gCHQFQ6ArGAJdwRDoCoZAVzAEuoIh0BUMga5gCHQFQ/xYXX/zQ2uXsdFuTUDdc3QVsNFuTQBdDbLRbk0AXQ2y0W5NAF0NstFuTQBdDbLRbk0AXQ2y0W5NAF0NstFuTQBdDbLRbk0AXQ2y0W5NAF0NstFuTQBdDfLQbiUhqBuDrgZ5rK6i2Fmgq0HQVQW6CkBXFegqAF1VoKsAdFWBrgIWd0t1to+ui0DXflQKousi0LUfdB0AXQWgqwp0FYCuKtBVALqqQFcB6KoCXQWgqwp0FYCuKtBVALqqQFcB6KoCXQWgq4qwDpyO/jL1l29xakfXftB1gKAOZMnev8jrFwtB137QdYCQDpyOjaTZ8/cYtaNrP+g6QEgHPg6v9cs8znQAXftB1wEYXQWgq4rAuWs1vDJ3HQVdVYR14OPgVwbijK3oOgS6DsC6qwB0VRGnA3MfE4Ku/aDrAGErA27Gmo9dJkBXB7qqCNa1XBPoLGnNT3dG1yHQdYBQXStRhxay0NWBripCdX1/KXUdukyArg50VcHoKgBdVYTp6s79d+f6pGthujO6DoGuAwR2oDD26W3koha6OtBVReQOoKsDXVWgqwB0VYGuAtBVBboKQFcV6CoAXVWgqwB0VYGuAtBVBboKQFcV6CoAXVWgqwB0VYGuAtBVBboKQFcV6CoAXVWgqwB0VYGuAtBVBboKQFcV6CoAXVWgqwB0VYGuAtBVBboKQFcV6CoAXVWgqwB0VYGuAtBVBboKQFcV6CoAXVWgqwB0VYGuAtBVBboKQFcV6CoAXVWgqwB0VYGuAtBVBboKQFcV6CoAXVWgqwB0VYGuAtBVRVgHTke/lSa7Fo6CriqCOpDVW2gM7qWBrg50VRHSgdOxkZR9tcZAVxUhHejsBMuuhWOgqwpGVwHoqiJw7loNr8xdR0FXFWEd8NtsJsnA2IquHnRVwbqrAHRVEacDSUNYOXTtB10HmNOB95fXobfQ1YGuKsIWstpRlIWsEdBVRVAHqgUBRtc7oKuK0JUBtyaArndAVxWhHUif3tD1HuiqIrgDWbJH1zugq4rwDry//BZdx0FXFTM6cDom6DoKuqrgqpYAdFWBrgLQVQW6CkBXFegqAF1VoKsAdFWBrgLQVQW6CkBXFegqAF1VoKsAdFWBrgLQVQW6CkBXFegqAF1VoKsAdFWBrgLQVQW6CkBXFegqAF1VoKsAdFWBrgLQVUXTgY9DsouYbhro2g+6DtDpQJYkQ8/BnJNuCujaD7oOcNmBxcaiqwNdVVx3IBvb1yU83R3QtR90HeCiA3nh6uv5dBx82nBYuvugaz/oOkDbAfd8Qe/p0D4ZQekmga79oOsAnZWBp7eI6aaBrv2g6wCsuwpAVxWdDqTFVGBwS5fwdFNA137QdYC2A2k5cf04LLpY8FBdkwAWVRQKuqrozF39c9oWnGedH62rJDQC6Kqi6UC9JWGGrotBVxVtB/yWhO8vdq5qoasydhYPPNV6fynmeAtXs9DVga4qLC9koasydhboGqUydA2OncXjdD0dRzd4mxaDrg50VdFZd72/JJDVFxHWsUE8uipjZ/HAdde7SwL1WldBNnDTFro60FXFzWWCETohQxcT0NWBrio6lwnuXn1ldJ0IuqpoO5AP75ZVk9UhzF1HQVcVncnA+NbvF0GDXzdAVwe6qmDdVQC6qojTgbk36qFrP+g6QKcDxZH++Xt6/w6XfOTOAnR1oKuKzqnW01txvj++/Jomyf7999+HV73Q1YGuKi7ud3XLU2P3u7ovHKTlyMpC1hjoquLiMoGTcOTbBOWY+v6T05XLBGOgq4qb0TUdfiaGnyic/v/M6DoOuqq4nrtmYxcLmonC4AwXXR3oquJyZeDetwky//bwBTB0daCrCi4TCEBXFegqAF1VhN0zMD3dNNC1H3Qd4LoDy56Kga4l6KripgOpoYcOSUIjgK4qbjrAQ4eWg64qbjrAQ4eWg64qrjtg6gmEktAI9HdL9MDEz6nr3S8KhKWbxufSdXoCdB2AdVcB6KoCXQWgq4rbywSLrhSgqwNdVXTuyCrvWxm9Iysk3STQtR90HaBzv6v3dOhO1sB000DXftB1gHYy8NXfOshlguWgq4qb0XXk2wQh6aaBrv2g6wCduav/zuCyp72jqwNdVXQ6UK4NLLtKgK4l6KqCdVcB6KoCXQWgq4rLycCkhw5NTDcFdO0HXQe4ONW6+9ChgHSTQNd+0HWAzkLW/YcOBaSbBrr2g64DdO4ZuPvQoZB003icrqKbTftBVxU3oyuXCdB1Pqt66FBIukmg6/ImfFJdpzx0KCTdFNB1eRM+q64/IB26Lm/C59S1s2lWjHTTQNflTQiKVZ9xrmnXwpB000DX5U1YQ2zDA0+1lj1u6DrdJNB1eRPWENvAI92iVIau0tgGTrWiVIau0tiGz6ar6GQAXaWxDY/RNc551jmGrprK0FUa2/BAXScsZZ2Od+a36OpAVxVBumb1vvC6DeLRNbQJa4htWJOunQDZvlroGtqENcQ2rEnXzgxXtmshuoY2YQ2xDWvSldF1IuiqInDuWg2vzF1HQVcVta7tYubYVa27zyxGVwe6qljdZQJNZegqjW2woevcu87QdXkT1hDbsC5ds0LIcvo69H1ZdHWgq4qgRrnnvfmtYdB1DHRVEdIov3RwOo48jQBdHeiqIqRR9WWC9Pk7uo6BrirCR9ez2zcWXcdAVxVhc9dK0o/D0OosujrQVUXoyoCfDpyO6DoCuqrgMsGi2H7QVQW6LortB11VoOui2H7QVQW6LortB11VoOui2H7QVQW6LortB11VoOui2H7QVQW6LortB11VoOui2H7QVQW6LortB11VoOui2H7QVQW6LortB11VoOui2H7QVQW6LortB11VoOui2H7QVQW6LortB11VoOui2H7QVQW6LortB11VoOui2H7QVQW6LortB11VoOui2H7QVQW6LortB11VoOui2H7QVQW63sYu3okOXVWgqyAWXVWgqyAWXVWgqyAWXVWgqyAWXVWgqyAWXVWgqyAWXVWgqyAWXVWgqyAWXVWgqyAWXVWgqyAWXVWgqyAWXVWENep0vLPNMbo60FVFUKOyel94NogfjUVXFSGNavbVKsQd2CIeXR3oqiKkUfWuhQU5GxWNxKKrCkZXQSy6qgicu1bDK3PX0Vh0VRHWqI+DXxkYGFvR1YOuKlh3FcSiq4o4ut757tIg9nXt5TcB3+t6bHNFsQ02dJ2bzr6uvb9ldFWBroJYdFURtu7aHthYdx2JRVcVQY06HQdvFpiTDl11TXhsbMOqdC183cVMh66yJjw2tmFdup7z5HX0fXR1oKsKTrUEseiqAl0FseiqAl0FseiqAl0FseiqAl0FseiqAl0FseiqAl0FseiqAl0FseiqAl0FseiqAl0FseiqAl0FseiqAl0FseiqAl0FseiqAl0FseiqAl0FseiqAl0FseiqAl0FseiqAl0FseiqAl0FseiqAl0FsdvQdc6+4+gapTJ0Vcaia+TK0FUZi66RK0NXZSy6Rq4MXZWx29J1eIre82TJpZUtDkXX4NiN6ToYzuiqasJDY9F1dmWLQ9E1OBZdZ1e2OBRdg2PRdXZli0PRNTgWXWdXtjgUXYNj0XV2ZYtD0TU4Fl1nV7Y4FF2DY9F1dmWLQ9E1OBZdZ1e2OBRdg2PRdXZli0PRNTgWXWdXtjgUXYNj0XV2ZYtD0TU4Fl1nV7Y4FF2DY9F1dmWLQ9E1OBZdZ1e2OBRdg2Pv7jU+8w7RRY2anQ5dQ/OuQEHV6PpIXU/H0Q2M0dWDrnGasLB0luz9i7x+MSkduobmXYGC9nU9HRtJs+fv09Oha2jeFShoX9ePQ7N/cX45HRifSAd9V2sLbLRbE1jTqdaE0TUQ9brHD2Kj3ZrAqhaysnp7+MG5ayAb/Vw32q0JrErXYjrgB/Q4Y+tmP9eNdmsC69I1Nhv9XDfarQmgq0E22q0JoKtBNtqtCaCrQTbarQmgq0E22q0JoKtBNtqtCaCrQTbarQmgq0E22q0JbEHXRZeYLbLRbk1gTfcMrK0OUV6aq8yLrjbS0twIadH1UWlpboS06PqotDQ3Qlp0fVRamhshLbo+Ki3NjZAWXR+VluZGSIuuj0pLcyOkRddHpaW5EdL+2IuwAEGgKxgCXcEQ6AqGQFcwBLqCIdAVDIGuYAh0BUOgKxgCXcEQ6AqGQFcwhFrXtHwUbJ4kT2/RcvqnzO6i531/8Vnjps2rLyy/xm5u5pPGTuvy+nQR877//O0i49zUYl3z8snFedG0PN5f9P2nKlXcvHnR1I/DLnpzHYK8mctW/iOI29y0SPX+Ejfvx6Hcy6LJODu1VtdiHCwc8HsapLtYWet9POLm9dmyL99iN/fs00Zv7s5ni9zcj0P5Z3j+HjFv7ndiazLOT63VNXv+90LX8t9q+YlFylr1M27eZsyO3dwy5T523kbXyM31g17x33h582RfjjBNxvmppboWBri5qxchj/b5p/9czNr258h58y+/HARpS9Ly04qbt54MRE5b6Zq8Rs2bX/wB5qdW6urGfKdr8w82TtqPg5sPp/vIebOkPGDvYjf3XB9gY+etTlcip61GvuQ1at5SzSbj/NRKXd3mWwJdPcUfILKuT2+KtOfzpVHR8roR200yYjfXn2p9Pl3LIV8xGfDZX+IerqqpVPS0Dr+aFzdvOxGM3dy0ODH65evbZ5sMZPWCo+Dcpex73Lz+rxc97bmeC0Q+JxKcEnV4/3nB+VAPJk61zn5gibzU4jubx15x8jveRk97rhsc+c8g+itUxF3IqnRd/UJWdRyMvZBdLuHso6+71/+yYl8mqHPFzVvPXSOnbS5oRM2bm7hMUE/bsriXCdP68mPcvLlfH4ve3OaoFzdvqmmuu8Tt00XMW81Ts6WpucUFDIGuYAh0BUOgKxgCXcEQ6AqGQFcwBLqCIdAVDIGuYAh0BUOgKxgCXcEQ6AqGQFcwBLqCIdAVDIGuYAh0BUOgKxgCXcEQ6AqGQFcwBLqCIdAVDIGuYAh0fQgff/z+o5uwCdB1Jln1gCpH+fDC/LV6iOHlm+XvmsdZ3XJTFkZA13lUz2zt/qJVrvum+1365Zc//t/LsI6oOhV0nceNYZe6vnZ///7yWkwG/LMYpyWDAdB1Fm6Dwy/f6id3lkomyfOvhz+/lE8Ld49LP/g3Gl2rvYXKnducujdlne5/OfhHgRZvPv017mYOmwBd5+GnpDu/A0c9sn4cih+KXzQjrdf14/D8a3mq5R54nCX+6dG3Zdvy5aOG86jPmN0I6DqP0q6vb/6h661ufre3K12bzWbdA6PTPz1/f//pradsW94/yD1F1xvQdR71dDN3B++ucrWi3df+kdT78sevf/v5W/Us6ZuydZn20ehwCbrOozQrK+ao//MySdfiVMvNdffvv//161s5EegpW5fJ0HUAdJ2HP8a/3h7QB3V1m0llz/+1O6f/chwoy+h6D3SdhzPLb8U3OBlo5rGFd4Wuzr73n/91f85+W4jbW7YuX+07ha43oOs86tG1mJTum9Okrq6no9uaKmlWBn51GwCVp/55Uu1CflO2Ls/KwBDoOo967vr0lvqFq3Pq106bCYA7u/pLrW7ilwbazbB6yza6lhtd/nfcjQg3Abo+hDm3uETeRHcToOsKKeeuN3clALqukzyp92WEC9AVDIGuYAh0BUOgKxgCXcEQ6AqGQFcwBLqCIdAVDIGuYAh0BUOgKxgCXcEQ6AqGQFcwBLqCIdAVDIGuYAh0BUOgKxgCXcEQ/wCksY8uCk0UZAAAAABJRU5ErkJggg==" /><!-- --></p>
329 | <p>Handling of missing values</p>
330 | <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">bin</span>(<span class="kw">c</span>(<span class="dv">1</span>:<span class="dv">10</span>, <span class="ot">NA</span>), <span class="dt">nbins =</span> <span class="dv">2</span>, <span class="dt">na.omit =</span> <span class="ot">FALSE</span>) <span class="co"># adds new level &quot;NA&quot;</span></code></pre></div>
331 | <pre><code>##  [1] (0.991,5.5] (0.991,5.5] (0.991,5.5] (0.991,5.5] (0.991,5.5]
332 | ##  [6] (5.5,10]    (5.5,10]    (5.5,10]    (5.5,10]    (5.5,10]   
333 | ## [11] NA         
334 | ## Levels: (0.991,5.5] (5.5,10] NA</code></pre>
335 | <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">bin</span>(<span class="kw">c</span>(<span class="dv">1</span>:<span class="dv">10</span>, <span class="ot">NA</span>), <span class="dt">nbins =</span> <span class="dv">2</span>)</code></pre></div>
336 | <pre><code>## Warning in bin(c(1:10, NA), nbins = 2): 1 instance(s) removed due to
337 | ## missing values</code></pre>
338 | <pre><code>##  [1] (0.991,5.5] (0.991,5.5] (0.991,5.5] (0.991,5.5] (0.991,5.5]
339 | ##  [6] (5.5,10]    (5.5,10]    (5.5,10]    (5.5,10]    (5.5,10]   
340 | ## Levels: (0.991,5.5] (5.5,10]</code></pre>
341 | </div>
342 | <div id="optbin" class="section level3">
343 | <h3>optbin</h3>
344 | <p>optbin discretizes all numerical data in a data frame into categorical bins where the cut points are optimally aligned with the target categories, thereby a factor is returned. When building a OneR model this could result in fewer rules with enhanced accuracy. The cutpoints are calculated by pairwise logistic regressions (method “logreg”) or as the means of the expected values of the respective classes (“naive”). The function is likely to give unsatisfactory results when the distributions of the respective classes are not (linearly) separable. Method “naive” should only be used when distributions are (approximately) normal, although in this case “logreg” should give comparable results, so it is the preferable (and therefore default) method.</p>
345 | <p>Method “infogain” is an entropy based method which calculates cut points based on information gain. The idea is that uncertainty is minimized by making the resulting bins as pure as possible. This method is the standard method of many decision tree algorithms.</p>
346 | </div>
347 | <div id="maxlevels" class="section level3">
348 | <h3>maxlevels</h3>
349 | <p>maxlavels removes all columns of a data frame where a factor (or character string) has more than a maximum number of levels. Often categories that have very many levels are not useful in modelling OneR rules because they result in too many rules and tend to overfit. Examples are IDs or names.</p>
350 | <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">df &lt;-<span class="st"> </span><span class="kw">data.frame</span>(<span class="dt">numeric =</span> <span class="kw">c</span>(<span class="dv">1</span>:<span class="dv">26</span>), <span class="dt">alphabet =</span> letters)
351 | <span class="kw">str</span>(df)</code></pre></div>
352 | <pre><code>## 'data.frame':    26 obs. of  2 variables:
353 | ##  $ numeric : int  1 2 3 4 5 6 7 8 9 10 ...
354 | ##  $ alphabet: Factor w/ 26 levels &quot;a&quot;,&quot;b&quot;,&quot;c&quot;,&quot;d&quot;,..: 1 2 3 4 5 6 7 8 9 10 ...</code></pre>
355 | <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">str</span>(<span class="kw">maxlevels</span>(df))</code></pre></div>
356 | <pre><code>## 'data.frame':    26 obs. of  1 variable:
357 | ##  $ numeric: int  1 2 3 4 5 6 7 8 9 10 ...</code></pre>
358 | </div>
359 | <div id="predict" class="section level3">
360 | <h3>predict</h3>
361 | <p>predict is a S3 method for predicting cases or probabilites based on OneR model objects. The second argument “newdata”&quot; can have the same format as used for building the model but must at least have the feature variable that is used in the OneR rules. The default output is a factor with the predicted classes.</p>
362 | <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">model &lt;-<span class="st"> </span><span class="kw">OneR</span>(iris)
363 | <span class="kw">predict</span>(model, <span class="kw">data.frame</span>(<span class="dt">Petal.Width =</span> <span class="kw">seq</span>(<span class="dv">0</span>, <span class="dv">3</span>, <span class="fl">0.5</span>)))</code></pre></div>
364 | <pre><code>## (-Inf,0.0976] (0.0976,0.58]   (0.58,1.06]   (1.06,1.54]   (1.54,2.02] 
365 | ##        UNSEEN        setosa    versicolor    versicolor     virginica 
366 | ##    (2.02,2.5]    (2.5, Inf] 
367 | ##     virginica        UNSEEN 
368 | ## Levels: UNSEEN setosa versicolor virginica</code></pre>
369 | <p>If “type = prob” a matrix is returned whose columns are the probability of the first, second, etc. class.</p>
370 | <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">predict</span>(model, <span class="kw">data.frame</span>(<span class="dt">Petal.Width =</span> <span class="kw">seq</span>(<span class="dv">0</span>, <span class="dv">3</span>, <span class="fl">0.5</span>)), <span class="dt">type =</span> <span class="st">&quot;prob&quot;</span>)</code></pre></div>
371 | <pre><code>##               setosa versicolor  virginica
372 | ## (-Inf,0.0976]     NA         NA         NA
373 | ## (0.0976,0.58]  1.000  0.0000000 0.00000000
374 | ## (0.58,1.06]    0.125  0.8750000 0.00000000
375 | ## (1.06,1.54]    0.000  0.9268293 0.07317073
376 | ## (1.54,2.02]    0.000  0.1724138 0.82758621
377 | ## (2.02,2.5]     0.000  0.0000000 1.00000000
378 | ## (2.5, Inf]        NA         NA         NA</code></pre>
379 | </div>
380 | <div id="eval_model" class="section level3">
381 | <h3>eval_model</h3>
382 | <p>eval_model is a simple function for evaluating a OneR classification model. It prints confusion matrices with prediction vs. actual in absolute and relative numbers. Additionally it gives the accuracy, error rate as well as the error rate reduction versus the base rate accuracy together with a p-value. The second argument “actual” is a data frame which contains the actual data in the last column. A single vector is allowed too.</p>
383 | <p>For the details please consult the available help entries.</p>
384 | </div>
385 | </div>
386 | <div id="help-overview" class="section level2">
387 | <h2>Help overview</h2>
388 | <p>From within R:</p>
389 | <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">help</span>(<span class="dt">package =</span> OneR)</code></pre></div>
390 | <p>…or as a pdf here: <a href="https://cran.r-project.org/package=OneR/OneR.pdf">OneR.pdf</a></p>
391 | <p>Issues can be posted here: <a href="https://github.com/vonjd/OneR/issues" class="uri">https://github.com/vonjd/OneR/issues</a></p>
392 | <p>The latest version of the package (and full sourcecode) can be found here: <a href="https://github.com/vonjd/OneR" class="uri">https://github.com/vonjd/OneR</a></p>
393 | </div>
394 | <div id="sources" class="section level2">
395 | <h2>Sources</h2>
396 | <p>[Holte93] R. Holte: Very Simple Classification Rules Perform Well on Most Commonly Used Datasets, 1993. Available online here: <a href="https://link.springer.com/article/10.1023/A:1022631118932" class="uri">https://link.springer.com/article/10.1023/A:1022631118932</a></p>
397 | </div>
398 | <div id="contact" class="section level2">
399 | <h2>Contact</h2>
400 | <p>I would love to hear about your experiences with the OneR package. Please drop me a note - you can reach me at my university account: <a href="https://www.h-ab.de/nc/eng/about-aschaffenburg-university-of-applied-sciences/organisation/personal/?tx_fhapersonal_pi1%5BshowUid%5D=jouanne-diedrich">Holger K. von Jouanne-Diedrich</a></p>
401 | </div>
402 | <div id="license" class="section level2">
403 | <h2>License</h2>
404 | <p>This package is under <a href="https://cran.r-project.org/package=OneR/LICENSE">MIT License</a>.</p>
405 | </div>
406 | 
407 | 
408 | 
409 | <!-- dynamically load mathjax for compatibility with self-contained -->
410 | <script>
411 |   (function () {
412 |     var script = document.createElement("script");
413 |     script.type = "text/javascript";
414 |     script.src  = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
415 |     document.getElementsByTagName("head")[0].appendChild(script);
416 |   })();
417 | </script>
418 | 
419 | </body>
420 | </html>
421 | 


--------------------------------------------------------------------------------