├── .Rbuildignore ├── DESCRIPTION ├── NAMESPACE ├── R ├── cutoff.R ├── discretize.R ├── misc.R ├── normalize.R ├── search.best.first.R ├── search.exhaustive.R ├── search.greedy.R ├── search.hill.climbing.R ├── search.misc.R ├── selector.cfs.R ├── selector.chi.squared.R ├── selector.consistency.R ├── selector.correlation.R ├── selector.info.gain.R ├── selector.oneR.R ├── selector.random.forest.R └── selector.relief.R ├── README.md └── man ├── FSelector-package.Rd ├── as.simple.formula.Rd ├── best.first.search.Rd ├── cfs.Rd ├── chi.squared.Rd ├── consistency.Rd ├── correlation.Rd ├── cutoff.Rd ├── exhaustive.search.Rd ├── greedy.search.Rd ├── hill.climbing.search.Rd ├── information.gain.Rd ├── oneR.Rd ├── random.forest.importance.Rd └── relief.Rd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | makeR 2 | Makefile 3 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Type: Package 2 | Package: FSelector 3 | Title: Selecting Attributes 4 | Version: 0.34 5 | Date: 2023-08-22 6 | Author: Piotr Romanski, Lars Kotthoff, Patrick Schratz 7 | Maintainer: Lars Kotthoff 8 | BugReports: https://github.com/larskotthoff/fselector/issues 9 | URL: https://github.com/larskotthoff/fselector 10 | Description: Functions for selecting attributes from a given 11 | dataset. Attribute subset selection is the process of identifying and 12 | removing as much of the irrelevant and redundant information as 13 | possible. 14 | License: GPL-2 15 | Imports: 16 | digest, 17 | entropy, 18 | randomForest, 19 | RWeka 20 | Suggests: 21 | mlbench, 22 | rpart 23 | Encoding: UTF-8 24 | LazyLoad: yes 25 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | export("as.simple.formula", 2 | "cutoff.biggest.diff", 3 | "cutoff.k", 4 | "cutoff.k.percent", 5 | "best.first.search", 6 | "backward.search", 7 | "forward.search", 8 | "hill.climbing.search", 9 | "exhaustive.search", 10 | "cfs", 11 | "chi.squared", 12 | "information.gain", 13 | "gain.ratio", 14 | "symmetrical.uncertainty", 15 | "linear.correlation", 16 | "rank.correlation", 17 | "oneR", 18 | "relief", 19 | "consistency", 20 | "random.forest.importance") 21 | importFrom(randomForest, randomForest, importance) 22 | importFrom(RWeka, Discretize) 23 | importFrom(digest, digest) 24 | importFrom(entropy, entropy) 25 | importFrom("stats", "aggregate", "as.formula", "complete.cases", "cor", "model.frame", "na.pass", "sd") 26 | importFrom("utils", "combn") 27 | -------------------------------------------------------------------------------- /R/cutoff.R: -------------------------------------------------------------------------------- 1 | cutoff.k <- function(attrs, k) { 2 | if(dim(attrs)[1] == 0) 3 | return(character(0)) 4 | if(k < 1) 5 | stop("k too small") 6 | if(k > dim(attrs)[1]) 7 | k = dim(attrs)[1] 8 | sorted_names = rownames(attrs)[do.call(order, c(attrs, decreasing = TRUE))] 9 | return(sorted_names[1:k]) 10 | } 11 | 12 | cutoff.k.percent <- function(attrs, k) { 13 | if(dim(attrs)[1] == 0) 14 | return(character(0)) 15 | if(k <= 0) 16 | stop("k too small") 17 | if(k > 1) { 18 | warning("Assumed k=1") 19 | k = 1 20 | } 21 | sorted_names = rownames(attrs)[do.call(order, c(attrs, decreasing = TRUE))] 22 | return(sorted_names[1:round((k * length(sorted_names)))]) 23 | } 24 | 25 | cutoff.biggest.diff <- function(attrs) { 26 | if(dim(attrs)[1] == 0) 27 | return(character(0)) 28 | else if(dim(attrs)[1] == 1) 29 | return(dimnames(attrs)[[1]]) 30 | 31 | perm = order(attrs[,1], decreasing = TRUE) 32 | attrs = attrs[perm, , drop = FALSE] 33 | 34 | intervals = sapply(1:(dim(attrs)[1] - 1), function(idx) { 35 | attrs[idx, 1] - attrs[idx + 1, 1] 36 | }) 37 | 38 | return(dimnames(attrs)[[1]][1:(which.max(intervals))]) 39 | } 40 | -------------------------------------------------------------------------------- /R/discretize.R: -------------------------------------------------------------------------------- 1 | discretize.all <- function(formula, data) { 2 | new_data = get.data.frame.from.formula(formula, data) 3 | 4 | dest_column_name = dimnames(new_data)[[2]][1] 5 | if(!is.factor(new_data[[1]])) { 6 | new_data[[1]] = equal.frequency.binning.discretization(new_data[[1]], 5) 7 | } 8 | 9 | new_data = supervised.discretization(formula, data = new_data) 10 | 11 | # reorder attributes 12 | new_data = get.data.frame.from.formula(formula, new_data) 13 | return(new_data) 14 | } 15 | 16 | # unupervised 17 | equal.frequency.binning.discretization <- function(data, bins) { 18 | bins = as.integer(bins) 19 | if (!is.numeric(data)) 20 | stop("Data must be numeric") 21 | if(bins < 1) 22 | stop("Number of bins too small") 23 | 24 | complete = complete.cases(data) 25 | ord = do.call(order, list(data)) 26 | len = length(data[complete]) 27 | blen = len / bins 28 | new_data = data 29 | 30 | p1 = p2 = 0 31 | 32 | for(i in 1:bins) { 33 | p1 = p2 + 1 34 | p2 = round(i * blen) 35 | new_data[ord[p1:min(p2, len)]] = i 36 | } 37 | 38 | return(factor(new_data)) 39 | } 40 | 41 | # unupervised 42 | equal.width.binning.discretization <- function(data, bins) { 43 | if (!is.numeric(data)) 44 | stop("Data must be numeric") 45 | if(bins < 1) 46 | stop("Number of bins too small") 47 | return(cut(data, bins)) 48 | } 49 | 50 | #MDL - Fayyad, Irani 51 | supervised.discretization <- function(formula, data) { 52 | data = get.data.frame.from.formula(formula, data) 53 | complete = complete.cases(data[[1]]) 54 | all.complete = all(complete) 55 | if(!all.complete) { 56 | new_data = data[complete, , drop=FALSE] 57 | result = Discretize(formula, data = new_data, na.action = na.pass) 58 | return(result) 59 | } else { 60 | return(Discretize(formula, data = data, na.action = na.pass)) 61 | } 62 | 63 | } 64 | -------------------------------------------------------------------------------- /R/misc.R: -------------------------------------------------------------------------------- 1 | as.simple.formula <- function(attributes, class) { 2 | return(as.formula(paste(class, paste(attributes, sep = "", collapse = " + "), sep = " ~ "))) 3 | } 4 | 5 | get.data.frame.from.formula <- function(formula, data) { 6 | d = model.frame(formula, data, na.action = NULL) 7 | for(i in 1:dim(d)[2]) { 8 | if(is.factor(d[[i]]) || is.logical(d[[i]]) || is.character(d[[i]])) 9 | d[[i]] = factor(d[[i]]) 10 | } 11 | return(d) 12 | } 13 | 14 | entropyHelper <- function(x, unit = "log") { 15 | return(entropy(table(x, useNA="always"), unit = unit)) 16 | } 17 | -------------------------------------------------------------------------------- /R/normalize.R: -------------------------------------------------------------------------------- 1 | normalize.min.max <- function(data) { 2 | attr_count = dim(data)[2] 3 | if(attr_count == 0) 4 | return(data) 5 | for(i in 1:attr_count) { 6 | if(!is.numeric(data[, i])) 7 | next() 8 | if(!any(complete.cases(data[, i]))) 9 | next() 10 | mm = range(data[, i], na.rm = TRUE) 11 | minimum = mm[1] 12 | maximum = mm[2] 13 | if(minimum == maximum) 14 | data[, i] = data[, i] / minimum 15 | else 16 | data[, i] = (data[, i] - minimum) / (maximum - minimum) 17 | } 18 | 19 | return(data) 20 | } 21 | -------------------------------------------------------------------------------- /R/search.best.first.R: -------------------------------------------------------------------------------- 1 | best.first.search <- function(attributes, eval.fun, max.backtracks = 5) { 2 | if(length(attributes) == 0) 3 | stop("Attributes not specified") 4 | 5 | eval.fun = match.fun(eval.fun) 6 | 7 | attach_children <- function(states, best) { 8 | parent_state = states$attrs[best$idx,] 9 | children = create.children(parent_state, "forward", omit.func = function(...) { 10 | length(find.subset(states$attrs, ...)) > 0 11 | }) 12 | children_len = ifelse(is.null(children), 0, dim(children)[1]) 13 | if(children_len > 0) { 14 | states$attrs = rbind(states$attrs, children) 15 | states$open = c(states$open, rep(TRUE, children_len)) 16 | states$results = c(states$results, rep(NA, children_len)) 17 | } 18 | return(states) 19 | } 20 | 21 | states = list( 22 | attrs = diag(length(attributes)), 23 | open = rep(TRUE, length(attributes)), 24 | results = rep(NA, length(attributes)) 25 | ) 26 | colnames(states$attrs) = attributes 27 | 28 | best = list( 29 | result = -Inf, 30 | idx = NULL 31 | ) 32 | 33 | repeat { 34 | # calculate merit for every open and not evaluated subset of attributes 35 | rows_to_eval = states$open & is.na(states$results) 36 | if(any(rows_to_eval)) { 37 | states$results[rows_to_eval] = 38 | apply(states$attrs[rows_to_eval,, drop=FALSE], 1, function(vec) { 39 | attrs = attributes[as.logical(vec)] 40 | return(eval.fun(attrs)) 41 | }) 42 | } 43 | #find best 44 | new_best = find.best(states$results, states$open) 45 | 46 | #check if better 47 | if(is.null(new_best$result)) 48 | break() 49 | states$open[new_best$idx] = FALSE 50 | if(new_best$result > best$result) { 51 | best = new_best 52 | states = attach_children(states, best) 53 | } else { 54 | if(max.backtracks > 0) { 55 | max.backtracks = max.backtracks - 1 56 | } else 57 | break 58 | } 59 | } 60 | return(attributes[as.logical(states$attrs[best$idx, ])]) 61 | 62 | } 63 | -------------------------------------------------------------------------------- /R/search.exhaustive.R: -------------------------------------------------------------------------------- 1 | exhaustive.search <- function(attributes, eval.fun) { 2 | len = length(attributes) 3 | if(len == 0) 4 | stop("Attributes not specified") 5 | 6 | eval.fun = match.fun(eval.fun) 7 | best = list( 8 | result = -Inf, 9 | attrs = rep(0, len) 10 | ) 11 | 12 | # main loop 13 | # for each subset size 14 | for(size in 1:len) { 15 | child_comb = combn(1:len, size) 16 | # for each child 17 | for(i in 1:dim(child_comb)[2]) { 18 | subset = rep(0, len) 19 | subset[child_comb[, i]] = 1 20 | result = eval.fun(attributes[as.logical(subset)]) 21 | if(result > best$result) { 22 | best$result = result 23 | best$attrs = subset 24 | } 25 | } 26 | } 27 | return(attributes[as.logical(best$attrs)]) 28 | } -------------------------------------------------------------------------------- /R/search.greedy.R: -------------------------------------------------------------------------------- 1 | 2 | forward.search <- function(attributes, eval.fun) { 3 | return(greedy.search(attributes, eval.fun, TRUE)) 4 | } 5 | 6 | backward.search <- function(attributes, eval.fun) { 7 | return(greedy.search(attributes, eval.fun, FALSE)) 8 | } 9 | 10 | greedy.search <- function(attributes, eval.fun, forward = TRUE) { 11 | if(length(attributes) == 0) 12 | stop("Attributes not specified") 13 | 14 | eval.fun = match.fun(eval.fun) 15 | best = list( 16 | result = -Inf, 17 | attrs = rep(as.numeric(!forward), length(attributes)) 18 | ) 19 | 20 | # initial evaluation for full set when backward 21 | if(!forward) { 22 | best$result = eval.fun(attributes[as.logical(best$attrs)]) 23 | } 24 | 25 | forward_text = ifelse(forward, "forward", "backward") 26 | 27 | # main loop 28 | repeat { 29 | # create new matrix of children to evaluate 30 | children = create.children(best$attrs, forward_text) 31 | if(is.null(children)) 32 | break() 33 | 34 | # evaluate and find the best of them 35 | children_results = apply(children, 1, function(vec) { 36 | eval.fun(attributes[as.logical(vec)]) 37 | }) 38 | local_best = find.best(children_results) 39 | 40 | # compare to the best so far 41 | if(local_best$result > best$result) { 42 | best$result = local_best$result 43 | best$attrs = children[local_best$idx,] 44 | } else { 45 | break() 46 | } 47 | } 48 | 49 | return(attributes[as.logical(best$attrs)]) 50 | 51 | } 52 | -------------------------------------------------------------------------------- /R/search.hill.climbing.R: -------------------------------------------------------------------------------- 1 | hill.climbing.search <- function(attributes, eval.fun) { 2 | if(length(attributes) == 0) 3 | stop("Attributes not specified") 4 | 5 | eval.fun = match.fun(eval.fun) 6 | best = list( 7 | result = -Inf, 8 | attrs = rep(0, length(attributes)) 9 | ) 10 | while(sum(best$attrs) == 0) 11 | best$attrs = sample(c(0,1), length(attributes), replace = TRUE) 12 | best$result = eval.fun(attributes[as.logical(best$attrs)]) 13 | 14 | evaluated_states = list( 15 | attrs = matrix(best$attrs, nrow = 1, ncol = length(attributes), byrow = TRUE), 16 | results = best$result 17 | ) 18 | 19 | eval_state <- function(state, evaluated_states) { 20 | idx = find.subset(evaluated_states$attrs, state) 21 | if(length(idx) == 0) { # needs to be evaluated 22 | return(list( 23 | to_be_saved = TRUE, 24 | result = eval.fun(attributes[as.logical(state)]) 25 | )) 26 | } else if(length(idx) == 1) { # already evaluated 27 | return(list( 28 | to_be_saved = FALSE, 29 | result = evaluated_states$results[idx] 30 | )) 31 | } else { 32 | stop("Internal error") 33 | } 34 | } 35 | 36 | # main loop 37 | repeat { 38 | # find neighbours 39 | children = create.children(best$attrs, direction = "both") 40 | if(is.null(children)) 41 | break() 42 | 43 | # evaluate and find the best of them 44 | children_evaluated = apply(children, 1, function(vec) { 45 | eval_state(vec, evaluated_states) 46 | }) 47 | 48 | children_results = sapply(children_evaluated, function(x) x$result) 49 | children_to_be_saved = sapply(children_evaluated, function(x) x$to_be_saved) 50 | 51 | # save children at evaluated_states 52 | evaluated_states$attrs = rbind(evaluated_states$attrs, children[children_to_be_saved,, drop = FALSE]) 53 | evaluated_states$results = c(evaluated_states$results, children_results[children_to_be_saved]) 54 | 55 | local_best = find.best(children_results) 56 | 57 | # compare to the best so far 58 | if(local_best$result > best$result) { 59 | best$result = local_best$result 60 | best$attrs = children[local_best$idx,] 61 | } else { 62 | break() 63 | } 64 | } 65 | return(attributes[as.logical(best$attrs)]) 66 | } -------------------------------------------------------------------------------- /R/search.misc.R: -------------------------------------------------------------------------------- 1 | # returns indicies 2 | find.subset <- function(subsets.matrix, subset) { 3 | subset = as.vector(subset) 4 | len = length(subset) 5 | if(len == 0) 6 | stop("Empty atrributes subset.") 7 | if(dim(subsets.matrix)[2] != len) 8 | stop("Incorrect dimensions.") 9 | 10 | if(dim(subsets.matrix)[1] == 0) 11 | return(as.integer(NULL)) 12 | 13 | cond = rep(TRUE, dim(subsets.matrix)[1]) 14 | for(i in 1:len) 15 | cond = cond & (subsets.matrix[,i] == subset[i]) 16 | return(which(cond)) 17 | } 18 | 19 | 20 | create.children <- function(parent, direction = c("forward", "backward", "both"), omit.func = NULL ) { 21 | direction = match.arg(direction) 22 | 23 | if(!is.null(omit.func)) { 24 | omit.func = match.fun(omit.func) 25 | } 26 | 27 | cols = length(parent) 28 | if(cols <= 0) 29 | stop("Parent attribute set cannot be empty.") 30 | 31 | m1 = NULL 32 | m2 = NULL 33 | 34 | if(direction == "forward" || direction == "both") { 35 | rows = cols - sum(parent) 36 | if(rows > 0) { 37 | m1 = matrix(parent, ncol = cols, nrow = rows, byrow = TRUE) 38 | 39 | current_row = 1 40 | current_col = 1 41 | repeat { 42 | if(current_col > cols || current_row > rows) 43 | break() 44 | 45 | if(m1[current_row, current_col] == 0) { 46 | m1[current_row, current_col] = 1 47 | current_row = current_row + 1 48 | } 49 | 50 | current_col = current_col + 1 51 | } 52 | } 53 | } 54 | 55 | if(direction == "backward" || direction == "both") { 56 | rows = sum(parent) 57 | if(rows > 1) { # skipped if only 0s 58 | m2 = matrix(parent, ncol = cols, nrow = rows, byrow = TRUE) 59 | 60 | current_row = 1 61 | current_col = 1 62 | repeat { 63 | if(current_col > cols || current_row > rows) 64 | break() 65 | 66 | if(m2[current_row, current_col] == 1) { 67 | m2[current_row, current_col] = 0 68 | current_row = current_row + 1 69 | } 70 | 71 | current_col = current_col + 1 72 | } 73 | } 74 | } 75 | 76 | m = rbind(m1, m2) 77 | if(is.null(m)) 78 | return(m) 79 | if(!is.null(omit.func)) { 80 | rows_to_omit = apply(m, 1, omit.func) 81 | return(m[!rows_to_omit,, drop = FALSE]) 82 | } else { 83 | return(m) 84 | } 85 | } 86 | 87 | find.best <- function(results, subset = rep(TRUE, length(results))) { 88 | best = list( 89 | result = NULL, 90 | idx = NULL 91 | ) 92 | 93 | w = which(subset) 94 | if(length(w) > 0) { 95 | children_results = results[w] 96 | max_idx = which.max(children_results) 97 | best$result = children_results[max_idx] 98 | best$idx = w[max_idx] 99 | } 100 | return(best) 101 | } 102 | -------------------------------------------------------------------------------- /R/selector.cfs.R: -------------------------------------------------------------------------------- 1 | ### CFS 2 | # classification and regression 3 | # continous and discrete data 4 | cfs <- function(formula, data) { 5 | cont_correlation <- function(a, b) { 6 | result = 0 7 | if(!is.factor(a) && !is.factor(b)) { # both continous 8 | complete = complete.cases(a) & complete.cases(b) 9 | if(!any(complete)) 10 | return(0) 11 | vec1 = a[complete] 12 | vec2 = b[complete] 13 | if(sd(vec1) == 0 || sd(vec2) == 0) 14 | return(0) 15 | result = cor(vec1, vec2) 16 | } else if(is.factor(a) && is.factor(b)) { # both discrete 17 | tab = table(a, b) 18 | alevels = rownames(tab) 19 | blevels = colnames(tab) 20 | 21 | result = sum(sapply(alevels, function(avalue) { 22 | avec = as.numeric(a == avalue) 23 | complete_a = complete.cases(a) 24 | return(sum(sapply(blevels, function(bvalue) { 25 | bvec = as.numeric(b == bvalue) 26 | complete_b = complete.cases(b) 27 | complete = complete_a & complete_b 28 | avec_complete_data = avec[complete] 29 | bvec_complete_data = bvec[complete] 30 | if(sd(avec_complete_data, na.rm=TRUE) == 0 || sd(bvec_complete_data, na.rm=TRUE) == 0) 31 | return(0) 32 | return(tab[avalue, bvalue] / length(a) * cor(avec_complete_data, bvec_complete_data)) 33 | }))) 34 | })) 35 | } else { # continous and discrete 36 | cont = NULL; 37 | disc = NULL; 38 | if(is.factor(a)) { 39 | cont = b 40 | disc = a 41 | } else { 42 | cont = a 43 | disc = b 44 | } 45 | 46 | cont_complete = complete.cases(cont) 47 | disc_table = table(disc) 48 | disc_levels = names(disc_table) 49 | 50 | if(length(disc_levels) == 0) { 51 | result = 0 52 | } else { 53 | result = sum(sapply(disc_levels, function(lev) { 54 | disc_vec = as.numeric(disc == lev) 55 | disc_vec_complete = complete.cases(disc_vec) 56 | complete = cont_complete & disc_vec_complete 57 | disc_vec_complete_data = disc_vec[complete] 58 | cont_complete_data = cont[complete] 59 | if(sd(cont_complete_data) == 0) 60 | return(0) 61 | return(disc_table[lev] / length(disc) * cor(disc_vec_complete_data, cont_complete_data)) 62 | })) 63 | } 64 | } 65 | return(result) 66 | } 67 | 68 | # uses parent.env (correlations) 69 | get_correlation <- function(attr1, attr2, classification, new_data, entropies) { 70 | #lazy evaluation 71 | if(!is.na(correlations[attr1, attr2])) { 72 | return(correlations[attr1, attr2]) 73 | } 74 | 75 | tmp_res = NA 76 | if(classification) { #discrete class 77 | tmp_res = 2.0 * (entropies[attr1] + entropies[attr2] - entropyHelper(data.frame(cbind(new_data[[attr1]], new_data[[attr2]])))) / (entropies[attr1] + entropies[attr2]) 78 | } else { #continous class 79 | tmp_res = cont_correlation(new_data[[attr1]], new_data[[attr2]]) 80 | } 81 | if(is.nan(tmp_res)) { 82 | # all entropies (individual + joint) are 0 83 | tmp_res = 0 84 | } 85 | 86 | correlations[attr1, attr2] <<- tmp_res 87 | correlations[attr2, attr1] <<- tmp_res 88 | 89 | return(tmp_res) 90 | } 91 | 92 | # uses parent.env 93 | evaluator <- function(attrs) { 94 | ff_sum = 0 95 | ff_count = 0 96 | fc_sum = 0 97 | attr_count = length(attrs) 98 | 99 | if(attr_count <= 0) 100 | stop("Attributes not specified") 101 | 102 | for(i in 1:attr_count) { 103 | attr1 = attrs[i] 104 | 105 | # feature-class correlation 106 | cor = get_correlation(attr1, 1, classification, new_data, entropies) 107 | fc_sum = fc_sum + cor 108 | 109 | # feature-feature correlation 110 | if(i == attr_count) { 111 | next() 112 | } 113 | for(j in (i+1):attr_count) { 114 | attr2 = attrs[j] 115 | cor = get_correlation(attr1, attr2, classification, new_data, entropies) 116 | ff_count = ff_count + 1 117 | ff_sum = ff_sum + cor 118 | } 119 | } 120 | 121 | ff_cor = ff_sum / ff_count 122 | fc_cor = fc_sum / attr_count 123 | 124 | if(attr_count == 1) 125 | return(fc_cor) 126 | else 127 | return(attr_count * fc_cor / sqrt(attr_count + attr_count * (attr_count - 1) * ff_cor)) 128 | } 129 | 130 | new_data = get.data.frame.from.formula(formula, data) 131 | 132 | # prepare correlation matrix 133 | classification = is.factor(new_data[[1]]) 134 | attr_count = dim(new_data)[2] 135 | attr_names = colnames(new_data) 136 | correlations = matrix(rep(NA, attr_count ^ 2), nrow = attr_count, ncol = attr_count, 137 | dimnames = list(attr_names, attr_names)) 138 | 139 | entropies = NULL 140 | if(classification) { 141 | new_data = supervised.discretization(formula, data = new_data) 142 | new_data = get.data.frame.from.formula(formula, new_data) 143 | entropies = sapply(new_data, entropyHelper) 144 | } 145 | 146 | result = best.first.search(names(new_data)[-1], evaluator) 147 | 148 | return(result) 149 | } 150 | -------------------------------------------------------------------------------- /R/selector.chi.squared.R: -------------------------------------------------------------------------------- 1 | ### CHI-SQUARED 2 | # classification and regression 3 | # continous and discrete data 4 | chi.squared <- function(formula, data) { 5 | 6 | new_data = get.data.frame.from.formula(formula, data) 7 | new_data = discretize.all(formula,new_data) 8 | 9 | class_data = new_data[[1]] 10 | new_data = new_data[-1] #new_data without class attr 11 | 12 | results = sapply(new_data, function(w) { 13 | cont = table(class_data, w) 14 | row_sums = apply(cont, 1, sum) 15 | col_sums = apply(cont, 2, sum) 16 | all_sum = sum(col_sums) 17 | expected_matrix = t(as.matrix(col_sums) %*% t(as.matrix(row_sums))) / all_sum 18 | chis = sum((cont - expected_matrix) ^ 2 / expected_matrix) 19 | 20 | if(chis == 0 || length(col_sums) < 2 || length (row_sums) < 2) { 21 | return(0) 22 | } else { 23 | # phi or Cramer's V 24 | return(sqrt(chis / (all_sum * min(length(col_sums) - 1, length(row_sums) - 1)))) 25 | } 26 | }) 27 | 28 | attr_names = dimnames(new_data)[[2]] 29 | return(data.frame(attr_importance = results, row.names = attr_names)) 30 | } 31 | -------------------------------------------------------------------------------- /R/selector.consistency.R: -------------------------------------------------------------------------------- 1 | ### Consistency 2 | # classification and regression 3 | # continous and discrete data 4 | consistency <- function(formula, data) { 5 | # uses parent.env 6 | evaluator <- function(attrs) { 7 | attr_data = new_data[, attrs, drop=FALSE] 8 | 9 | #deprecated 10 | #attr_data = as.data.frame(lapply(attr_data, function(vec) { addNA(vec, ifany=TRUE) })) 11 | #result = aggregate(new_data[[1]], as.list(attr_data), function(classes) { 12 | 13 | hashvec = as.factor(apply(attr_data, 1, digest)) 14 | result = aggregate(new_data[[1]], list(hash=hashvec), function(classes) { 15 | return(max(as.vector(table(classes)))) 16 | }) 17 | result = sum(result[[dim(result)[2]]]) / dim(attr_data)[1] 18 | return(result) 19 | } 20 | new_data = get.data.frame.from.formula(formula, data) 21 | new_data = discretize.all(formula, new_data) 22 | column_names = names(new_data) 23 | 24 | result = best.first.search(column_names[-1], evaluator) 25 | return(result) 26 | } 27 | -------------------------------------------------------------------------------- /R/selector.correlation.R: -------------------------------------------------------------------------------- 1 | ### correlation.body 2 | # regression 3 | # continous data 4 | correlation.body <- function(formula, data, type = c("pearson", "spearman")) { 5 | type = match.arg(type) 6 | 7 | new_data = get.data.frame.from.formula(formula, data) 8 | 9 | lapply(new_data, function(vec) { 10 | if(is.factor(vec)) 11 | stop("All data must be continous.") 12 | }) 13 | 14 | class_data = new_data[[1]] 15 | new_data = new_data[-1] #new_data without class attr 16 | 17 | class_data_complete = complete.cases(class_data) 18 | results = abs(sapply(new_data, function(attr_data) { 19 | complete = complete.cases(attr_data) & class_data_complete 20 | if(!any(complete)) 21 | return(NA) 22 | vec1 = class_data[complete] 23 | vec2 = attr_data[complete] 24 | if(sd(vec1) == 0 || sd(vec2) == 0) 25 | return(NA) 26 | return(cor(vec1, vec2, method = type)) 27 | })) 28 | 29 | attr_names = dimnames(new_data)[[2]] 30 | return(data.frame(attr_importance = results, row.names = attr_names)) 31 | } 32 | 33 | linear.correlation <- function(formula, data) { 34 | return(correlation.body(formula, data, "pearson")) 35 | } 36 | 37 | rank.correlation <- function(formula, data) { 38 | return(correlation.body(formula, data, "spearman")) 39 | } 40 | -------------------------------------------------------------------------------- /R/selector.info.gain.R: -------------------------------------------------------------------------------- 1 | ### INFORMATION GAIN BASED ALGORITHMS 2 | # classification and regression 3 | # continous and discrete data 4 | 5 | information.gain <- function(formula, data, unit = "log") { 6 | information.gain.body(formula, data, type = "infogain", unit) 7 | } 8 | 9 | gain.ratio <- function(formula, data, unit = "log") { 10 | information.gain.body(formula, data, type = "gainratio", unit) 11 | } 12 | 13 | symmetrical.uncertainty <- function(formula, data, unit = "log") { 14 | information.gain.body(formula, data, type = "symuncert", unit) 15 | } 16 | 17 | information.gain.body <- function(formula, data, type = c("infogain", "gainratio", "symuncert"), unit) { 18 | type = match.arg(type) 19 | new_data = get.data.frame.from.formula(formula, data) 20 | new_data = discretize.all(formula, new_data) 21 | 22 | attr_entropies = sapply(new_data, entropyHelper, unit) 23 | class_entropy = attr_entropies[1] 24 | attr_entropies = attr_entropies[-1] 25 | joint_entropies = sapply(new_data[-1], function(t) { 26 | entropyHelper(data.frame(cbind(new_data[[1]], t)), unit) 27 | }) 28 | results = class_entropy + attr_entropies - joint_entropies 29 | 30 | if(type == "gainratio") { 31 | results = ifelse(attr_entropies == 0, 0, results / attr_entropies) 32 | } else if(type == "symuncert") { 33 | results = 2 * results / (attr_entropies + class_entropy) 34 | } 35 | 36 | attr_names = dimnames(new_data)[[2]][-1] 37 | return(data.frame(attr_importance = results, row.names = attr_names)) 38 | } 39 | -------------------------------------------------------------------------------- /R/selector.oneR.R: -------------------------------------------------------------------------------- 1 | ### 1R 2 | # classification and regression 3 | # continous and discrete data 4 | oneR <- function(formula, data) { 5 | 6 | new_data = get.data.frame.from.formula(formula, data) 7 | new_data = discretize.all(formula,new_data) 8 | 9 | class_data = new_data[[1]] 10 | new_data = new_data[-1] #new_data without class attr 11 | 12 | results = sapply(new_data, function(vec) { 13 | vec = factor(vec) 14 | errors = sapply(levels(vec), function(val) { 15 | cvaluestab = as.vector(table(class_data[ which(vec == val) ])) 16 | return(sum(cvaluestab[ -which.max(cvaluestab) ]) / length(class_data)) 17 | }) 18 | return(sum(errors)) 19 | }) 20 | 21 | results = max(results) + min(results) - results 22 | 23 | attr_names = dimnames(new_data)[[2]] 24 | return(data.frame(attr_importance = results, row.names = attr_names)) 25 | } 26 | -------------------------------------------------------------------------------- /R/selector.random.forest.R: -------------------------------------------------------------------------------- 1 | ### RANDOM FOREST 2 | # classification and regression 3 | # continous and discrete data 4 | # NA deleted 5 | random.forest.importance <- function(formula, data, importance.type = 1) { 6 | new_data = get.data.frame.from.formula(formula, data) 7 | 8 | # get rid of NAs 9 | no_na = rep(TRUE, dim(new_data)[1]) 10 | for(i in 1:dim(new_data)[2]) { 11 | no_na = no_na & complete.cases(new_data[, i]) 12 | } 13 | new_data = new_data[no_na, , drop=FALSE] 14 | 15 | forest = randomForest(formula, new_data, 16 | ntree = 1000, keep.forest = FALSE, importance = TRUE) 17 | 18 | res = as.data.frame(importance(forest, type = importance.type)) 19 | colnames(res)[1] = "attr_importance" 20 | return(res) 21 | } 22 | -------------------------------------------------------------------------------- /R/selector.relief.R: -------------------------------------------------------------------------------- 1 | ### RELIEF 2 | # classification and regression 3 | # continous and discrete data 4 | relief <- function(formula, data, neighbours.count = 5, sample.size = 10) { 5 | # uses parent.env 6 | find_neighbours <- function(instance_idx) { 7 | instance = new_data[instance_idx,, drop = FALSE] 8 | 9 | # for every other instance 10 | for(current_idx in 1:instances_count) { 11 | if(instance_idx == current_idx) 12 | next() 13 | current_instance = new_data[current_idx,, drop = FALSE] 14 | if(is.na(current_instance[1, 1])) 15 | next() 16 | 17 | dist = instance_distance(instance, current_instance) 18 | 19 | if(classification) 20 | class_no = which(classes == current_instance[[1]]) 21 | else 22 | class_no = 1 23 | if(nn_stored_count[class_no] < neighbours.count) { 24 | nn_stored_count[class_no] <<- nn_stored_count[class_no] + 1 25 | n_array[class_no, nn_stored_count[class_no], ] <<- c(dist, current_idx) 26 | } else { 27 | max_idx = which.max(n_array[class_no, , 1]) 28 | max_value = n_array[class_no, max_idx, 1] 29 | if(dist < max_value) { 30 | n_array[class_no, max_idx, ] <<- c(dist, current_idx) 31 | } 32 | } 33 | } 34 | } 35 | 36 | # uses parent.env 37 | update_weights <- function(instance_idx) { 38 | instance = new_data[instance_idx,, drop = FALSE] 39 | instance_class = instance[1, 1] 40 | instance_class_no = which(classes == instance_class) 41 | 42 | if(classification) { 43 | # for each attribute 44 | for(attr_idx in 1:attributes_count) { 45 | col_idx = attr_idx + 1 46 | 47 | # nearest hits 48 | hits_sum = 0 49 | if(nn_stored_count[instance_class_no] > 0) { 50 | hits_sum = sum(sapply(1:nn_stored_count[instance_class_no], function(n_idx) { 51 | n_instance_idx = n_array[instance_class_no, n_idx, 2] 52 | n_instance = new_data[n_instance_idx,, drop = FALSE] 53 | return(field_distance(col_idx, instance, n_instance)) 54 | })) 55 | hits_sum = hits_sum / nn_stored_count[instance_class_no] 56 | } 57 | 58 | # nearest misses 59 | misses_sum = 0 60 | if(class_count > 1) { 61 | misses_sum = sum(sapply((1:class_count)[-instance_class_no], function(class_no) { 62 | class_misses_sum = 0 63 | if(nn_stored_count[class_no] > 0) { 64 | class_misses_sum = sum(sapply(1:nn_stored_count[class_no], function(n_idx) { 65 | n_instance_idx = n_array[class_no, n_idx, 2] 66 | n_instance = new_data[n_instance_idx,, drop = FALSE] 67 | return(field_distance(col_idx, instance, n_instance)) 68 | })) 69 | class_misses_sum = class_misses_sum * class_prob[class_no] / nn_stored_count[class_no] 70 | } 71 | return(class_misses_sum) 72 | })) 73 | 74 | 75 | misses_sum = misses_sum / (1 - class_prob[instance_class_no]) 76 | } 77 | results[attr_idx] <<- results[attr_idx] - hits_sum + misses_sum 78 | } 79 | } else { 80 | if(nn_stored_count[1] > 0) { 81 | for(n_idx in 1:nn_stored_count[1]) { 82 | n_instance_idx = n_array[1, n_idx, 2] 83 | n_instance = new_data[n_instance_idx,, drop = FALSE] 84 | class_diff = field_distance(1, instance, n_instance) 85 | ndc <<- ndc + class_diff / nn_stored_count[1] 86 | for(attr_idx in 1:attributes_count) { 87 | col_idx = attr_idx + 1 88 | attr_diff_norm = field_distance(col_idx, instance, n_instance) / nn_stored_count[1] 89 | nda[attr_idx] <<- nda[attr_idx] + attr_diff_norm 90 | ndcda[attr_idx] <<- ndcda[attr_idx] + class_diff * attr_diff_norm 91 | } 92 | } 93 | } 94 | } 95 | } 96 | 97 | # parameters: data.frame, data.frame 98 | instance_distance <- function(instance1, instance2) { 99 | len = dim(instance1)[2] 100 | if(len != dim(instance2)[2]) 101 | stop("Instances of different lengths") 102 | if(len <= 1) 103 | stop("Too few attributes") 104 | 105 | result = sapply(2:len, function(i) { 106 | return(field_distance(i, instance1, instance2)) 107 | }) 108 | #return(sqrt(sum(result ^ 2))) #sqrt not needed 109 | res = sum(result ^ 2) 110 | if(is.na(res)) { 111 | stop("Internal error. Distance NA.") 112 | } 113 | return(res) 114 | } 115 | 116 | # uses parent.env 117 | # parameters: index, data.frame, data.frame 118 | field_distance <- function(col_idx, instance1, instance2) { 119 | value1 = instance1[1, col_idx] 120 | value2 = instance2[1, col_idx] 121 | attr_idx = col_idx - 1 # skip class 122 | 123 | if(is.factor(value1) && is.factor(value2)) { 124 | if(is.na(value1) && is.na(value2)) { 125 | if(classification) 126 | return(1 - sum(p_val_in_class[[attr_idx]][, instance1[1, 1]] * p_val_in_class[[attr_idx]][, instance2[1, 1]])) 127 | else 128 | return(1 - p_same_val[[attr_idx]]) 129 | } else if(is.na(value1) || is.na(value2)) { 130 | if(is.na(value1)) { 131 | known_value = value2 132 | unknown_class = instance1[1, 1] 133 | } else { 134 | known_value = value1 135 | unknown_class = instance2[1, 1] 136 | } 137 | if(classification) 138 | return(1 - p_val_in_class[[attr_idx]][known_value, unknown_class]) 139 | else 140 | return(1 - p_val[[attr_idx]][known_value]) 141 | } else if(value1 == value2) { 142 | return(0) 143 | } else { #if(value1 != value2) 144 | return(1) 145 | } 146 | } else if(is.numeric(value1) && is.numeric(value2)) { 147 | if(is.na(value1) && is.na(value2)) { 148 | return(1) 149 | } else if(is.na(value1)) { 150 | return(max(value2, 1 - value2)) 151 | } else if(is.na(value2)) { 152 | return(max(value1, 1 - value1)) 153 | } else { 154 | return(abs(value1 - value2)) 155 | } 156 | } else { 157 | stop("Unsupported value type") 158 | } 159 | } 160 | 161 | new_data = get.data.frame.from.formula(formula, data) 162 | new_data = normalize.min.max(new_data) 163 | 164 | # for discrete classes 165 | class_vector = NULL 166 | class_count = NULL 167 | class_prob = NULL 168 | classes = NULL 169 | p_val_in_class = NULL 170 | p_val = NULL 171 | p_same_val = NULL 172 | 173 | # for continous class 174 | ndc = NULL 175 | nda = NULL 176 | ndcda = NULL 177 | 178 | results = NULL 179 | n_array = NULL 180 | nn_stored_count = NULL 181 | classification = NULL 182 | sample_instances_idx = NULL 183 | 184 | instances_count = dim(new_data)[1] 185 | attributes_count = dim(new_data)[2] - 1 186 | attr_names = dimnames(new_data)[[2]][-1] 187 | 188 | if(neighbours.count < 1) { 189 | neighbours.count = 1 190 | warning(paste("Assumed: neighbours.count = ", neighbours.count)) 191 | } 192 | 193 | if(sample.size < 1) { 194 | warning(paste("Assumed: sample.size = ", sample.size)) 195 | sample.size = 1 196 | sample_instances_idx = sample(1:instances_count, 1) 197 | } else if(sample.size > instances_count) { 198 | warning(paste("Assumed: sample.size = ", sample.size)) 199 | sample.size = instances_count 200 | sample_instances_idx = 1:instances_count 201 | } else { 202 | sample_instances_idx = sort(sample(1:instances_count, sample.size, replace=TRUE)) 203 | } 204 | 205 | classification = is.factor(new_data[[1]]) 206 | if(classification) { 207 | class_vector = new_data[[1]] 208 | class_prob = table(class_vector) 209 | class_prob = class_prob / sum(class_prob) 210 | classes = names(class_prob) 211 | class_count = length(classes) 212 | 213 | p_val_in_class = lapply(new_data[-1], function(vec) { 214 | if(!is.factor(vec) || !any(is.na(vec))) 215 | return(NULL) 216 | tab = table(vec, class_vector) 217 | return(apply(tab, 2, function(x) { 218 | s = sum(x) 219 | if(s == 0) 220 | return(x) 221 | else 222 | return(x / s) 223 | })) 224 | }) 225 | } else { 226 | class_count = 1 227 | ndc = 0 228 | nda = array(0, attributes_count) 229 | ndcda = array(0, attributes_count) 230 | 231 | p_val = lapply(new_data[-1], function(vec) { 232 | if(!is.factor(vec) || !any(is.na(vec))) 233 | return(NULL) 234 | tab = table(vec) 235 | if(sum(tab) != 0) { 236 | tab = tab / sum(tab) 237 | } 238 | return(tab) 239 | }) 240 | p_same_val = lapply(p_val, function(attr) { 241 | if(is.null(attr)) 242 | return(NULL) 243 | return(sum(attr ^ 2)) 244 | }) 245 | } 246 | 247 | n_array = array(0, c(class_count, neighbours.count, 2)) 248 | nn_stored_count = array(0, class_count) 249 | results = rep(0, attributes_count) 250 | 251 | sapply(sample_instances_idx, function(current_instance_idx) { 252 | current_instance = new_data[current_instance_idx,, drop = FALSE] 253 | if(is.na(current_instance[[1]])) 254 | return(NULL) 255 | 256 | nn_stored_count[] <<- 0 257 | n_array[] <<- Inf 258 | find_neighbours(current_instance_idx) 259 | update_weights(current_instance_idx) 260 | }) 261 | 262 | 263 | if(classification) { 264 | results = results / sample.size 265 | return(data.frame(attr_importance = results, row.names = attr_names)) 266 | } else { 267 | results = ndcda / ndc - ((nda - ndcda) / (sample.size - ndc)) 268 | results = data.frame(attr_importance = results, row.names = attr_names) 269 | #results = normalize.min.max(results) 270 | return(results) 271 | } 272 | 273 | } 274 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | FSelector: Selecting attributes 2 | =============================== 3 | 4 | This package provides functions for selecting attributes from a given dataset. Attribute subset selection is the process of identifying and removing as much of the irrelevant and redundant information as possible. 5 | 6 | CRAN page: http://cran.r-project.org/web/packages/FSelector/index.html 7 | -------------------------------------------------------------------------------- /man/FSelector-package.Rd: -------------------------------------------------------------------------------- 1 | \name{FSelector-package} 2 | \alias{FSelector-package} 3 | \alias{FSelector} 4 | \docType{package} 5 | \title{ 6 | Package for selecting attributes 7 | } 8 | \description{ 9 | Package containing functions for selecting attributes from a given dataset and a destination attribute. 10 | } 11 | \details{ 12 | This package contains: 13 | \itemize{ 14 | \item{-}{Algorithms for filtering attributes: cfs, chi.squared, information.gain, gain.ratio, symmetrical.uncertainty, linear.correlation, rank.correlation, oneR, relief, consistency, random.forest.importance} 15 | \item{-}{Algorithms for wrapping classifiers and search attribute subset space: best.first.search, backward.search, forward.search, hill.climbing.search} 16 | \item{-}{Algorithm for choosing a subset of attributes based on attributes' weights: cutoff.k, cutoff.k.percent, cutoff.biggest.diff} 17 | \item{-}{Algorithm for creating formulas: as.simple.formula} 18 | } 19 | } 20 | \author{ 21 | Piotr Romanski\cr 22 | Maintainer: Lars Kotthoff 23 | } 24 | \keyword{ package } 25 | -------------------------------------------------------------------------------- /man/as.simple.formula.Rd: -------------------------------------------------------------------------------- 1 | \name{as.simple.formula} 2 | \alias{as.simple.formula} 3 | \title{ Converting to formulas } 4 | \description{ 5 | Converts character vector of atrributes' names and destination attribute's name to a simple formula. 6 | } 7 | \usage{ 8 | as.simple.formula(attributes, class) 9 | } 10 | \arguments{ 11 | \item{attributes}{ character vector of attributes' names } 12 | \item{class}{ name of destination attribute } 13 | } 14 | \value{ 15 | A simple formula like "class ~ attr1 + attr2" 16 | } 17 | \author{ Piotr Romanski } 18 | \examples{ 19 | data(iris) 20 | result <- cfs(Species ~ ., iris) 21 | f <- as.simple.formula(result, "Species") 22 | } -------------------------------------------------------------------------------- /man/best.first.search.Rd: -------------------------------------------------------------------------------- 1 | \name{best.first.search} 2 | \alias{best.first.search} 3 | \title{ Best-first search } 4 | \description{ 5 | The algorithm for searching atrribute subset space. 6 | } 7 | \usage{ 8 | best.first.search(attributes, eval.fun, max.backtracks = 5) 9 | } 10 | \arguments{ 11 | \item{attributes}{ a character vector of all attributes to search in } 12 | \item{eval.fun}{ a function taking as first parameter a character vector of all attributes and returning a numeric indicating how important a given subset is } 13 | \item{max.backtracks}{ an integer indicating a maximum allowed number of backtracks, default is 5 } 14 | } 15 | \details{ 16 | The algorithm is similar to \code{\link{forward.search}} besides the fact that is chooses the best node from all already evaluated ones and evaluates it. The selection of the best node is repeated approximately \code{max.backtracks} times in case no better node found. 17 | } 18 | \value{ 19 | A character vector of selected attributes. 20 | } 21 | \author{ Piotr Romanski } 22 | \seealso{ \code{\link{forward.search}}, \code{\link{backward.search}}, \code{\link{hill.climbing.search}}, \code{\link{exhaustive.search}} } 23 | \examples{ 24 | library(rpart) 25 | data(iris) 26 | 27 | evaluator <- function(subset) { 28 | #k-fold cross validation 29 | k <- 5 30 | splits <- runif(nrow(iris)) 31 | results = sapply(1:k, function(i) { 32 | test.idx <- (splits >= (i - 1) / k) & (splits < i / k) 33 | train.idx <- !test.idx 34 | test <- iris[test.idx, , drop=FALSE] 35 | train <- iris[train.idx, , drop=FALSE] 36 | tree <- rpart(as.simple.formula(subset, "Species"), train) 37 | error.rate = sum(test$Species != predict(tree, test, type="c")) / nrow(test) 38 | return(1 - error.rate) 39 | }) 40 | print(subset) 41 | print(mean(results)) 42 | return(mean(results)) 43 | } 44 | 45 | subset <- best.first.search(names(iris)[-5], evaluator) 46 | f <- as.simple.formula(subset, "Species") 47 | print(f) 48 | 49 | 50 | } 51 | -------------------------------------------------------------------------------- /man/cfs.Rd: -------------------------------------------------------------------------------- 1 | \name{cfs} 2 | \alias{cfs} 3 | \title{ CFS filter } 4 | \description{ 5 | The algorithm finds attribute subset using correlation and entropy measures for continous and discrete data. 6 | } 7 | \usage{ 8 | cfs(formula, data) 9 | } 10 | \arguments{ 11 | \item{formula}{ a symbolic description of a model } 12 | \item{data}{ data to process }} 13 | \details{ 14 | The alorithm makes use of \code{\link{best.first.search}} for searching the attribute subset space. 15 | } 16 | \seealso{ \code{\link{best.first.search}} } 17 | \value{ 18 | a character vector containing chosen attributes 19 | } 20 | \author{ Piotr Romanski } 21 | \examples{ 22 | data(iris) 23 | 24 | subset <- cfs(Species~., iris) 25 | f <- as.simple.formula(subset, "Species") 26 | print(f) 27 | } -------------------------------------------------------------------------------- /man/chi.squared.Rd: -------------------------------------------------------------------------------- 1 | \name{chi.squared} 2 | \alias{chi.squared} 3 | \title{ Chi-squared filter } 4 | \description{ 5 | The algorithm finds weights of discrete attributes basing on a chi-squared test. 6 | } 7 | \usage{ 8 | chi.squared(formula, data) 9 | } 10 | \arguments{ 11 | \item{formula}{ a symbolic description of a model } 12 | \item{data}{ a symbolic description of a model } 13 | } 14 | \details{ 15 | The result is equal to Cramer's V coefficient between source attributes and destination attribute. 16 | } 17 | \value{ 18 | a data.frame containing the worth of attributes in the first column and their names as row names 19 | } 20 | \author{ Piotr Romanski } 21 | \examples{ 22 | library(mlbench) 23 | data(HouseVotes84) 24 | 25 | weights <- chi.squared(Class~., HouseVotes84) 26 | print(weights) 27 | subset <- cutoff.k(weights, 5) 28 | f <- as.simple.formula(subset, "Class") 29 | print(f) 30 | } 31 | -------------------------------------------------------------------------------- /man/consistency.Rd: -------------------------------------------------------------------------------- 1 | \name{consistency} 2 | \alias{consistency} 3 | \title{ Consistency-based filter } 4 | \description{ 5 | The algorithm finds attribute subset using consistency measure for continous and discrete data. 6 | } 7 | \usage{ 8 | consistency(formula, data) 9 | } 10 | \arguments{ 11 | \item{formula}{ a symbolic description of a model } 12 | \item{data}{ data to process } 13 | } 14 | \details{ 15 | The alorithm makes use of \code{\link{best.first.search}} for searching the attribute subset space. 16 | } 17 | \seealso{ \code{\link{best.first.search}} } 18 | \value{ 19 | a character vector containing chosen attributes 20 | } 21 | \author{ Piotr Romanski } 22 | \examples{ 23 | \dontrun{ 24 | library(mlbench) 25 | data(HouseVotes84) 26 | 27 | subset <- consistency(Class~., HouseVotes84) 28 | f <- as.simple.formula(subset, "Class") 29 | print(f) 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /man/correlation.Rd: -------------------------------------------------------------------------------- 1 | \name{correlation} 2 | \alias{linear.correlation} 3 | \alias{rank.correlation} 4 | \title{ Correlation filter} 5 | \description{ 6 | The algorithm finds weights of continous attributes basing on their correlation with continous class attribute. 7 | } 8 | \usage{ 9 | linear.correlation(formula, data) 10 | rank.correlation(formula, data) 11 | } 12 | \arguments{ 13 | \item{formula}{ a symbolic description of a model } 14 | \item{data}{ data to process } 15 | } 16 | \details{ 17 | \code{linear.correlation} uses Pearson's correlation 18 | 19 | \code{rank.correlation} uses Spearman's correlation 20 | 21 | Rows with \code{NA} values are not taken into consideration. 22 | } 23 | \value{ 24 | a data.frame containing the worth of attributes in the first column and their names as row names 25 | } 26 | \author{ Piotr Romanski } 27 | \examples{ 28 | library(mlbench) 29 | data(BostonHousing) 30 | d=BostonHousing[-4] # only numeric variables 31 | 32 | weights <- linear.correlation(medv~., d) 33 | print(weights) 34 | subset <- cutoff.k(weights, 3) 35 | f <- as.simple.formula(subset, "medv") 36 | print(f) 37 | 38 | weights <- rank.correlation(medv~., d) 39 | print(weights) 40 | subset <- cutoff.k(weights, 3) 41 | f <- as.simple.formula(subset, "medv") 42 | print(f) 43 | } 44 | -------------------------------------------------------------------------------- /man/cutoff.Rd: -------------------------------------------------------------------------------- 1 | \name{cutoff} 2 | \alias{cutoff.k} 3 | \alias{cutoff.k.percent} 4 | \alias{cutoff.biggest.diff} 5 | \title{ Cutoffs } 6 | \description{ 7 | The algorithms select a subset from a ranked attributes. 8 | } 9 | \usage{ 10 | cutoff.k(attrs, k) 11 | cutoff.k.percent(attrs, k) 12 | cutoff.biggest.diff(attrs) 13 | } 14 | \arguments{ 15 | \item{attrs}{ a data.frame containing ranks for attributes in the first column and their names as row names } 16 | \item{k}{ a positive integer in case of \code{cutoff.k} and a numeric between 0 and 1 in case of \code{cutoff.k.percent} } 17 | } 18 | \details{ 19 | \code{cutoff.k} chooses k best attributes 20 | 21 | \code{cutoff.k.percent} chooses best k * 100\% of attributes 22 | 23 | \code{cutoff.biggest.diff} chooses a subset of attributes which are significantly better than other. 24 | } 25 | \value{ 26 | A character vector containing selected attributes. 27 | } 28 | \author{ Piotr Romanski } 29 | \examples{ 30 | data(iris) 31 | 32 | weights <- information.gain(Species~., iris) 33 | print(weights) 34 | 35 | subset <- cutoff.k(weights, 1) 36 | f <- as.simple.formula(subset, "Species") 37 | print(f) 38 | 39 | subset <- cutoff.k.percent(weights, 0.75) 40 | f <- as.simple.formula(subset, "Species") 41 | print(f) 42 | 43 | subset <- cutoff.biggest.diff(weights) 44 | f <- as.simple.formula(subset, "Species") 45 | print(f) 46 | 47 | } 48 | -------------------------------------------------------------------------------- /man/exhaustive.search.Rd: -------------------------------------------------------------------------------- 1 | \name{exhaustive.search} 2 | \alias{exhaustive.search} 3 | \title{ Exhaustive search } 4 | \description{ 5 | The algorithm for searching atrribute subset space. 6 | } 7 | \usage{ 8 | exhaustive.search(attributes, eval.fun) 9 | } 10 | \arguments{ 11 | \item{attributes}{ a character vector of all attributes to search in } 12 | \item{eval.fun}{ a function taking as first parameter a character vector of all attributes and returning a numeric indicating how important a given subset is } 13 | } 14 | \details{ 15 | The algorithm searches the whole attribute subset space in breadth-first order. 16 | } 17 | \value{ 18 | A character vector of selected attributes. 19 | } 20 | \author{ Piotr Romanski } 21 | \seealso{ \code{\link{forward.search}}, \code{\link{backward.search}}, \code{\link{best.first.search}}, \code{\link{hill.climbing.search}} } 22 | \examples{ 23 | library(rpart) 24 | data(iris) 25 | 26 | evaluator <- function(subset) { 27 | #k-fold cross validation 28 | k <- 5 29 | splits <- runif(nrow(iris)) 30 | results = sapply(1:k, function(i) { 31 | test.idx <- (splits >= (i - 1) / k) & (splits < i / k) 32 | train.idx <- !test.idx 33 | test <- iris[test.idx, , drop=FALSE] 34 | train <- iris[train.idx, , drop=FALSE] 35 | tree <- rpart(as.simple.formula(subset, "Species"), train) 36 | error.rate = sum(test$Species != predict(tree, test, type="c")) / nrow(test) 37 | return(1 - error.rate) 38 | }) 39 | print(subset) 40 | print(mean(results)) 41 | return(mean(results)) 42 | } 43 | 44 | subset <- exhaustive.search(names(iris)[-5], evaluator) 45 | f <- as.simple.formula(subset, "Species") 46 | print(f) 47 | 48 | 49 | } 50 | -------------------------------------------------------------------------------- /man/greedy.search.Rd: -------------------------------------------------------------------------------- 1 | \name{greedy.search} 2 | \alias{backward.search} 3 | \alias{forward.search} 4 | \title{ Greedy search } 5 | \description{ 6 | The algorithms for searching atrribute subset space. 7 | } 8 | \usage{ 9 | backward.search(attributes, eval.fun) 10 | forward.search(attributes, eval.fun) 11 | } 12 | \arguments{ 13 | \item{attributes}{ a character vector of all attributes to search in } 14 | \item{eval.fun}{ a function taking as first parameter a character vector of all attributes and returning a numeric indicating how important a given subset is } 15 | } 16 | \details{ 17 | These algorithms implement greedy search. At first, the algorithms expand starting node, evaluate its children and choose the best one which becomes a new starting node. This process goes only in one direction. \code{forward.search} starts from an empty and \code{backward.search} from a full set of attributes. 18 | } 19 | \value{ 20 | A character vector of selected attributes. 21 | } 22 | \author{ Piotr Romanski } 23 | \seealso{ \code{\link{best.first.search}}, \code{\link{hill.climbing.search}}, \code{\link{exhaustive.search}} } 24 | \examples{ 25 | library(rpart) 26 | data(iris) 27 | 28 | evaluator <- function(subset) { 29 | #k-fold cross validation 30 | k <- 5 31 | splits <- runif(nrow(iris)) 32 | results = sapply(1:k, function(i) { 33 | test.idx <- (splits >= (i - 1) / k) & (splits < i / k) 34 | train.idx <- !test.idx 35 | test <- iris[test.idx, , drop=FALSE] 36 | train <- iris[train.idx, , drop=FALSE] 37 | tree <- rpart(as.simple.formula(subset, "Species"), train) 38 | error.rate = sum(test$Species != predict(tree, test, type="c")) / nrow(test) 39 | return(1 - error.rate) 40 | }) 41 | print(subset) 42 | print(mean(results)) 43 | return(mean(results)) 44 | } 45 | 46 | subset <- forward.search(names(iris)[-5], evaluator) 47 | f <- as.simple.formula(subset, "Species") 48 | print(f) 49 | 50 | 51 | } 52 | -------------------------------------------------------------------------------- /man/hill.climbing.search.Rd: -------------------------------------------------------------------------------- 1 | \name{hill.climbing.search} 2 | \alias{hill.climbing.search} 3 | \title{ Hill climbing search } 4 | \description{ 5 | The algorithm for searching atrribute subset space. 6 | } 7 | \usage{ 8 | hill.climbing.search(attributes, eval.fun) 9 | } 10 | \arguments{ 11 | \item{attributes}{ a character vector of all attributes to search in } 12 | \item{eval.fun}{ a function taking as first parameter a character vector of all attributes and returning a numeric indicating how important a given subset is } 13 | } 14 | \details{ 15 | The algorithm starts with a random attribute set. Then it evaluates all its neighbours and chooses the best one. It might be susceptible to local maximum. 16 | } 17 | \value{ 18 | A character vector of selected attributes. 19 | } 20 | \author{ Piotr Romanski } 21 | \seealso{ \code{\link{forward.search}}, \code{\link{backward.search}}, \code{\link{best.first.search}}, \code{\link{exhaustive.search}} } 22 | \examples{ 23 | library(rpart) 24 | data(iris) 25 | 26 | evaluator <- function(subset) { 27 | #k-fold cross validation 28 | k <- 5 29 | splits <- runif(nrow(iris)) 30 | results = sapply(1:k, function(i) { 31 | test.idx <- (splits >= (i - 1) / k) & (splits < i / k) 32 | train.idx <- !test.idx 33 | test <- iris[test.idx, , drop=FALSE] 34 | train <- iris[train.idx, , drop=FALSE] 35 | tree <- rpart(as.simple.formula(subset, "Species"), train) 36 | error.rate = sum(test$Species != predict(tree, test, type="c")) / nrow(test) 37 | return(1 - error.rate) 38 | }) 39 | print(subset) 40 | print(mean(results)) 41 | return(mean(results)) 42 | } 43 | 44 | subset <- hill.climbing.search(names(iris)[-5], evaluator) 45 | f <- as.simple.formula(subset, "Species") 46 | print(f) 47 | 48 | 49 | } 50 | -------------------------------------------------------------------------------- /man/information.gain.Rd: -------------------------------------------------------------------------------- 1 | \name{entropy.based} 2 | \alias{information.gain} 3 | \alias{gain.ratio} 4 | \alias{symmetrical.uncertainty} 5 | \title{ Entropy-based filters } 6 | \description{ 7 | The algorithms find weights of discrete attributes basing on their correlation with continous class attribute. 8 | } 9 | \usage{ 10 | information.gain(formula, data, unit) 11 | gain.ratio(formula, data, unit) 12 | symmetrical.uncertainty(formula, data, unit) 13 | } 14 | \arguments{ 15 | \item{formula}{ A symbolic description of a model. } 16 | \item{data}{ Data to process. } 17 | \item{unit}{ Unit for computing entropy (passed to \code{\link[entropy]{entropy}}. Default is "log".} 18 | } 19 | \details{ 20 | \code{information.gain} is \deqn{H(Class) + H(Attribute) - H(Class, Attribute)}{H(Class) + H(Attribute) - H(Class, Attribute)}. 21 | 22 | \code{gain.ratio} is \deqn{\frac{H(Class) + H(Attribute) - H(Class, Attribute)}{H(Attribute)}}{(H(Class) + H(Attribute) - H(Class, Attribute)) / H(Attribute)} 23 | 24 | \code{symmetrical.uncertainty} is \deqn{2\frac{H(Class) + H(Attribute) - H(Class, Attribute)}{H(Attribute) + H(Class)}}{2 * (H(Class) + H(Attribute) - H(Class, Attribute)) / (H(Attribute) + H(Class))} 25 | } 26 | \value{ 27 | a data.frame containing the worth of attributes in the first column and their names as row names 28 | } 29 | \author{ Piotr Romanski, Lars Kotthoff } 30 | \examples{ 31 | data(iris) 32 | 33 | weights <- information.gain(Species~., iris) 34 | print(weights) 35 | subset <- cutoff.k(weights, 2) 36 | f <- as.simple.formula(subset, "Species") 37 | print(f) 38 | 39 | weights <- information.gain(Species~., iris, unit = "log2") 40 | print(weights) 41 | 42 | weights <- gain.ratio(Species~., iris) 43 | print(weights) 44 | subset <- cutoff.k(weights, 2) 45 | f <- as.simple.formula(subset, "Species") 46 | print(f) 47 | 48 | weights <- symmetrical.uncertainty(Species~., iris) 49 | print(weights) 50 | subset <- cutoff.biggest.diff(weights) 51 | f <- as.simple.formula(subset, "Species") 52 | print(f) 53 | 54 | } 55 | -------------------------------------------------------------------------------- /man/oneR.Rd: -------------------------------------------------------------------------------- 1 | \name{oneR} 2 | \alias{oneR} 3 | \title{ OneR algorithm } 4 | \description{ 5 | The algorithms find weights of discrete attributes basing on very simple association rules involving only one attribute in condition part. 6 | } 7 | \usage{ 8 | oneR(formula, data) 9 | } 10 | \arguments{ 11 | \item{formula}{ a symbolic description of a model } 12 | \item{data}{ data to process } 13 | } 14 | \details{ 15 | The algorithm uses OneR classifier to find out the attributes' weights. For each attribute it creates a simple rule based only on that attribute and then calculates its error rate. 16 | } 17 | \value{ 18 | a data.frame containing the worth of attributes in the first column and their names as row names 19 | } 20 | \author{ Piotr Romanski } 21 | \examples{ 22 | library(mlbench) 23 | data(HouseVotes84) 24 | 25 | weights <- oneR(Class~., HouseVotes84) 26 | print(weights) 27 | subset <- cutoff.k(weights, 5) 28 | f <- as.simple.formula(subset, "Class") 29 | print(f) 30 | } 31 | -------------------------------------------------------------------------------- /man/random.forest.importance.Rd: -------------------------------------------------------------------------------- 1 | \name{random.forest.importance} 2 | \alias{random.forest.importance} 3 | \title{ RandomForest filter } 4 | \description{ 5 | The algorithm finds weights of attributes using RandomForest algorithm. 6 | } 7 | \usage{ 8 | random.forest.importance(formula, data, importance.type = 1) 9 | } 10 | \arguments{ 11 | \item{formula}{ a symbolic description of a model } 12 | \item{data}{ data to process } 13 | \item{importance.type}{ either 1 or 2, specifying the type of importance measure (1=mean decrease in accuracy, 2=mean decrease in node impurity) } 14 | } 15 | \details{ 16 | This is a wrapper for \code{\link[randomForest]{importance}.} 17 | } 18 | \value{ 19 | a data.frame containing the worth of attributes in the first column and their names as row names 20 | } 21 | \author{ Piotr Romanski } 22 | \examples{ 23 | library(mlbench) 24 | data(HouseVotes84) 25 | 26 | weights <- random.forest.importance(Class~., HouseVotes84, importance.type = 1) 27 | print(weights) 28 | subset <- cutoff.k(weights, 5) 29 | f <- as.simple.formula(subset, "Class") 30 | print(f) 31 | } 32 | -------------------------------------------------------------------------------- /man/relief.Rd: -------------------------------------------------------------------------------- 1 | \name{relief} 2 | \alias{relief} 3 | \title{ RReliefF filter } 4 | \description{ 5 | The algorithm finds weights of continous and discrete attributes basing on a distance between instances. 6 | } 7 | \usage{ 8 | relief(formula, data, neighbours.count = 5, sample.size = 10) 9 | } 10 | \arguments{ 11 | \item{formula}{ a symbolic description of a model } 12 | \item{data}{ data to process } 13 | \item{neighbours.count}{ number of neighbours to find for every sampled instance } 14 | \item{sample.size}{ number of instances to sample } 15 | } 16 | \details{ 17 | The algorithm samples instances and finds their nearest hits and misses. Considering that result, it evaluates weights of attributes. 18 | } 19 | \references{ 20 | \itemize{ 21 | \item{-}{Igor Kononenko: Estimating Attributes: Analysis and Extensions of RELIEF. In: European Conference on Machine Learning, 171-182, 1994.} 22 | \item{-}{Marko Robnik-Sikonja, Igor Kononenko: An adaptation of Relief for attribute estimation in regression. In: Fourteenth International Conference on Machine Learning, 296-304, 1997.} 23 | } 24 | } 25 | \value{ 26 | a data.frame containing the worth of attributes in the first column and their names as row names 27 | } 28 | \author{ Piotr Romanski } 29 | \examples{ 30 | data(iris) 31 | 32 | weights <- relief(Species~., iris, neighbours.count = 5, sample.size = 20) 33 | print(weights) 34 | subset <- cutoff.k(weights, 2) 35 | f <- as.simple.formula(subset, "Species") 36 | print(f) 37 | } 38 | --------------------------------------------------------------------------------