├── .Rbuildignore
├── DESCRIPTION
├── NAMESPACE
├── R
    ├── cutoff.R
    ├── discretize.R
    ├── misc.R
    ├── normalize.R
    ├── search.best.first.R
    ├── search.exhaustive.R
    ├── search.greedy.R
    ├── search.hill.climbing.R
    ├── search.misc.R
    ├── selector.cfs.R
    ├── selector.chi.squared.R
    ├── selector.consistency.R
    ├── selector.correlation.R
    ├── selector.info.gain.R
    ├── selector.oneR.R
    ├── selector.random.forest.R
    └── selector.relief.R
├── README.md
└── man
    ├── FSelector-package.Rd
    ├── as.simple.formula.Rd
    ├── best.first.search.Rd
    ├── cfs.Rd
    ├── chi.squared.Rd
    ├── consistency.Rd
    ├── correlation.Rd
    ├── cutoff.Rd
    ├── exhaustive.search.Rd
    ├── greedy.search.Rd
    ├── hill.climbing.search.Rd
    ├── information.gain.Rd
    ├── oneR.Rd
    ├── random.forest.importance.Rd
    └── relief.Rd


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | makeR
2 | Makefile
3 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Type: Package
 2 | Package: FSelector
 3 | Title: Selecting Attributes
 4 | Version: 0.34
 5 | Date: 2023-08-22
 6 | Author: Piotr Romanski, Lars Kotthoff, Patrick Schratz
 7 | Maintainer: Lars Kotthoff <larsko@uwyo.edu>
 8 | BugReports: https://github.com/larskotthoff/fselector/issues
 9 | URL: https://github.com/larskotthoff/fselector
10 | Description: Functions for selecting attributes from a given
11 |     dataset. Attribute subset selection is the process of identifying and
12 |     removing as much of the irrelevant and redundant information as
13 |     possible.
14 | License: GPL-2
15 | Imports:
16 |     digest,
17 |     entropy,
18 |     randomForest,
19 |     RWeka
20 | Suggests:
21 |     mlbench,
22 |     rpart
23 | Encoding: UTF-8
24 | LazyLoad: yes
25 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | export("as.simple.formula",
 2 | 	"cutoff.biggest.diff",
 3 | 	"cutoff.k",
 4 | 	"cutoff.k.percent",
 5 | 	"best.first.search",
 6 | 	"backward.search",
 7 | 	"forward.search",
 8 | 	"hill.climbing.search",
 9 | 	"exhaustive.search",
10 | 	"cfs",
11 | 	"chi.squared",
12 | 	"information.gain",
13 | 	"gain.ratio",
14 | 	"symmetrical.uncertainty",
15 | 	"linear.correlation",
16 | 	"rank.correlation",
17 | 	"oneR",
18 | 	"relief",
19 | 	"consistency",
20 | 	"random.forest.importance")
21 | importFrom(randomForest, randomForest, importance)
22 | importFrom(RWeka, Discretize)
23 | importFrom(digest, digest)
24 | importFrom(entropy, entropy)
25 | importFrom("stats", "aggregate", "as.formula", "complete.cases", "cor", "model.frame", "na.pass", "sd")
26 | importFrom("utils", "combn")
27 | 


--------------------------------------------------------------------------------
/R/cutoff.R:
--------------------------------------------------------------------------------
 1 | cutoff.k <- function(attrs, k) {
 2 | 	if(dim(attrs)[1] == 0)
 3 | 		return(character(0))
 4 | 	if(k < 1)
 5 | 		stop("k too small")
 6 | 	if(k > dim(attrs)[1])
 7 | 		k = dim(attrs)[1]
 8 | 	sorted_names = rownames(attrs)[do.call(order, c(attrs, decreasing = TRUE))]
 9 | 	return(sorted_names[1:k])
10 | }
11 | 
12 | cutoff.k.percent <- function(attrs, k) {
13 | 	if(dim(attrs)[1] == 0)
14 | 		return(character(0))
15 | 	if(k <= 0)
16 | 		stop("k too small")
17 | 	if(k > 1) {
18 | 		warning("Assumed k=1")
19 | 		k = 1
20 | 	}
21 | 	sorted_names = rownames(attrs)[do.call(order, c(attrs, decreasing = TRUE))]
22 | 	return(sorted_names[1:round((k * length(sorted_names)))])
23 | }
24 | 
25 | cutoff.biggest.diff <- function(attrs) {
26 | 	if(dim(attrs)[1] == 0)
27 | 		return(character(0))
28 | 	else if(dim(attrs)[1] == 1)
29 | 		return(dimnames(attrs)[[1]])
30 | 	
31 | 	perm = order(attrs[,1], decreasing = TRUE)
32 | 	attrs = attrs[perm, , drop = FALSE]
33 | 	
34 | 	intervals = sapply(1:(dim(attrs)[1] - 1), function(idx) {
35 | 			attrs[idx, 1] - attrs[idx + 1, 1]
36 | 		})
37 | 	
38 | 	return(dimnames(attrs)[[1]][1:(which.max(intervals))])
39 | }
40 | 


--------------------------------------------------------------------------------
/R/discretize.R:
--------------------------------------------------------------------------------
 1 | discretize.all <- function(formula, data) {
 2 | 	new_data = get.data.frame.from.formula(formula, data)
 3 | 
 4 | 	dest_column_name = dimnames(new_data)[[2]][1]
 5 | 	if(!is.factor(new_data[[1]])) {
 6 | 		new_data[[1]] = equal.frequency.binning.discretization(new_data[[1]], 5)
 7 | 	}
 8 | 
 9 | 	new_data = supervised.discretization(formula, data = new_data)
10 | 
11 | 	# reorder attributes
12 | 	new_data = get.data.frame.from.formula(formula, new_data)
13 | 	return(new_data)
14 | }
15 | 
16 | # unupervised
17 | equal.frequency.binning.discretization <- function(data, bins) {
18 | 	bins = as.integer(bins)
19 | 	if (!is.numeric(data))
20 | 		stop("Data must be numeric")
21 | 	if(bins < 1)
22 | 		stop("Number of bins too small")
23 | 
24 | 	complete = complete.cases(data)
25 | 	ord = do.call(order, list(data))
26 | 	len = length(data[complete])
27 | 	blen = len / bins
28 | 	new_data = data
29 | 
30 | 	p1 = p2 = 0
31 | 
32 | 	for(i in 1:bins) {
33 | 		p1 = p2 + 1
34 | 		p2 = round(i * blen)
35 | 		new_data[ord[p1:min(p2, len)]] = i
36 | 	}
37 | 
38 | 	return(factor(new_data))
39 | }
40 | 
41 | # unupervised
42 | equal.width.binning.discretization <- function(data, bins) {
43 | 	if (!is.numeric(data))
44 | 		stop("Data must be numeric")
45 | 	if(bins < 1)
46 | 		stop("Number of bins too small")
47 | 	return(cut(data, bins))
48 | }
49 | 
50 | #MDL - Fayyad, Irani
51 | supervised.discretization <- function(formula, data) {
52 | 	data = get.data.frame.from.formula(formula, data)
53 | 	complete = complete.cases(data[[1]])
54 | 	all.complete = all(complete)
55 | 	if(!all.complete) {
56 | 		new_data = data[complete, , drop=FALSE]
57 | 		result = Discretize(formula, data = new_data, na.action = na.pass)
58 | 		return(result)
59 | 	} else {
60 | 		return(Discretize(formula, data = data, na.action = na.pass))
61 | 	}
62 | 
63 | }
64 | 


--------------------------------------------------------------------------------
/R/misc.R:
--------------------------------------------------------------------------------
 1 | as.simple.formula <- function(attributes, class) {
 2 | 	return(as.formula(paste(class, paste(attributes, sep = "", collapse = " + "), sep = " ~ ")))
 3 | }
 4 | 
 5 | get.data.frame.from.formula <- function(formula, data) {
 6 | 	d = model.frame(formula, data, na.action = NULL)
 7 | 	for(i in 1:dim(d)[2]) {
 8 | 		if(is.factor(d[[i]]) || is.logical(d[[i]]) || is.character(d[[i]]))
 9 | 			d[[i]] = factor(d[[i]])
10 | 	}
11 | 	return(d)
12 | }
13 | 
14 | entropyHelper <- function(x, unit = "log") {
15 |     return(entropy(table(x, useNA="always"), unit = unit))
16 | }
17 | 


--------------------------------------------------------------------------------
/R/normalize.R:
--------------------------------------------------------------------------------
 1 | normalize.min.max <- function(data) {
 2 | 	attr_count = dim(data)[2]
 3 | 	if(attr_count == 0)
 4 | 		return(data)
 5 | 	for(i in 1:attr_count) {
 6 | 		if(!is.numeric(data[, i]))
 7 | 			next()
 8 | 		if(!any(complete.cases(data[, i])))
 9 | 			next()
10 | 		mm = range(data[, i], na.rm = TRUE)
11 | 		minimum = mm[1]
12 | 		maximum = mm[2]
13 | 		if(minimum == maximum)
14 | 			data[, i] = data[, i] / minimum
15 | 		else
16 | 			data[, i] = (data[, i] - minimum) / (maximum - minimum)
17 | 	}
18 | 
19 | 	return(data)
20 | }
21 | 


--------------------------------------------------------------------------------
/R/search.best.first.R:
--------------------------------------------------------------------------------
 1 | best.first.search <- function(attributes, eval.fun, max.backtracks = 5) {
 2 | 	if(length(attributes) == 0)
 3 | 		stop("Attributes not specified")
 4 | 	
 5 | 	eval.fun = match.fun(eval.fun)
 6 | 	
 7 | 	attach_children <- function(states, best) {
 8 | 		parent_state = states$attrs[best$idx,]
 9 | 		children = create.children(parent_state, "forward", omit.func = function(...) {
10 | 			length(find.subset(states$attrs, ...)) > 0
11 | 			})
12 | 		children_len = ifelse(is.null(children), 0, dim(children)[1])
13 | 		if(children_len > 0) {
14 | 			states$attrs = rbind(states$attrs, children)
15 | 			states$open = c(states$open, rep(TRUE, children_len))
16 | 			states$results = c(states$results, rep(NA, children_len))
17 | 		}
18 | 		return(states)
19 | 	}
20 | 
21 | 	states = list(
22 | 		attrs = diag(length(attributes)),
23 | 		open = rep(TRUE, length(attributes)),
24 | 		results = rep(NA, length(attributes))
25 | 	)
26 | 	colnames(states$attrs) = attributes
27 | 	
28 | 	best = list(
29 | 		result = -Inf,
30 | 		idx = NULL
31 | 	)
32 | 	
33 | 	repeat {
34 | 		# calculate merit for every open and not evaluated subset of attributes
35 | 		rows_to_eval = states$open & is.na(states$results)
36 | 		if(any(rows_to_eval)) {
37 | 			states$results[rows_to_eval] = 
38 | 				apply(states$attrs[rows_to_eval,, drop=FALSE], 1, function(vec) {
39 | 					attrs = attributes[as.logical(vec)]
40 | 					return(eval.fun(attrs))
41 | 				})
42 | 		}
43 | 		#find best
44 | 		new_best = find.best(states$results, states$open)
45 | 
46 | 		#check if better
47 | 		if(is.null(new_best$result))
48 | 			break()
49 | 		states$open[new_best$idx] = FALSE
50 | 		if(new_best$result > best$result) {
51 | 			best = new_best
52 | 			states = attach_children(states, best)
53 | 		} else {
54 | 			if(max.backtracks > 0) {
55 | 				max.backtracks = max.backtracks - 1
56 | 			} else
57 | 				break
58 | 		}
59 | 	}
60 | 	return(attributes[as.logical(states$attrs[best$idx, ])])
61 | 
62 | }
63 | 


--------------------------------------------------------------------------------
/R/search.exhaustive.R:
--------------------------------------------------------------------------------
 1 | exhaustive.search <- function(attributes, eval.fun) {
 2 | 	len = length(attributes)
 3 | 	if(len == 0)
 4 | 		stop("Attributes not specified")
 5 | 	
 6 | 	eval.fun = match.fun(eval.fun)
 7 | 	best = list(
 8 | 		result = -Inf,
 9 | 		attrs = rep(0, len)
10 | 	)
11 | 	
12 | 	# main loop
13 | 	# for each subset size
14 | 	for(size in 1:len) {
15 | 		child_comb = combn(1:len, size)
16 | 		# for each child
17 | 		for(i in 1:dim(child_comb)[2]) {
18 | 			subset = rep(0, len)
19 | 			subset[child_comb[, i]] = 1
20 | 			result = eval.fun(attributes[as.logical(subset)])
21 | 			if(result > best$result) {
22 | 				best$result = result
23 | 				best$attrs = subset
24 | 			}
25 | 		}
26 | 	}
27 | 	return(attributes[as.logical(best$attrs)])
28 | }


--------------------------------------------------------------------------------
/R/search.greedy.R:
--------------------------------------------------------------------------------
 1 | 
 2 | forward.search <- function(attributes, eval.fun) {
 3 | 	return(greedy.search(attributes, eval.fun, TRUE))
 4 | }
 5 | 
 6 | backward.search <- function(attributes, eval.fun) {
 7 | 	return(greedy.search(attributes, eval.fun, FALSE))
 8 | }
 9 | 
10 | greedy.search <- function(attributes, eval.fun, forward = TRUE) {
11 | 	if(length(attributes) == 0)
12 | 		stop("Attributes not specified")
13 | 	
14 | 	eval.fun = match.fun(eval.fun)
15 | 	best = list(
16 | 		result = -Inf,
17 | 		attrs = rep(as.numeric(!forward), length(attributes))
18 | 	)
19 | 	
20 | 	# initial evaluation for full set when backward
21 | 	if(!forward) {
22 | 		best$result = eval.fun(attributes[as.logical(best$attrs)])
23 | 	}
24 | 	
25 | 	forward_text = ifelse(forward, "forward", "backward")
26 | 	
27 | 	# main loop
28 | 	repeat {
29 | 		# create new matrix of children to evaluate
30 | 		children = create.children(best$attrs, forward_text)
31 | 		if(is.null(children))
32 | 			break()
33 | 		
34 | 		# evaluate and find the best of them
35 | 		children_results = apply(children, 1, function(vec) {
36 | 				eval.fun(attributes[as.logical(vec)])
37 | 			})
38 | 		local_best = find.best(children_results)
39 | 		
40 | 		# compare to the best so far
41 | 		if(local_best$result > best$result) {
42 | 			best$result = local_best$result
43 | 			best$attrs = children[local_best$idx,]
44 | 		} else {
45 | 			break()
46 | 		}
47 | 	}
48 | 	
49 | 	return(attributes[as.logical(best$attrs)])
50 | 
51 | }
52 | 


--------------------------------------------------------------------------------
/R/search.hill.climbing.R:
--------------------------------------------------------------------------------
 1 | hill.climbing.search <- function(attributes, eval.fun) {
 2 | 	if(length(attributes) == 0)
 3 | 		stop("Attributes not specified")
 4 | 	
 5 | 	eval.fun = match.fun(eval.fun)
 6 | 	best = list(
 7 | 		result = -Inf,
 8 | 		attrs = rep(0, length(attributes))
 9 | 	)
10 | 	while(sum(best$attrs) == 0)
11 | 		best$attrs = sample(c(0,1), length(attributes), replace = TRUE)
12 | 	best$result = eval.fun(attributes[as.logical(best$attrs)])
13 | 	
14 | 	evaluated_states = list(
15 | 		attrs = matrix(best$attrs, nrow = 1, ncol = length(attributes), byrow = TRUE),
16 | 		results = best$result
17 | 	)
18 | 	
19 | 	eval_state <- function(state, evaluated_states) {
20 | 		idx = find.subset(evaluated_states$attrs, state)
21 | 		if(length(idx) == 0) {  # needs to be evaluated
22 | 			return(list(
23 | 				to_be_saved = TRUE,
24 | 				result = eval.fun(attributes[as.logical(state)])
25 | 				))
26 | 		} else if(length(idx) == 1) { # already evaluated
27 | 			return(list(
28 | 				to_be_saved = FALSE,
29 | 				result = evaluated_states$results[idx]
30 | 				))
31 | 		} else {
32 | 			stop("Internal error")
33 | 		}
34 | 	}
35 | 	
36 | 	# main loop
37 | 	repeat {
38 | 		# find neighbours
39 | 		children = create.children(best$attrs, direction = "both")
40 | 		if(is.null(children))
41 | 			break()
42 | 		
43 | 		# evaluate and find the best of them
44 | 		children_evaluated = apply(children, 1, function(vec) {
45 | 				eval_state(vec, evaluated_states)
46 | 			})
47 | 
48 | 		children_results = sapply(children_evaluated, function(x) x$result)
49 | 		children_to_be_saved = sapply(children_evaluated, function(x) x$to_be_saved)
50 | 		
51 | 		# save children at evaluated_states
52 | 		evaluated_states$attrs = rbind(evaluated_states$attrs, children[children_to_be_saved,, drop = FALSE])
53 | 		evaluated_states$results = c(evaluated_states$results, children_results[children_to_be_saved])
54 | 		
55 | 		local_best = find.best(children_results)
56 | 
57 | 		# compare to the best so far
58 | 		if(local_best$result > best$result) {
59 | 			best$result = local_best$result
60 | 			best$attrs = children[local_best$idx,]
61 | 		} else {
62 | 			break()
63 | 		}
64 | 	}
65 | 	return(attributes[as.logical(best$attrs)])
66 | }


--------------------------------------------------------------------------------
/R/search.misc.R:
--------------------------------------------------------------------------------
  1 | # returns indicies
  2 | find.subset <- function(subsets.matrix, subset) {
  3 | 	subset = as.vector(subset)
  4 | 	len = length(subset)
  5 | 	if(len == 0)
  6 | 		stop("Empty atrributes subset.")
  7 | 	if(dim(subsets.matrix)[2] != len)
  8 | 		stop("Incorrect dimensions.")
  9 | 		
 10 | 	if(dim(subsets.matrix)[1] == 0)
 11 | 		return(as.integer(NULL))
 12 | 	
 13 | 	cond = rep(TRUE, dim(subsets.matrix)[1])
 14 | 	for(i in 1:len)
 15 | 		cond = cond & (subsets.matrix[,i] == subset[i])
 16 | 	return(which(cond))
 17 | }
 18 | 
 19 | 
 20 | create.children <- function(parent, direction = c("forward", "backward", "both"), omit.func = NULL ) {
 21 | 	direction = match.arg(direction)
 22 | 
 23 | 	if(!is.null(omit.func)) {
 24 | 		omit.func = match.fun(omit.func)
 25 | 	}
 26 | 
 27 | 	cols = length(parent)
 28 | 	if(cols <= 0)
 29 | 		stop("Parent attribute set cannot be empty.")
 30 | 	
 31 | 	m1 = NULL
 32 | 	m2 = NULL
 33 | 	
 34 | 	if(direction == "forward" || direction == "both") {
 35 | 		rows = cols - sum(parent)
 36 | 		if(rows > 0) {
 37 | 			m1 = matrix(parent, ncol = cols, nrow = rows, byrow = TRUE)
 38 | 
 39 | 			current_row = 1
 40 | 			current_col = 1
 41 | 			repeat {
 42 | 				if(current_col > cols || current_row > rows)
 43 | 					break()
 44 | 
 45 | 				if(m1[current_row, current_col] == 0) {
 46 | 					m1[current_row, current_col] = 1
 47 | 					current_row = current_row + 1
 48 | 				}
 49 | 				
 50 | 				current_col = current_col + 1
 51 | 			}
 52 | 		}
 53 | 	}
 54 | 	
 55 | 	if(direction == "backward" || direction == "both") {
 56 | 		rows = sum(parent)
 57 | 		if(rows > 1) { # skipped if only 0s
 58 | 			m2 = matrix(parent, ncol = cols, nrow = rows, byrow = TRUE)
 59 | 
 60 | 			current_row = 1
 61 | 			current_col = 1
 62 | 			repeat {
 63 | 				if(current_col > cols || current_row > rows)
 64 | 					break()
 65 | 
 66 | 				if(m2[current_row, current_col] == 1) {
 67 | 					m2[current_row, current_col] = 0
 68 | 					current_row = current_row + 1
 69 | 				}
 70 | 				
 71 | 				current_col = current_col + 1
 72 | 			}
 73 | 		}
 74 | 	}
 75 | 	
 76 | 	m = rbind(m1, m2)
 77 | 	if(is.null(m))
 78 | 		return(m)
 79 | 	if(!is.null(omit.func)) {
 80 | 		rows_to_omit = apply(m, 1, omit.func)
 81 | 		return(m[!rows_to_omit,, drop = FALSE])
 82 | 	} else {
 83 | 		return(m)
 84 | 	}
 85 | }
 86 | 
 87 | find.best <- function(results, subset = rep(TRUE, length(results))) {
 88 | 	best = list(
 89 | 		result = NULL,
 90 | 		idx = NULL
 91 | 	)
 92 | 
 93 | 	w = which(subset)
 94 | 	if(length(w) > 0) {
 95 | 		children_results = results[w]
 96 | 		max_idx = which.max(children_results)
 97 | 		best$result = children_results[max_idx]
 98 | 		best$idx = w[max_idx]
 99 | 	}
100 | 	return(best)
101 | }
102 | 


--------------------------------------------------------------------------------
/R/selector.cfs.R:
--------------------------------------------------------------------------------
  1 | ### CFS
  2 | # classification and regression
  3 | # continous and discrete data
  4 | cfs <- function(formula, data) {
  5 | 	cont_correlation <- function(a, b) {
  6 | 		result = 0
  7 | 		if(!is.factor(a) && !is.factor(b)) { # both continous
  8 | 			complete = complete.cases(a) & complete.cases(b)
  9 | 			if(!any(complete))
 10 | 				return(0)
 11 | 			vec1 = a[complete]
 12 | 			vec2 = b[complete]
 13 | 			if(sd(vec1) == 0 || sd(vec2) == 0)
 14 | 				return(0)
 15 | 			result = cor(vec1, vec2)
 16 | 		} else if(is.factor(a) && is.factor(b)) { # both discrete
 17 | 			tab = table(a, b)
 18 | 			alevels = rownames(tab)
 19 | 			blevels = colnames(tab)
 20 | 			
 21 | 			result = sum(sapply(alevels, function(avalue) {
 22 | 					avec = as.numeric(a == avalue)
 23 | 					complete_a = complete.cases(a)
 24 | 					return(sum(sapply(blevels, function(bvalue) {
 25 | 							bvec = as.numeric(b == bvalue)
 26 | 							complete_b = complete.cases(b)
 27 | 							complete = complete_a & complete_b
 28 | 							avec_complete_data = avec[complete]
 29 | 							bvec_complete_data = bvec[complete]
 30 | 							if(sd(avec_complete_data, na.rm=TRUE) == 0 || sd(bvec_complete_data, na.rm=TRUE) == 0)
 31 | 								return(0)
 32 | 							return(tab[avalue, bvalue] / length(a) * cor(avec_complete_data, bvec_complete_data))
 33 | 						})))
 34 | 				}))
 35 | 		} else { # continous and discrete
 36 | 			cont = NULL;
 37 | 			disc = NULL;
 38 | 			if(is.factor(a)) {
 39 | 				cont = b
 40 | 				disc = a
 41 | 			} else {
 42 | 				cont = a
 43 | 				disc = b
 44 | 			}
 45 | 			
 46 | 			cont_complete = complete.cases(cont)
 47 | 			disc_table = table(disc)
 48 | 			disc_levels = names(disc_table)
 49 | 			
 50 | 			if(length(disc_levels) == 0) {
 51 | 				result = 0
 52 | 			} else {
 53 | 				result = sum(sapply(disc_levels, function(lev) {
 54 | 						disc_vec = as.numeric(disc == lev)
 55 | 						disc_vec_complete = complete.cases(disc_vec)
 56 | 						complete = cont_complete & disc_vec_complete
 57 | 						disc_vec_complete_data = disc_vec[complete]
 58 | 						cont_complete_data = cont[complete]
 59 | 						if(sd(cont_complete_data) == 0)
 60 | 							return(0)
 61 | 						return(disc_table[lev] / length(disc) * cor(disc_vec_complete_data, cont_complete_data))
 62 | 					}))
 63 | 			}
 64 | 		}
 65 | 		return(result)
 66 | 	}
 67 | 
 68 | 	# uses parent.env (correlations)
 69 | 	get_correlation <- function(attr1, attr2, classification, new_data, entropies) {
 70 | 		#lazy evaluation
 71 | 		if(!is.na(correlations[attr1, attr2])) {
 72 | 			return(correlations[attr1, attr2])
 73 | 		}
 74 | 		
 75 | 		tmp_res = NA
 76 | 		if(classification) { #discrete class
 77 | 			tmp_res = 2.0 * (entropies[attr1] + entropies[attr2] - entropyHelper(data.frame(cbind(new_data[[attr1]], new_data[[attr2]])))) / (entropies[attr1] + entropies[attr2])
 78 | 		} else { #continous class
 79 | 			tmp_res = cont_correlation(new_data[[attr1]], new_data[[attr2]])
 80 | 		}
 81 |         if(is.nan(tmp_res)) {
 82 |             # all entropies (individual + joint) are 0
 83 |             tmp_res = 0
 84 |         }
 85 | 
 86 | 		correlations[attr1, attr2] <<- tmp_res
 87 | 		correlations[attr2, attr1] <<- tmp_res
 88 | 		
 89 | 		return(tmp_res)
 90 | 	}
 91 | 
 92 | 	# uses parent.env
 93 | 	evaluator <- function(attrs) {
 94 | 		ff_sum = 0
 95 | 		ff_count = 0
 96 | 		fc_sum = 0
 97 | 		attr_count = length(attrs)
 98 | 		
 99 | 		if(attr_count <= 0)
100 | 			stop("Attributes not specified")
101 | 		
102 | 		for(i in 1:attr_count) {
103 | 			attr1 = attrs[i]
104 | 			
105 | 			# feature-class correlation
106 |             cor = get_correlation(attr1, 1, classification, new_data, entropies)
107 |             fc_sum = fc_sum + cor
108 | 			
109 | 			# feature-feature correlation
110 | 			if(i == attr_count) {
111 | 				next()
112 | 			}
113 | 			for(j in (i+1):attr_count) {
114 | 				attr2 = attrs[j]
115 |                 cor = get_correlation(attr1, attr2, classification, new_data, entropies)
116 |                 ff_count = ff_count + 1
117 |                 ff_sum = ff_sum + cor
118 | 			}
119 | 		}
120 | 
121 | 		ff_cor = ff_sum / ff_count
122 | 		fc_cor = fc_sum / attr_count
123 | 		
124 | 		if(attr_count == 1)
125 | 			return(fc_cor)
126 | 		else
127 | 			return(attr_count * fc_cor / sqrt(attr_count + attr_count * (attr_count - 1) * ff_cor))
128 | 	}
129 | 	
130 | 	new_data = get.data.frame.from.formula(formula, data)
131 | 	
132 | 	# prepare correlation matrix
133 | 	classification = is.factor(new_data[[1]])
134 | 	attr_count = dim(new_data)[2]
135 | 	attr_names = colnames(new_data)
136 | 	correlations = matrix(rep(NA, attr_count ^ 2), nrow = attr_count, ncol = attr_count,
137 | 		dimnames = list(attr_names, attr_names))
138 | 	
139 | 	entropies = NULL
140 | 	if(classification) {
141 | 		new_data = supervised.discretization(formula, data = new_data)
142 | 		new_data = get.data.frame.from.formula(formula, new_data)
143 | 		entropies = sapply(new_data, entropyHelper)		
144 | 	}
145 | 
146 | 	result = best.first.search(names(new_data)[-1], evaluator)
147 | 		
148 | 	return(result)
149 | }
150 | 


--------------------------------------------------------------------------------
/R/selector.chi.squared.R:
--------------------------------------------------------------------------------
 1 | ### CHI-SQUARED
 2 | # classification and regression
 3 | # continous and discrete data
 4 | chi.squared <- function(formula, data) {
 5 | 	
 6 | 	new_data = get.data.frame.from.formula(formula, data)
 7 | 	new_data = discretize.all(formula,new_data)
 8 | 	
 9 | 	class_data = new_data[[1]]
10 | 	new_data = new_data[-1] #new_data without class attr
11 | 	
12 | 	results = sapply(new_data, function(w) {
13 | 			cont = table(class_data, w)
14 | 			row_sums = apply(cont, 1, sum)
15 | 			col_sums = apply(cont, 2, sum)
16 | 			all_sum = sum(col_sums)
17 | 			expected_matrix = t(as.matrix(col_sums) %*% t(as.matrix(row_sums))) / all_sum
18 | 			chis = sum((cont - expected_matrix) ^ 2 / expected_matrix)
19 | 			
20 | 			if(chis == 0 || length(col_sums) < 2 || length (row_sums) < 2) {
21 | 				return(0)
22 | 			} else {
23 | 				# phi or Cramer's V
24 | 				return(sqrt(chis / (all_sum * min(length(col_sums) - 1, length(row_sums) - 1))))
25 | 			}
26 | 		})
27 | 
28 | 	attr_names = dimnames(new_data)[[2]]
29 | 	return(data.frame(attr_importance = results, row.names = attr_names))
30 | }
31 | 


--------------------------------------------------------------------------------
/R/selector.consistency.R:
--------------------------------------------------------------------------------
 1 | ### Consistency
 2 | # classification and regression
 3 | # continous and discrete data
 4 | consistency <- function(formula, data) {
 5 | 	# uses parent.env
 6 | 	evaluator <- function(attrs) {
 7 | 		attr_data = new_data[, attrs, drop=FALSE]
 8 | 
 9 | 		#deprecated
10 | 		#attr_data = as.data.frame(lapply(attr_data, function(vec) { addNA(vec, ifany=TRUE) }))
11 | 		#result = aggregate(new_data[[1]], as.list(attr_data), function(classes) {
12 | 
13 | 		hashvec = as.factor(apply(attr_data, 1, digest))
14 | 		result = aggregate(new_data[[1]], list(hash=hashvec), function(classes) {
15 | 				return(max(as.vector(table(classes))))
16 | 			})
17 | 		result = sum(result[[dim(result)[2]]]) / dim(attr_data)[1]
18 | 		return(result)
19 | 	}
20 | 	new_data = get.data.frame.from.formula(formula, data)
21 | 	new_data = discretize.all(formula, new_data)
22 | 	column_names = names(new_data)
23 | 	
24 | 	result = best.first.search(column_names[-1], evaluator)
25 | 	return(result)
26 | }
27 | 


--------------------------------------------------------------------------------
/R/selector.correlation.R:
--------------------------------------------------------------------------------
 1 | ### correlation.body
 2 | # regression
 3 | # continous data
 4 | correlation.body <- function(formula, data, type = c("pearson", "spearman")) {
 5 | 	type = match.arg(type)
 6 | 	
 7 | 	new_data = get.data.frame.from.formula(formula, data)
 8 | 	
 9 | 	lapply(new_data, function(vec) {
10 | 		if(is.factor(vec))
11 | 			stop("All data must be continous.")
12 | 	})
13 | 	
14 | 	class_data = new_data[[1]]
15 | 	new_data = new_data[-1] #new_data without class attr
16 | 	
17 | 	class_data_complete = complete.cases(class_data)
18 | 	results = abs(sapply(new_data, function(attr_data) {
19 | 			complete = complete.cases(attr_data) & class_data_complete
20 | 			if(!any(complete))
21 | 				return(NA)
22 | 			vec1 = class_data[complete]
23 | 			vec2 = attr_data[complete]
24 | 			if(sd(vec1) == 0 || sd(vec2) == 0)
25 | 				return(NA)
26 | 			return(cor(vec1, vec2, method = type))
27 | 		}))
28 | 	
29 | 	attr_names = dimnames(new_data)[[2]]
30 | 	return(data.frame(attr_importance = results, row.names = attr_names))
31 | }
32 | 
33 | linear.correlation <- function(formula, data) {
34 | 	return(correlation.body(formula, data, "pearson"))
35 | }
36 | 
37 | rank.correlation <- function(formula, data) {
38 | 	return(correlation.body(formula, data, "spearman"))
39 | }
40 | 


--------------------------------------------------------------------------------
/R/selector.info.gain.R:
--------------------------------------------------------------------------------
 1 | ### INFORMATION GAIN BASED ALGORITHMS
 2 | # classification and regression
 3 | # continous and discrete data
 4 | 
 5 | information.gain <- function(formula, data, unit = "log") {
 6 |     information.gain.body(formula, data, type = "infogain", unit)
 7 | }
 8 | 
 9 | gain.ratio <- function(formula, data, unit = "log") {
10 |     information.gain.body(formula, data, type = "gainratio", unit)
11 | }
12 | 
13 | symmetrical.uncertainty <- function(formula, data, unit = "log") {
14 |     information.gain.body(formula, data, type = "symuncert", unit)
15 | }
16 | 
17 | information.gain.body <- function(formula, data, type = c("infogain", "gainratio", "symuncert"), unit) {
18 |     type = match.arg(type)
19 |     new_data = get.data.frame.from.formula(formula, data)
20 |     new_data = discretize.all(formula, new_data)
21 |     
22 |     attr_entropies = sapply(new_data, entropyHelper, unit)
23 |     class_entropy = attr_entropies[1]
24 |     attr_entropies = attr_entropies[-1]
25 |     joint_entropies = sapply(new_data[-1], function(t) {
26 |             entropyHelper(data.frame(cbind(new_data[[1]], t)), unit)
27 |         })
28 |     results = class_entropy + attr_entropies - joint_entropies
29 | 
30 |     if(type == "gainratio") {
31 |         results = ifelse(attr_entropies == 0, 0, results / attr_entropies)
32 |     } else if(type == "symuncert") {
33 |         results = 2 * results / (attr_entropies + class_entropy)
34 |     }
35 | 
36 |     attr_names = dimnames(new_data)[[2]][-1]
37 |     return(data.frame(attr_importance = results, row.names = attr_names))
38 | }
39 | 


--------------------------------------------------------------------------------
/R/selector.oneR.R:
--------------------------------------------------------------------------------
 1 | ### 1R
 2 | # classification and regression
 3 | # continous and discrete data
 4 | oneR <- function(formula, data) {
 5 | 	
 6 | 	new_data = get.data.frame.from.formula(formula, data)
 7 | 	new_data = discretize.all(formula,new_data)
 8 | 	
 9 | 	class_data = new_data[[1]]
10 | 	new_data = new_data[-1] #new_data without class attr
11 | 	
12 | 	results = sapply(new_data, function(vec) {
13 | 			vec = factor(vec)
14 | 			errors = sapply(levels(vec), function(val) {
15 | 					cvaluestab = as.vector(table(class_data[ which(vec == val) ]))
16 | 					return(sum(cvaluestab[ -which.max(cvaluestab) ]) / length(class_data))
17 | 				})
18 | 			return(sum(errors))
19 | 		})
20 | 	
21 | 	results = max(results) + min(results) - results
22 | 	
23 | 	attr_names = dimnames(new_data)[[2]]
24 | 	return(data.frame(attr_importance = results, row.names = attr_names))
25 | }
26 | 


--------------------------------------------------------------------------------
/R/selector.random.forest.R:
--------------------------------------------------------------------------------
 1 | ### RANDOM FOREST
 2 | # classification and regression
 3 | # continous and discrete data
 4 | # NA deleted
 5 | random.forest.importance <- function(formula, data, importance.type = 1) {
 6 | 	new_data = get.data.frame.from.formula(formula, data)
 7 | 
 8 | 	# get rid of NAs
 9 | 	no_na = rep(TRUE, dim(new_data)[1])
10 | 	for(i in 1:dim(new_data)[2]) {
11 | 		no_na = no_na & complete.cases(new_data[, i])
12 | 	}
13 | 	new_data = new_data[no_na, , drop=FALSE]
14 | 	
15 | 	forest = randomForest(formula, new_data,
16 | 		ntree = 1000, keep.forest = FALSE, importance = TRUE)
17 | 		
18 | 	res = as.data.frame(importance(forest, type = importance.type))
19 | 	colnames(res)[1] = "attr_importance"
20 | 	return(res)
21 | }
22 | 


--------------------------------------------------------------------------------
/R/selector.relief.R:
--------------------------------------------------------------------------------
  1 | ### RELIEF
  2 | # classification and regression
  3 | # continous and discrete data
  4 | relief <- function(formula, data, neighbours.count = 5, sample.size = 10) {
  5 | 	# uses parent.env
  6 | 	find_neighbours <- function(instance_idx) {
  7 | 		instance = new_data[instance_idx,, drop = FALSE]
  8 | 
  9 | 		# for every other instance
 10 | 		for(current_idx in 1:instances_count) {
 11 | 			if(instance_idx == current_idx)
 12 | 				next()
 13 | 			current_instance = new_data[current_idx,, drop = FALSE]
 14 | 			if(is.na(current_instance[1, 1]))
 15 | 				next()
 16 | 			
 17 | 			dist = instance_distance(instance, current_instance)
 18 | 			
 19 | 			if(classification)
 20 | 				class_no = which(classes == current_instance[[1]])
 21 | 			else
 22 | 				class_no = 1
 23 | 			if(nn_stored_count[class_no] < neighbours.count) {
 24 | 				nn_stored_count[class_no] <<- nn_stored_count[class_no] + 1
 25 | 				n_array[class_no, nn_stored_count[class_no], ] <<- c(dist, current_idx)
 26 | 			} else {
 27 | 				max_idx = which.max(n_array[class_no, , 1])
 28 | 				max_value = n_array[class_no, max_idx, 1]
 29 | 				if(dist < max_value) {
 30 | 					n_array[class_no, max_idx, ] <<- c(dist, current_idx)
 31 | 				}
 32 | 			}
 33 | 		}
 34 | 	}
 35 | 	
 36 | 	# uses parent.env
 37 | 	update_weights <- function(instance_idx) {
 38 | 		instance = new_data[instance_idx,, drop = FALSE]
 39 | 		instance_class = instance[1, 1]
 40 | 		instance_class_no = which(classes == instance_class)
 41 | 		
 42 | 		if(classification) {
 43 | 			# for each attribute
 44 | 			for(attr_idx in 1:attributes_count) {
 45 | 				col_idx = attr_idx + 1
 46 | 				
 47 | 				# nearest hits
 48 | 				hits_sum = 0
 49 | 				if(nn_stored_count[instance_class_no] > 0) {
 50 | 					hits_sum = sum(sapply(1:nn_stored_count[instance_class_no], function(n_idx) {
 51 | 							n_instance_idx = n_array[instance_class_no, n_idx, 2]
 52 | 							n_instance = new_data[n_instance_idx,, drop = FALSE]
 53 | 							return(field_distance(col_idx, instance, n_instance))
 54 | 						}))
 55 | 					hits_sum = hits_sum / nn_stored_count[instance_class_no]
 56 | 				}
 57 | 				
 58 | 				# nearest misses
 59 | 				misses_sum = 0
 60 | 				if(class_count > 1) {
 61 | 					misses_sum = sum(sapply((1:class_count)[-instance_class_no], function(class_no) {
 62 | 							class_misses_sum = 0
 63 | 							if(nn_stored_count[class_no] > 0) {
 64 | 								class_misses_sum = sum(sapply(1:nn_stored_count[class_no], function(n_idx) {
 65 | 										n_instance_idx = n_array[class_no, n_idx, 2]
 66 | 										n_instance = new_data[n_instance_idx,, drop = FALSE]
 67 | 										return(field_distance(col_idx, instance, n_instance))
 68 | 									}))
 69 | 								class_misses_sum = class_misses_sum * class_prob[class_no] / nn_stored_count[class_no]
 70 | 							}
 71 | 							return(class_misses_sum)
 72 | 						}))
 73 | 					
 74 | 					
 75 | 					misses_sum = misses_sum / (1 - class_prob[instance_class_no])
 76 | 				}
 77 | 				results[attr_idx] <<- results[attr_idx] - hits_sum + misses_sum
 78 | 			}
 79 | 		} else {
 80 | 			if(nn_stored_count[1] > 0) {
 81 | 				for(n_idx in 1:nn_stored_count[1]) {
 82 | 					n_instance_idx = n_array[1, n_idx, 2]
 83 | 					n_instance = new_data[n_instance_idx,, drop = FALSE]
 84 | 					class_diff = field_distance(1, instance, n_instance)
 85 | 					ndc <<- ndc + class_diff / nn_stored_count[1]
 86 | 					for(attr_idx in 1:attributes_count) {
 87 | 						col_idx = attr_idx + 1
 88 | 						attr_diff_norm = field_distance(col_idx, instance, n_instance) / nn_stored_count[1]
 89 | 						nda[attr_idx] <<- nda[attr_idx] + attr_diff_norm
 90 | 						ndcda[attr_idx] <<- ndcda[attr_idx] + class_diff * attr_diff_norm
 91 | 					}
 92 | 				}
 93 | 			}
 94 | 		}
 95 | 	}
 96 | 	
 97 | 	# parameters: data.frame, data.frame
 98 | 	instance_distance <- function(instance1, instance2) {
 99 | 		len = dim(instance1)[2]
100 | 		if(len != dim(instance2)[2])
101 | 			stop("Instances of different lengths")
102 | 		if(len <= 1)
103 | 			stop("Too few attributes")
104 | 		
105 | 		result = sapply(2:len, function(i) {
106 | 				return(field_distance(i, instance1, instance2))
107 | 			})
108 | 		#return(sqrt(sum(result ^ 2))) #sqrt not needed
109 | 		res = sum(result ^ 2)
110 | 		if(is.na(res)) {
111 | 			stop("Internal error. Distance NA.")
112 | 		}
113 | 		return(res)
114 | 	}
115 | 	
116 | 	# uses parent.env
117 | 	# parameters: index, data.frame, data.frame
118 | 	field_distance <- function(col_idx, instance1, instance2) {
119 | 		value1 = instance1[1, col_idx]
120 | 		value2 = instance2[1, col_idx]
121 | 		attr_idx = col_idx - 1 # skip class
122 | 		
123 | 		if(is.factor(value1) && is.factor(value2)) {
124 | 			if(is.na(value1) && is.na(value2)) {
125 | 				if(classification)
126 | 					return(1 - sum(p_val_in_class[[attr_idx]][, instance1[1, 1]] * p_val_in_class[[attr_idx]][, instance2[1, 1]]))
127 | 				else
128 | 					return(1 - p_same_val[[attr_idx]])
129 | 			} else if(is.na(value1) || is.na(value2)) {
130 | 				if(is.na(value1)) {
131 | 					known_value = value2
132 | 					unknown_class = instance1[1, 1]
133 | 				} else {
134 | 					known_value = value1
135 | 					unknown_class = instance2[1, 1]
136 | 				}
137 | 				if(classification)
138 | 					return(1 - p_val_in_class[[attr_idx]][known_value, unknown_class])
139 | 				else
140 | 					return(1 - p_val[[attr_idx]][known_value])
141 | 			} else if(value1 == value2) {
142 | 				return(0)
143 | 			} else { #if(value1 != value2)
144 | 				return(1)
145 | 			}
146 | 		} else if(is.numeric(value1) && is.numeric(value2)) {
147 | 			if(is.na(value1) && is.na(value2)) {
148 | 				return(1)
149 | 			} else if(is.na(value1)) {
150 | 				return(max(value2, 1 - value2))
151 | 			} else if(is.na(value2)) {
152 | 				return(max(value1, 1 - value1))
153 | 			} else {
154 | 				return(abs(value1 - value2))
155 | 			}		
156 | 		} else {
157 | 			stop("Unsupported value type")
158 | 		}
159 | 	}
160 | 
161 | 	new_data = get.data.frame.from.formula(formula, data)
162 | 	new_data = normalize.min.max(new_data)
163 | 	
164 | 	# for discrete classes
165 | 	class_vector = NULL
166 | 	class_count = NULL
167 | 	class_prob = NULL
168 | 	classes = NULL
169 | 	p_val_in_class = NULL
170 | 	p_val = NULL
171 | 	p_same_val = NULL
172 | 	
173 | 	# for continous class
174 | 	ndc = NULL
175 | 	nda = NULL
176 | 	ndcda = NULL
177 | 	
178 | 	results = NULL
179 | 	n_array = NULL
180 | 	nn_stored_count = NULL
181 | 	classification = NULL
182 | 	sample_instances_idx = NULL
183 | 
184 | 	instances_count = dim(new_data)[1]
185 | 	attributes_count = dim(new_data)[2] - 1
186 | 	attr_names = dimnames(new_data)[[2]][-1]
187 | 	
188 | 	if(neighbours.count < 1) {
189 | 		neighbours.count = 1
190 | 		warning(paste("Assumed: neighbours.count = ", neighbours.count))
191 | 	}
192 | 
193 | 	if(sample.size < 1) {
194 | 		warning(paste("Assumed: sample.size = ", sample.size))
195 | 		sample.size = 1
196 | 		sample_instances_idx = sample(1:instances_count, 1)
197 | 	} else if(sample.size > instances_count) {
198 | 		warning(paste("Assumed: sample.size = ", sample.size))
199 | 		sample.size = instances_count
200 | 		sample_instances_idx = 1:instances_count
201 | 	} else {
202 | 		sample_instances_idx = sort(sample(1:instances_count, sample.size, replace=TRUE))
203 | 	}
204 | 	
205 | 	classification = is.factor(new_data[[1]])
206 | 	if(classification) {
207 | 		class_vector = new_data[[1]]
208 | 		class_prob = table(class_vector)
209 | 		class_prob = class_prob / sum(class_prob)
210 | 		classes = names(class_prob)
211 | 		class_count = length(classes)
212 | 		
213 | 		p_val_in_class = lapply(new_data[-1], function(vec) {
214 | 				if(!is.factor(vec) || !any(is.na(vec)))
215 | 					return(NULL)
216 | 				tab = table(vec, class_vector)
217 | 				return(apply(tab, 2, function(x) {
218 | 						s = sum(x) 
219 | 						if(s == 0)
220 | 							return(x)
221 | 						else
222 | 							return(x / s)
223 | 					}))
224 | 			})
225 | 	} else {
226 | 		class_count = 1
227 | 		ndc = 0
228 | 		nda = array(0, attributes_count)
229 | 		ndcda = array(0, attributes_count)
230 | 	
231 | 		p_val = lapply(new_data[-1], function(vec) {
232 | 				if(!is.factor(vec) || !any(is.na(vec)))
233 | 					return(NULL)
234 | 				tab = table(vec)
235 | 				if(sum(tab) != 0) {
236 | 					tab = tab / sum(tab)
237 | 				}
238 | 				return(tab)
239 | 			})
240 | 		p_same_val = lapply(p_val, function(attr) {
241 | 				if(is.null(attr))
242 | 					return(NULL)
243 | 				return(sum(attr ^ 2))
244 | 			})
245 | 	}
246 | 
247 | 	n_array = array(0, c(class_count, neighbours.count, 2))
248 | 	nn_stored_count = array(0, class_count)
249 | 	results = rep(0, attributes_count)
250 | 
251 | 	sapply(sample_instances_idx, function(current_instance_idx) {
252 | 		current_instance = new_data[current_instance_idx,, drop = FALSE]
253 | 		if(is.na(current_instance[[1]]))
254 | 			return(NULL)
255 | 		
256 | 		nn_stored_count[] <<- 0
257 | 		n_array[] <<- Inf
258 | 		find_neighbours(current_instance_idx)
259 | 		update_weights(current_instance_idx)
260 | 	})
261 | 	
262 | 
263 | 	if(classification) {
264 | 		results = results / sample.size
265 | 		return(data.frame(attr_importance = results, row.names = attr_names))
266 | 	} else {
267 | 		results = ndcda / ndc - ((nda - ndcda) / (sample.size - ndc))
268 | 		results = data.frame(attr_importance = results, row.names = attr_names)
269 | 		#results = normalize.min.max(results)
270 | 		return(results)
271 | 	}
272 | 	
273 | }
274 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | FSelector: Selecting attributes
2 | ===============================
3 | 
4 | This package provides functions for selecting attributes from a given dataset. Attribute subset selection is the process of identifying and removing as much of the irrelevant and redundant information as possible.
5 | 
6 | CRAN page: http://cran.r-project.org/web/packages/FSelector/index.html
7 | 


--------------------------------------------------------------------------------
/man/FSelector-package.Rd:
--------------------------------------------------------------------------------
 1 | \name{FSelector-package}
 2 | \alias{FSelector-package}
 3 | \alias{FSelector}
 4 | \docType{package}
 5 | \title{
 6 |   Package for selecting attributes
 7 | }
 8 | \description{
 9 |   Package containing functions for selecting attributes from a given dataset and a destination attribute.
10 | }
11 | \details{
12 | This package contains:
13 | \itemize{
14 |   \item{-}{Algorithms for filtering attributes: cfs, chi.squared, information.gain, gain.ratio, symmetrical.uncertainty, linear.correlation, rank.correlation, oneR, relief, consistency, random.forest.importance}
15 |   \item{-}{Algorithms for wrapping classifiers and search attribute subset space: best.first.search, backward.search, forward.search, hill.climbing.search}
16 |   \item{-}{Algorithm for choosing a subset of attributes based on attributes' weights: cutoff.k, cutoff.k.percent, cutoff.biggest.diff}
17 |   \item{-}{Algorithm for creating formulas: as.simple.formula}
18 | }
19 | }
20 | \author{
21 | Piotr Romanski\cr
22 | Maintainer: Lars Kotthoff <larsko@uwyo.edu>
23 | }
24 | \keyword{ package }
25 | 


--------------------------------------------------------------------------------
/man/as.simple.formula.Rd:
--------------------------------------------------------------------------------
 1 | \name{as.simple.formula}
 2 | \alias{as.simple.formula}
 3 | \title{ Converting to formulas }
 4 | \description{
 5 |   Converts character vector of atrributes' names and destination attribute's name to a simple formula.
 6 | }
 7 | \usage{
 8 | as.simple.formula(attributes, class)
 9 | }
10 | \arguments{
11 |   \item{attributes}{ character vector of attributes' names }
12 |   \item{class}{ name of destination attribute }
13 | }
14 | \value{
15 |   A simple formula like "class ~ attr1 + attr2"
16 | }
17 | \author{ Piotr Romanski }
18 | \examples{
19 |   data(iris)
20 |   result <- cfs(Species ~ ., iris)
21 |   f <- as.simple.formula(result, "Species")
22 | }


--------------------------------------------------------------------------------
/man/best.first.search.Rd:
--------------------------------------------------------------------------------
 1 | \name{best.first.search}
 2 | \alias{best.first.search}
 3 | \title{ Best-first search }
 4 | \description{
 5 |   The algorithm for searching atrribute subset space.
 6 | }
 7 | \usage{
 8 | best.first.search(attributes, eval.fun, max.backtracks = 5)
 9 | }
10 | \arguments{
11 |   \item{attributes}{ a character vector of all attributes to search in }
12 |   \item{eval.fun}{ a function taking as first parameter a character vector of all attributes and returning a numeric indicating how important a given subset is }
13 |   \item{max.backtracks}{ an integer indicating a maximum allowed number of backtracks, default is 5 }
14 | }
15 | \details{
16 |   The algorithm is similar to \code{\link{forward.search}} besides the fact that is chooses the best node from all already evaluated ones and evaluates it. The selection of the best node is repeated approximately \code{max.backtracks} times in case no better node found.
17 | }
18 | \value{
19 |   A character vector of selected attributes.
20 | }
21 | \author{ Piotr Romanski }
22 | \seealso{ \code{\link{forward.search}}, \code{\link{backward.search}}, \code{\link{hill.climbing.search}}, \code{\link{exhaustive.search}} }
23 | \examples{
24 |   library(rpart)
25 |   data(iris)
26 |   
27 |   evaluator <- function(subset) {
28 |     #k-fold cross validation
29 |     k <- 5
30 |     splits <- runif(nrow(iris))
31 |     results = sapply(1:k, function(i) {
32 |       test.idx <- (splits >= (i - 1) / k) & (splits < i / k)
33 |       train.idx <- !test.idx
34 |       test <- iris[test.idx, , drop=FALSE]
35 |       train <- iris[train.idx, , drop=FALSE]
36 |       tree <- rpart(as.simple.formula(subset, "Species"), train)
37 |       error.rate = sum(test$Species != predict(tree, test, type="c")) / nrow(test)
38 |       return(1 - error.rate)
39 |     })
40 |     print(subset)
41 |     print(mean(results))
42 |     return(mean(results))
43 |   }
44 |   
45 |   subset <- best.first.search(names(iris)[-5], evaluator)
46 |   f <- as.simple.formula(subset, "Species")
47 |   print(f)
48 | 
49 |   
50 | }
51 | 


--------------------------------------------------------------------------------
/man/cfs.Rd:
--------------------------------------------------------------------------------
 1 | \name{cfs}
 2 | \alias{cfs}
 3 | \title{ CFS filter }
 4 | \description{
 5 |   The algorithm finds attribute subset using correlation and entropy measures for continous and discrete data.
 6 | }
 7 | \usage{
 8 | cfs(formula, data)
 9 | }
10 | \arguments{
11 |   \item{formula}{ a symbolic description of a model }
12 |   \item{data}{ data to process }}
13 | \details{
14 |   The alorithm makes use of \code{\link{best.first.search}} for searching the attribute subset space.
15 | }
16 | \seealso{ \code{\link{best.first.search}} }
17 | \value{
18 | a character vector containing chosen attributes
19 | }
20 | \author{ Piotr Romanski }
21 | \examples{
22 |   data(iris)
23 |   
24 |   subset <- cfs(Species~., iris)
25 |   f <- as.simple.formula(subset, "Species")
26 |   print(f)
27 | }


--------------------------------------------------------------------------------
/man/chi.squared.Rd:
--------------------------------------------------------------------------------
 1 | \name{chi.squared}
 2 | \alias{chi.squared}
 3 | \title{ Chi-squared filter }
 4 | \description{
 5 |   The algorithm finds weights of discrete attributes basing on a chi-squared test.
 6 | }
 7 | \usage{
 8 | chi.squared(formula, data)
 9 | }
10 | \arguments{
11 |   \item{formula}{ a symbolic description of a model }
12 |   \item{data}{ a symbolic description of a model }
13 | }
14 | \details{
15 |   The result is equal to Cramer's V coefficient between source attributes and destination attribute.
16 | }
17 | \value{
18 | a data.frame containing the worth of attributes in the first column and their names as row names
19 | }
20 | \author{ Piotr Romanski }
21 | \examples{
22 |   library(mlbench)
23 |   data(HouseVotes84)
24 | 
25 |   weights <- chi.squared(Class~., HouseVotes84)
26 |   print(weights)
27 |   subset <- cutoff.k(weights, 5)
28 |   f <- as.simple.formula(subset, "Class")
29 |   print(f)
30 | }
31 | 


--------------------------------------------------------------------------------
/man/consistency.Rd:
--------------------------------------------------------------------------------
 1 | \name{consistency}
 2 | \alias{consistency}
 3 | \title{ Consistency-based filter }
 4 | \description{
 5 |   The algorithm finds attribute subset using consistency measure for continous and discrete data.  
 6 | }
 7 | \usage{
 8 | consistency(formula, data)
 9 | }
10 | \arguments{
11 |   \item{formula}{ a symbolic description of a model }
12 |   \item{data}{ data to process }
13 | }
14 | \details{
15 |   The alorithm makes use of \code{\link{best.first.search}} for searching the attribute subset space.
16 | }
17 | \seealso{ \code{\link{best.first.search}} }
18 | \value{
19 | a character vector containing chosen attributes
20 | }
21 | \author{ Piotr Romanski }
22 | \examples{
23 | \dontrun{
24 |   library(mlbench)
25 |   data(HouseVotes84)
26 |   
27 |   subset <- consistency(Class~., HouseVotes84)
28 |   f <- as.simple.formula(subset, "Class")
29 |   print(f)
30 | }
31 | }
32 | 


--------------------------------------------------------------------------------
/man/correlation.Rd:
--------------------------------------------------------------------------------
 1 | \name{correlation}
 2 | \alias{linear.correlation}
 3 | \alias{rank.correlation}
 4 | \title{ Correlation filter}
 5 | \description{
 6 |   The algorithm finds weights of continous attributes basing on their correlation with continous class attribute.
 7 | }
 8 | \usage{
 9 | linear.correlation(formula, data)
10 | rank.correlation(formula, data)
11 | }
12 | \arguments{
13 |   \item{formula}{ a symbolic description of a model }
14 |   \item{data}{ data to process }
15 | }
16 | \details{
17 |   \code{linear.correlation} uses Pearson's correlation
18 |   
19 |   \code{rank.correlation} uses Spearman's correlation
20 |   
21 |   Rows with \code{NA} values are not taken into consideration.
22 | }
23 | \value{
24 | a data.frame containing the worth of attributes in the first column and their names as row names
25 | }
26 | \author{ Piotr Romanski }
27 | \examples{
28 |   library(mlbench)
29 |   data(BostonHousing)
30 |   d=BostonHousing[-4] # only numeric variables
31 |   
32 |   weights <- linear.correlation(medv~., d)
33 |   print(weights)
34 |   subset <- cutoff.k(weights, 3)
35 |   f <- as.simple.formula(subset, "medv")
36 |   print(f)
37 | 
38 |   weights <- rank.correlation(medv~., d)
39 |   print(weights)
40 |   subset <- cutoff.k(weights, 3)
41 |   f <- as.simple.formula(subset, "medv")
42 |   print(f)
43 | }
44 | 


--------------------------------------------------------------------------------
/man/cutoff.Rd:
--------------------------------------------------------------------------------
 1 | \name{cutoff}
 2 | \alias{cutoff.k}
 3 | \alias{cutoff.k.percent}
 4 | \alias{cutoff.biggest.diff}
 5 | \title{ Cutoffs }
 6 | \description{
 7 |   The algorithms select a subset from a ranked attributes.
 8 | }
 9 | \usage{
10 | cutoff.k(attrs, k)
11 | cutoff.k.percent(attrs, k)
12 | cutoff.biggest.diff(attrs)
13 | }
14 | \arguments{
15 |   \item{attrs}{ a data.frame containing ranks for attributes in the first column and their names as row names }
16 |   \item{k}{ a positive integer in case of \code{cutoff.k} and a numeric between 0 and 1 in case of \code{cutoff.k.percent} }
17 | }
18 | \details{
19 |   \code{cutoff.k} chooses k best attributes
20 | 
21 |   \code{cutoff.k.percent} chooses best k * 100\% of attributes
22 | 
23 |   \code{cutoff.biggest.diff} chooses a subset of attributes which are significantly better than other.
24 | }
25 | \value{
26 |   A character vector containing selected attributes.
27 | }
28 | \author{ Piotr Romanski }
29 | \examples{
30 |   data(iris)
31 | 
32 |   weights <- information.gain(Species~., iris)
33 |   print(weights)
34 | 
35 |   subset <- cutoff.k(weights, 1)
36 |   f <- as.simple.formula(subset, "Species")
37 |   print(f)
38 | 
39 |   subset <- cutoff.k.percent(weights, 0.75)
40 |   f <- as.simple.formula(subset, "Species")
41 |   print(f)
42 | 
43 |   subset <- cutoff.biggest.diff(weights)
44 |   f <- as.simple.formula(subset, "Species")
45 |   print(f)
46 |   
47 | }
48 | 


--------------------------------------------------------------------------------
/man/exhaustive.search.Rd:
--------------------------------------------------------------------------------
 1 | \name{exhaustive.search}
 2 | \alias{exhaustive.search}
 3 | \title{ Exhaustive search }
 4 | \description{
 5 |   The algorithm for searching atrribute subset space.
 6 | }
 7 | \usage{
 8 | exhaustive.search(attributes, eval.fun)
 9 | }
10 | \arguments{
11 |   \item{attributes}{ a character vector of all attributes to search in }
12 |   \item{eval.fun}{ a function taking as first parameter a character vector of all attributes and returning a numeric indicating how important a given subset is }
13 | }
14 | \details{
15 |   The algorithm searches the whole attribute subset space in breadth-first order.
16 | }
17 | \value{
18 |   A character vector of selected attributes.
19 | }
20 | \author{ Piotr Romanski }
21 | \seealso{ \code{\link{forward.search}}, \code{\link{backward.search}}, \code{\link{best.first.search}}, \code{\link{hill.climbing.search}} }
22 | \examples{
23 |   library(rpart)
24 |   data(iris)
25 |   
26 |   evaluator <- function(subset) {
27 |     #k-fold cross validation
28 |     k <- 5
29 |     splits <- runif(nrow(iris))
30 |     results = sapply(1:k, function(i) {
31 |       test.idx <- (splits >= (i - 1) / k) & (splits < i / k)
32 |       train.idx <- !test.idx
33 |       test <- iris[test.idx, , drop=FALSE]
34 |       train <- iris[train.idx, , drop=FALSE]
35 |       tree <- rpart(as.simple.formula(subset, "Species"), train)
36 |       error.rate = sum(test$Species != predict(tree, test, type="c")) / nrow(test)
37 |       return(1 - error.rate)
38 |     })
39 |     print(subset)
40 |     print(mean(results))
41 |     return(mean(results))
42 |   }
43 |   
44 |   subset <- exhaustive.search(names(iris)[-5], evaluator)
45 |   f <- as.simple.formula(subset, "Species")
46 |   print(f)
47 | 
48 |   
49 | }
50 | 


--------------------------------------------------------------------------------
/man/greedy.search.Rd:
--------------------------------------------------------------------------------
 1 | \name{greedy.search}
 2 | \alias{backward.search}
 3 | \alias{forward.search}
 4 | \title{ Greedy search }
 5 | \description{
 6 |   The algorithms for searching atrribute subset space.
 7 | }
 8 | \usage{
 9 | backward.search(attributes, eval.fun)
10 | forward.search(attributes, eval.fun)
11 | }
12 | \arguments{
13 |   \item{attributes}{ a character vector of all attributes to search in }
14 |   \item{eval.fun}{ a function taking as first parameter a character vector of all attributes and returning a numeric indicating how important a given subset is }
15 | }
16 | \details{
17 |   These algorithms implement greedy search. At first, the algorithms expand starting node, evaluate its children and choose the best one which becomes a new starting node. This process goes only in one direction. \code{forward.search} starts from an empty and \code{backward.search} from a full set of attributes.
18 | }
19 | \value{
20 |   A character vector of selected attributes.
21 | }
22 | \author{ Piotr Romanski }
23 | \seealso{ \code{\link{best.first.search}}, \code{\link{hill.climbing.search}}, \code{\link{exhaustive.search}} }
24 | \examples{
25 |   library(rpart)
26 |   data(iris)
27 |   
28 |   evaluator <- function(subset) {
29 |     #k-fold cross validation
30 |     k <- 5
31 |     splits <- runif(nrow(iris))
32 |     results = sapply(1:k, function(i) {
33 |       test.idx <- (splits >= (i - 1) / k) & (splits < i / k)
34 |       train.idx <- !test.idx
35 |       test <- iris[test.idx, , drop=FALSE]
36 |       train <- iris[train.idx, , drop=FALSE]
37 |       tree <- rpart(as.simple.formula(subset, "Species"), train)
38 |       error.rate = sum(test$Species != predict(tree, test, type="c")) / nrow(test)
39 |       return(1 - error.rate)
40 |     })
41 |     print(subset)
42 |     print(mean(results))
43 |     return(mean(results))
44 |   }
45 |   
46 |   subset <- forward.search(names(iris)[-5], evaluator)
47 |   f <- as.simple.formula(subset, "Species")
48 |   print(f)
49 | 
50 |   
51 | }
52 | 


--------------------------------------------------------------------------------
/man/hill.climbing.search.Rd:
--------------------------------------------------------------------------------
 1 | \name{hill.climbing.search}
 2 | \alias{hill.climbing.search}
 3 | \title{ Hill climbing search }
 4 | \description{
 5 |   The algorithm for searching atrribute subset space.
 6 | }
 7 | \usage{
 8 | hill.climbing.search(attributes, eval.fun)
 9 | }
10 | \arguments{
11 |   \item{attributes}{ a character vector of all attributes to search in }
12 |   \item{eval.fun}{ a function taking as first parameter a character vector of all attributes and returning a numeric indicating how important a given subset is }
13 | }
14 | \details{
15 |   The algorithm starts with a random attribute set. Then it evaluates all its neighbours and chooses the best one. It might be susceptible to local maximum.
16 | }
17 | \value{
18 |   A character vector of selected attributes.
19 | }
20 | \author{ Piotr Romanski }
21 | \seealso{ \code{\link{forward.search}}, \code{\link{backward.search}}, \code{\link{best.first.search}}, \code{\link{exhaustive.search}} }
22 | \examples{
23 |   library(rpart)
24 |   data(iris)
25 |   
26 |   evaluator <- function(subset) {
27 |     #k-fold cross validation
28 |     k <- 5
29 |     splits <- runif(nrow(iris))
30 |     results = sapply(1:k, function(i) {
31 |       test.idx <- (splits >= (i - 1) / k) & (splits < i / k)
32 |       train.idx <- !test.idx
33 |       test <- iris[test.idx, , drop=FALSE]
34 |       train <- iris[train.idx, , drop=FALSE]
35 |       tree <- rpart(as.simple.formula(subset, "Species"), train)
36 |       error.rate = sum(test$Species != predict(tree, test, type="c")) / nrow(test)
37 |       return(1 - error.rate)
38 |     })
39 |     print(subset)
40 |     print(mean(results))
41 |     return(mean(results))
42 |   }
43 |   
44 |   subset <- hill.climbing.search(names(iris)[-5], evaluator)
45 |   f <- as.simple.formula(subset, "Species")
46 |   print(f)
47 | 
48 |   
49 | }
50 | 


--------------------------------------------------------------------------------
/man/information.gain.Rd:
--------------------------------------------------------------------------------
 1 | \name{entropy.based}
 2 | \alias{information.gain}
 3 | \alias{gain.ratio}
 4 | \alias{symmetrical.uncertainty}
 5 | \title{ Entropy-based filters }
 6 | \description{
 7 |   The algorithms find weights of discrete attributes basing on their correlation with continous class attribute.
 8 | }
 9 | \usage{
10 | information.gain(formula, data, unit)
11 | gain.ratio(formula, data, unit)
12 | symmetrical.uncertainty(formula, data, unit)
13 | }
14 | \arguments{
15 |   \item{formula}{ A symbolic description of a model. }
16 |   \item{data}{ Data to process. }
17 |   \item{unit}{ Unit for computing entropy (passed to \code{\link[entropy]{entropy}}. Default is "log".}
18 | }
19 | \details{
20 |   \code{information.gain} is \deqn{H(Class) + H(Attribute) - H(Class, Attribute)}{H(Class) + H(Attribute) - H(Class, Attribute)}.
21 |   
22 |   \code{gain.ratio} is \deqn{\frac{H(Class) + H(Attribute) - H(Class, Attribute)}{H(Attribute)}}{(H(Class) + H(Attribute) - H(Class, Attribute)) / H(Attribute)}
23 | 
24 |   \code{symmetrical.uncertainty} is \deqn{2\frac{H(Class) + H(Attribute) - H(Class, Attribute)}{H(Attribute) + H(Class)}}{2 * (H(Class) + H(Attribute) - H(Class, Attribute)) / (H(Attribute) + H(Class))}
25 | }
26 | \value{
27 | a data.frame containing the worth of attributes in the first column and their names as row names
28 | }
29 | \author{ Piotr Romanski, Lars Kotthoff }
30 | \examples{
31 |   data(iris)
32 | 
33 |   weights <- information.gain(Species~., iris)
34 |   print(weights)
35 |   subset <- cutoff.k(weights, 2)
36 |   f <- as.simple.formula(subset, "Species")
37 |   print(f)
38 | 
39 |   weights <- information.gain(Species~., iris, unit = "log2")
40 |   print(weights)
41 | 
42 |   weights <- gain.ratio(Species~., iris)
43 |   print(weights)
44 |   subset <- cutoff.k(weights, 2)
45 |   f <- as.simple.formula(subset, "Species")
46 |   print(f)
47 | 
48 |   weights <- symmetrical.uncertainty(Species~., iris)
49 |   print(weights)
50 |   subset <- cutoff.biggest.diff(weights)
51 |   f <- as.simple.formula(subset, "Species")
52 |   print(f)
53 | 
54 | }
55 | 


--------------------------------------------------------------------------------
/man/oneR.Rd:
--------------------------------------------------------------------------------
 1 | \name{oneR}
 2 | \alias{oneR}
 3 | \title{ OneR algorithm }
 4 | \description{
 5 |   The algorithms find weights of discrete attributes basing on very simple association rules involving only one attribute in condition part.
 6 | }
 7 | \usage{
 8 | oneR(formula, data)
 9 | }
10 | \arguments{
11 |   \item{formula}{ a symbolic description of a model }
12 |   \item{data}{ data to process }
13 | }
14 | \details{
15 |   The algorithm uses OneR classifier to find out the attributes' weights. For each attribute it creates a simple rule based only on that attribute and then calculates its error rate.
16 | }
17 | \value{
18 | a data.frame containing the worth of attributes in the first column and their names as row names
19 | }
20 | \author{ Piotr Romanski }
21 | \examples{
22 |   library(mlbench)
23 |   data(HouseVotes84)
24 |   
25 |   weights <- oneR(Class~., HouseVotes84)
26 |   print(weights)
27 |   subset <- cutoff.k(weights, 5)
28 |   f <- as.simple.formula(subset, "Class")
29 |   print(f)
30 | }
31 | 


--------------------------------------------------------------------------------
/man/random.forest.importance.Rd:
--------------------------------------------------------------------------------
 1 | \name{random.forest.importance}
 2 | \alias{random.forest.importance}
 3 | \title{ RandomForest filter }
 4 | \description{
 5 |   The algorithm finds weights of attributes using RandomForest algorithm.
 6 | }
 7 | \usage{
 8 | random.forest.importance(formula, data, importance.type = 1)
 9 | }
10 | \arguments{
11 |   \item{formula}{ a symbolic description of a model }
12 |   \item{data}{ data to process }
13 |   \item{importance.type}{ either 1 or 2, specifying the type of importance measure (1=mean decrease in accuracy, 2=mean decrease in node impurity) }
14 | }
15 | \details{
16 |   This is a wrapper for \code{\link[randomForest]{importance}.}
17 | }
18 | \value{
19 | a data.frame containing the worth of attributes in the first column and their names as row names
20 | }
21 | \author{ Piotr Romanski }
22 | \examples{
23 |   library(mlbench)
24 |   data(HouseVotes84)
25 |   
26 |   weights <- random.forest.importance(Class~., HouseVotes84, importance.type = 1)
27 |   print(weights)
28 |   subset <- cutoff.k(weights, 5)
29 |   f <- as.simple.formula(subset, "Class")
30 |   print(f)
31 | }
32 | 


--------------------------------------------------------------------------------
/man/relief.Rd:
--------------------------------------------------------------------------------
 1 | \name{relief}
 2 | \alias{relief}
 3 | \title{ RReliefF filter }
 4 | \description{
 5 |   The algorithm finds weights of continous and discrete attributes basing on a distance between instances.
 6 | }
 7 | \usage{
 8 | relief(formula, data, neighbours.count = 5, sample.size = 10)
 9 | }
10 | \arguments{
11 |   \item{formula}{ a symbolic description of a model }
12 |   \item{data}{ data to process }
13 |   \item{neighbours.count}{ number of neighbours to find for every sampled instance }
14 |   \item{sample.size}{ number of instances to sample }
15 | }
16 | \details{
17 |   The algorithm samples instances and finds their nearest hits and misses. Considering that result, it evaluates weights of attributes.
18 | }
19 | \references{
20 |   \itemize{
21 |     \item{-}{Igor Kononenko: Estimating Attributes: Analysis and Extensions of RELIEF. In: European Conference on Machine Learning, 171-182, 1994.}
22 |     \item{-}{Marko Robnik-Sikonja, Igor Kononenko: An adaptation of Relief for attribute estimation in regression. In: Fourteenth International Conference on Machine Learning, 296-304, 1997.}
23 |   }
24 | }
25 | \value{
26 | a data.frame containing the worth of attributes in the first column and their names as row names
27 | }
28 | \author{ Piotr Romanski }
29 | \examples{
30 |   data(iris)
31 |   
32 |   weights <- relief(Species~., iris, neighbours.count = 5, sample.size = 20)
33 |   print(weights)
34 |   subset <- cutoff.k(weights, 2)
35 |   f <- as.simple.formula(subset, "Species")
36 |   print(f)
37 | }
38 | 


--------------------------------------------------------------------------------