├── project
└── build.properties
├── images
├── NCL.png
├── SBC.png
├── ADASYN.png
├── IHTS.png
├── IPADE.png
├── MWMOTE.png
├── SMOTE.png
├── original.png
└── SafeLevelSMOTE.png
├── src
└── main
│ └── scala
│ └── soul
│ ├── data
│ ├── Data.scala
│ └── FileInfo.scala
│ ├── algorithm
│ ├── oversampling
│ │ ├── RO.scala
│ │ ├── SMOTE.scala
│ │ ├── SMOTEENN.scala
│ │ ├── SMOTETL.scala
│ │ ├── SafeLevelSMOTE.scala
│ │ ├── ADOMS.scala
│ │ ├── BorderlineSMOTE.scala
│ │ ├── ADASYN.scala
│ │ ├── MDO.scala
│ │ ├── SMOTERSB.scala
│ │ ├── Spider2.scala
│ │ ├── MWMOTE.scala
│ │ └── DBSMOTE.scala
│ └── undersampling
│ │ ├── RU.scala
│ │ ├── EE.scala
│ │ ├── ENN.scala
│ │ ├── OSS.scala
│ │ ├── IHTS.scala
│ │ ├── TL.scala
│ │ ├── ClusterOSS.scala
│ │ ├── CPM.scala
│ │ ├── NCL.scala
│ │ ├── BC.scala
│ │ ├── CNN.scala
│ │ ├── NM.scala
│ │ ├── EUS.scala
│ │ └── SBC.scala
│ ├── io
│ ├── Writer.scala
│ └── Reader.scala
│ └── util
│ └── KDTree.scala
└── README.md
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=1.2.3
2 |
--------------------------------------------------------------------------------
/images/NCL.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NestorRV/SOUL/HEAD/images/NCL.png
--------------------------------------------------------------------------------
/images/SBC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NestorRV/SOUL/HEAD/images/SBC.png
--------------------------------------------------------------------------------
/images/ADASYN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NestorRV/SOUL/HEAD/images/ADASYN.png
--------------------------------------------------------------------------------
/images/IHTS.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NestorRV/SOUL/HEAD/images/IHTS.png
--------------------------------------------------------------------------------
/images/IPADE.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NestorRV/SOUL/HEAD/images/IPADE.png
--------------------------------------------------------------------------------
/images/MWMOTE.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NestorRV/SOUL/HEAD/images/MWMOTE.png
--------------------------------------------------------------------------------
/images/SMOTE.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NestorRV/SOUL/HEAD/images/SMOTE.png
--------------------------------------------------------------------------------
/images/original.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NestorRV/SOUL/HEAD/images/original.png
--------------------------------------------------------------------------------
/images/SafeLevelSMOTE.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NestorRV/SOUL/HEAD/images/SafeLevelSMOTE.png
--------------------------------------------------------------------------------
/src/main/scala/soul/data/Data.scala:
--------------------------------------------------------------------------------
1 | /*
2 | SOUL: Scala Oversampling and Undersampling Library.
3 | Copyright (C) 2019 Néstor Rodríguez, David López
4 |
5 | This program is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU General Public License as published by
7 | the Free Software Foundation in version 3 of the License.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | */
17 | package soul.data
18 |
19 | import scala.collection.mutable
20 |
21 | /** Data structure used by the algorithms
22 | *
23 | * @param x data associated to the file (x)
24 | * @param y classes associated to the file (y)
25 | * @param index randomIndex representing the kept elements
26 | * @param fileInfo object with the information needed to save the data into a file
27 | * @author Néstor Rodríguez Vico
28 | */
29 | class Data private[soul](private[soul] val x: Array[Array[Any]], private[soul] val y: Array[Any],
30 | private[soul] val index: Option[Array[Int]] = None, private[soul] val fileInfo: FileInfo) {
31 |
32 | private[soul] var processedData: Array[Array[Double]] = new Array[Array[Double]](0)
33 | private[soul] var nomToNum: Array[mutable.Map[Double, Any]] = new Array[mutable.Map[Double, Any]](0)
34 | }
35 |
--------------------------------------------------------------------------------
/src/main/scala/soul/data/FileInfo.scala:
--------------------------------------------------------------------------------
1 | /*
2 | SOUL: Scala Oversampling and Undersampling Library.
3 | Copyright (C) 2019 Néstor Rodríguez, David López
4 |
5 | This program is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU General Public License as published by
7 | the Free Software Foundation in version 3 of the License.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | */
17 | package soul.data
18 |
19 | import scala.collection.mutable
20 |
21 | /** Data structure used by the arff classes
22 | *
23 | * @param _file file containing the data
24 | * @param _comment string indicating that a line is a comment
25 | * @param _columnClass indicates which column represents the class in the file
26 | * @param _delimiter string separating two elements
27 | * @param _missing string indicating a element is missed
28 | * @param _header header of the file. If it is _, there was no header
29 | * @param _attributes map with the form: index -> attributeName
30 | * @param _attributesValues map with the form attributeName -> type (it it's nominal, possible values instead of type)
31 | * @param nominal array to know which attributes are nominal
32 | * @author Néstor Rodríguez Vico
33 | */
34 | class FileInfo private[soul](private[soul] val _file: String, private[soul] val _comment: String,
35 | private[soul] val _columnClass: Int = -1,
36 | private[soul] val _delimiter: String, private[soul] val _missing: String,
37 | private[soul] val _header: Array[String], private[soul] val _relationName: String,
38 | private[soul] val _attributes: mutable.Map[Int, String],
39 | private[soul] val _attributesValues: mutable.Map[String, String],
40 | private[soul] val nominal: Array[Int]) {
41 |
42 | // data necessary to denormalize the data
43 | private[soul] var maxAttribs: Array[Double] = _
44 | private[soul] var minAttribs: Array[Double] = _
45 |
46 | }
--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/oversampling/RO.scala:
--------------------------------------------------------------------------------
1 | /*
2 | SOUL: Scala Oversampling and Undersampling Library.
3 | Copyright (C) 2019 Néstor Rodríguez, David López
4 |
5 | This program is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU General Public License as published by
7 | the Free Software Foundation in version 3 of the License.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | */
17 | package soul.algorithm.oversampling
18 |
19 | import soul.data.Data
20 | import soul.util.Utilities._
21 |
22 | import scala.util.Random
23 |
24 | /** Random Oversampling algorithm. Original paper: "A study of the behavior of several methods for balancing machine
25 | * learning training data" by Batista, Gustavo EAPA and Prati, Ronaldo C and Monard, Maria Carolina.
26 | *
27 | * @param data data to work with
28 | * @param seed seed to use. If it is not provided, it will use the system time
29 | * @param percent number of samples to create
30 | * @param verbose choose to display information about the execution or not
31 | * @author David López Pretel
32 | */
33 | class RO(data: Data, seed: Long = System.currentTimeMillis(), percent: Int = 500, verbose: Boolean = false) {
34 |
35 | /** Compute the SMOTE algorithm
36 | *
37 | * @return synthetic samples generated
38 | */
39 | def compute(): Data = {
40 | val initTime: Long = System.nanoTime()
41 |
42 | if (percent < 0) {
43 | throw new Exception("Percent must be a greather than 0")
44 | }
45 |
46 | val minorityClassIndex: Array[Int] = minority(data.y)
47 | val minorityClass: Any = data.y(minorityClassIndex(0))
48 |
49 | // output with a size of T*N samples
50 | val output: Array[Array[Double]] = Array.ofDim[Double](percent, data.processedData(0).length)
51 |
52 | val r: Random = new Random(seed)
53 |
54 | // for each minority class sample
55 | (0 until percent).par.foreach((i: Int) => {
56 | output(i) = data.processedData(minorityClassIndex(r.nextInt(minorityClassIndex.length)))
57 | })
58 |
59 | val finishTime: Long = System.nanoTime()
60 |
61 | if (verbose) {
62 | println("ORIGINAL SIZE: %d".format(data.x.length))
63 | println("NEW DATA SIZE: %d".format(data.x.length + output.length))
64 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
65 | }
66 |
67 | new Data(if (data.fileInfo.nominal.length == 0) {
68 | to2Decimals(Array.concat(data.processedData, output))
69 | } else {
70 | toNominal(Array.concat(data.processedData, output), data.nomToNum)
71 | }, Array.concat(data.y, Array.fill(output.length)(minorityClass)), None, data.fileInfo)
72 | }
73 | }
--------------------------------------------------------------------------------
/src/main/scala/soul/io/Writer.scala:
--------------------------------------------------------------------------------
1 | /*
2 | SOUL: Scala Oversampling and Undersampling Library.
3 | Copyright (C) 2019 Néstor Rodríguez, David López
4 |
5 | This program is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU General Public License as published by
7 | the Free Software Foundation in version 3 of the License.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | */
17 | package soul.io
18 |
19 | import java.io.{File, PrintWriter}
20 |
21 | import soul.data.Data
22 |
23 | import scala.collection.immutable.ListMap
24 |
25 | /** Class write data files
26 | *
27 | * @author Néstor Rodríguez Vico
28 | */
29 | object Writer {
30 | /** Store data into a delimited text file
31 | *
32 | * @param file filename where to store the data
33 | * @param data data to save to the file
34 | */
35 | def writeArff(file: String, data: Data): Unit = {
36 | val pr = new PrintWriter(new File(file))
37 | pr.write("@relation %s\n".format(data.fileInfo._relationName))
38 |
39 | if (data.fileInfo._attributes == null || data.fileInfo._attributesValues == null)
40 | throw new Exception("Unable to write arff: missing information")
41 |
42 | val orderedAttributes: Map[Int, String] = ListMap(data.fileInfo._attributes.toSeq.sortBy(_._1): _*)
43 |
44 | for (attribute <- orderedAttributes) {
45 | pr.write("@attribute %s %s\n".format(attribute._2, data.fileInfo._attributesValues(attribute._2)))
46 | }
47 |
48 | pr.write("@data\n")
49 |
50 | for (row <- data.x zip data.y) {
51 | val naIndex: Array[Int] = row._1.zipWithIndex.filter(_._1 == "soul_NA").map(_._2)
52 | val newRow: Array[Any] = row._1.clone()
53 | for (index <- naIndex) {
54 | newRow(index) = "?"
55 | }
56 |
57 | pr.write(newRow.mkString(",") + "," + row._2 + "\n")
58 | }
59 |
60 | pr.close()
61 | }
62 |
63 | /** Store data into a delimited text file
64 | *
65 | * @param file filename where to store the data
66 | * @param data data to save to the file
67 | */
68 | def writeDelimitedText(file: String, data: Data): Unit = {
69 | val delimiter: String = if (data.fileInfo._delimiter == null) "," else data.fileInfo._delimiter
70 | val missing: String = if (data.fileInfo._missing == null) "?" else data.fileInfo._delimiter
71 |
72 | val pr = new PrintWriter(new File(file))
73 | if (data.fileInfo._header != null)
74 | pr.write(data.fileInfo._header.mkString(delimiter) + "\n")
75 |
76 | for (row <- data.x zip data.y) {
77 | val naIndex: Array[Int] = row._1.zipWithIndex.filter(_._1 == "soul_NA").map(_._2)
78 | val newRow: Array[Any] = row._1.clone()
79 | for (index <- naIndex) {
80 | newRow(index) = missing
81 | }
82 |
83 | pr.write(newRow.mkString(delimiter) + "," + row._2 + "\n")
84 | }
85 |
86 | pr.close()
87 | }
88 | }
--------------------------------------------------------------------------------
/src/main/scala/soul/util/KDTree.scala:
--------------------------------------------------------------------------------
1 | /*
2 | SOUL: Scala Oversampling and Undersampling Library.
3 | Copyright (C) 2019 Néstor Rodríguez, David López
4 |
5 | This program is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU General Public License as published by
7 | the Free Software Foundation in version 3 of the License.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | */
17 | package soul.util
18 |
19 | import com.thesamet.spatial.{DimensionalOrdering, KDTreeMap, Metric}
20 |
21 | import scala.language.implicitConversions
22 | import scala.math.sqrt
23 |
24 | /** Wrapper of a com.thesamet.spatial.KDTreeMap adapted for Arrays of Doubles
25 | *
26 | * @param x data
27 | * @param y labels
28 | * @param dimensions number of dimensions
29 | * @param which if it's set to "nearest", return the nearest neighbours, if it sets "farthest", return the farthest ones
30 | * @author Néstor Rodríguez Vico
31 | */
32 | class KDTree(x: Array[Array[Double]], y: Array[Any], dimensions: Int, which: String = "nearest") {
33 |
34 | private[soul] var kDTreeMap: KDTreeMap[Array[Double], (Any, Int)] = if (which == "nearest") {
35 | KDTreeMap.fromSeq((x zip y.zipWithIndex).map(f => f._1 -> (f._2._1, f._2._2)))(dimensionalOrderingForArray[Array[Double], Double](dimensions))
36 | } else {
37 | KDTreeMap.fromSeq((x zip y.zipWithIndex).map(f => f._1 -> (f._2._1, f._2._2)))(dimensionalReverseOrderingForArray[Array[Double], Double](dimensions))
38 | }
39 |
40 | def nNeighbours(instance: Array[Double], k: Int, leaveOneOut: Boolean = true): (Seq[Array[Double]], Seq[Any], Seq[Int]) = {
41 | val realK: Int = if (leaveOneOut) k + 1 else k
42 | val drop: Int = if (leaveOneOut) 1 else 0
43 | val instances: (Seq[Array[Double]], Seq[(Any, Int)]) = kDTreeMap.findNearest(instance, realK).drop(drop).unzip
44 | val (labels, index) = instances._2.unzip
45 | (instances._1, labels, index)
46 | }
47 |
48 | def apply(x: Array[Double]): (Any, Int) = kDTreeMap(x)
49 |
50 | def addElement(x: Array[Double], y: Any): Unit = {
51 | kDTreeMap = kDTreeMap + (x -> (y, kDTreeMap.size + 1))
52 | }
53 |
54 | def dimensionalOrderingForArray[T <: Array[A], A](dim: Int)(implicit ord: Ordering[A]): DimensionalOrdering[T] =
55 | new DimensionalOrdering[T] {
56 | val dimensions: Int = dim
57 |
58 | def compareProjection(d: Int)(x: T, y: T): Int = ord.compare(x(d), y(d))
59 | }
60 |
61 | def dimensionalReverseOrderingForArray[T <: Array[A], A](dim: Int)(implicit ord: Ordering[A]): DimensionalOrdering[T] =
62 | new DimensionalOrdering[T] {
63 | val dimensions: Int = dim
64 |
65 | def compareProjection(d: Int)(x: T, y: T): Int = ord.compare(y(d), x(d))
66 | }
67 |
68 | implicit def metricFromArray(implicit n: Numeric[Double]): Metric[Array[Double], Double] = new Metric[Array[Double], Double] {
69 | override def distance(x: Array[Double], y: Array[Double]): Double = sqrt(x.zip(y).map { z =>
70 | val d = z._1 - z._2
71 | d * d
72 | }.sum)
73 |
74 | override def planarDistance(dimension: Int)(x: Array[Double], y: Array[Double]): Double = {
75 | val dd = x(dimension) - y(dimension)
76 | dd * dd
77 | }
78 | }
79 | }
--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/undersampling/RU.scala:
--------------------------------------------------------------------------------
1 | /*
2 | SOUL: Scala Oversampling and Undersampling Library.
3 | Copyright (C) 2019 Néstor Rodríguez, David López
4 |
5 | This program is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU General Public License as published by
7 | the Free Software Foundation in version 3 of the License.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | */
17 | package soul.algorithm.undersampling
18 |
19 | import soul.data.Data
20 | import soul.util.Utilities._
21 |
22 | /** Compute a random algorithm. Original paper: "A study of the behavior of several methods for balancing machine
23 | * learning training data" by Batista, Gustavo EAPA and Prati, Ronaldo C and Monard, Maria Carolina.
24 | *
25 | * @param data data to work with
26 | * @param seed seed to use. If it is not provided, it will use the system time
27 | * @param ratio ratio to know how many majority class examples to preserve. By default it's set to 1 so there
28 | * will be the same minority class examples as majority class examples. It will take
29 | * numMinorityInstances * ratio
30 | * @param replacement whether or not to sample randomly with replacement or not. false by default
31 | * @param verbose choose to display information about the execution or not
32 | * @author Néstor Rodríguez Vico
33 | */
34 | class RU(data: Data, seed: Long = System.currentTimeMillis(), ratio: Double = 1.0, replacement: Boolean = false, verbose: Boolean = false) {
35 |
36 | /** Compute the RU algorithm.
37 | *
38 | * @return undersampled data structure
39 | */
40 | def compute(): Data = {
41 | val initTime: Long = System.nanoTime()
42 |
43 | val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length)
44 | val untouchableClass: Any = counter.minBy((c: (Any, Int)) => c._2)._1
45 | val random: scala.util.Random = new scala.util.Random(seed)
46 |
47 | val minorityIndex: Array[Int] = data.y.zipWithIndex.collect { case (label, i) if label == untouchableClass => i }
48 | val majorityIndex: Array[Int] = random.shuffle(data.y.zipWithIndex.collect { case (label, i)
49 | if label != untouchableClass => i
50 | }.toList).toArray
51 | val selectedMajorityIndex: Array[Int] = if (!replacement) majorityIndex.take((minorityIndex.length * ratio).toInt) else
52 | majorityIndex.indices.map(_ => random.nextInt(majorityIndex.length)).toArray map majorityIndex
53 | val finalIndex: Array[Int] = minorityIndex ++ selectedMajorityIndex
54 | val finishTime: Long = System.nanoTime()
55 |
56 | if (verbose) {
57 | val newCounter: Map[Any, Int] = (finalIndex map data.y).groupBy(identity).mapValues(_.length)
58 | println("ORIGINAL SIZE: %d".format(data.x.length))
59 | println("NEW DATA SIZE: %d".format(finalIndex.length))
60 | println("REDUCTION PERCENTAGE: %s".format(100 - (finalIndex.length.toFloat / data.x.length) * 100))
61 | println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass)))
62 | println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass)))
63 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
64 | }
65 |
66 | new Data(finalIndex map data.x, finalIndex map data.y, Some(finalIndex), data.fileInfo)
67 | }
68 | }
--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/undersampling/EE.scala:
--------------------------------------------------------------------------------
1 | /*
2 | SOUL: Scala Oversampling and Undersampling Library.
3 | Copyright (C) 2019 Néstor Rodríguez, David López
4 |
5 | This program is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU General Public License as published by
7 | the Free Software Foundation in version 3 of the License.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | */
17 | package soul.algorithm.undersampling
18 |
19 | import soul.data.Data
20 | import soul.util.Utilities._
21 |
22 | /** Easy Ensemble algorithm. Original paper: "Exploratory Undersampling for Class-Imbalance Learning" by Xu-Ying Liu,
23 | * Jianxin Wu and Zhi-Hua Zhou.
24 | *
25 | * @param data data to work with
26 | * @param seed seed to use. If it is not provided, it will use the system time
27 | * @param ratio ratio to know how many majority class examples to preserve. By default it's set to 1 so there
28 | * will be the same minority class examples as majority class examples. It will take
29 | * numMinorityInstances * ratio
30 | * @param replacement whether or not to sample randomly with replacement or not. false by default
31 | * @param nTimes times to perform the random algorithm
32 | * @param normalize normalize the data or not
33 | * @param randomData iterate through the data randomly or not
34 | * @param verbose choose to display information about the execution or not
35 | * @author Néstor Rodríguez Vico
36 | */
37 | class EE(data: Data, seed: Long = System.currentTimeMillis(), ratio: Double = 1.0, replacement: Boolean = false, nTimes: Int = 5,
38 | normalize: Boolean = false, randomData: Boolean = false, verbose: Boolean = false) {
39 |
40 | /** Compute the EE algorithm.
41 | *
42 | * @return undersampled data structure
43 | */
44 | def compute(): Data = {
45 | val initTime: Long = System.nanoTime()
46 |
47 | val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length)
48 | val untouchableClass: Any = counter.minBy((c: (Any, Int)) => c._2)._1
49 | val random: scala.util.Random = new scala.util.Random(seed)
50 | var dataToWorkWith: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
51 | val classesToWorkWith: Array[Any] = if (randomData) {
52 | val randomIndex: List[Int] = random.shuffle(data.y.indices.toList)
53 | dataToWorkWith = (randomIndex map dataToWorkWith).toArray
54 | (randomIndex map data.y).toArray
55 | } else {
56 | data.y
57 | }
58 |
59 | val minorityIndex: Array[Int] = classesToWorkWith.zipWithIndex.collect { case (label, i) if label == untouchableClass => i }
60 | val majIndex: List[Int] = classesToWorkWith.zipWithIndex.collect { case (label, i) if label != untouchableClass => i }.toList
61 | val majElements: Array[Int] = (0 until nTimes).flatMap { _: Int =>
62 | val majorityIndex: Array[Int] = random.shuffle(majIndex).toArray
63 | if (!replacement) majorityIndex.take((minorityIndex.length * ratio).toInt) else majorityIndex.indices.map(_ =>
64 | random.nextInt(majorityIndex.length)).toArray map majorityIndex
65 | }.toArray
66 |
67 | // Make an histogram and select the majority class examples that have been selected more times
68 | val majorityIndexHistogram: Array[(Int, Int)] = majElements.groupBy(identity).mapValues(_.length).toArray.sortBy(_._2).reverse
69 | val majorityIndex: Array[Int] = majorityIndexHistogram.take((minorityIndex.length * ratio).toInt).map(_._1)
70 | val finalIndex: Array[Int] = minorityIndex ++ majorityIndex
71 | val finishTime: Long = System.nanoTime()
72 |
73 | if (verbose) {
74 | val newCounter: Map[Any, Int] = (finalIndex map classesToWorkWith).groupBy(identity).mapValues(_.length)
75 | println("ORIGINAL SIZE: %d".format(dataToWorkWith.length))
76 | println("NEW DATA SIZE: %d".format(finalIndex.length))
77 | println("REDUCTION PERCENTAGE: %s".format(100 - (finalIndex.length.toFloat / dataToWorkWith.length) * 100))
78 | println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass)))
79 | println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass)))
80 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
81 | }
82 |
83 | new Data(finalIndex map data.x, finalIndex map data.y, Some(finalIndex), data.fileInfo)
84 | }
85 | }
--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/oversampling/SMOTE.scala:
--------------------------------------------------------------------------------
1 | /*
2 | SOUL: Scala Oversampling and Undersampling Library.
3 | Copyright (C) 2019 Néstor Rodríguez, David López
4 |
5 | This program is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU General Public License as published by
7 | the Free Software Foundation in version 3 of the License.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | */
17 | package soul.algorithm.oversampling
18 |
19 | import soul.data.Data
20 | import soul.util.KDTree
21 | import soul.util.Utilities.Distance.Distance
22 | import soul.util.Utilities._
23 |
24 | import scala.util.Random
25 |
26 | /** SMOTE algorithm. Original paper: "SMOTE: Synthetic Minority Over-sampling Technique" by Nitesh V. Chawla, Kevin W.
27 | * Bowyer, Lawrence O. Hall and W. Philip Kegelmeyer.
28 | *
29 | * @param data data to work with
30 | * @param seed seed to use. If it is not provided, it will use the system time
31 | * @param percent amount of SMOTE N%
32 | * @param k number of minority class nearest neighbors
33 | * @param dist object of Distance enumeration representing the distance to be used
34 | * @param normalize normalize the data or not
35 | * @param verbose choose to display information about the execution or not
36 | * @author David López Pretel
37 | */
38 | class SMOTE(data: Data, seed: Long = System.currentTimeMillis(), percent: Int = 500, k: Int = 5,
39 | dist: Distance = Distance.EUCLIDEAN, normalize: Boolean = false, verbose: Boolean = false) {
40 |
41 | /** Compute the SMOTE algorithm
42 | *
43 | * @return synthetic samples generated
44 | */
45 | def compute(): Data = {
46 | val initTime: Long = System.nanoTime()
47 |
48 | if (percent > 100 && percent % 100 != 0) {
49 | throw new Exception("Percent must be a multiple of 100")
50 | }
51 |
52 | val samples: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
53 | val minorityClassIndex: Array[Int] = minority(data.y)
54 | val minorityClass: Any = data.y(minorityClassIndex(0))
55 |
56 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
57 | (samples.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
58 | samples.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
59 | samples.transpose.map((column: Array[Double]) => standardDeviation(column)))
60 | } else {
61 | (null, null, null)
62 | }
63 |
64 | val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) {
65 | Some(new KDTree(samples, data.y, samples(0).length))
66 | } else {
67 | None
68 | }
69 |
70 | // check if the percent is correct
71 | var T: Int = minorityClassIndex.length
72 | var N: Int = percent
73 |
74 | if (N < 100) {
75 | T = N / 100 * T
76 | N = 100
77 | }
78 | N = N / 100
79 |
80 | // output with a size of T*N samples
81 | val output: Array[Array[Double]] = Array.ofDim[Double](N * T, samples(0).length)
82 |
83 | val r: Random = new Random(seed)
84 |
85 | // for each minority class sample
86 | minorityClassIndex.indices.par.foreach((i: Int) => {
87 | val neighbors: Array[Int] = if (dist == Distance.EUCLIDEAN) {
88 | KDTree.get.nNeighbours(samples(minorityClassIndex(i)), k)._3.toArray
89 | } else {
90 | kNeighborsHVDM(samples, minorityClassIndex(i), k, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
91 | }
92 |
93 | // compute populate for the sample
94 | (0 until N).par.foreach((n: Int) => {
95 | val nn: Int = neighbors(r.nextInt(neighbors.length))
96 | // compute attributes of the sample
97 | samples(0).indices.foreach((atrib: Int) => {
98 | val diff: Double = samples(nn)(atrib) - samples(minorityClassIndex(i))(atrib)
99 | val gap: Double = r.nextFloat()
100 | output(i * N + n)(atrib) = samples(minorityClassIndex(i))(atrib) + gap * diff
101 | })
102 | })
103 | })
104 |
105 | val finishTime: Long = System.nanoTime()
106 |
107 | if (verbose) {
108 | println("ORIGINAL SIZE: %d".format(data.x.length))
109 | println("NEW DATA SIZE: %d".format(data.x.length + output.length))
110 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
111 | }
112 |
113 | new Data(if (data.fileInfo.nominal.length == 0) {
114 | to2Decimals(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs,
115 | data.fileInfo.minAttribs) else output))
116 | } else {
117 | toNominal(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs,
118 | data.fileInfo.minAttribs) else output), data.nomToNum)
119 | }, Array.concat(data.y, Array.fill(output.length)(minorityClass)), None, data.fileInfo)
120 | }
121 | }
--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/undersampling/ENN.scala:
--------------------------------------------------------------------------------
1 | /*
2 | SOUL: Scala Oversampling and Undersampling Library.
3 | Copyright (C) 2019 Néstor Rodríguez, David López
4 |
5 | This program is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU General Public License as published by
7 | the Free Software Foundation in version 3 of the License.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | */
17 | package soul.algorithm.undersampling
18 |
19 | import soul.data.Data
20 | import soul.util.KDTree
21 | import soul.util.Utilities.Distance.Distance
22 | import soul.util.Utilities._
23 |
24 | import scala.collection.mutable.ArrayBuffer
25 |
26 | /** Edited Nearest Neighbour rule. Original paper: "Asymptotic Properties of Nearest Neighbor Rules Using Edited Data"
27 | * by Dennis L. Wilson.
28 | *
29 | * @param data data to work with
30 | * @param seed seed to use. If it is not provided, it will use the system time
31 | * @param dist object of Distance enumeration representing the distance to be used
32 | * @param k number of neighbours to use when computing k-NN rule (normally 3 neighbours)
33 | * @param normalize normalize the data or not
34 | * @param randomData iterate through the data randomly or not
35 | * @param verbose choose to display information about the execution or not
36 | * @author Néstor Rodríguez Vico
37 | */
38 | class ENN(data: Data, seed: Long = System.currentTimeMillis(), dist: Distance = Distance.EUCLIDEAN,
39 | k: Int = 3, normalize: Boolean = false, randomData: Boolean = false, verbose: Boolean = false) {
40 |
41 | /** Compute the ENN algorithm.
42 | *
43 | * @return undersampled data structure
44 | */
45 | def compute(): Data = {
46 | val initTime: Long = System.nanoTime()
47 |
48 | val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length)
49 | val untouchableClass: Any = counter.minBy((c: (Any, Int)) => c._2)._1
50 | val random: scala.util.Random = new scala.util.Random(seed)
51 | var dataToWorkWith: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
52 | val classesToWorkWith: Array[Any] = if (randomData) {
53 | val randomIndex: List[Int] = random.shuffle(data.y.indices.toList)
54 | dataToWorkWith = (randomIndex map dataToWorkWith).toArray
55 | (randomIndex map data.y).toArray
56 | } else {
57 | data.y
58 | }
59 |
60 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
61 | (dataToWorkWith.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
62 | dataToWorkWith.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
63 | dataToWorkWith.transpose.map((column: Array[Double]) => standardDeviation(column)))
64 | } else {
65 | (null, null, null)
66 | }
67 |
68 | val finalIndex = new ArrayBuffer[Int]()
69 | val uniqueClasses = classesToWorkWith.distinct
70 |
71 | var j = 0
72 | val majorityClassIndex = new ArrayBuffer[Int]()
73 | while (j < classesToWorkWith.length) {
74 | if (classesToWorkWith(j) == untouchableClass) finalIndex += j else majorityClassIndex += j
75 | j += 1
76 | }
77 |
78 | val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) {
79 | Some(new KDTree(dataToWorkWith, classesToWorkWith, dataToWorkWith(0).length))
80 | } else {
81 | None
82 | }
83 |
84 | var i = 0
85 | while (i < uniqueClasses.length) {
86 | val targetClass = uniqueClasses(i)
87 | val selected: Array[(Int, Boolean)] = if (targetClass != untouchableClass) {
88 | majorityClassIndex.par.map { j =>
89 | val label = if (dist == Distance.EUCLIDEAN) {
90 | mode(KDTree.get.nNeighbours(dataToWorkWith(j), k)._2.toArray)
91 | } else {
92 | nnRuleHVDM(dataToWorkWith, dataToWorkWith(j), j, classesToWorkWith, k, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter, "nearest")._1
93 | }
94 |
95 | (j, label == targetClass)
96 | }.toArray
97 | } else {
98 | new Array[(Int, Boolean)](0)
99 | }
100 |
101 | selected.foreach(e => if (e._2) finalIndex += e._1)
102 |
103 | i += 1
104 | }
105 |
106 | val finishTime: Long = System.nanoTime()
107 |
108 | if (verbose) {
109 | val newCounter: Map[Any, Int] = (finalIndex.toArray map classesToWorkWith).groupBy(identity).mapValues(_.length)
110 | println("ORIGINAL SIZE: %d".format(dataToWorkWith.length))
111 | println("NEW DATA SIZE: %d".format(finalIndex.length))
112 | println("REDUCTION PERCENTAGE: %s".format(100 - (finalIndex.length.toFloat / dataToWorkWith.length) * 100))
113 | println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass)))
114 | println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass)))
115 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
116 | }
117 |
118 | new Data(finalIndex.toArray map data.x, finalIndex.toArray map data.y, Some(finalIndex.toArray), data.fileInfo)
119 | }
120 | }
121 |
--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/undersampling/OSS.scala:
--------------------------------------------------------------------------------
1 | /*
2 | SOUL: Scala Oversampling and Undersampling Library.
3 | Copyright (C) 2019 Néstor Rodríguez, David López
4 |
5 | This program is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU General Public License as published by
7 | the Free Software Foundation in version 3 of the License.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | */
17 | package soul.algorithm.undersampling
18 |
19 | import soul.data.Data
20 | import soul.util.KDTree
21 | import soul.util.Utilities.Distance.Distance
22 | import soul.util.Utilities._
23 |
24 | /** One-Side Selection. Original paper: "Addressing the Curse of Imbalanced
25 | * Training Sets: One-Side Selection" by Miroslav Kubat and Stan Matwin.
26 | *
27 | * @param data data to work with
28 | * @param seed seed to use. If it is not provided, it will use the system time
29 | * @param dist object of Distance enumeration representing the distance to be used
30 | * @param normalize normalize the data or not
31 | * @param randomData iterate through the data randomly or not
32 | * @param verbose choose to display information about the execution or not
33 | * @author Néstor Rodríguez Vico
34 | */
35 | class OSS(data: Data, seed: Long = System.currentTimeMillis(), dist: Distance = Distance.EUCLIDEAN,
36 | normalize: Boolean = false, randomData: Boolean = false, verbose: Boolean = false) {
37 |
38 | /** Compute the OSS algorithm.
39 | *
40 | * @return undersampled data structure
41 | */
42 | def compute(): Data = {
43 | // Note: the notation used to refers the subsets of data is the used in the original paper.
44 | val initTime: Long = System.nanoTime()
45 |
46 | val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length)
47 | val untouchableClass: Any = counter.minBy((c: (Any, Int)) => c._2)._1
48 | val random: scala.util.Random = new scala.util.Random(seed)
49 | var dataToWorkWith: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
50 | val classesToWorkWith: Array[Any] = if (randomData) {
51 | val randomIndex: List[Int] = random.shuffle(data.y.indices.toList)
52 | dataToWorkWith = (randomIndex map dataToWorkWith).toArray
53 | (randomIndex map data.y).toArray
54 | } else {
55 | data.y
56 | }
57 |
58 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
59 | (dataToWorkWith.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
60 | dataToWorkWith.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
61 | dataToWorkWith.transpose.map((column: Array[Double]) => standardDeviation(column)))
62 | } else {
63 | (null, null, null)
64 | }
65 |
66 | val positives: Array[Int] = classesToWorkWith.zipWithIndex.collect { case (label, i) if label == untouchableClass => i }
67 | val randomElement: Int = classesToWorkWith.indices.diff(positives)(new util.Random(seed).nextInt(classesToWorkWith.length - positives.length))
68 | val c: Array[Int] = positives ++ Array(randomElement)
69 |
70 | val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) {
71 | Some(new KDTree(c map dataToWorkWith, c map classesToWorkWith, dataToWorkWith(0).length))
72 | } else {
73 | None
74 | }
75 |
76 | val labels: Seq[(Int, Any)] = if (dist == Distance.EUCLIDEAN) {
77 | dataToWorkWith.indices.map(i => (i, mode(KDTree.get.nNeighbours(dataToWorkWith(i), 1)._2.toArray)))
78 | } else {
79 | val neighbours = c map dataToWorkWith
80 | val classes = c map classesToWorkWith
81 |
82 | dataToWorkWith.indices.map(i => (i, nnRuleHVDM(neighbours, dataToWorkWith(i), c.indexOf(i), classes, 1, data.fileInfo.nominal,
83 | sds, attrCounter, attrClassesCounter, "nearest")._1))
84 | }
85 | val misclassified: Array[Int] = labels.collect { case (i, label) if label != classesToWorkWith(i) => i }.toArray
86 | val finalC: Array[Int] = (misclassified ++ c).distinct
87 |
88 | val auxData: Data = new Data(x = toXData(finalC map dataToWorkWith), y = finalC map classesToWorkWith, fileInfo = data.fileInfo)
89 | auxData.processedData = finalC map dataToWorkWith
90 | val tl = new TL(auxData, dist = dist, minorityClass = Some(untouchableClass))
91 | val resultTL: Data = tl.compute()
92 | val finalIndex: Array[Int] = (resultTL.index.get.toList map finalC).toArray
93 | val finishTime: Long = System.nanoTime()
94 |
95 | if (verbose) {
96 | val newCounter: Map[Any, Int] = (finalIndex map classesToWorkWith).groupBy(identity).mapValues(_.length)
97 | println("ORIGINAL SIZE: %d".format(dataToWorkWith.length))
98 | println("NEW DATA SIZE: %d".format(finalIndex.length))
99 | println("REDUCTION PERCENTAGE: %s".format(100 - (finalIndex.length.toFloat / dataToWorkWith.length) * 100))
100 | println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass)))
101 | println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass)))
102 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
103 | }
104 |
105 | new Data(finalIndex map data.x, finalIndex map data.y, Some(finalIndex), data.fileInfo)
106 | }
107 | }
108 |
--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/oversampling/SMOTEENN.scala:
--------------------------------------------------------------------------------
1 | /*
2 | SOUL: Scala Oversampling and Undersampling Library.
3 | Copyright (C) 2019 Néstor Rodríguez, David López
4 |
5 | This program is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU General Public License as published by
7 | the Free Software Foundation in version 3 of the License.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | */
17 | package soul.algorithm.oversampling
18 |
19 | import soul.algorithm.undersampling.ENN
20 | import soul.data.Data
21 | import soul.util.KDTree
22 | import soul.util.Utilities.Distance.Distance
23 | import soul.util.Utilities._
24 |
25 | import scala.util.Random
26 |
27 | /** SMOTEENN algorithm. Original paper: "A Study of the Behavior of Several Methods for Balancing Machine Learning
28 | * Training Data" by Gustavo E. A. P. A. Batista, Ronaldo C. Prati and Maria Carolina Monard.
29 | *
30 | * @param data data to work with
31 | * @param seed seed to use. If it is not provided, it will use the system time
32 | * @param percent amount of Smote N%
33 | * @param k number of minority class nearest neighbors
34 | * @param dist object of Distance enumeration representing the distance to be used
35 | * @param normalize normalize the data or not
36 | * @param verbose choose to display information about the execution or not
37 | * @author David López Pretel
38 | */
39 | class SMOTEENN(data: Data, seed: Long = System.currentTimeMillis(), percent: Int = 500, k: Int = 5,
40 | dist: Distance = Distance.EUCLIDEAN, normalize: Boolean = false, verbose: Boolean = false) {
41 |
42 | /** Compute the SMOTEENN algorithm
43 | *
44 | * @return synthetic samples generated
45 | */
46 | def compute(): Data = {
47 | val initTime: Long = System.nanoTime()
48 |
49 | if (percent > 100 && percent % 100 != 0) {
50 | throw new Exception("Percent must be a multiple of 100")
51 | }
52 |
53 | val samples: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
54 | val minorityClassIndex: Array[Int] = minority(data.y)
55 | val minorityClass: Any = data.y(minorityClassIndex(0))
56 |
57 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
58 | (samples.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
59 | samples.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
60 | samples.transpose.map((column: Array[Double]) => standardDeviation(column)))
61 | } else {
62 | (null, null, null)
63 | }
64 |
65 | val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) {
66 | Some(new KDTree(samples, data.y, samples(0).length))
67 | } else {
68 | None
69 | }
70 |
71 | // check if the percent is correct
72 | var T: Int = minorityClassIndex.length
73 | var N: Int = percent
74 |
75 | if (N < 100) {
76 | T = N / 100 * T
77 | N = 100
78 | }
79 | N = N / 100
80 |
81 | // output with a size of T*N samples
82 | val output: Array[Array[Double]] = Array.ofDim[Double](N * T, samples(0).length)
83 |
84 | val r: Random = new Random(seed)
85 |
86 | // for each minority class sample
87 | minorityClassIndex.indices.par.foreach((i: Int) => {
88 | val neighbors: Array[Int] = if (dist == Distance.EUCLIDEAN) {
89 | KDTree.get.nNeighbours(samples(minorityClassIndex(i)), k)._3.toArray
90 | } else {
91 | kNeighborsHVDM(samples, minorityClassIndex(i), k, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
92 | }
93 |
94 | // compute populate for the sample
95 | (0 until N).par.foreach((n: Int) => {
96 | val nn: Int = neighbors(r.nextInt(neighbors.length))
97 | // compute attributes of the sample
98 | samples(0).indices.foreach((atrib: Int) => {
99 | val diff: Double = samples(nn)(atrib) - samples(minorityClassIndex(i))(atrib)
100 | val gap: Double = r.nextFloat()
101 | output(i * N + n)(atrib) = samples(minorityClassIndex(i))(atrib) + gap * diff
102 | })
103 | })
104 | })
105 |
106 | val result: Array[Array[Double]] = Array.concat(samples, output)
107 | val resultClasses: Array[Any] = Array.concat(data.y, Array.fill(output.length)(minorityClass))
108 |
109 | val ennData: Data = new Data(x = toXData(result), y = resultClasses, fileInfo = data.fileInfo)
110 | ennData.processedData = result
111 | val enn = new ENN(ennData, dist = dist)
112 | val resultENN: Data = enn.compute()
113 | val finalIndex: Array[Int] = result.indices.diff(resultENN.index.get).toArray
114 |
115 | val finishTime: Long = System.nanoTime()
116 |
117 | if (verbose) {
118 | println("ORIGINAL SIZE: %d".format(data.x.length))
119 | println("NEW DATA SIZE: %d".format(data.x.length + output.length))
120 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
121 | }
122 |
123 | new Data(if (data.nomToNum(0).isEmpty) {
124 | to2Decimals(zeroOneDenormalization(finalIndex map result, data.fileInfo.maxAttribs, data.fileInfo.minAttribs))
125 | } else {
126 | toNominal(zeroOneDenormalization(finalIndex map result, data.fileInfo.maxAttribs, data.fileInfo.minAttribs), data.nomToNum)
127 | }, finalIndex map resultClasses, None, data.fileInfo)
128 | }
129 | }
130 |
--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/undersampling/IHTS.scala:
--------------------------------------------------------------------------------
1 | /*
2 | SOUL: Scala Oversampling and Undersampling Library.
3 | Copyright (C) 2019 Néstor Rodríguez, David López
4 |
5 | This program is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU General Public License as published by
7 | the Free Software Foundation in version 3 of the License.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | */
17 | package soul.algorithm.undersampling
18 |
19 | import soul.data.Data
20 | import soul.util.Utilities._
21 | import weka.classifiers.trees.J48
22 | import weka.core.Instances
23 |
24 |
25 | /** Instance Hardness Threshold. Original paper: "An Empirical Study of Instance Hardness" by Michael R. Smith,
26 | * Tony Martinez and Christophe Giraud-Carrier.
27 | *
28 | * @param data data to work with
29 | * @param seed seed to use. If it is not provided, it will use the system time
30 | * @param nFolds number of subsets to create when applying cross-validation
31 | * @param normalize normalize the data or not
32 | * @param randomData iterate through the data randomly or not
33 | * @param verbose choose to display information about the execution or not
34 | * @author Néstor Rodríguez Vico
35 | */
36 | class IHTS(data: Data, seed: Long = System.currentTimeMillis(), nFolds: Int = 5,
37 | normalize: Boolean = false, randomData: Boolean = false, verbose: Boolean = false) {
38 |
39 | /** Compute the IHTS algorithm.
40 | *
41 | * @return undersampled data structure
42 | */
43 | def compute(): Data = {
44 | val initTime: Long = System.nanoTime()
45 |
46 | val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length)
47 | val untouchableClass: Any = counter.minBy((c: (Any, Int)) => c._2)._1
48 | val random: scala.util.Random = new scala.util.Random(seed)
49 | var dataToWorkWith: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
50 | val classesToWorkWith: Array[Any] = if (randomData) {
51 | val randomIndex: List[Int] = random.shuffle(data.y.indices.toList)
52 | dataToWorkWith = (randomIndex map dataToWorkWith).toArray
53 | (randomIndex map data.y).toArray
54 | } else {
55 | data.y
56 | }
57 |
58 | // Each element is the index of test elements
59 | val indices: Array[Array[Int]] = random.shuffle(classesToWorkWith.indices.toList).toArray.grouped((classesToWorkWith.length.toFloat / nFolds).ceil.toInt).toArray
60 | val probabilities: Array[Double] = new Array[Double](classesToWorkWith.length)
61 |
62 | indices.foreach { testIndex: Array[Int] =>
63 | val trainIndex: Array[Int] = classesToWorkWith.indices.diff(testIndex).toArray
64 |
65 | val j48: J48 = new J48
66 | j48.setOptions(Array("-U", "-M", "1"))
67 |
68 | val trainInstances: Instances = buildInstances(data = trainIndex map dataToWorkWith,
69 | classes = trainIndex map classesToWorkWith, fileInfo = data.fileInfo)
70 | val testInstances: Instances = buildInstances(data = testIndex map dataToWorkWith,
71 | classes = testIndex map classesToWorkWith, fileInfo = data.fileInfo)
72 |
73 | j48.buildClassifier(trainInstances)
74 |
75 | val probs: Array[Array[Double]] = testIndex.indices.map((i: Int) => j48.distributionForInstance(testInstances.instance(i))).toArray
76 | val classes: Array[Any] = (testIndex map classesToWorkWith).distinct
77 | val values: Array[Double] = (testIndex map classesToWorkWith).zipWithIndex.map((e: (Any, Int)) => probs(e._2)(classes.indexOf(e._1)))
78 |
79 | (testIndex zip values).foreach((i: (Int, Double)) => probabilities(i._1) = i._2)
80 | }
81 |
82 | val finalIndex: Array[Int] = classesToWorkWith.distinct.flatMap { targetClass: Any =>
83 | val indexTargetClass: Array[Int] = if (targetClass != untouchableClass) {
84 | val nSamples: Int = counter(untouchableClass)
85 | val targetIndex: Array[Int] = boolToIndex(classesToWorkWith.map((c: Any) => c == targetClass))
86 | val targetProbabilities: Array[Double] = targetIndex map probabilities
87 | val percentile: Double = (1.0 - (nSamples / counter(targetClass))) * 100.0
88 | val threshold: Double = targetProbabilities.sorted.apply(math.ceil((targetProbabilities.length - 1) * (percentile / 100.0)).toInt)
89 | boolToIndex((targetIndex map probabilities).map((e: Double) => e >= threshold))
90 | }
91 | else {
92 | classesToWorkWith.zipWithIndex.collect { case (c, i) if c == targetClass => i }
93 | }
94 |
95 | indexTargetClass
96 | }
97 |
98 | val finishTime: Long = System.nanoTime()
99 |
100 | if (verbose) {
101 | val newCounter: Map[Any, Int] = (finalIndex map classesToWorkWith).groupBy(identity).mapValues(_.length)
102 | println("ORIGINAL SIZE: %d".format(dataToWorkWith.length))
103 | println("NEW DATA SIZE: %d".format(finalIndex.length))
104 | println("REDUCTION PERCENTAGE: %s".format(100 - (finalIndex.length.toFloat / dataToWorkWith.length) * 100))
105 | println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass)))
106 | println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass)))
107 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
108 | }
109 |
110 | new Data(finalIndex map data.x, finalIndex map data.y, Some(finalIndex), data.fileInfo)
111 | }
112 | }
113 |
--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/oversampling/SMOTETL.scala:
--------------------------------------------------------------------------------
1 | /*
2 | SOUL: Scala Oversampling and Undersampling Library.
3 | Copyright (C) 2019 Néstor Rodríguez, David López
4 |
5 | This program is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU General Public License as published by
7 | the Free Software Foundation in version 3 of the License.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | */
17 | package soul.algorithm.oversampling
18 |
19 | import soul.algorithm.undersampling.TL
20 | import soul.data.Data
21 | import soul.util.KDTree
22 | import soul.util.Utilities.Distance.Distance
23 | import soul.util.Utilities._
24 |
25 | import scala.util.Random
26 |
27 | /** SMOTETL algorithm. Original paper: "A Study of the Behavior of Several Methods for Balancing Machine Learning
28 | * Training Data" by Gustavo E. A. P. A. Batista, Ronaldo C. Prati and Maria Carolina Monard.
29 | *
30 | * @param data data to work with
31 | * @param seed seed to use. If it is not provided, it will use the system time
32 | * @param percent Amount of Smote N%
33 | * @param k Number of minority class nearest neighbors
34 | * @param dist object of Distance enumeration representing the distance to be used
35 | * @param normalize normalize the data or not
36 | * @param verbose choose to display information about the execution or not
37 | * @author David López Pretel
38 | */
39 | class SMOTETL(data: Data, seed: Long = System.currentTimeMillis(), percent: Int = 500, k: Int = 5,
40 | dist: Distance = Distance.EUCLIDEAN, normalize: Boolean = false, verbose: Boolean = false) {
41 |
42 | /** Compute the SMOTETL algorithm
43 | *
44 | * @return synthetic samples generated
45 | */
46 | def compute(): Data = {
47 | val initTime: Long = System.nanoTime()
48 |
49 | if (percent > 100 && percent % 100 != 0) {
50 | throw new Exception("Percent must be a multiple of 100")
51 | }
52 |
53 | val samples: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
54 | // compute minority class
55 | val minorityClassIndex: Array[Int] = minority(data.y)
56 | val minorityClass: Any = data.y(minorityClassIndex(0))
57 |
58 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
59 | (samples.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
60 | samples.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
61 | samples.transpose.map((column: Array[Double]) => standardDeviation(column)))
62 | } else {
63 | (null, null, null)
64 | }
65 |
66 | val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) {
67 | Some(new KDTree(samples, data.y, samples(0).length))
68 | } else {
69 | None
70 | }
71 |
72 | // check if the percent is correct
73 | var T: Int = minorityClassIndex.length
74 | var N: Int = percent
75 |
76 | if (N < 100) {
77 | T = N / 100 * T
78 | N = 100
79 | }
80 | N = N / 100
81 |
82 | // output with a size of T*N samples
83 | val output: Array[Array[Double]] = Array.ofDim[Double](N * T, samples(0).length)
84 |
85 | val r: Random = new Random(seed)
86 |
87 | // for each minority class sample
88 | minorityClassIndex.indices.par.foreach((i: Int) => {
89 | val neighbors: Array[Int] = if (dist == Distance.EUCLIDEAN) {
90 | KDTree.get.nNeighbours(samples(minorityClassIndex(i)), k)._3.toArray
91 | } else {
92 | kNeighborsHVDM(samples, minorityClassIndex(i), k, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
93 | }
94 |
95 | // compute populate for the sample
96 | (0 until N).par.foreach((n: Int) => {
97 | val nn: Int = neighbors(r.nextInt(neighbors.length))
98 | // compute attributes of the sample
99 | samples(0).indices.foreach((atrib: Int) => {
100 | val diff: Double = samples(nn)(atrib) - samples(minorityClassIndex(i))(atrib)
101 | val gap: Double = r.nextFloat()
102 | output(i * N + n)(atrib) = samples(minorityClassIndex(i))(atrib) + gap * diff
103 | })
104 | })
105 | })
106 | val result: Array[Array[Double]] = Array.concat(samples, output)
107 | val resultClasses: Array[Any] = Array.concat(data.y, Array.fill(output.length)(minorityClass))
108 |
109 | val tlData: Data = new Data(x = toXData(result), y = resultClasses, fileInfo = data.fileInfo)
110 | tlData.processedData = result
111 | val tl = new TL(tlData, dist = dist, ratio = "all")
112 | val resultTL: Data = tl.compute()
113 | val finalIndex: Array[Int] = result.indices.diff(resultTL.index.get).toArray
114 |
115 | val finishTime: Long = System.nanoTime()
116 |
117 | if (verbose) {
118 | println("ORIGINAL SIZE: %d".format(data.x.length))
119 | println("NEW DATA SIZE: %d".format(data.x.length + output.length))
120 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
121 | }
122 |
123 | new Data(if (data.nomToNum(0).isEmpty) {
124 | to2Decimals(zeroOneDenormalization(finalIndex map result, data.fileInfo.maxAttribs, data.fileInfo.minAttribs))
125 | } else {
126 | toNominal(zeroOneDenormalization(finalIndex map result, data.fileInfo.maxAttribs, data.fileInfo.minAttribs), data.nomToNum)
127 | }, finalIndex map resultClasses, None, data.fileInfo)
128 | }
129 | }
130 |
--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/oversampling/SafeLevelSMOTE.scala:
--------------------------------------------------------------------------------
1 | /*
2 | SOUL: Scala Oversampling and Undersampling Library.
3 | Copyright (C) 2019 Néstor Rodríguez, David López
4 |
5 | This program is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU General Public License as published by
7 | the Free Software Foundation in version 3 of the License.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | */
17 | package soul.algorithm.oversampling
18 |
19 | import soul.data.Data
20 | import soul.util.KDTree
21 | import soul.util.Utilities.Distance.Distance
22 | import soul.util.Utilities._
23 |
24 | import scala.util.Random
25 |
26 | /** SafeLevel-SMOTE algorithm. Original paper: "Safe-Level-SMOTE: Safe-Level-Synthetic Minority Over-Sampling Technique
27 | * for Handling the Class Imbalanced Problem" by Chumphol Bunkhumpornpat, Krung Sinapiromsaran, and Chidchanok Lursinsap.
28 | *
29 | * @param data data to work with
30 | * @param seed seed to use. If it is not provided, it will use the system time
31 | * @param k Number of nearest neighbors
32 | * @param dist object of Distance enumeration representing the distance to be used
33 | * @param normalize normalize the data or not
34 | * @param verbose choose to display information about the execution or not
35 | * @author David López Pretel
36 | */
37 | class SafeLevelSMOTE(data: Data, seed: Long = System.currentTimeMillis(), k: Int = 5,
38 | dist: Distance = Distance.EUCLIDEAN, normalize: Boolean = false, verbose: Boolean = false) {
39 |
40 | /** Compute the SafeLevelSMOTE algorithm
41 | *
42 | * @return synthetic samples generated
43 | */
44 | def compute(): Data = {
45 | val initTime: Long = System.nanoTime()
46 | val samples: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
47 | // compute minority class
48 | val minorityClassIndex: Array[Int] = minority(data.y)
49 | val minorityClass: Any = data.y(minorityClassIndex(0))
50 |
51 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
52 | (samples.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
53 | samples.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
54 | samples.transpose.map((column: Array[Double]) => standardDeviation(column)))
55 | } else {
56 | (null, null, null)
57 | }
58 |
59 | val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) {
60 | Some(new KDTree(samples, data.y, samples(0).length))
61 | } else {
62 | None
63 | }
64 |
65 | var sl_ratio: Double = 0.0
66 | val r: Random = new Random(seed)
67 |
68 | val output: Array[Array[Double]] = minorityClassIndex.indices.par.map(i => {
69 | // compute k neighbors from p and save number of positive instances
70 | val neighbors: Array[Int] = if (dist == Distance.EUCLIDEAN) {
71 | KDTree.get.nNeighbours(samples(minorityClassIndex(i)), k)._3.toArray
72 | } else {
73 | kNeighborsHVDM(samples, i, k, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
74 | }
75 | val n: Int = neighbors(r.nextInt(neighbors.length))
76 | val slp: Int = neighbors.map(neighbor => {
77 | if (data.y(neighbor) == minorityClass) {
78 | 1
79 | } else {
80 | 0
81 | }
82 | }).sum
83 | // compute k neighbors from n and save number of positive instances
84 | val selectedNeighbors: Array[Int] = if (dist == Distance.EUCLIDEAN) {
85 | KDTree.get.nNeighbours(samples(n), k)._3.toArray
86 | } else {
87 | kNeighborsHVDM(samples, n, k, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
88 | }
89 |
90 | val sln: Int = selectedNeighbors.map(neighbor => {
91 | if (data.y(neighbor) == minorityClass) {
92 | 1
93 | } else {
94 | 0
95 | }
96 | }).sum
97 | if (sln != 0) { //sl is safe level
98 | sl_ratio = slp / sln
99 | } else {
100 | sl_ratio = 99999999
101 | }
102 | if (sl_ratio == 99999999 && slp == 0) {
103 | // dont create a synthetic instance
104 | None
105 | }
106 | else {
107 | // calculate synthetic sample
108 | Some(samples(i).indices.map(atrib => {
109 | var gap: Double = 0.0 // 2 case
110 | if (sl_ratio == 1) { // 3 case
111 | gap = r.nextFloat
112 | } else if (sl_ratio > 1 && sl_ratio != 99999999) { // 4 case
113 | gap = r.nextFloat * (1 / sl_ratio)
114 | } else if (sl_ratio < 1) { // 5 case
115 | gap = r.nextFloat()
116 | if (gap < 1 - sl_ratio) {
117 | gap = gap + 1 - sl_ratio
118 | }
119 | }
120 | val diff: Double = samples(n)(atrib) - samples(minorityClassIndex(i))(atrib)
121 | samples(minorityClassIndex(i))(atrib) + gap * diff
122 | }).toArray)
123 | }
124 | }).filterNot(_.forall(_ == None)).map(_.get).toArray
125 |
126 | val finishTime: Long = System.nanoTime()
127 |
128 | if (verbose) {
129 | println("ORIGINAL SIZE: %d".format(data.x.length))
130 | println("NEW DATA SIZE: %d".format(data.x.length + output.length))
131 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
132 | }
133 |
134 | new Data(if (data.fileInfo.nominal.length == 0) {
135 | to2Decimals(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs,
136 | data.fileInfo.minAttribs) else output))
137 | } else {
138 | toNominal(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs,
139 | data.fileInfo.minAttribs) else output), data.nomToNum)
140 | }, Array.concat(data.y, Array.fill(output.length)(minorityClass)), None, data.fileInfo)
141 | }
142 | }
143 |
--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/oversampling/ADOMS.scala:
--------------------------------------------------------------------------------
1 | /*
2 | SOUL: Scala Oversampling and Undersampling Library.
3 | Copyright (C) 2019 Néstor Rodríguez, David López
4 |
5 | This program is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU General Public License as published by
7 | the Free Software Foundation in version 3 of the License.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | */
17 | package soul.algorithm.oversampling
18 |
19 | import breeze.linalg.{DenseMatrix, eigSym}
20 | import soul.data.Data
21 | import soul.util.KDTree
22 | import soul.util.Utilities.Distance.Distance
23 | import soul.util.Utilities._
24 |
25 | import scala.util.Random
26 |
27 | /** ADOMS algorithm. Original paper: "The Generation Mechanism of Synthetic Minority Class Examples" by Sheng TANG
28 | * and Si-ping CHEN.
29 | *
30 | * @param data data to work with
31 | * @param seed seed to use. If it is not provided, it will use the system time
32 | * @param percent amount of samples N%
33 | * @param k number of neighbors
34 | * @param dist object of Distance enumeration representing the distance to be used
35 | * @param normalize normalize the data or not
36 | * @param verbose choose to display information about the execution or not
37 | * @author David López Pretel
38 | */
39 | class ADOMS(data: Data, seed: Long = System.currentTimeMillis(), percent: Int = 300, k: Int = 5,
40 | dist: Distance = Distance.EUCLIDEAN, normalize: Boolean = false, verbose: Boolean = false) {
41 |
42 | /** Compute the first principal component axis
43 | *
44 | * @param A the data
45 | * @return the first principal component axis
46 | */
47 | private def PCA(A: Array[Array[Double]]): Array[Double] = {
48 | val mean: Array[Double] = A.transpose.map(_.sum / A.length)
49 | // subtract the mean to the data
50 | val dataNoMean: DenseMatrix[Double] = DenseMatrix(A: _*) -:- DenseMatrix(A.map(_ => mean): _*)
51 | // get the covariance matrix
52 | val oneDividedByN: Array[Array[Double]] = Array.fill(dataNoMean.cols, dataNoMean.cols)(dataNoMean.rows)
53 | val S: DenseMatrix[Double] = (dataNoMean.t * dataNoMean) /:/ DenseMatrix(oneDividedByN: _*)
54 | //compute the eigenvectors and eigenvalues of S
55 | val eigen = eigSym(S)
56 |
57 | //return the first eigenvector because it represent the first principal component axis
58 | eigen.eigenvectors(0, ::).t.toArray
59 | }
60 |
61 | /** Compute the ADOMS algorithm
62 | *
63 | * @return synthetic samples generated
64 | */
65 | def compute(): Data = {
66 | val initTime: Long = System.nanoTime()
67 | val samples: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
68 | val minorityClassIndex: Array[Int] = minority(data.y)
69 | val minorityClass: Any = data.y(minorityClassIndex(0))
70 | // output with a size of T*N samples
71 | val output: Array[Array[Double]] = Array.ofDim(minorityClassIndex.length * percent / 100, samples(0).length)
72 | // index array to save the neighbors of each sample
73 | val r: Random = new Random(seed)
74 |
75 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
76 | (samples.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
77 | samples.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
78 | samples.transpose.map((column: Array[Double]) => standardDeviation(column)))
79 | } else {
80 | (null, null, null)
81 | }
82 |
83 | val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) {
84 | Some(new KDTree(samples, data.y, samples(0).length))
85 | } else {
86 | None
87 | }
88 |
89 | val N: Int = percent / 100
90 |
91 | (0 until N).par.foreach(nn => {
92 | // for each minority class sample
93 | minorityClassIndex.zipWithIndex.par.foreach(i => {
94 | val neighbors: Array[Int] = if (dist == Distance.EUCLIDEAN) {
95 | KDTree.get.nNeighbours(samples(i._1), k)._3.toArray
96 | } else {
97 | kNeighborsHVDM(samples, i._2, k, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
98 | }
99 |
100 | val n: Int = r.nextInt(neighbors.length)
101 |
102 | // calculate first principal component axis of local data distribution
103 | val l2: Array[Double] = PCA(neighbors map samples)
104 | // compute projection of n in l2, M is on l2
105 | val dotMN: Double = l2.indices.map(j => {
106 | samples(i._1)(j) - samples(neighbors(n))(j)
107 | }).toArray.zipWithIndex.map(j => {
108 | j._1 * l2(j._2)
109 | }).sum
110 | val dotMM: Double = l2.map(x => x * x).sum
111 | // create synthetic sample
112 | output(nn * minorityClassIndex.length + i._2) = l2.indices.map(j => samples(i._1)(j) + dotMN / dotMM * l2(j)).toArray
113 | output(nn * minorityClassIndex.length + i._2) = output(nn * minorityClassIndex.length + i._2).indices.map(j => output(nn * minorityClassIndex.length + i._2)(j) + (samples(i._1)(j) - output(nn * minorityClassIndex.length + i._2)(j)) * r.nextFloat()).toArray
114 | })
115 | })
116 |
117 | val finishTime: Long = System.nanoTime()
118 |
119 | if (verbose) {
120 | println("ORIGINAL SIZE: %d".format(data.x.length))
121 | println("NEW DATA SIZE: %d".format(data.x.length + output.length))
122 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
123 | }
124 |
125 | new Data(if (data.fileInfo.nominal.length == 0) {
126 | to2Decimals(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs,
127 | data.fileInfo.minAttribs) else output))
128 | } else {
129 | toNominal(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs,
130 | data.fileInfo.minAttribs) else output), data.nomToNum)
131 | }, Array.concat(data.y, Array.fill(output.length)(minorityClass)), None, data.fileInfo)
132 | }
133 | }
134 |
--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/oversampling/BorderlineSMOTE.scala:
--------------------------------------------------------------------------------
1 | /*
2 | SOUL: Scala Oversampling and Undersampling Library.
3 | Copyright (C) 2019 Néstor Rodríguez, David López
4 |
5 | This program is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU General Public License as published by
7 | the Free Software Foundation in version 3 of the License.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | */
17 | package soul.algorithm.oversampling
18 |
19 | import soul.data.Data
20 | import soul.util.KDTree
21 | import soul.util.Utilities.Distance.Distance
22 | import soul.util.Utilities._
23 |
24 | import scala.util.Random
25 |
26 | /** Borderline-SMOTE algorithm. Original paper: "Borderline-SMOTE: A New Over-Sampling Method in Imbalanced Data Sets
27 | * Learning." by Hui Han, Wen-Yuan Wang, and Bing-Huan Mao.
28 | *
29 | * @param data data to work with
30 | * @param seed seed to use. If it is not provided, it will use the system time
31 | * @param m number of nearest neighbors
32 | * @param k number of minority class nearest neighbors
33 | * @param dist object of Distance enumeration representing the distance to be used
34 | * @param normalize normalize the data or not
35 | * @param verbose choose to display information about the execution or not
36 | * @author David López Pretel
37 | */
38 | class BorderlineSMOTE(data: Data, seed: Long = System.currentTimeMillis(), m: Int = 10, k: Int = 5,
39 | dist: Distance = Distance.EUCLIDEAN, normalize: Boolean = false, verbose: Boolean = false) {
40 |
41 | /** Compute the BorderlineSMOTE algorithm
42 | *
43 | * @return synthetic samples generated
44 | */
45 | def compute(): Data = {
46 | val initTime: Long = System.nanoTime()
47 | val samples: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
48 | val minorityClassIndex: Array[Int] = minority(data.y)
49 | val minorityClass: Any = data.y(minorityClassIndex(0))
50 |
51 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
52 | (samples.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
53 | samples.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
54 | samples.transpose.map((column: Array[Double]) => standardDeviation(column)))
55 | } else {
56 | (null, null, null)
57 | }
58 |
59 | val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) {
60 | Some(new KDTree(samples, data.y, samples(0).length))
61 | } else {
62 | None
63 | }
64 |
65 | val KDTreeMinority: Option[KDTree] = if (dist == Distance.EUCLIDEAN) {
66 | Some(new KDTree(minorityClassIndex map samples, minorityClassIndex map data.y, samples(0).length))
67 | } else {
68 | None
69 | }
70 |
71 | // compute minority class neighbors
72 | val minorityClassNeighbors: Array[Array[Int]] = new Array[Array[Int]](minorityClassIndex.length)
73 | if (dist == Distance.EUCLIDEAN) {
74 | minorityClassIndex.indices.par.foreach(i => minorityClassNeighbors(i) = KDTree.get.nNeighbours(samples(minorityClassIndex(i)), k)._3.toArray)
75 | } else {
76 | minorityClassIndex.indices.par.foreach(i => minorityClassNeighbors(i) = kNeighborsHVDM(samples, minorityClassIndex(i), m, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter))
77 | }
78 |
79 | //compute nodes in borderline
80 | val DangerNodes: Array[Int] = minorityClassNeighbors.map(neighbors => {
81 | var counter = 0
82 | neighbors.foreach(neighbor => {
83 | if (data.y(neighbor) != minorityClass) {
84 | counter += 1
85 | }
86 | })
87 | counter
88 | }).zipWithIndex.map(nNonMinorityClass => {
89 | if (nNonMinorityClass._1 >= (m / 2) && nNonMinorityClass._1 < m) {
90 | Some(nNonMinorityClass._2)
91 | } else {
92 | None
93 | }
94 | }).filterNot(_.forall(_ == None)).map(x => minorityClassIndex(x.get))
95 |
96 | val r: Random = new Random(seed)
97 | val s: Int = r.nextInt(k) + 1
98 |
99 | // output with a size of T*N samples
100 | val output: Array[Array[Double]] = Array.ofDim(s * DangerNodes.length, samples(0).length)
101 |
102 | // for each minority class sample
103 | DangerNodes.zipWithIndex.par.foreach(i => {
104 | val neighbors = if (dist == Distance.EUCLIDEAN) {
105 | KDTreeMinority.get.nNeighbours(samples(i._1), k)._3.toArray
106 | } else {
107 | kNeighborsHVDM(minorityClassIndex map samples, i._2, k, data.fileInfo.nominal, sds, attrCounter,
108 | attrClassesCounter).map(minorityClassIndex(_))
109 | }
110 | val sNeighbors: Array[Int] = (0 until s).map(_ => r.nextInt(neighbors.length)).toArray
111 | // calculate populate for the sample
112 | (sNeighbors map neighbors).zipWithIndex.par.foreach(j => {
113 | // calculate attributes of the sample
114 | samples(i._1).indices.foreach(attrib => {
115 | val diff: Double = samples(minorityClassIndex(j._1))(attrib) - samples(i._1)(attrib)
116 | val gap: Float = r.nextFloat
117 | output(i._2 * s + j._2)(attrib) = samples(i._1)(attrib) + gap * diff
118 | })
119 | })
120 | })
121 |
122 | val finishTime: Long = System.nanoTime()
123 |
124 | if (verbose) {
125 | println("ORIGINAL SIZE: %d".format(data.x.length))
126 | println("NEW DATA SIZE: %d".format(data.x.length + output.length))
127 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
128 | }
129 |
130 | new Data(if (data.fileInfo.nominal.length == 0) {
131 | to2Decimals(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs,
132 | data.fileInfo.minAttribs) else output))
133 | } else {
134 | toNominal(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs,
135 | data.fileInfo.minAttribs) else output), data.nomToNum)
136 | }, Array.concat(data.y, Array.fill(output.length)(minorityClass)), None, data.fileInfo)
137 | }
138 | }
139 |
--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/undersampling/TL.scala:
--------------------------------------------------------------------------------
1 | /*
2 | SOUL: Scala Oversampling and Undersampling Library.
3 | Copyright (C) 2019 Néstor Rodríguez, David López
4 |
5 | This program is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU General Public License as published by
7 | the Free Software Foundation in version 3 of the License.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | */
17 | package soul.algorithm.undersampling
18 |
19 | import soul.data.Data
20 | import soul.util.Utilities.Distance.Distance
21 | import soul.util.Utilities._
22 |
23 | /** Tomek Link. Original paper: "Two Modifications of CNN" by Ivan Tomek.
24 | *
25 | * @param data data to work with
26 | * @param seed seed to use. If it is not provided, it will use the system time
27 | * @param dist object of Distance enumeration representing the distance to be used
28 | * @param ratio indicates the instances of the Tomek Links that are going to be remove. "all" will remove all instances,
29 | * "minority" will remove instances of the minority class and "not minority" will remove all the instances
30 | * except the ones of the minority class.
31 | * @param minorityClass minority class. If set to None, it will be computed
32 | * @param normalize normalize the data or not
33 | * @param randomData iterate through the data randomly or not
34 | * @param verbose choose to display information about the execution or not
35 | * @author Néstor Rodríguez Vico
36 | */
37 | class TL(data: Data, seed: Long = System.currentTimeMillis(), dist: Distance = Distance.EUCLIDEAN, ratio: String = "not minority",
38 | val minorityClass: Option[Any] = None, normalize: Boolean = false, randomData: Boolean = false, verbose: Boolean = false) {
39 |
40 | /** Compute the TL algorithm.
41 | *
42 | * @return undersampled data structure
43 | */
44 | def compute(): Data = {
45 | val initTime: Long = System.nanoTime()
46 | val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length)
47 | val untouchableClass: Any = if (minorityClass.isDefined) minorityClass.get else counter.minBy((c: (Any, Int)) => c._2)._1
48 | val random: scala.util.Random = new scala.util.Random(seed)
49 |
50 | var dataToWorkWith: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
51 | val classesToWorkWith: Array[Any] = if (randomData) {
52 | val randomIndex: List[Int] = random.shuffle(data.y.indices.toList)
53 | dataToWorkWith = (randomIndex map dataToWorkWith).toArray
54 | (randomIndex map data.y).toArray
55 | } else {
56 | data.y
57 | }
58 |
59 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
60 | (dataToWorkWith.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
61 | dataToWorkWith.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
62 | dataToWorkWith.transpose.map((column: Array[Double]) => standardDeviation(column)))
63 | } else {
64 | (null, null, null)
65 | }
66 |
67 | val candidates: Map[Any, Array[Int]] = classesToWorkWith.distinct.map {
68 | c: Any =>
69 | c -> classesToWorkWith.zipWithIndex.collect {
70 | case (a, b) if a != c => b
71 | }
72 | }.toMap
73 |
74 | val distances: Array[Array[Double]] = Array.fill[Array[Double]](dataToWorkWith.length)(new Array[Double](dataToWorkWith.length))
75 |
76 | if (dist == Distance.EUCLIDEAN) {
77 | dataToWorkWith.indices.par.foreach { i: Int =>
78 | dataToWorkWith.indices.drop(i).par.foreach { j: Int =>
79 | distances(i)(j) = euclidean(dataToWorkWith(i), dataToWorkWith(j))
80 | distances(j)(i) = distances(i)(j)
81 | }
82 | }
83 | } else {
84 | dataToWorkWith.indices.par.foreach { i: Int =>
85 | dataToWorkWith.indices.drop(i).par.foreach { j: Int =>
86 | distances(i)(j) = HVDM(dataToWorkWith(i), dataToWorkWith(j), data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
87 | distances(j)(i) = distances(i)(j)
88 | }
89 | }
90 | }
91 |
92 | // Look for the nearest neighbour in the rest of the classes
93 | val nearestNeighbour: Array[Int] = distances.zipWithIndex.map((row: (Array[Double], Int)) => row._1.indexOf((candidates(classesToWorkWith(row._2)) map row._1).min))
94 | // For each instance, I: If my nearest neighbour is J and the nearest neighbour of J it's me, I, I and J form a Tomek link
95 | val tomekLinks: Array[(Int, Int)] = nearestNeighbour.zipWithIndex.filter((pair: (Int, Int)) => nearestNeighbour(pair._1) == pair._2)
96 | val targetInstances: Array[Int] = tomekLinks.flatMap((x: (Int, Int)) => List(x._1, x._2)).distinct
97 | // but the user can choose which of them should be removed
98 | val removedInstances: Array[Int] = if (ratio == "all") targetInstances else if (ratio == "minority")
99 | targetInstances.zipWithIndex.collect {
100 | case (a, b) if a == untouchableClass => b
101 | } else if (ratio == "not minority")
102 | targetInstances.zipWithIndex.collect {
103 | case (a, b) if a != untouchableClass => b
104 | } else
105 | throw new Exception("Incorrect value of ratio. Possible options: all, minority, not minority")
106 | val finalIndex: Array[Int] = dataToWorkWith.indices.diff(removedInstances).toArray
107 | val finishTime: Long = System.nanoTime()
108 |
109 | if (verbose) {
110 | val newCounter: Map[Any, Int] = (finalIndex map classesToWorkWith).groupBy(identity).mapValues(_.length)
111 | println("ORIGINAL SIZE: %d".format(dataToWorkWith.length))
112 | println("NEW DATA SIZE: %d".format(finalIndex.length))
113 | println("REDUCTION PERCENTAGE: %s".format(100 - (finalIndex.length.toFloat / dataToWorkWith.length) * 100))
114 | println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass)))
115 | println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass)))
116 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
117 | println("REMOVED INSTANCES: %s".format(ratio))
118 | }
119 |
120 | new Data(finalIndex map data.x, finalIndex map data.y, Some(finalIndex), data.fileInfo)
121 | }
122 | }
123 |
--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/oversampling/ADASYN.scala:
--------------------------------------------------------------------------------
1 | /*
2 | SOUL: Scala Oversampling and Undersampling Library.
3 | Copyright (C) 2019 Néstor Rodríguez, David López
4 |
5 | This program is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU General Public License as published by
7 | the Free Software Foundation in version 3 of the License.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | */
17 | package soul.algorithm.oversampling
18 |
19 | import soul.data.Data
20 | import soul.util.KDTree
21 | import soul.util.Utilities.Distance.Distance
22 | import soul.util.Utilities._
23 |
24 | import scala.util.Random
25 |
26 | /** ADASYN algorithm. Original paper: "ADASYN: Adaptive Synthetic Sampling Approach for Imbalanced Learning" by Haibo He,
27 | * Yang Bai, Edwardo A. Garcia, and Shutao Li.
28 | *
29 | * @param data data to work with
30 | * @param seed seed to use. If it is not provided, it will use the system time
31 | * @param d preset threshold for the maximum tolerated degree of class imbalance radio
32 | * @param B balance level after generation of synthetic data
33 | * @param k number of neighbors
34 | * @param dist object of Distance enumeration representing the distance to be used
35 | * @param normalize normalize the data or not
36 | * @param verbose choose to display information about the execution or not
37 | * @author David López Pretel
38 | */
39 | class ADASYN(data: Data, seed: Long = System.currentTimeMillis(), d: Double = 1, B: Double = 1, k: Int = 5,
40 | dist: Distance = Distance.EUCLIDEAN, normalize: Boolean = false, verbose: Boolean = false) {
41 |
42 | /** Compute the ADASYN algorithm
43 | *
44 | * @return synthetic samples generated
45 | */
46 | def compute(): Data = {
47 | val initTime: Long = System.nanoTime()
48 |
49 | if (B > 1 || B < 0) {
50 | throw new Exception("B must be between 0 and 1, both included")
51 | }
52 |
53 | if (d > 1 || d <= 0) {
54 | throw new Exception("d must be between 0 and 1, zero not included")
55 | }
56 |
57 | val samples: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
58 |
59 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
60 | (samples.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
61 | samples.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
62 | samples.transpose.map((column: Array[Double]) => standardDeviation(column)))
63 | } else {
64 | (null, null, null)
65 | }
66 |
67 | val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) {
68 | Some(new KDTree(samples, data.y, samples(0).length))
69 | } else {
70 | None
71 | }
72 |
73 | val minorityClassIndex: Array[Int] = minority(data.y)
74 | val minorityClass: Any = data.y(minorityClassIndex(0))
75 |
76 | // calculate size of the output
77 | val ms: Int = minorityClassIndex.length
78 | val ml: Int = data.y.length - ms
79 | val G: Int = ((ml - ms) * B).asInstanceOf[Int]
80 |
81 | // k neighbors of each minority sample
82 | val neighbors: Array[Array[Int]] = new Array[Array[Int]](minorityClassIndex.length)
83 | minorityClassIndex.indices.par.foreach { i =>
84 | if (dist == Distance.EUCLIDEAN) {
85 | neighbors(i) = KDTree.get.nNeighbours(samples(minorityClassIndex(i)), k)._3.toArray
86 | } else {
87 | neighbors(i) = kNeighborsHVDM(samples, minorityClassIndex(i), k, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
88 | }
89 | }
90 |
91 | // ratio of each minority sample
92 | val ratio: Array[Double] = new Array[Double](neighbors.length)
93 | neighbors.zipWithIndex.par.foreach(neighborsOfX => {
94 | ratio(neighborsOfX._2) = neighborsOfX._1.map(neighbor => {
95 | if (data.y(neighbor) != minorityClass) 1 else 0
96 | }).sum.asInstanceOf[Double] / k
97 | })
98 |
99 | // normalize ratios
100 | val sumRatios: Double = ratio.sum
101 | ratio.indices.par.foreach(i => ratio(i) = ratio(i) / sumRatios)
102 |
103 | // number of synthetic samples for each sample
104 | val g: Array[Int] = new Array[Int](ratio.length)
105 | ratio.zipWithIndex.par.foreach(ri => g(ri._2) = (ri._1 * G).asInstanceOf[Int])
106 |
107 | // output with a size of sum(Gi) samples
108 | val output: Array[Array[Double]] = Array.ofDim(g.sum, samples(0).length)
109 |
110 | val r: Random = new Random(seed)
111 | // must compute the random information before the loops due to parallelism
112 |
113 | var counter: Int = 0
114 | val increment: Array[Int] = new Array[Int](g.length)
115 | var i = 0
116 | while (i < g.length) {
117 | increment(i) = counter
118 | counter += g(i)
119 | i += 1
120 | }
121 |
122 | // for each minority class sample, create gi synthetic samples
123 | minorityClassIndex.indices.zip(increment).foreach(xi => {
124 | (0 until g(xi._1)).foreach(n => {
125 | // compute synthetic sample si = (xzi - xi) * lambda + xi
126 | samples(0).indices.foreach(atrib => {
127 | val nn: Int = neighbors(xi._1)(r.nextInt(neighbors(xi._1).length))
128 | val diff: Double = samples(nn)(atrib) - samples(minorityClassIndex(xi._1))(atrib)
129 | val gap: Float = r.nextFloat
130 | output(xi._2 + n)(atrib) = samples(minorityClassIndex(xi._1))(atrib) + gap * diff
131 | })
132 | })
133 | })
134 |
135 | val finishTime: Long = System.nanoTime()
136 |
137 | if (verbose) {
138 | println("ORIGINAL SIZE: %d".format(data.x.length))
139 | println("NEW DATA SIZE: %d".format(data.x.length + output.length))
140 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
141 | }
142 |
143 | new Data(if (data.fileInfo.nominal.length == 0) {
144 | to2Decimals(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs,
145 | data.fileInfo.minAttribs) else output))
146 | } else {
147 | toNominal(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs,
148 | data.fileInfo.minAttribs) else output), data.nomToNum)
149 | }, Array.concat(data.y, Array.fill(output.length)(minorityClass)), None, data.fileInfo)
150 | }
151 | }
152 |
--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/oversampling/MDO.scala:
--------------------------------------------------------------------------------
1 | /*
2 | SOUL: Scala Oversampling and Undersampling Library.
3 | Copyright (C) 2019 Néstor Rodríguez, David López
4 |
5 | This program is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU General Public License as published by
7 | the Free Software Foundation in version 3 of the License.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | */
17 | package soul.algorithm.oversampling
18 |
19 | import breeze.linalg.{DenseMatrix, DenseVector, eigSym, inv, sum}
20 | import soul.data.Data
21 | import soul.util.Utilities._
22 |
23 | import scala.util.Random
24 |
25 | /** MDO algorithm. Original paper: "To combat multi-class imbalanced problems by means of over-sampling and boosting
26 | * techniques" by Lida Adbi and Sattar Hashemi.
27 | *
28 | * @param data data to work with
29 | * @param seed seed to use. If it is not provided, it will use the system time
30 | * @param normalize normalize the data or not
31 | * @param verbose choose to display information about the execution or not
32 | * @author David López Pretel
33 | */
34 | class MDO(data: Data, seed: Long = System.currentTimeMillis(), normalize: Boolean = false, verbose: Boolean = false) {
35 |
36 | /** Compute the MDO algorithm
37 | *
38 | * @return synthetic samples generated
39 | */
40 | def compute(): Data = {
41 | val initTime: Long = System.nanoTime()
42 | val samples: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
43 | // compute minority class
44 | val minorityClassIndex: Array[Int] = minority(data.y)
45 | // compute majority class
46 | val minorityClass: Any = data.y(minorityClassIndex(0))
47 | val majorityClassIndex: Array[Int] = samples.indices.diff(minorityClassIndex.toList).toArray
48 |
49 | // compute the mean for the values of each attribute
50 | val mean: Array[Double] = (minorityClassIndex map samples).transpose.map(_.sum / minorityClassIndex.length)
51 |
52 | // subtract the mean to every attrib and then compute the covariance matrix
53 | val Zi: DenseMatrix[Double] = DenseMatrix(minorityClassIndex map samples: _*) -:- DenseMatrix(minorityClassIndex.map(_ => mean): _*)
54 | val oneDividedByN: Array[Array[Double]] = Array.fill(Zi.cols, Zi.cols)(Zi.rows)
55 | val S: DenseMatrix[Double] = (Zi.t * Zi) /:/ DenseMatrix(oneDividedByN: _*)
56 | //compute the eigenvectors and eigenvalues of S
57 | val eigen = eigSym(S)
58 | // the eigenvectors form the columns of the matrix that performs the change os basis
59 | val Ti: DenseMatrix[Double] = (eigen.eigenvectors * Zi.t).t
60 | // the diag are the eigenvalues
61 | val V: DenseVector[Double] = eigen.eigenvalues
62 |
63 | //compute the new samples
64 | val newSamples: Array[Array[Double]] = MDO_oversampling(Ti, mean, V, majorityClassIndex.length - minorityClassIndex.length, seed)
65 |
66 | //transform the samples to the original basis
67 | val newSamplesToOriginalSpace: DenseMatrix[Double] = (inv(eigen.eigenvectors) * DenseMatrix(newSamples: _*).t).t
68 |
69 | //sum the mean again
70 | val samplesWithMean: DenseMatrix[Double] = newSamplesToOriginalSpace +:+ DenseMatrix((0 until newSamplesToOriginalSpace.rows).map(_ => mean): _*)
71 |
72 | // the output
73 | val output: Array[Array[Double]] = Array.range(0, samplesWithMean.rows).map(i => samplesWithMean(i, ::).t.toArray)
74 |
75 | val finishTime: Long = System.nanoTime()
76 |
77 | if (verbose) {
78 | println("ORIGINAL SIZE: %d".format(data.x.length))
79 | println("NEW DATA SIZE: %d".format(data.x.length + output.length))
80 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
81 | }
82 |
83 | new Data(if (data.fileInfo.nominal.length == 0) {
84 | to2Decimals(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs,
85 | data.fileInfo.minAttribs) else output))
86 | } else {
87 | toNominal(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs,
88 | data.fileInfo.minAttribs) else output), data.nomToNum)
89 | }, Array.concat(data.y, Array.fill(output.length)(minorityClass)), None, data.fileInfo)
90 | }
91 |
92 | /** create the new samples for MDO algorithm
93 | *
94 | * @param Ti the samples changed of basis
95 | * @param mean the mean of every characteristic
96 | * @param V the vector of coefficients
97 | * @param Orate majoritySamples - minoritySamples
98 | * @param seed seed to use. If it is not provided, it will use the system time
99 | * @return return the new samples generated
100 | */
101 | def MDO_oversampling(Ti: DenseMatrix[Double], mean: Array[Double], V: DenseVector[Double], Orate: Int, seed: Long): Array[Array[Double]] = {
102 | // check the number of new samples to be created
103 | var I: Int = Ti.rows
104 | var N: Int = Orate / I
105 | if (I > Orate) {
106 | N = 1
107 | I = Orate
108 | }
109 |
110 | val output: Array[Array[Double]] = Array.fill(Orate, Ti.cols)(0.0)
111 | var newIndex: Int = 0
112 | val rand: Random.type = scala.util.Random
113 | rand.setSeed(seed)
114 |
115 | (0 until I).foreach(i => {
116 | // square of each sample
117 | val x: DenseVector[Double] = Ti(i, ::).t *:* Ti(i, ::).t
118 | // vector results from α × V , which forms the denominators of ellipse equation
119 | val alpha: Double = sum(x /:/ V)
120 | val alphaV: DenseVector[Double] = V *:* alpha
121 | (0 until N).foreach(_ => {
122 | var s: Double = 0.0
123 | (0 until Ti.cols - 1).foreach(p => {
124 | //random number between -sqrt(alphaV(p)) and sqrt(alphaV(p))
125 | val r: Double = -alphaV(p) / (Ti.cols - 1) + rand.nextFloat() * (alphaV(p) / (Ti.cols - 1) + alphaV(p) / (Ti.cols - 1))
126 | //this number is the value for the attrib p
127 | output(newIndex)(p) = r
128 | // sum necessary to compute the last attrib later
129 | s = s + (r * r / alphaV(p))
130 | })
131 | //compute the last attrib
132 | val lastFeaVal: Double = (1 - s) * alphaV(alphaV.length - 1)
133 | output(newIndex)(alphaV.size - 1) = if (rand.nextInt() % 2 == 0) -lastFeaVal else lastFeaVal
134 | newIndex += 1
135 | })
136 | })
137 | output
138 | }
139 | }
140 |
--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/undersampling/ClusterOSS.scala:
--------------------------------------------------------------------------------
1 | /*
2 | SOUL: Scala Oversampling and Undersampling Library.
3 | Copyright (C) 2019 Néstor Rodríguez, David López
4 |
5 | This program is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU General Public License as published by
7 | the Free Software Foundation in version 3 of the License.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | */
17 | package soul.algorithm.undersampling
18 |
19 | import soul.data.Data
20 | import soul.util.KDTree
21 | import soul.util.Utilities.Distance.Distance
22 | import soul.util.Utilities._
23 |
24 | /** ClusterOSS. Original paper: "ClusterOSS: a new undersampling method for imbalanced learning."
25 | * by Victor H Barella, Eduardo P Costa and André C P L F Carvalho.
26 | *
27 | * @param data data to work with
28 | * @param seed seed to use. If it is not provided, it will use the system time
29 | * @param dist object of Distance enumeration representing the distance to be used
30 | * @param numClusters number of clusters to be created by KMeans algorithm
31 | * @param restarts number of times to relaunch KMeans algorithm
32 | * @param minDispersion stop KMeans core if dispersion is lower than this value
33 | * @param maxIterations number of iterations to be done in KMeans algorithm
34 | * @param normalize normalize the data or not
35 | * @param randomData iterate through the data randomly or not
36 | * @param verbose choose to display information about the execution or not
37 | * @author Néstor Rodríguez Vico
38 | */
39 | class ClusterOSS(data: Data, seed: Long = System.currentTimeMillis(), dist: Distance = Distance.EUCLIDEAN,
40 | numClusters: Int = 15, restarts: Int = 5, minDispersion: Double = 0.0001, maxIterations: Int = 100,
41 | normalize: Boolean = false, randomData: Boolean = false, verbose: Boolean = false) {
42 |
43 | /** Compute the ClusterOSS algorithm
44 | *
45 | * @return undersampled data structure
46 | */
47 | def compute(): Data = {
48 | val initTime: Long = System.nanoTime()
49 |
50 | val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length)
51 | val untouchableClass: Any = counter.minBy((c: (Any, Int)) => c._2)._1
52 | val random: scala.util.Random = new scala.util.Random(seed)
53 | var dataToWorkWith: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
54 | val classesToWorkWith: Array[Any] = if (randomData) {
55 | val randomIndex: List[Int] = random.shuffle(data.y.indices.toList)
56 | dataToWorkWith = (randomIndex map dataToWorkWith).toArray
57 | (randomIndex map data.y).toArray
58 | } else {
59 | data.y
60 | }
61 |
62 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
63 | (dataToWorkWith.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
64 | dataToWorkWith.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
65 | dataToWorkWith.transpose.map((column: Array[Double]) => standardDeviation(column)))
66 | } else {
67 | (null, null, null)
68 | }
69 |
70 | val majElements: Array[Int] = classesToWorkWith.zipWithIndex.collect { case (label, i) if label != untouchableClass => i }
71 | val (_, centroids, assignment) = kMeans(data = majElements map dataToWorkWith, nominal = data.fileInfo.nominal,
72 | numClusters = numClusters, restarts = restarts, minDispersion = minDispersion, maxIterations = maxIterations, seed = seed)
73 |
74 | val (closestInstances, restOfInstances) = assignment.par.map { cluster: (Int, Array[Int]) =>
75 | val distances: Array[(Int, Double)] = cluster._2.map { instance: Int =>
76 | (instance, euclidean(dataToWorkWith(instance), centroids(cluster._1)))
77 | }
78 |
79 | val closestInstance: Int = if (distances.isEmpty) -1 else distances.minBy(_._2)._1
80 | (closestInstance, cluster._2.diff(List(closestInstance)))
81 | }.toArray.unzip
82 |
83 | // Remove foo values
84 | val train: Array[Int] = closestInstances.diff(List(-1))
85 | // Flatten all the clusters
86 | val test: Array[Int] = restOfInstances.flatten
87 | val neighbours: Array[Array[Double]] = train map dataToWorkWith
88 | val classes: Array[Any] = train map classesToWorkWith
89 |
90 | val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) {
91 | Some(new KDTree(neighbours, classes, dataToWorkWith(0).length))
92 | } else {
93 | None
94 | }
95 |
96 | val calculatedLabels: Array[(Int, Any)] = test.zipWithIndex.map { i =>
97 | val label: Any = if (dist == Distance.EUCLIDEAN) {
98 | val labels = KDTree.get.nNeighbours(dataToWorkWith(i._1), 1)._2
99 | mode(labels.toArray)
100 | } else {
101 | nnRuleHVDM(neighbours, dataToWorkWith(i._1), -1, classes, 1, data.fileInfo.nominal, sds, attrCounter,
102 | attrClassesCounter, "nearest")._1
103 | }
104 | (i._1, label)
105 | }
106 |
107 | // if the label matches (it is well classified) the element is useful
108 | val misclassified: Array[Int] = calculatedLabels.collect { case (i, label) if label != classesToWorkWith(i) => i }
109 | val newDataIndex: Array[Int] = misclassified ++ train
110 |
111 | // Construct a data object to be passed to Tomek Link
112 | val auxData: Data = new Data(x = toXData(newDataIndex map dataToWorkWith),
113 | y = newDataIndex map classesToWorkWith, fileInfo = data.fileInfo)
114 | auxData.processedData = newDataIndex map dataToWorkWith
115 | val tl = new TL(auxData, dist = dist, minorityClass = Some(untouchableClass))
116 | val resultTL: Data = tl.compute()
117 | // The final instances is the result of applying Tomek Link to the content of newDataIndex
118 | val finalIndex: Array[Int] = (resultTL.index.get.toList map newDataIndex).toArray
119 | val finishTime: Long = System.nanoTime()
120 |
121 | if (verbose) {
122 | val newCounter: Map[Any, Int] = (finalIndex map classesToWorkWith).groupBy(identity).mapValues(_.length)
123 | println("ORIGINAL SIZE: %d".format(dataToWorkWith.length))
124 | println("NEW DATA SIZE: %d".format(finalIndex.length))
125 | println("REDUCTION PERCENTAGE: %s".format(100 - (finalIndex.length.toFloat / dataToWorkWith.length) * 100))
126 | println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass)))
127 | println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass)))
128 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
129 | }
130 |
131 | new Data(finalIndex map data.x, finalIndex map data.y, Some(finalIndex), data.fileInfo)
132 | }
133 | }
--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/undersampling/CPM.scala:
--------------------------------------------------------------------------------
1 | /*
2 | SOUL: Scala Oversampling and Undersampling Library.
3 | Copyright (C) 2019 Néstor Rodríguez, David López
4 |
5 | This program is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU General Public License as published by
7 | the Free Software Foundation in version 3 of the License.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | */
17 | package soul.algorithm.undersampling
18 |
19 | import soul.data.Data
20 | import soul.util.Utilities.Distance.Distance
21 | import soul.util.Utilities._
22 |
23 | import scala.collection.mutable.ArrayBuffer
24 | import scala.math.min
25 |
26 | /** Class Purity Maximization. Original paper: "An Unsupervised Learning Approach to Resolving the
27 | * Data Imbalanced Issue in Supervised Learning Problems in Functional Genomics" by Kihoon Yoon and Stephen Kwek.
28 | *
29 | * @param data data to work with
30 | * @param seed seed to use. If it is not provided, it will use the system time
31 | * @param dist object of Distance enumeration representing the distance to be used
32 | * @param normalize normalize the data or not
33 | * @param randomData iterate through the data randomly or not
34 | * @param verbose choose to display information about the execution or not
35 | * @author Néstor Rodríguez Vico
36 | */
37 | class CPM(data: Data, seed: Long = System.currentTimeMillis(), dist: Distance = Distance.EUCLIDEAN,
38 | normalize: Boolean = false, randomData: Boolean = false, verbose: Boolean = false) {
39 |
40 | /** Compute the CPM algorithm.
41 | *
42 | * @return undersampled data structure
43 | */
44 | def compute(): Data = {
45 | val initTime: Long = System.nanoTime()
46 |
47 | val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length)
48 | val untouchableClass: Any = counter.minBy((c: (Any, Int)) => c._2)._1
49 | val random: scala.util.Random = new scala.util.Random(seed)
50 | val centers: ArrayBuffer[Int] = new ArrayBuffer[Int](0)
51 | var dataToWorkWith: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
52 | val classesToWorkWith: Array[Any] = if (randomData) {
53 | val randomIndex: List[Int] = random.shuffle(data.y.indices.toList)
54 | dataToWorkWith = (randomIndex map dataToWorkWith).toArray
55 | (randomIndex map data.y).toArray
56 | } else {
57 | data.y
58 | }
59 |
60 | val posElements: Int = counter.head._2
61 | val negElements: Int = counter.tail.values.sum
62 | val impurity: Double = posElements.asInstanceOf[Double] / negElements.asInstanceOf[Double]
63 | val cluster: Array[Int] = new Array[Int](dataToWorkWith.length).indices.toArray
64 |
65 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
66 | (dataToWorkWith.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
67 | dataToWorkWith.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
68 | dataToWorkWith.transpose.map((column: Array[Double]) => standardDeviation(column)))
69 | } else {
70 | (null, null, null)
71 | }
72 |
73 | def purityMaximization(parentImpurity: Double, parentCluster: Array[Int], center: Int): Unit = {
74 | val cluster1: ArrayBuffer[Int] = new ArrayBuffer[Int](0)
75 | val cluster2: ArrayBuffer[Int] = new ArrayBuffer[Int](0)
76 | val posElements: ArrayBuffer[Int] = new ArrayBuffer[Int](0)
77 | val negElements: ArrayBuffer[Int] = new ArrayBuffer[Int](0)
78 |
79 | var center1: Int = 0
80 | var center2: Int = 0
81 | var pointer: Int = 0
82 | var impurity: Double = Double.PositiveInfinity
83 | var impurity1: Double = Double.PositiveInfinity
84 | var impurity2: Double = Double.PositiveInfinity
85 |
86 | parentCluster.foreach((f: Int) => if (data.y(f) == untouchableClass) posElements += f else negElements += f)
87 |
88 | val pairs: ArrayBuffer[(Int, Int)] = for {x <- negElements; y <- posElements} yield (x, y)
89 |
90 | while (parentImpurity <= impurity) {
91 | if (pointer >= pairs.length) {
92 | centers += center
93 | return
94 | }
95 |
96 | center1 = pairs(pointer)._1
97 | center2 = pairs(pointer)._2
98 |
99 | parentCluster.foreach { element: Int =>
100 | val d1: Double = if (dist == Distance.EUCLIDEAN) {
101 | euclidean(dataToWorkWith(element), dataToWorkWith(center1))
102 | } else {
103 | HVDM(dataToWorkWith(element), dataToWorkWith(center1), data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
104 | }
105 |
106 | val d2: Double = if (dist == Distance.EUCLIDEAN) {
107 | euclidean(dataToWorkWith(element), dataToWorkWith(center2))
108 | } else {
109 | HVDM(dataToWorkWith(element), dataToWorkWith(center2), data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
110 | }
111 |
112 | if (d1 < d2)
113 | cluster1 += element else cluster2 += element
114 | }
115 |
116 | if (cluster1.nonEmpty)
117 | impurity1 = cluster1.count((element: Int) => data.y(element) == untouchableClass).toDouble / cluster1.length
118 | else {
119 | centers += center2
120 | return
121 | }
122 |
123 | if (cluster2.nonEmpty)
124 | impurity2 = cluster2.count((element: Int) => data.y(element) == untouchableClass).toDouble / cluster2.length
125 | else {
126 | centers += center1
127 | return
128 | }
129 |
130 | impurity = min(impurity1, impurity2)
131 | pointer += 1
132 | }
133 |
134 | purityMaximization(impurity1, cluster1.toArray, center1)
135 | purityMaximization(impurity2, cluster2.toArray, center2)
136 | }
137 |
138 | purityMaximization(impurity, cluster, 0)
139 |
140 | val finishTime: Long = System.nanoTime()
141 |
142 | if (verbose) {
143 | val newCounter: Map[Any, Int] = (centers.toArray map classesToWorkWith).groupBy(identity).mapValues(_.length)
144 | println("ORIGINAL SIZE: %d".format(dataToWorkWith.length))
145 | println("NEW DATA SIZE: %d".format(centers.toArray.length))
146 | println("REDUCTION PERCENTAGE: %s".format(100 - (centers.toArray.length.toFloat / dataToWorkWith.length) * 100))
147 | println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass)))
148 | println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass)))
149 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
150 | }
151 |
152 | new Data(centers.toArray map data.x, centers.toArray map data.y, Some(centers.toArray), data.fileInfo)
153 | }
154 | }
155 |
--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/oversampling/SMOTERSB.scala:
--------------------------------------------------------------------------------
1 | /*
2 | SOUL: Scala Oversampling and Undersampling Library.
3 | Copyright (C) 2019 Néstor Rodríguez, David López
4 |
5 | This program is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU General Public License as published by
7 | the Free Software Foundation in version 3 of the License.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | */
17 | package soul.algorithm.oversampling
18 |
19 | import soul.data.Data
20 | import soul.util.KDTree
21 | import soul.util.Utilities.Distance.Distance
22 | import soul.util.Utilities._
23 |
24 | import scala.Array._
25 | import scala.collection.mutable.ArrayBuffer
26 | import scala.util.Random
27 |
28 | /** SMOTERSB algorithm. Original paper: "kNN Approach to Unbalanced Data Distribution: SMOTE-RSB: a hybrid preprocessing
29 | * approach based on oversampling and undersampling for high imbalanced data-sets using SMOTE and rough sets theory"
30 | * by Enislay Ramentol, Yailé Caballero, Rafael Bello and Francisco Herrera.
31 | *
32 | * @param data data to work with
33 | * @param seed seed to use. If it is not provided, it will use the system time
34 | * @param percent amount of Smote N%
35 | * @param k number of minority class nearest neighbors
36 | * @param dist object of Distance enumeration representing the distance to be used
37 | * @param normalize normalize the data or not
38 | * @param verbose choose to display information about the execution or not
39 | * @author David López Pretel
40 | */
41 | class SMOTERSB(data: Data, seed: Long = System.currentTimeMillis(), percent: Int = 500, k: Int = 5,
42 | dist: Distance = Distance.EUCLIDEAN, normalize: Boolean = false, verbose: Boolean = false) {
43 |
44 | /** Compute the SMOTERSB algorithm
45 | *
46 | * @return synthetic samples generated
47 | */
48 | def compute(): Data = {
49 | val initTime: Long = System.nanoTime()
50 |
51 | if (percent > 100 && percent % 100 != 0) {
52 | throw new Exception("Percent must be a multiple of 100")
53 | }
54 |
55 | val samples: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
56 | val minorityClassIndex: Array[Int] = minority(data.y)
57 | val minorityClass: Any = data.y(minorityClassIndex(0))
58 |
59 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
60 | (samples.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
61 | samples.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
62 | samples.transpose.map((column: Array[Double]) => standardDeviation(column)))
63 | } else {
64 | (null, null, null)
65 | }
66 |
67 | val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) {
68 | Some(new KDTree(samples, data.y, samples(0).length))
69 | } else {
70 | None
71 | }
72 |
73 | // check if the percent is correct
74 | var T: Int = minorityClassIndex.length
75 | var N: Int = percent
76 |
77 | if (N < 100) {
78 | T = N / 100 * T
79 | N = 100
80 | }
81 | N = N / 100
82 |
83 | // output with a size of T*N samples
84 | val output: Array[Array[Double]] = Array.ofDim[Double](N * T, samples(0).length)
85 |
86 | val r: Random = new Random(seed)
87 |
88 | // for each minority class sample
89 | minorityClassIndex.indices.par.foreach((i: Int) => {
90 | val neighbors: Array[Int] = if (dist == Distance.EUCLIDEAN) {
91 | KDTree.get.nNeighbours(samples(minorityClassIndex(i)), k)._3.toArray
92 | } else {
93 | kNeighborsHVDM(samples, minorityClassIndex(i), k, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
94 | }
95 |
96 | // compute populate for the sample
97 | (0 until N).par.foreach((n: Int) => {
98 | val nn: Int = neighbors(r.nextInt(neighbors.length))
99 | // compute attributes of the sample
100 | samples(0).indices.foreach((atrib: Int) => {
101 | val diff: Double = samples(nn)(atrib) - samples(minorityClassIndex(i))(atrib)
102 | val gap: Double = r.nextFloat()
103 | output(i * N + n)(atrib) = samples(minorityClassIndex(i))(atrib) + gap * diff
104 | })
105 | })
106 | })
107 |
108 | //compute the majority class
109 | val majorityClassIndex: Array[Int] = samples.indices.diff(minorityClassIndex.toList).toArray
110 |
111 | // minimum and maximum value for each attrib
112 | val maxMinValues: Array[(Double, Double)] = Array.concat(majorityClassIndex map samples, output).transpose.map(column => (column.max, column.min))
113 |
114 | //compute the similarity matrix
115 | val similarityMatrix: Array[Array[Double]] = Array.ofDim(output.length, majorityClassIndex.length)
116 | output.indices.par.foreach(i => {
117 | (majorityClassIndex map samples).zipWithIndex.par.foreach(j => {
118 | similarityMatrix(i)(j._2) = output(i).indices.map(k => {
119 | if (data.nomToNum(0).isEmpty) {
120 | 1 - (Math.abs(output(i)(k) - j._1(k)) / (maxMinValues(k)._1 - maxMinValues(k)._2)) // this expression must be multiplied by wk
121 | } else { // but all the features are included, so wk is 1
122 | if (output(i)(k) == j._1(k)) 1 else 0
123 | }
124 | }).sum / output(i).length
125 | })
126 | })
127 |
128 | var result: ArrayBuffer[Int] = ArrayBuffer()
129 | var similarityValue: Double = 0.4
130 | var lowerApproximation: Boolean = true
131 | while (similarityValue < 0.9) {
132 | output.indices.foreach(i => {
133 | lowerApproximation = true
134 | majorityClassIndex.indices.foreach(j => {
135 | if (similarityMatrix(i)(j) > similarityValue)
136 | lowerApproximation = false
137 | })
138 | if (lowerApproximation) result += i
139 | })
140 | similarityValue += 0.05
141 | }
142 |
143 | //if there are not synthetic samples with lower approximation, return all synthetic samples
144 | if (result.isEmpty) {
145 | result = ArrayBuffer.range(0, output.length)
146 | } else {
147 | result = result.distinct
148 | }
149 |
150 | val finishTime: Long = System.nanoTime()
151 |
152 | if (verbose) {
153 | println("ORIGINAL SIZE: %d".format(data.x.length))
154 | println("NEW DATA SIZE: %d".format(data.x.length + output.length))
155 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
156 | }
157 |
158 | new Data(if (data.fileInfo.nominal.length == 0) {
159 | to2Decimals(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(result.toArray map output, data.fileInfo.maxAttribs,
160 | data.fileInfo.minAttribs) else result.toArray map output))
161 | } else {
162 | toNominal(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(result.toArray map output, data.fileInfo.maxAttribs,
163 | data.fileInfo.minAttribs) else result.toArray map output), data.nomToNum)
164 | }, Array.concat(data.y, Array.fill((result.toArray map output).length)(minorityClass)), None, data.fileInfo)
165 | }
166 | }
167 |
--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/undersampling/NCL.scala:
--------------------------------------------------------------------------------
1 | /*
2 | SOUL: Scala Oversampling and Undersampling Library.
3 | Copyright (C) 2019 Néstor Rodríguez, David López
4 |
5 | This program is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU General Public License as published by
7 | the Free Software Foundation in version 3 of the License.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | */
17 | package soul.algorithm.undersampling
18 |
19 | import soul.data.Data
20 | import soul.util.KDTree
21 | import soul.util.Utilities.Distance.Distance
22 | import soul.util.Utilities._
23 |
24 | import scala.collection.mutable.ArrayBuffer
25 |
26 | /** Neighbourhood Cleaning Rule. Original paper: "Improving Identification of Difficult Small Classes by Balancing Class
27 | * Distribution" by J. Laurikkala.
28 | *
29 | * @param data data to work with
30 | * @param seed seed to use. If it is not provided, it will use the system time
31 | * @param dist object of Distance enumeration representing the distance to be used
32 | * @param k number of neighbours to use when computing k-NN rule (normally 3 neighbours)
33 | * @param threshold consider a class to be undersampled if the number of instances of this class is
34 | * greater than data.size * threshold
35 | * @param normalize normalize the data or not
36 | * @param randomData iterate through the data randomly or not
37 | * @param verbose choose to display information about the execution or not
38 | * @author Néstor Rodríguez Vico
39 | */
40 | class NCL(data: Data, seed: Long = System.currentTimeMillis(), dist: Distance = Distance.EUCLIDEAN, k: Int = 3,
41 | threshold: Double = 0.5, normalize: Boolean = false, randomData: Boolean = false, verbose: Boolean = false) {
42 | /** Compute the NCL algorithm.
43 | *
44 | * @return undersampled data structure
45 | */
46 | def compute(): Data = {
47 | // Note: the notation used to refers the subsets of data is the used in the original paper.
48 | val initTime: Long = System.nanoTime()
49 |
50 | val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length)
51 | val untouchableClass: Any = counter.minBy((c: (Any, Int)) => c._2)._1
52 | val random: scala.util.Random = new scala.util.Random(seed)
53 | var dataToWorkWith: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
54 | val classesToWorkWith: Array[Any] = if (randomData) {
55 | val randomIndex: List[Int] = random.shuffle(data.y.indices.toList)
56 | dataToWorkWith = (randomIndex map dataToWorkWith).toArray
57 | (randomIndex map data.y).toArray
58 | } else {
59 | data.y
60 | }
61 |
62 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
63 | (dataToWorkWith.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
64 | dataToWorkWith.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
65 | dataToWorkWith.transpose.map((column: Array[Double]) => standardDeviation(column)))
66 | } else {
67 | (null, null, null)
68 | }
69 |
70 | val minorityIndex: ArrayBuffer[Int] = new ArrayBuffer[Int](0)
71 | val majorityIndex: ArrayBuffer[Int] = new ArrayBuffer[Int](0)
72 |
73 | var i = 0
74 | while (i < classesToWorkWith.length) {
75 | if (classesToWorkWith(i) == untouchableClass) minorityIndex += i else majorityIndex += i
76 | i += 1
77 | }
78 |
79 | // ENN can not be applied when only one class is in the less important group
80 | val indexA1: Array[Int] = if (classesToWorkWith.distinct.length > 2) {
81 | val ennData = new Data(toXData((majorityIndex map dataToWorkWith).toArray), (majorityIndex map classesToWorkWith).toArray, None, data.fileInfo)
82 | ennData.processedData = (majorityIndex map dataToWorkWith).toArray
83 | val enn = new ENN(ennData, dist = dist, k = k)
84 | val resultENN: Data = enn.compute()
85 | classesToWorkWith.indices.diff(resultENN.index.get).toArray
86 | } else {
87 | new Array[Int](0)
88 | }
89 |
90 | val uniqueMajClasses = (majorityIndex map classesToWorkWith).distinct
91 | val ratio: Double = dataToWorkWith.length * threshold
92 |
93 | val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) {
94 | Some(new KDTree((minorityIndex map dataToWorkWith).toArray, (majorityIndex map classesToWorkWith).toArray, dataToWorkWith(0).length))
95 | } else {
96 | None
97 | }
98 |
99 | def selectNeighbours(l: Int): ArrayBuffer[Int] = {
100 | var selectedElements = new ArrayBuffer[Int](0)
101 | val (_, labels, index) = KDTree.get.nNeighbours(dataToWorkWith(l), k)
102 | val label = mode(labels.toArray)
103 |
104 | if (label != classesToWorkWith(l)) {
105 | index.foreach { n =>
106 | if (classesToWorkWith(n) != untouchableClass && counter(classesToWorkWith(n)) > ratio) {
107 | selectedElements += n
108 | }
109 | }
110 | }
111 | selectedElements
112 | }
113 |
114 | def selectNeighboursHVDM(l: Int): ArrayBuffer[Int] = {
115 | val selectedElements = new ArrayBuffer[Int]()
116 | val (label, nNeighbours, _) = nnRuleHVDM(dataToWorkWith, dataToWorkWith(l), l, classesToWorkWith, k, data.fileInfo.nominal,
117 | sds, attrCounter, attrClassesCounter, "nearest")
118 |
119 | if (label != classesToWorkWith(l)) {
120 | nNeighbours.foreach { n =>
121 | val nNeighbourClass: Any = classesToWorkWith(n)
122 | if (nNeighbourClass != untouchableClass && counter(nNeighbourClass) > ratio) {
123 | selectedElements += n
124 | }
125 | }
126 | }
127 | selectedElements
128 | }
129 |
130 | var j = 0
131 | val indexA2 = new ArrayBuffer[Int](0)
132 | while (j < uniqueMajClasses.length) {
133 | val selectedNeighbours: Array[ArrayBuffer[Int]] = if (dist == Distance.EUCLIDEAN) {
134 | minorityIndex.par.map(l => selectNeighbours(l)).toArray
135 | } else {
136 | minorityIndex.par.map(l => selectNeighboursHVDM(l)).toArray
137 | }
138 |
139 | selectedNeighbours.flatten.distinct.foreach(e => indexA2 += e)
140 | j += 1
141 | }
142 |
143 | val finalIndex: Array[Int] = classesToWorkWith.indices.diff(indexA1.toList ++ indexA2.distinct).toArray
144 | val finishTime: Long = System.nanoTime()
145 |
146 | if (verbose) {
147 | val newCounter: Map[Any, Int] = (finalIndex map classesToWorkWith).groupBy(identity).mapValues(_.length)
148 | println("ORIGINAL SIZE: %d".format(dataToWorkWith.length))
149 | println("NEW DATA SIZE: %d".format(finalIndex.length))
150 | println("REDUCTION PERCENTAGE: %s".format(100 - (finalIndex.length.toFloat / dataToWorkWith.length) * 100))
151 | println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass)))
152 | println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass)))
153 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
154 | }
155 |
156 | new Data(finalIndex map data.x, finalIndex map data.y, Some(finalIndex), data.fileInfo)
157 | }
158 | }
--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/undersampling/BC.scala:
--------------------------------------------------------------------------------
1 | /*
2 | SOUL: Scala Oversampling and Undersampling Library.
3 | Copyright (C) 2019 Néstor Rodríguez, David López
4 |
5 | This program is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU General Public License as published by
7 | the Free Software Foundation in version 3 of the License.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | */
17 | package soul.algorithm.undersampling
18 |
19 | import soul.data.Data
20 | import soul.util.Utilities.Distance.Distance
21 | import soul.util.Utilities._
22 |
23 | import scala.collection.mutable.ArrayBuffer
24 |
25 | /** Balance Cascade algorithm. Original paper: "Exploratory Undersampling for Class-Imbalance Learning" by Xu-Ying Liu,
26 | * Jianxin Wu and Zhi-Hua Zhou.
27 | *
28 | * @param data data to work with
29 | * @param seed seed to use. If it is not provided, it will use the system time
30 | * @param dist object of Distance enumeration representing the distance to be used
31 | * @param k number of neighbours to use when computing k-NN rule (normally 3 neighbours)
32 | * @param nMaxSubsets maximum number of subsets to generate
33 | * @param nFolds number of subsets to create when applying cross-validation
34 | * @param ratio ratio to know how many majority class examples to preserve. By default it's set to 1 so there
35 | * will be the same minority class examples as majority class examples. It will take
36 | * numMinorityInstances * ratio
37 | * @param normalize normalize the data or not
38 | * @param randomData iterate through the data randomly or not
39 | * @param verbose choose to display information about the execution or not
40 | * @author Néstor Rodríguez Vico
41 | */
42 | class BC(data: Data, seed: Long = System.currentTimeMillis(), dist: Distance = Distance.EUCLIDEAN,
43 | k: Int = 3, nMaxSubsets: Int = 5, nFolds: Int = 5, ratio: Double = 1.0, normalize: Boolean = false,
44 | randomData: Boolean = false, verbose: Boolean = false) {
45 |
46 | /** Compute the BC algorithm.
47 | *
48 | * @return undersampled data structure
49 | */
50 | def compute(): Data = {
51 | val initTime: Long = System.nanoTime()
52 |
53 | val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length)
54 | val untouchableClass: Any = counter.minBy((c: (Any, Int)) => c._2)._1
55 | val random: scala.util.Random = new scala.util.Random(seed)
56 | var dataToWorkWith: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
57 | val classesToWorkWith: Array[Any] = if (randomData) {
58 | val randomIndex: List[Int] = random.shuffle(data.y.indices.toList)
59 | dataToWorkWith = (randomIndex map dataToWorkWith).toArray
60 | (randomIndex map data.y).toArray
61 | } else {
62 | data.y
63 | }
64 |
65 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
66 | (dataToWorkWith.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
67 | dataToWorkWith.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
68 | dataToWorkWith.transpose.map((column: Array[Double]) => standardDeviation(column)))
69 | } else {
70 | (null, null, null)
71 | }
72 |
73 | var search: Boolean = true
74 | var subsetsCounter: Int = 0
75 | val mask: Array[Boolean] = Array.fill(classesToWorkWith.length)(true)
76 | val subsets: ArrayBuffer[Array[Int]] = new ArrayBuffer[Array[Int]](0)
77 | val minorityElements: ArrayBuffer[Int] = new ArrayBuffer[Int](0)
78 | val majorityElements: ArrayBuffer[Int] = new ArrayBuffer[Int](0)
79 |
80 | while (search) {
81 | val indexToUnderSample: ArrayBuffer[Int] = new ArrayBuffer[Int](0)
82 | val minorityIndex: ArrayBuffer[Int] = new ArrayBuffer[Int](0)
83 | val classesCounter: Map[Any, Int] = (boolToIndex(mask) map classesToWorkWith).groupBy(identity).mapValues(_.length)
84 |
85 | classesCounter.foreach { target: (Any, Int) =>
86 | val indexClass: Array[Int] = classesToWorkWith.zipWithIndex.collect { case (c, i) if c == target._1 => i }
87 | if (target._1 != untouchableClass) {
88 | val sameClassBool: Array[Boolean] = mask.zipWithIndex.collect { case (c, i) if classesToWorkWith(i) == target._1 => c }
89 | val indexClassInterest: Array[Int] = boolToIndex(sameClassBool) map indexClass
90 | val indexTargetClass: List[Int] = random.shuffle((indexClassInterest map classesToWorkWith).indices.toList).take(counter(untouchableClass))
91 | indexToUnderSample ++= (indexTargetClass map indexClassInterest)
92 | majorityElements ++= (indexTargetClass map indexClassInterest)
93 | } else {
94 | minorityIndex ++= indexClass
95 | minorityElements ++= indexClass
96 | }
97 | }
98 |
99 | subsetsCounter += 1
100 | val subset: Array[Int] = (indexToUnderSample ++ minorityIndex).toArray
101 | subsets += subset
102 |
103 | val classesToWorkWithSubset: Array[Any] = subset map classesToWorkWith
104 | val dataToWorkWithSubset: Array[Array[Double]] = subset map dataToWorkWith
105 | val prediction: Array[Any] = (if (dist == Distance.EUCLIDEAN) {
106 | kFoldPrediction(dataToWorkWithSubset, classesToWorkWithSubset, k, nFolds, "nearest")
107 | } else {
108 | kFoldPredictionHVDM(dataToWorkWithSubset, classesToWorkWithSubset, k, nFolds, data.fileInfo.nominal, sds, attrCounter,
109 | attrClassesCounter, "nearest")
110 | }).take(indexToUnderSample.length)
111 |
112 | val classifiedInstances: Array[Boolean] = ((indexToUnderSample.indices map classesToWorkWithSubset)
113 | zip prediction).map((e: (Any, Any)) => e._1 == e._2).toArray
114 | (boolToIndex(classifiedInstances) map indexToUnderSample).foreach((i: Int) => mask(i) = false)
115 |
116 | if (subsetsCounter == nMaxSubsets) search = false
117 |
118 | val finalTargetStats: Map[Any, Int] = (boolToIndex(mask) map classesToWorkWith).groupBy(identity).mapValues(_.length)
119 | classesToWorkWith.distinct.filter((c: Any) => c != untouchableClass).foreach { c: Any =>
120 | if (finalTargetStats(c) < counter(untouchableClass)) search = false
121 | }
122 | }
123 |
124 | val majorityIndexHistogram: Array[(Int, Int)] = majorityElements.groupBy(identity).mapValues(_.length).toArray.sortBy(_._2).reverse
125 | val majorityIndex: Array[Int] = majorityIndexHistogram.take((minorityElements.distinct.length * ratio).toInt).map(_._1)
126 | val finalIndex: Array[Int] = minorityElements.distinct.toArray ++ majorityIndex
127 | val finishTime: Long = System.nanoTime()
128 |
129 | if (verbose) {
130 | val newCounter: Map[Any, Int] = (finalIndex map classesToWorkWith).groupBy(identity).mapValues(_.length)
131 | println("ORIGINAL SIZE: %d".format(dataToWorkWith.length))
132 | println("NEW DATA SIZE: %d".format(finalIndex.length))
133 | println("REDUCTION PERCENTAGE: %s".format(100 - (finalIndex.length.toFloat / dataToWorkWith.length) * 100))
134 | println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass)))
135 | println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass)))
136 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
137 | }
138 |
139 | new Data(finalIndex map data.x, finalIndex map data.y, Some(finalIndex), data.fileInfo)
140 | }
141 | }
--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/undersampling/CNN.scala:
--------------------------------------------------------------------------------
1 | /*
2 | SOUL: Scala Oversampling and Undersampling Library.
3 | Copyright (C) 2019 Néstor Rodríguez, David López
4 |
5 | This program is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU General Public License as published by
7 | the Free Software Foundation in version 3 of the License.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | */
17 | package soul.algorithm.undersampling
18 |
19 | import soul.data.Data
20 | import soul.util.KDTree
21 | import soul.util.Utilities.Distance.Distance
22 | import soul.util.Utilities._
23 |
24 | import scala.collection.mutable.ListBuffer
25 |
26 | /** Condensed Nearest Neighbor decision rule. Original paper: "The Condensed Nearest Neighbor Rule" by P. Hart.
27 | *
28 | * @param data data to work with
29 | * @param seed seed to use. If it is not provided, it will use the system time
30 | * @param dist object of Distance enumeration representing the distance to be used
31 | * @param normalize normalize the data or not
32 | * @param randomData iterate through the data randomly or not
33 | * @param verbose choose to display information about the execution or not
34 | * @author Néstor Rodríguez Vico
35 | */
36 | class CNN(data: Data, seed: Long = System.currentTimeMillis(), dist: Distance = Distance.EUCLIDEAN,
37 | normalize: Boolean = false, randomData: Boolean = false, verbose: Boolean = false) {
38 |
39 | /** Compute the CNN algorithm
40 | *
41 | * @return undersampled data structure
42 | */
43 | def compute(): Data = {
44 | val initTime: Long = System.nanoTime()
45 |
46 | val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length)
47 | val untouchableClass: Any = counter.minBy((c: (Any, Int)) => c._2)._1
48 | val random: scala.util.Random = new scala.util.Random(seed)
49 | var dataToWorkWith: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
50 | val classesToWorkWith: Array[Any] = if (randomData) {
51 | val randomIndex: List[Int] = random.shuffle(data.y.indices.toList)
52 | dataToWorkWith = (randomIndex map dataToWorkWith).toArray
53 | (randomIndex map data.y).toArray
54 | } else {
55 | data.y
56 | }
57 |
58 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
59 | (dataToWorkWith.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
60 | dataToWorkWith.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
61 | dataToWorkWith.transpose.map((column: Array[Double]) => standardDeviation(column)))
62 | } else {
63 | (null, null, null)
64 | }
65 |
66 | val finalIndex: Array[Int] = if (dist == Distance.HVDM) {
67 | // Indicate the corresponding group: 1 for store, 0 for unknown, -1 for grabbag
68 | val location: Array[Int] = List.fill(dataToWorkWith.length)(0).toArray
69 | // The first element is added to store
70 | location(0) = 1
71 | var changed = true
72 |
73 | // Iterate the data, x (except the first instance)
74 | dataToWorkWith.zipWithIndex.tail.foreach { element: (Array[Double], Int) =>
75 | // and classify each element with the actual content of store
76 | val index: Array[Int] = location.zipWithIndex.collect { case (a, b) if a == 1 => b }
77 | val neighbours: Array[Array[Double]] = index map dataToWorkWith
78 | val classes: Array[Any] = index map classesToWorkWith
79 | val label: (Any, Array[Int], Array[Double]) = nnRuleHVDM(neighbours, element._1, -1, classes, 1, data.fileInfo.nominal,
80 | sds, attrCounter, attrClassesCounter, "nearest")
81 |
82 | // If it is misclassified or is a element of the untouchable class it is added to store; otherwise, it is added to grabbag
83 | location(element._2) = if (label._1 != classesToWorkWith(element._2)) 1 else -1
84 | }
85 |
86 | // After a first pass, iterate grabbag until is exhausted:
87 | // 1. There is no element in grabbag or
88 | // 2. There is no data change between grabbag and store after a full iteration
89 | while (location.count((z: Int) => z == -1) != 0 && changed) {
90 | changed = false
91 | // Now, instead of iterating x, we iterate grabbag
92 | location.zipWithIndex.filter((x: (Int, Int)) => x._1 == -1).foreach { element: (Int, Int) =>
93 | val index: Array[Int] = location.zipWithIndex.collect { case (a, b) if a == 1 => b }
94 | val neighbours: Array[Array[Double]] = index map dataToWorkWith
95 | val classes: Array[Any] = index map classesToWorkWith
96 | val label: Any = nnRuleHVDM(neighbours, dataToWorkWith(element._2), -1, classes, 1, data.fileInfo.nominal,
97 | sds, attrCounter, attrClassesCounter, "nearest")._1
98 | // If it is misclassified or is a element of the untouchable class it is added to store; otherwise, it is added to grabbag
99 | location(element._2) = if (label != classesToWorkWith(element._2)) {
100 | changed = true
101 | 1
102 | } else -1
103 | }
104 | }
105 |
106 | location.zipWithIndex.filter((x: (Int, Int)) => x._1 == 1).collect { case (_, a) => a }
107 | } else {
108 | val store: KDTree = new KDTree(Array(dataToWorkWith(0)), Array(classesToWorkWith(0)), dataToWorkWith(0).length)
109 | var grabbag: ListBuffer[(Array[Double], Int)] = new ListBuffer[(Array[Double], Int)]()
110 | var newGrabbag: ListBuffer[(Array[Double], Int)] = new ListBuffer[(Array[Double], Int)]()
111 |
112 | // Iterate the data, x (except the first instance)
113 | dataToWorkWith.zipWithIndex.tail.foreach { instance: (Array[Double], Int) =>
114 | val label = mode(store.nNeighbours(instance._1, k = 1, leaveOneOut = false)._2.toArray)
115 | if (label != classesToWorkWith(instance._2)) {
116 | store.addElement(instance._1, classesToWorkWith(instance._2))
117 | } else {
118 | grabbag += instance
119 | }
120 | }
121 |
122 | var changed = true
123 | while (grabbag.nonEmpty && changed) {
124 | changed = false
125 | grabbag.foreach { instance =>
126 | val label = mode(store.nNeighbours(instance._1, k = 1, leaveOneOut = false)._2.toArray)
127 | if (label != classesToWorkWith(instance._2)) {
128 | store.addElement(instance._1, classesToWorkWith(instance._2))
129 | changed = true
130 | } else {
131 | newGrabbag += instance
132 | }
133 | }
134 |
135 | grabbag = newGrabbag
136 | newGrabbag = new ListBuffer[(Array[Double], Int)]()
137 | }
138 |
139 | store.kDTreeMap.values.unzip._2.toArray
140 | }
141 |
142 | val finishTime: Long = System.nanoTime()
143 |
144 | if (verbose) {
145 | val newCounter: Map[Any, Int] = (finalIndex map classesToWorkWith).groupBy(identity).mapValues(_.length)
146 | println("ORIGINAL SIZE: %d".format(dataToWorkWith.length))
147 | println("NEW DATA SIZE: %d".format(finalIndex.length))
148 | println("REDUCTION PERCENTAGE: %s".format(100 - (finalIndex.length.toFloat / dataToWorkWith.length) * 100))
149 | println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass)))
150 | println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass)))
151 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
152 | }
153 |
154 | new Data(finalIndex map data.x, finalIndex map data.y, Some(finalIndex), data.fileInfo)
155 | }
156 | }
157 |
--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/undersampling/NM.scala:
--------------------------------------------------------------------------------
1 | /*
2 | SOUL: Scala Oversampling and Undersampling Library.
3 | Copyright (C) 2019 Néstor Rodríguez, David López
4 |
5 | This program is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU General Public License as published by
7 | the Free Software Foundation in version 3 of the License.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | */
17 | package soul.algorithm.undersampling
18 |
19 | import soul.data.Data
20 | import soul.util.KDTree
21 | import soul.util.Utilities.Distance.Distance
22 | import soul.util.Utilities._
23 |
24 | import scala.collection.mutable.ArrayBuffer
25 | import scala.util.Random
26 |
27 | /** NearMiss. Original paper: "kNN Approach to Unbalanced Data Distribution: A Case Study involving Information
28 | * Extraction" by Jianping Zhang and Inderjeet Mani.
29 | *
30 | * @param data data to work with
31 | * @param seed seed to use. If it is not provided, it will use the system time
32 | * @param dist object of Distance enumeration representing the distance to be used
33 | * @param version version of the core to execute
34 | * @param nNeighbours number of neighbours to take for each minority example (only used if version is set to 3)
35 | * @param ratio ratio to know how many majority class examples to preserve. By default it's set to 1 so there
36 | * will be the same minority class examples as majority class examples. It will take
37 | * numMinorityInstances * ratio
38 | * @param normalize normalize the data or not
39 | * @param randomData iterate through the data randomly or not
40 | * @param verbose choose to display information about the execution or not
41 | * @author Néstor Rodríguez Vico
42 | */
43 | class NM(data: Data, seed: Long = System.currentTimeMillis(), dist: Distance = Distance.EUCLIDEAN, version: Int = 1,
44 | nNeighbours: Int = 3, ratio: Double = 1.0, normalize: Boolean = false, randomData: Boolean = false, verbose: Boolean = false) {
45 |
46 | /** Compute the NM algorithm.
47 | *
48 | * @return undersampled data structure
49 | */
50 | def compute(): Data = {
51 | val initTime: Long = System.nanoTime()
52 |
53 | val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length)
54 | val untouchableClass: Any = counter.minBy((c: (Any, Int)) => c._2)._1
55 | val random: scala.util.Random = new scala.util.Random(seed)
56 | var dataToWorkWith: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
57 | val classesToWorkWith: Array[Any] = if (randomData) {
58 | val randomIndex: List[Int] = random.shuffle(data.y.indices.toList)
59 | dataToWorkWith = (randomIndex map dataToWorkWith).toArray
60 | (randomIndex map data.y).toArray
61 | } else {
62 | data.y
63 | }
64 |
65 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
66 | (dataToWorkWith.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
67 | dataToWorkWith.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
68 | dataToWorkWith.transpose.map((column: Array[Double]) => standardDeviation(column)))
69 | } else {
70 | (null, null, null)
71 | }
72 |
73 | val majElements: ArrayBuffer[Int] = new ArrayBuffer[Int](0)
74 | val minElements: ArrayBuffer[Int] = new ArrayBuffer[Int](0)
75 | classesToWorkWith.zipWithIndex.foreach(i => if (i._1 == untouchableClass) minElements += i._2 else majElements += i._2)
76 | val minNeighbours: Array[Array[Double]] = minElements.toArray map dataToWorkWith
77 | val majNeighbours: Array[Array[Double]] = majElements.toArray map dataToWorkWith
78 | val minClasses: Array[Any] = minElements.toArray map classesToWorkWith
79 | val majClasses: Array[Any] = majElements.toArray map classesToWorkWith
80 |
81 | val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) {
82 | Some(new KDTree(minNeighbours, minClasses, dataToWorkWith(0).length))
83 | } else {
84 | None
85 | }
86 |
87 | val majorityKDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) {
88 | Some(new KDTree(majNeighbours, majClasses, dataToWorkWith(0).length))
89 | } else {
90 | None
91 | }
92 |
93 | val reverseKDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) {
94 | Some(new KDTree(minNeighbours, minClasses, dataToWorkWith(0).length, which = "farthest"))
95 | } else {
96 | None
97 | }
98 |
99 | val selectedMajElements: Array[Int] = if (version == 1) {
100 | majElements.map { i: Int =>
101 | if (dist == Distance.EUCLIDEAN) {
102 | val index = KDTree.get.nNeighbours(dataToWorkWith(i), 3)._3
103 | (i, index.map(j => euclidean(dataToWorkWith(i), dataToWorkWith(j))).sum / index.length)
104 | } else {
105 | val result: (Any, Array[Int], Array[Double]) = nnRuleHVDM(minNeighbours, dataToWorkWith(i), -1, minClasses, 3,
106 | data.fileInfo.nominal, sds, attrCounter, attrClassesCounter, "nearest")
107 | (i, (result._2 map result._3).sum / result._2.length)
108 | }
109 | }.toArray.sortBy(_._2).map(_._1)
110 | } else if (version == 2) {
111 | majElements.map { i: Int =>
112 | if (dist == Distance.EUCLIDEAN) {
113 | val index = reverseKDTree.get.nNeighbours(dataToWorkWith(i), 3)._3
114 | (i, index.map(j => euclidean(dataToWorkWith(i), dataToWorkWith(j))).sum / index.length)
115 | } else {
116 | val result: (Any, Array[Int], Array[Double]) = nnRuleHVDM(minNeighbours, dataToWorkWith(i), -1, minClasses, 3,
117 | data.fileInfo.nominal, sds, attrCounter, attrClassesCounter, "farthest")
118 | (i, (result._2 map result._3).sum / result._2.length)
119 | }
120 | }.toArray.sortBy(_._2).map(_._1)
121 | } else if (version == 3) {
122 | // We shuffle the data because, at last, we are going to take, at least, minElements.length * ratio elements and if
123 | // we don't shuffle, we only take majority elements examples that are near to the first minority class examples
124 | new Random(seed).shuffle(minElements.flatMap { i: Int =>
125 | if (dist == Distance.EUCLIDEAN) {
126 | majorityKDTree.get.nNeighbours(dataToWorkWith(i), 3)._3
127 | } else {
128 | nnRuleHVDM(majNeighbours, dataToWorkWith(i), -1, majClasses, nNeighbours, data.fileInfo.nominal, sds, attrCounter,
129 | attrClassesCounter, "nearest")._2
130 | }
131 | }.distinct.toList).toArray
132 | } else {
133 | throw new Exception("Invalid argument: version should be: 1, 2 or 3")
134 | }
135 |
136 | val finalIndex: Array[Int] = minElements.toArray ++ selectedMajElements.take((minElements.length * ratio).toInt)
137 | val finishTime: Long = System.nanoTime()
138 |
139 | if (verbose) {
140 | val newCounter: Map[Any, Int] = (finalIndex map classesToWorkWith).groupBy(identity).mapValues(_.length)
141 | println("ORIGINAL SIZE: %d".format(dataToWorkWith.length))
142 | println("NEW DATA SIZE: %d".format(finalIndex.length))
143 | println("REDUCTION PERCENTAGE: %s".format(100 - (finalIndex.length.toFloat / dataToWorkWith.length) * 100))
144 | println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass)))
145 | println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass)))
146 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
147 | }
148 |
149 | new Data(finalIndex map data.x, finalIndex map data.y, Some(finalIndex), data.fileInfo)
150 | }
151 | }
152 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # SOUL
2 |
3 | ### Scala Oversampling and Undersampling Library
4 |
5 | Included algorithms for oversampling:
6 |
7 | * **Random Oversampling.** Original paper: "A study of the behavior of several methods for balancing machine learning training data" by Batista, Gustavo EAPA and Prati, Ronaldo C and Monard, Maria Carolina.
8 |
9 | * **SMOTE.** Original paper: "SMOTE: Synthetic Minority Over-sampling Technique" by Nitesh V. Chawla, Kevin W. Bowyer, Lawrence O. Hall and W. Philip Kegelmeyer.
10 |
11 | * **SMOTE + ENN.** Original paper: "A Study of the Behavior of Several Methods for Balancing Machine Learning Training Data" by Gustavo E. A. P. A. Batista, Ronaldo C. Prati and Maria Carolina Monard.
12 |
13 | * **SMOTE + TL.** Original paper: "A Study of the Behavior of Several Methods for Balancing Machine Learning Training Data" by Gustavo E. A. P. A. Batista, Ronaldo C. Prati and Maria Carolina Monard.
14 |
15 | * **Borderline-SMOTE.** Original paper: "Borderline-SMOTE: A New Over-Sampling Method in Imbalanced Data Sets Learning." by Hui Han, Wen-Yuan Wang, and Bing-Huan Mao.
16 |
17 | * **Adasyn.** Original paper: "ADASYN: Adaptive Synthetic Sampling Approach for Imbalanced Learning" by Haibo He, Yang Bai, Edwardo A. Garcia, and Shutao Li.
18 |
19 | * **Adoms.** Original paper: "The Generation Mechanism of Synthetic Minority Class Examples" by Sheng TANG and Si-ping CHEN.
20 |
21 | * **SafeLevel-SMOTE.** Original paper: "Safe-Level-SMOTE: Safe-Level-Synthetic Minority Over-Sampling TEchnique for Handling the Class Imbalanced Problem" by Chumphol Bunkhumpornpat, Krung Sinapiromsaran, and Chidchanok Lursinsap.
22 |
23 | * **Spider2.** Original paper: "Learning from Imbalanced Data in Presence of Noisy and Borderline Examples" by Krystyna Napiera la, Jerzy Stefanowski and Szymon Wilk.
24 |
25 | * **DBSMOTE.** Original paper: "DBSMOTE: Density-Based Synthetic Minority Over-sampling Technique" by Chumphol Bunkhumpornpat, Krung Sinapiromsaran and Chidchanok Lursinsap.
26 |
27 | * **SMOTE-RSB.** Original paper: "kNN Approach to Unbalanced Data Distribution: SMOTE-RSB: a hybrid preprocessing approach based on oversampling and undersampling for high imbalanced data-sets using SMOTE and rough sets theory" by Enislay Ramentol, Yailé Caballero, Rafael Bello and Francisco Herrera.
28 |
29 | * **MWMOTE.** Original paper: "MWMOTE—Majority Weighted Minority Oversampling Technique for Imbalanced Data Set Learning" by Sukarna Barua, Md. Monirul Islam, Xin Yao, Fellow, IEEE, and Kazuyuki Muras.
30 |
31 | * **MDO.** Original paper: "To combat multi-class imbalanced problems by means of over-sampling and boosting techniques" by Lida Adbi and Sattar Hashemi.
32 |
33 | Included algorithms for undersampling:
34 |
35 | * **Random Undersampling.** Original paper: "A study of the behavior of several methods for balancing machine learning training data" by Batista, Gustavo EAPA and Prati, Ronaldo C and Monard, Maria Carolina.
36 |
37 | * **Condensed Nearest Neighbor decision rule.** Original paper: "The Condensed Nearest Neighbor Rule" by P. Hart.
38 |
39 | * **Edited Nearest Neighbour rule.** Original paper: "Asymptotic Properties of Nearest Neighbor Rules Using Edited Data" by Dennis L. Wilson.
40 |
41 | * **Tomek Link.** Original paper: "Two Modifications of CNN" by Ivan Tomek.
42 |
43 | * **One-Side Selection.** Original paper: "Addressing the Curse of Imbalanced Training Sets: One-Side Selection" by Miroslav Kubat and Stan Matwin.
44 |
45 | * **Neighbourhood Cleaning Rule.** Original paper: "Improving Identification of Difficult Small Classes by Balancing Class Distribution" by J. Laurikkala.
46 |
47 | * **NearMiss.** Original paper: "kNN Approach to Unbalanced Data Distribution: A Case Study involving Information Extraction" by Jianping Zhang and Inderjeet Mani.
48 |
49 | * **Class Purity Maximization algorithm.** Original paper: "An Unsupervised Learning Approach to Resolving the Data Imbalanced Issue in Supervised Learning Problems in Functional Genomics" by Kihoon Yoon and Stephen Kwek.
50 |
51 | * **Undersampling Based on Clustering.** Original paper: "Under-Sampling Approaches for Improving Prediction of the Minority Class in an Imbalanced Dataset" by Show-Jane Yen and Yue-Shi Lee.
52 |
53 | * **Balance Cascade.** Original paper: "Exploratory Undersampling for Class-Imbalance Learning" by Xu-Ying Liu, Jianxin Wu and Zhi-Hua Zhou.
54 |
55 | * **Easy Ensemble.** Original paper: "Exploratory Undersampling for Class-Imbalance Learning" by Xu-Ying Liu, Jianxin Wu and Zhi-Hua Zhou.
56 |
57 | * **Evolutionary Undersampling.** Original paper: "Evolutionary Under-Sampling for Classification with Imbalanced Data Sets: Proposals and Taxonomy" by Salvador Garcia and Francisco Herrera.
58 |
59 | * **Instance Hardness Threshold.** Original paper: "An Empirical Study of Instance Hardness" by Michael R. Smith, Tony Martinez and Christophe Giraud-Carrier.
60 |
61 | * **ClusterOSS.** Original paper: "ClusterOSS: a new undersampling method for imbalanced learning." by Victor H Barella, Eduardo P Costa and André C. P. L. F. Carvalho.
62 |
63 | * **Iterative Instance Adjustment for Imbalanced Domains.** Original paper: "Addressing imbalanced classification with instance generation techniques: IPADE-ID" by Victoria López, Isaac Triguero, Cristóbal J. Carmona, Salvador García and Francisco Herrera.
64 |
65 | ### How-to use it
66 |
67 | If you are going to use this library from another `sbt` project, you just need to clone the original repository, in the root folder of the cloned repository execute `sbt publishLocal` and add the following dependendy to the `build.sbt` file of your project:
68 |
69 | ```scala
70 | libraryDependencies += "com.github.soul" %% "soul" % "1.0.0"
71 | ```
72 |
73 | To read a data file you only need to do this:
74 |
75 | ```scala
76 | import soul.io.Reader
77 | import soul.data.Data
78 |
79 | /* Read a csv file or any delimited text file */
80 | val csvData: Data = Reader.readDelimitedText(file = )
81 | /* Read a WEKA arff file */
82 | val arffData: Data = Reader.readArff(file = )
83 | ```
84 |
85 | Now we're going to run an undersampling algorithm:
86 |
87 | ```scala
88 | import soul.algorithm.undersampling.NCL
89 | import soul.data.Data
90 |
91 | val nclCSV = new NCL(csvData)
92 | val resultCSV: Data = nclCSV.compute()
93 |
94 | val nclARFF = new NCL(arffData)
95 | val resultARFF: Data = nclARFF.compute()
96 | ```
97 |
98 | In this example we've used an undersampling algorithm but it's the same for an oversampling one. All the algorithm's parameters have default values so you don't need to specify any of them.
99 |
100 | Finally, we only need to save the result to a file:
101 |
102 | ```scala
103 | import soul.io.Writer
104 |
105 | Writer.writeDelimitedText(file = , data = resultCSV)
106 | Writer.writeArff(file = , data = resultARFF)
107 | ```
108 |
109 | ### Experiments
110 |
111 | With the objective of showing the capabilities of **SOUL**, we have generated a two dimension synthetic imbalanced dataset with 1,871 instances. Among them, 1,600 instances belong to the majority class and the remaining 271 belongs to the minority class, leading to about a 17% of minority instances in the whole dataset (IR=5.9). The representation of this dataset can be found below, where we may observe a clear overlapping between the classes, as well as a cluster of minority instances in the middle of the majority instances.
112 |
113 | Next, we have used the following parameters of the algorithms to perform an experiment with some relevant oversampling and undersampling approaches:
114 |
115 |
116 | * **MWMOTE**: *seed*: 0, *N*: 1400, *k1*: 5, *k2*: 5, *k3*: 5, *dist*: euclidean, *normalize*: false, *verbose*: false.
117 |
118 | * **SMOTE**: *seed*: 0, *percent*: 500, *k*: 5, *dist*: euclidean, *normalize*: false, *verbose*: false.
119 |
120 | * **ADASYN**: *seed*: 0, *d*: 1, *B*: 1, *k*: 5, *dist*: euclidean, *normalize*: false, *verbose*: false.
121 |
122 | * **SafeLevelSMOTE**: *seed*: 0, *k*: 5, *dist*: euclidean, *normalize*: false, *verbose*: false.
123 |
124 | * **IHTS**: *seed* = 0, *nFolds* = 5, *normalize* = false, *randomData* = false, *verbose* = false
125 |
126 | * **IPADE**: *seed* = 0, *iterations* = 100, *strategy* = 1, *randomChoice* = true, *normalize* = false, *randomData* = false, *verbose* = false
127 |
128 | * **NCL**: *seed* = 0, *dist* = euclidean, *k* = 3, *threshold* = 0.5, *normalize* = false, *randomData* = false, *verbose* = false
129 |
130 | * **SBC**: *seed* = 0, *method* = "NearMiss1", *m* = 1.0, *k* = 3, *numClusters* = 50, *restarts* = 1, *minDispersion* = 0.0001, *maxIterations* = 200, val *dist* = euclidean, *normalize* = false, *randomData* = false, *verbose* = false
131 |
132 |
133 | 
134 |
135 |
136 | |  |  |
137 | | ------------- | ------------- |
138 |  | 
139 |
140 |
141 | |  |  |
142 | | ------------- | ------------- |
143 |  | 
144 |
--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/oversampling/Spider2.scala:
--------------------------------------------------------------------------------
1 | /*
2 | SOUL: Scala Oversampling and Undersampling Library.
3 | Copyright (C) 2019 Néstor Rodríguez, David López
4 |
5 | This program is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU General Public License as published by
7 | the Free Software Foundation in version 3 of the License.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | */
17 | package soul.algorithm.oversampling
18 |
19 | import soul.data.Data
20 | import soul.util.Utilities.Distance.Distance
21 | import soul.util.Utilities._
22 |
23 | import scala.collection.mutable.ArrayBuffer
24 |
25 | /** Spider2 algorithm. Original paper: "Learning from Imbalanced Data in Presence of Noisy and Borderline Examples" by
26 | * Krystyna Napiera la, Jerzy Stefanowski and Szymon Wilk.
27 | *
28 | * @param data data to work with
29 | * @param seed seed to use. If it is not provided, it will use the system time
30 | * @param relabel relabeling option
31 | * @param ampl amplification option
32 | * @param k number of minority class nearest neighbors
33 | * @param dist object of Distance enumeration representing the distance to be used
34 | * @param normalize normalize the data or not
35 | * @param verbose choose to display information about the execution or not
36 | * @author David López Pretel
37 | */
38 | class Spider2(data: Data, seed: Long = System.currentTimeMillis(), relabel: String = "yes", ampl: String = "weak", k: Int = 5,
39 | dist: Distance = Distance.EUCLIDEAN, normalize: Boolean = false, verbose: Boolean = false) {
40 |
41 | /** Compute the Spider2 algorithm
42 | *
43 | * @return synthetic samples generated
44 | */
45 | def compute(): Data = {
46 | val initTime: Long = System.nanoTime()
47 |
48 | if (relabel != "no" && relabel != "yes") {
49 | throw new Exception("relabel must be yes or no.")
50 | }
51 |
52 | if (ampl != "weak" && ampl != "strong" && ampl != "no") {
53 | throw new Exception("amplification must be weak or strong or no.")
54 | }
55 |
56 | var minorityClassIndex: Array[Int] = minority(data.y)
57 | val minorityClass: Any = data.y(minorityClassIndex(0))
58 | var majorityClassIndex: Array[Int] = data.processedData.indices.diff(minorityClassIndex.toList).toArray
59 | val output: ArrayBuffer[Array[Double]] = ArrayBuffer()
60 | var resultClasses: Array[Any] = new Array[Any](0)
61 |
62 | val samples: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
63 |
64 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
65 | (samples.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
66 | samples.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
67 | samples.transpose.map((column: Array[Double]) => standardDeviation(column)))
68 | } else {
69 | (null, null, null)
70 | }
71 |
72 | def flagged(c: Array[Int], f: Array[Boolean]): Array[Int] = {
73 | c.map(classes => {
74 | if (!f(classes)) Some(classes) else None
75 | }).filterNot(_.forall(_ == None)).map(_.get)
76 | }
77 |
78 | def amplify(x: Int, k: Int): Unit = {
79 | // compute the neighborhood for the majority and minority class
80 | val majNeighbors: Array[Int] = if (dist == Distance.EUCLIDEAN) {
81 | kNeighbors(majorityClassIndex map output, output(x), k)
82 | } else {
83 | kNeighborsHVDM(majorityClassIndex map output, output(x), k, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
84 | }
85 |
86 | val minNeighbors: Array[Int] = if (dist == Distance.EUCLIDEAN) {
87 | kNeighbors(minorityClassIndex map output, output(x), k)
88 | } else {
89 | kNeighborsHVDM(minorityClassIndex map output, output(x), k, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
90 | }
91 |
92 | // compute the number of copies to create
93 | val S: Int = Math.abs(majNeighbors.length - minNeighbors.length) + 1
94 | // need to know the size of the output to save the randomIndex of the elements inserted
95 | val outputSize: Int = output.length
96 | (0 until S).foreach(_ => {
97 | output ++= Traversable(output(x))
98 | })
99 | // add n copies to the output
100 | if (resultClasses(x) == minorityClass) {
101 | minorityClassIndex = minorityClassIndex ++ (outputSize until outputSize + S)
102 | } else {
103 | majorityClassIndex = majorityClassIndex ++ (outputSize until outputSize + S)
104 | }
105 | resultClasses = resultClasses ++ Array.fill(S)(resultClasses(x))
106 | }
107 |
108 | def correct(x: Int, k: Int, out: Boolean): Boolean = {
109 | // compute the neighbors
110 | val neighbors: Array[Int] = if (dist == Distance.EUCLIDEAN) {
111 | kNeighbors(if (out) samples else output.toArray, if (out) samples(x) else output(x), k)
112 | } else {
113 | kNeighborsHVDM(if (out) samples else output.toArray, if (out) samples(x) else output(x), k,
114 | data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
115 | }
116 | val classes: scala.collection.mutable.Map[Any, Int] = scala.collection.mutable.Map()
117 | // compute the number of samples for each class in the neighborhood
118 | neighbors.foreach(neighbor => classes += data.y(neighbor) -> 0)
119 | neighbors.foreach(neighbor => classes(data.y(neighbor)) += 1)
120 |
121 | // if the majority class in neighborhood is the minority class return true
122 | if (classes.reduceLeft((x: (Any, Int), y: (Any, Int)) => if (x._2 > y._2) x else y)._1 == data.y(x))
123 | true
124 | else
125 | false
126 | }
127 |
128 | // array with the randomIndex of each sample
129 | var DS: Array[Int] = Array.range(0, samples.length)
130 | // at the beginning there are not safe samples
131 | var safeSamples: Array[Boolean] = Array.fill(samples.length)(false)
132 |
133 | // for each sample in majority class check if the neighbors has the same class
134 | majorityClassIndex.foreach(index => if (correct(index, k, out = true)) safeSamples(index) = true)
135 |
136 | // return a subset of samples that are not safe and belong to the majority class
137 | val RS: Array[Int] = flagged(majorityClassIndex, safeSamples)
138 | if (relabel == "yes") {
139 | //add the RS samples to the minority set
140 | minorityClassIndex = minorityClassIndex ++ RS
141 | resultClasses = data.y
142 | RS.foreach(resultClasses(_) = minorityClass)
143 | } else {
144 | // eliminate the samples from the initial set, first we recalculate the randomIndex for min and maj class
145 | var newIndex: Int = 0
146 | minorityClassIndex = minorityClassIndex.map(minor => {
147 | newIndex = minor
148 | RS.foreach(index => if (index < minor) newIndex -= 1)
149 | newIndex
150 | })
151 | majorityClassIndex = majorityClassIndex.map(major => {
152 | newIndex = major
153 | RS.foreach(index => if (index < major) newIndex -= 1)
154 | newIndex
155 | })
156 | DS = DS.diff(RS)
157 | safeSamples = DS map safeSamples
158 | resultClasses = DS map data.y
159 | }
160 |
161 | // the output is DS if ampl is not weak or strong
162 | output ++= (DS map samples)
163 |
164 | // if the neighbors of each sample in minority class belong to it, flag as safe
165 | minorityClassIndex.foreach(index => if (correct(index, k, out = false)) safeSamples(index) = true)
166 | if (ampl == "weak") {
167 | // for each sample returned by flagged amplify the data creating n copies (n calculated in amplify)
168 | flagged(minorityClassIndex, safeSamples).foreach(amplify(_, k))
169 | } else if (ampl == "strong") {
170 | // if the sample is correct amplify with k, else amplify with k + 2 (k is not n)
171 | flagged(minorityClassIndex, safeSamples).foreach(x => {
172 | if (correct(x, k + 2, out = false)) amplify(x, k) else amplify(x, k + 2)
173 | })
174 | }
175 |
176 | val finishTime: Long = System.nanoTime()
177 |
178 | if (verbose) {
179 | println("ORIGINAL SIZE: %d".format(data.x.length))
180 | println("NEW DATA SIZE: %d".format(data.x.length + output.length))
181 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
182 | }
183 |
184 | new Data(if (data.fileInfo.nominal.length == 0) {
185 | to2Decimals(if (normalize) zeroOneDenormalization(output.toArray, data.fileInfo.maxAttribs, data.fileInfo.minAttribs) else output.toArray)
186 | } else {
187 | toNominal(if (normalize) zeroOneDenormalization(output.toArray, data.fileInfo.maxAttribs, data.fileInfo.minAttribs) else output.toArray, data.nomToNum)
188 | }, resultClasses, None, data.fileInfo)
189 | }
190 | }
191 |
--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/oversampling/MWMOTE.scala:
--------------------------------------------------------------------------------
1 | /*
2 | SOUL: Scala Oversampling and Undersampling Library.
3 | Copyright (C) 2019 Néstor Rodríguez, David López
4 |
5 | This program is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU General Public License as published by
7 | the Free Software Foundation in version 3 of the License.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | */
17 | package soul.algorithm.oversampling
18 |
19 | import soul.data.Data
20 | import soul.util.Utilities.Distance.Distance
21 | import soul.util.Utilities._
22 |
23 | import scala.collection.mutable.ArrayBuffer
24 | import scala.util.Random
25 |
26 | /** MWMOTE algorithm. Original paper: "MWMOTE—Majority Weighted Minority Oversampling Technique for Imbalanced Data Set
27 | * Learning" by Sukarna Barua, Md. Monirul Islam, Xin Yao, Fellow, IEEE, and Kazuyuki Muras.
28 | *
29 | * @param data data to work with
30 | * @param seed seed to use. If it is not provided, it will use the system time
31 | * @param N number of synthetic samples to be generated
32 | * @param k1 number of neighbors used for predicting noisy minority class samples
33 | * @param k2 number of majority neighbors used for constructing informative minority set
34 | * @param k3 number of minority neighbors used for constructing informative minority set
35 | * @param dist object of Distance enumeration representing the distance to be used
36 | * @param normalize normalize the data or not
37 | * @param verbose choose to display information about the execution or not
38 | * @author David López Pretel
39 | */
40 | class MWMOTE(data: Data, seed: Long = System.currentTimeMillis(), N: Int = 500, k1: Int = 5, k2: Int = 5, k3: Int = 5,
41 | dist: Distance = Distance.EUCLIDEAN, normalize: Boolean = false, verbose: Boolean = false) {
42 |
43 | /** Compute the MWMOTE algorithm
44 | *
45 | * @return synthetic samples generated
46 | */
47 | def compute(): Data = {
48 | val initTime: Long = System.nanoTime()
49 | val samples: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
50 |
51 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
52 | (samples.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
53 | samples.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
54 | samples.transpose.map((column: Array[Double]) => standardDeviation(column)))
55 | } else {
56 | (null, null, null)
57 | }
58 |
59 | def f(value: Double, cut: Double): Double = {
60 | if (value < cut) value else cut
61 | }
62 |
63 | def Cf(y: (Int, Int), x: Int, Nmin: Array[Array[Int]]): Double = {
64 | val cut: Double = 5 // values used in the paper
65 | val CMAX: Double = 2
66 |
67 | if (!Nmin(y._2).contains(x)) {
68 | val D: Double = if (dist == Distance.EUCLIDEAN) {
69 | euclidean(samples(y._1), samples(x))
70 | } else {
71 | HVDM(samples(y._1), samples(x), data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
72 | }
73 | f(samples(0).length / D, cut) * CMAX
74 | } else
75 | 0.0
76 | }
77 |
78 | def Iw(y: (Int, Int), x: Int, Nmin: Array[Array[Int]], Simin: Array[Int]): Double = {
79 | val cf = Cf(y, x, Nmin)
80 | val df = cf / Simin.map(Cf(y, _, Nmin)).sum
81 | cf + df
82 | }
83 |
84 | def clusterDistance(cluster1: Array[Int], cluster2: Array[Int]): Double = {
85 | val centroid1: Array[Double] = (cluster1 map samples).transpose.map(_.sum / cluster1.length)
86 | val centroid2: Array[Double] = (cluster2 map samples).transpose.map(_.sum / cluster2.length)
87 |
88 | if (dist == Distance.EUCLIDEAN) {
89 | euclidean(centroid1, centroid2)
90 | } else {
91 | HVDM(centroid1, centroid2, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
92 | }
93 | }
94 |
95 | def minDistance(cluster: ArrayBuffer[ArrayBuffer[Int]]): (Int, Int, Double) = {
96 | var minDist: (Int, Int, Double) = (0, 0, 99999999)
97 | var i, j: Int = 0
98 | while (i < cluster.length) {
99 | j = 0
100 | while (j < cluster.length) {
101 | if (i != j) {
102 | val dist = clusterDistance(cluster(i).toArray, cluster(j).toArray)
103 | if (dist < minDist._3) minDist = (i, j, dist)
104 | }
105 | j += 1
106 | }
107 | i += 1
108 | }
109 | minDist
110 | }
111 |
112 | def cluster(Sminf: Array[Int]): Array[Array[Int]] = {
113 | val distances: Array[Array[Double]] = Array.fill(Sminf.length, Sminf.length)(9999999.0)
114 | var i, j: Int = 0
115 | while (i < Sminf.length) {
116 | j = 0
117 | while (j < Sminf.length) {
118 | if (i != j) {
119 | distances(i)(j) = if (dist == Distance.EUCLIDEAN) {
120 | euclidean(samples(Sminf(i)), samples(Sminf(j)))
121 | } else {
122 | HVDM(samples(Sminf(i)), samples(Sminf(j)), data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
123 | }
124 | }
125 | j += 1
126 | }
127 | i += 1
128 | }
129 |
130 | val Cp: Double = 3 // used in paper
131 | val Th: Double = distances.map(_.min).sum / Sminf.length * Cp
132 | var minDist: (Int, Int, Double) = (0, 0, 0.0)
133 | val clusters: ArrayBuffer[ArrayBuffer[Int]] = Sminf.map(ArrayBuffer(_)).to[ArrayBuffer]
134 | while (minDist._3 < Th) {
135 | //compute the min distance between each cluster
136 | minDist = minDistance(clusters)
137 | //merge the two more proximal clusters
138 | clusters(minDist._1) ++= clusters(minDist._2)
139 | clusters -= clusters(minDist._2)
140 | }
141 |
142 | clusters.map(_.toArray).toArray
143 | }
144 |
145 | // compute minority class
146 | val minorityClassIndex: Array[Int] = minority(data.y)
147 | val minorityClass: Any = data.y(minorityClassIndex(0))
148 | // compute majority class
149 | val majorityClassIndex: Array[Int] = samples.indices.par.diff(minorityClassIndex.toList).toArray
150 |
151 | // construct the filtered minority set
152 | val Sminf: Array[Int] = minorityClassIndex.par.map(index => {
153 | val neighbors = if (dist == Distance.EUCLIDEAN) {
154 | kNeighbors(samples, index, k1)
155 | } else {
156 | kNeighborsHVDM(samples, index, k1, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
157 | }
158 | if (neighbors map data.y contains data.y(minorityClassIndex(0))) {
159 | Some(index)
160 | } else {
161 | None
162 | }
163 | }).filterNot(_.forall(_ == None)).map(_.get).toArray
164 |
165 | //for each sample in Sminf compute the nearest majority set
166 | val Sbmaj: Array[Int] = Sminf.par.flatMap { x =>
167 | if (dist == Distance.EUCLIDEAN) {
168 | kNeighbors(majorityClassIndex map samples, samples(x), k2)
169 | } else {
170 | kNeighborsHVDM(majorityClassIndex map samples, samples(x), k2, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
171 | }
172 | }.distinct.par.map(majorityClassIndex(_)).toArray
173 |
174 | // for each majority example in Sbmaj , compute the nearest minority set
175 | val Nmin: Array[Array[Int]] = Sbmaj.par.map { x =>
176 | (if (dist == Distance.EUCLIDEAN) {
177 | kNeighbors(minorityClassIndex map samples, samples(x), k3)
178 | } else {
179 | kNeighborsHVDM(minorityClassIndex map samples, samples(x), k3, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
180 | }).par.map(minorityClassIndex(_)).toArray
181 | }.toArray
182 |
183 | // find the informative minority set (union of all Nmin)
184 | val Simin: Array[Int] = Nmin.par.flatten.distinct.toArray
185 | // for each sample in Simin compute the selection weight
186 | val Sw: Array[Double] = Simin.par.map(x => Sbmaj.zipWithIndex.par.map(y => Iw(y, x, Nmin, Simin)).sum).toArray
187 | val sumSw: Double = Sw.sum
188 | // convert each Sw into probability
189 | val Sp: Array[(Double, Int)] = Sw.par.map(_ / sumSw).toArray.zip(Simin).sortWith(_._1 > _._1)
190 |
191 | // compute the clusters
192 | val clusters: Array[Array[Int]] = cluster(minorityClassIndex) // cluster => index to processedData
193 | val clustersIndex: Map[Int, Int] = clusters.zipWithIndex.flatMap(c => {
194 | clusters(c._2).map(index => (index, c._2))
195 | }).toMap // index to processedData => cluster
196 |
197 | //output data
198 | val output: Array[Array[Double]] = Array.ofDim(N, samples(0).length)
199 |
200 | val probsSum: Double = Sp.map(_._1).sum
201 | val r: Random = new Random(seed)
202 |
203 | (0 until N).par.foreach(i => {
204 | // select a sample, then select another randomly from the cluster that have this sample
205 | val x = chooseByProb(Sp, probsSum, r)
206 | val y = clusters(clustersIndex(x))(r.nextInt(clusters(clustersIndex(x)).length))
207 | // compute atributtes of the sample
208 | samples(0).indices.foreach(atrib => {
209 | val diff: Double = samples(y)(atrib) - samples(x)(atrib)
210 | val gap: Float = r.nextFloat
211 | output(i)(atrib) = samples(x)(atrib) + gap * diff
212 | })
213 | })
214 |
215 | val finishTime: Long = System.nanoTime()
216 |
217 | if (verbose) {
218 | println("ORIGINAL SIZE: %d".format(data.x.length))
219 | println("NEW DATA SIZE: %d".format(data.x.length + output.length))
220 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
221 | }
222 |
223 | new Data(if (data.fileInfo.nominal.length == 0) {
224 | to2Decimals(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs, data.fileInfo.minAttribs) else output))
225 | } else {
226 | toNominal(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs, data.fileInfo.minAttribs) else output), data.nomToNum)
227 | }, Array.concat(data.y, Array.fill(output.length)(minorityClass)), None, data.fileInfo)
228 | }
229 | }
230 |
--------------------------------------------------------------------------------
/src/main/scala/soul/io/Reader.scala:
--------------------------------------------------------------------------------
1 | /*
2 | SOUL: Scala Oversampling and Undersampling Library.
3 | Copyright (C) 2019 Néstor Rodríguez, David López
4 |
5 | This program is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU General Public License as published by
7 | the Free Software Foundation in version 3 of the License.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | */
17 | package soul.io
18 |
19 | import java.io.{BufferedReader, FileInputStream, InputStreamReader}
20 | import java.text.ParseException
21 |
22 | import soul.data.{Data, FileInfo}
23 | import soul.util.Utilities.processData
24 |
25 | import scala.collection.mutable
26 | import scala.collection.mutable.ArrayBuffer
27 |
28 | /** Class to read data files
29 | *
30 | * @author Néstor Rodríguez Vico
31 | */
32 | object Reader {
33 | /** Parse a arff file
34 | *
35 | * @param file file containing the data
36 | * @param columnClass indicates which column represents the class in the file. It it's set to -1, it will take the las column
37 | * @return a data object containing all the relevant information
38 | */
39 | def readArff(file: String, columnClass: Int = -1): Data = {
40 | val reader: BufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(file)))
41 | var line: String = reader.readLine()
42 | var relationName: String = ""
43 | // index -> attributeName
44 | val attributes: mutable.Map[Int, String] = collection.mutable.Map[Int, String]()
45 | // attributeName -> type (it it's nominal, possible values instead of type)
46 | val attributesValues: mutable.Map[String, String] = collection.mutable.Map[String, String]()
47 |
48 | var dataDetected: Boolean = false
49 | var counter: Int = 0
50 |
51 | while (line != null && !dataDetected) {
52 | // ignore comments/description lines
53 | if (line.isEmpty || line.startsWith("%")) {
54 | line = reader.readLine
55 | } else {
56 | // take care if the relation name has commas, tabs, multiple spaces...
57 | val parts: Array[String] = line.replaceAll("\t", " ").replaceAll("\\s{2,}", " ").split(" ", 3)
58 | if (parts(0).equalsIgnoreCase("@relation")) {
59 | // drop the identifier and group all the possible parts separated by a space
60 | relationName = parts.drop(1).mkString(" ")
61 | } else if (parts(0).equalsIgnoreCase("@attribute")) {
62 | attributes += (counter -> parts(1))
63 | attributesValues += (parts(1) -> parts(2))
64 | counter += 1
65 | } else if (parts(0).equalsIgnoreCase("@data")) {
66 | dataDetected = true
67 | }
68 |
69 | line = reader.readLine
70 | }
71 | }
72 |
73 | if (columnClass >= attributes.size)
74 | throw new ParseException("Invalid response variable index: " + columnClass, columnClass)
75 |
76 | val response: Int = if (columnClass == -1) attributes.size - 1 else columnClass
77 | val readData: ArrayBuffer[Array[String]] = new ArrayBuffer[Array[String]](0)
78 |
79 | // Now we have the attributes, let's save the data
80 | while (line != null) {
81 | if (line.isEmpty || line.startsWith("%")) {
82 | line = reader.readLine
83 | } else {
84 | val parts: Array[String] = line.replaceAll("\t", " ").replaceAll("\\s{2,}", " ").split(",")
85 | // there are not quotations mark
86 | if (parts.length == (attributes.size + 1)) {
87 | readData += parts.asInstanceOf
88 | } else {
89 | // if there are quotations marks, they are going to be in pairs
90 | val subParts: Array[Array[Int]] = parts.zipWithIndex.filter((x: (String, Int)) => x._1.contains("\"")).collect { case (_, a) => a }.grouped(2).toArray
91 | // separators indicates the index of the elements that need to be merged into one class
92 | val separators = new ArrayBuffer[Array[Int]](0)
93 | for (quotationMarks <- subParts)
94 | separators += (quotationMarks(0) to quotationMarks(1)).toArray
95 |
96 | val separatedValues: ArrayBuffer[String] = new ArrayBuffer[String]()
97 | // append all the parts into one value
98 | for (pair <- subParts)
99 | separatedValues += ((pair(0) to pair(1)).toArray map parts).mkString(",")
100 |
101 | val nonSeparatedValuesIndex: Array[Int] = parts.indices.diff(separators.flatten.toList).toArray
102 | val nonSeparatedValues: Array[String] = nonSeparatedValuesIndex map parts
103 | // append all the data
104 | val values: Array[String] = (separatedValues ++ nonSeparatedValues).toArray
105 | // make an index array merging all the index: take care with the separatedValuesIndex because there are more than one
106 | // index for each value, so we compute the mean for all the numbers associated to one value
107 | val index: Array[Double] = separators.map((a: Array[Int]) => a.sum.toDouble / a.length).toArray ++ nonSeparatedValuesIndex.map(_.asInstanceOf[Double])
108 | // finally, construct an array to sort the values
109 | val indexForMap: Array[Int] = index.zipWithIndex.sortBy((pair: (Double, Int)) => pair._1).map((pair: (Double, Int)) => pair._2)
110 | // get the final values
111 | val finalValues: Array[String] = indexForMap map values
112 | if (finalValues.length != attributes.size)
113 | throw new ParseException("%d columns, expected %d".format((indexForMap map values).length, attributes.size), (indexForMap map values).length)
114 |
115 | readData += finalValues
116 | }
117 | line = reader.readLine
118 | }
119 | }
120 |
121 | val finalData: ArrayBuffer[Array[Any]] = new ArrayBuffer[Array[Any]](0)
122 | val readClasses: ArrayBuffer[Any] = new ArrayBuffer[Any](0)
123 | val readNominal: ArrayBuffer[Int] = new ArrayBuffer[Int](0)
124 |
125 | for (row <- readData) {
126 | val r = new ArrayBuffer[Any](0)
127 | for (e <- row.zipWithIndex) {
128 | if (e._2 == response)
129 | readClasses += e._1
130 | else if (e._1.matches("-?\\d+(\\.\\d+)?"))
131 | r += e._1.toDouble
132 | else {
133 | if (e._1 == "?" || e._1 == "'?'")
134 | r += "soul_NA"
135 | else {
136 | r += e._1
137 | readNominal += (if (e._2 >= response) e._2 - 1 else e._2)
138 | }
139 | }
140 | }
141 |
142 | finalData += r.toArray
143 | }
144 |
145 | val fileInfo = new FileInfo(_file = file, _comment = "%", _columnClass = response, _delimiter = null, _missing = "?", _header = null,
146 | _relationName = relationName, _attributes = attributes, _attributesValues = attributesValues, nominal = readNominal.distinct.toArray)
147 | val data: Data = new Data(x = finalData.toArray, y = readClasses.toArray, fileInfo = fileInfo)
148 | val (processedData, nomToNum) = processData(data)
149 | data.processedData = processedData
150 | data.nomToNum = nomToNum
151 | data
152 | }
153 |
154 | /** Parse a delimited text data file
155 | *
156 | * @param file file containing the data
157 | * @param comment string indicating that a line is a comment
158 | * @param delimiter string separating two elements
159 | * @param missing string indicating a element is missed
160 | * @param header indicates if the file contains a header or not
161 | * @param columnClass indicates which column represents the class in the file. It it's set to -1, it will take the las column
162 | * @return a data object containing all the relevant information
163 | */
164 | def readDelimitedText(file: String, comment: String = "#", delimiter: String = ",", missing: String = "?", header: Boolean = true, columnClass: Int = -1): Data = {
165 | val reader: BufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(file)))
166 | reader.mark(100)
167 | val firstLine: String = reader.readLine
168 | if (columnClass >= firstLine.split(delimiter).length) throw new ParseException("Invalid response variable index: " + columnClass, columnClass)
169 | val response: Int = if (columnClass == -1) firstLine.split(delimiter).length - 1 else columnClass
170 | reader.reset()
171 |
172 | val headerArray: Array[String] = if (header) reader.readLine.split(delimiter) else null
173 | var line: String = reader.readLine
174 | val readData: ArrayBuffer[Array[Any]] = new ArrayBuffer[Array[Any]](0)
175 | val readClasses: ArrayBuffer[Any] = new ArrayBuffer[Any](0)
176 | val readNominal: ArrayBuffer[Int] = new ArrayBuffer[Int](0)
177 |
178 | while (line != null) {
179 | if (line.isEmpty || line.startsWith(comment)) {
180 | line = reader.readLine
181 | } else {
182 | val elements: Array[String] = line.split(delimiter)
183 |
184 | if (elements.length != firstLine.split(delimiter).length)
185 | throw new ParseException("%d columns, expected %d".format(elements.length, firstLine.length), elements.length)
186 |
187 | val row = new ArrayBuffer[Any](0)
188 | for (e <- elements.zipWithIndex) {
189 | if (e._2 == response)
190 | readClasses += e._1
191 | else if (e._1.replaceAll("\\s", "").matches("[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?"))
192 | row += e._1.replaceAll("\\s", "").toDouble
193 | else {
194 | if (e._1 == missing)
195 | row += "soul_NA"
196 | else {
197 | row += e._1
198 | readNominal += (if (e._2 >= response) e._2 - 1 else e._2)
199 | }
200 | }
201 | }
202 |
203 | readData += row.toArray
204 | line = reader.readLine
205 | }
206 | }
207 |
208 | val attributesValues: mutable.Map[String, String] = collection.mutable.Map[String, String]()
209 | attributesValues += ("Class" -> readClasses.distinct.mkString(","))
210 |
211 | val fileInfo = new FileInfo(_file = file, _comment = "%", _columnClass = response, _delimiter = delimiter, _missing = missing, _header = headerArray, _relationName = null,
212 | _attributes = null, _attributesValues = attributesValues, nominal = readNominal.distinct.toArray)
213 | val data: Data = new Data(x = readData.toArray, y = readClasses.toArray, fileInfo = fileInfo)
214 | val (processedData, nomToNum) = processData(data)
215 | data.processedData = processedData
216 | data.nomToNum = nomToNum
217 | data
218 | }
219 | }
220 |
--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/undersampling/EUS.scala:
--------------------------------------------------------------------------------
1 | /*
2 | SOUL: Scala Oversampling and Undersampling Library.
3 | Copyright (C) 2019 Néstor Rodríguez, David López
4 |
5 | This program is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU General Public License as published by
7 | the Free Software Foundation in version 3 of the License.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | */
17 | package soul.algorithm.undersampling
18 |
19 | import soul.data.Data
20 | import soul.util.Utilities.Distance.Distance
21 | import soul.util.Utilities._
22 |
23 | import scala.collection.mutable.ArrayBuffer
24 | import scala.math.{abs, sqrt}
25 |
26 | /** Evolutionary Under Sampling. Original paper: "Evolutionary Under-Sampling for Classification with Imbalanced Data
27 | * Sets: Proposals and Taxonomy" by Salvador Garcia and Francisco Herrera.
28 | *
29 | * @param data data to work with
30 | * @param seed seed to use. If it is not provided, it will use the system time
31 | * @param populationSize number of chromosomes to generate
32 | * @param maxEvaluations number of evaluations
33 | * @param algorithm version of core to execute. One of: EBUSGSGM, EBUSMSGM, EBUSGSAUC, EBUSMSAUC,
34 | * EUSCMGSGM, EUSCMMSGM, EUSCMGSAUC or EUSCMMSAUC
35 | * @param dist object of Distance enumeration representing the distance to be used
36 | * @param probHUX probability of changing a gen from 0 to 1 (used in crossover)
37 | * @param recombination recombination threshold (used in reinitialization)
38 | * @param prob0to1 probability of changing a gen from 0 to 1 (used in reinitialization)
39 | * @param normalize normalize the data or not
40 | * @param randomData iterate through the data randomly or not
41 | * @param verbose choose to display information about the execution or not
42 | * @author Néstor Rodríguez Vico
43 | */
44 | class EUS(data: Data, seed: Long = System.currentTimeMillis(), populationSize: Int = 50, maxEvaluations: Int = 1000,
45 | algorithm: String = "EBUSMSGM", dist: Distance = Distance.EUCLIDEAN, probHUX: Double = 0.25,
46 | recombination: Double = 0.35, prob0to1: Double = 0.05, normalize: Boolean = false, randomData: Boolean = false,
47 | verbose: Boolean = false) {
48 |
49 | /** Compute the EUS algorithm.
50 | *
51 | * @return undersampled data structure
52 | */
53 | def compute(): Data = {
54 | val initTime: Long = System.nanoTime()
55 |
56 | val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length)
57 | val untouchableClass: Any = counter.minBy((c: (Any, Int)) => c._2)._1
58 | val random: scala.util.Random = new scala.util.Random(seed)
59 | var dataToWorkWith: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
60 | val classesToWorkWith: Array[Any] = if (randomData) {
61 | val randomIndex: List[Int] = random.shuffle(data.y.indices.toList)
62 | dataToWorkWith = (randomIndex map dataToWorkWith).toArray
63 | (randomIndex map data.y).toArray
64 | } else {
65 | data.y
66 | }
67 |
68 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
69 | (dataToWorkWith.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
70 | dataToWorkWith.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
71 | dataToWorkWith.transpose.map((column: Array[Double]) => standardDeviation(column)))
72 | } else {
73 | (null, null, null)
74 | }
75 |
76 | val majoritySelection: Boolean = algorithm.contains("MS")
77 | val targetInstances: Array[Int] = classesToWorkWith.indices.toArray
78 | val minorityElements: Array[Int] = classesToWorkWith.zipWithIndex.collect { case (c, i) if c == untouchableClass => i }
79 |
80 | def fitnessFunction(instance: Array[Int]): Double = {
81 | val index: Array[Int] = zeroOneToIndex(instance) map targetInstances
82 | val neighbours: Array[Array[Double]] = index map dataToWorkWith
83 | val classes: Array[Any] = index map classesToWorkWith
84 | val predicted: Array[Any] = dataToWorkWith.indices.map { e: Int =>
85 | if (dist == Distance.EUCLIDEAN) {
86 | nnRule(neighbours, dataToWorkWith(e), index.indexOf(e), classes, 1, "nearest")._1
87 | } else {
88 | nnRuleHVDM(neighbours, dataToWorkWith(e), index.indexOf(e), classes, 1, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter, "nearest")._1
89 | }
90 | }.toArray
91 |
92 | val matrix: (Int, Int, Int, Int) = confusionMatrix(originalLabels = index map classesToWorkWith,
93 | predictedLabels = predicted, minorityClass = untouchableClass)
94 |
95 | val tp: Int = matrix._1
96 | val fp: Int = matrix._2
97 | val fn: Int = matrix._3
98 | val tn: Int = matrix._4
99 |
100 | val nPositives: Int = (index map classesToWorkWith).count(_ == untouchableClass)
101 | val nNegatives: Int = (index map classesToWorkWith).length - nPositives
102 |
103 | val tpr: Double = tp / ((tp + fn) + 0.00000001)
104 | val fpr: Double = fp / ((fp + tn) + 0.00000001)
105 | val auc: Double = (1.0 + tpr - fpr) / 2.0
106 | val tnr: Double = tn / ((tn + fp) + 0.00000001)
107 | val g: Double = sqrt(tpr * tnr)
108 |
109 | val fitness: Double = if (algorithm == "EBUSGSGM") {
110 | g - abs(1 - (nPositives.toFloat / nNegatives)) * 20
111 | } else if (algorithm == "EBUSMSGM") {
112 | g - abs(1 - (counter(untouchableClass).toFloat / nNegatives)) * 20
113 | } else if (algorithm == "EUSCMGSGM") {
114 | g
115 | } else if (algorithm == "EUSCMMSGM") {
116 | g
117 | } else if (algorithm == "EBUSGSAUC") {
118 | auc - abs(1 - (nPositives.toFloat / nNegatives)) * 0.2
119 | } else if (algorithm == "EBUSMSAUC") {
120 | auc - abs(1 - (counter(untouchableClass).toFloat / nNegatives)) * 0.2
121 | } else if (algorithm == "EUSCMGSAUC") {
122 | auc
123 | } else if (algorithm == "EUSCMMSAUC") {
124 | auc
125 | } else {
126 | Double.NaN
127 | }
128 |
129 | if (fitness.isNaN)
130 | throw new Exception("Invalid argument: core should be: EBUSGSGM, EBUSMSGM, EBUSGSAUC, EBUSMSAUC, EUSCMGSGM, " +
131 | "EUSCMMSGM, EUSCMGSAUC or EUSCMMSAUC")
132 |
133 | fitness
134 | }
135 |
136 | val population: Array[Array[Int]] = new Array[Array[Int]](populationSize)
137 | (0 until populationSize).foreach { i: Int =>
138 | val individual: Array[Int] = targetInstances.indices.map(_ => random.nextInt(2)).toArray
139 | if (majoritySelection) {
140 | minorityElements.foreach((i: Int) => individual(i) = 1)
141 | }
142 | population(i) = individual
143 | }
144 |
145 | val evaluations: Array[Double] = new Array[Double](population.length)
146 | population.zipWithIndex.foreach { chromosome: (Array[Int], Int) =>
147 | evaluations(chromosome._2) = fitnessFunction(chromosome._1)
148 | }
149 |
150 | var incestThreshold: Int = targetInstances.length / 4
151 | var actualEvaluations: Int = populationSize
152 |
153 | while (actualEvaluations < maxEvaluations) {
154 | val randomPopulation: Array[Array[Int]] = random.shuffle(population.indices.toList).toArray map population
155 | val newPopulation: ArrayBuffer[Array[Int]] = new ArrayBuffer[Array[Int]](0)
156 |
157 | (randomPopulation.indices by 2).foreach { i: Int =>
158 | val hammingDistance: Int = (randomPopulation(i) zip randomPopulation(i + 1)).count((pair: (Int, Int)) => pair._1 != pair._2)
159 |
160 | if ((hammingDistance / 2) > incestThreshold) {
161 | val desc1: Array[Int] = randomPopulation(i).clone
162 | val desc2: Array[Int] = randomPopulation(i + 1).clone
163 |
164 | desc1.indices.foreach { i: Int =>
165 | if (desc1(i) != desc2(i) && random.nextFloat < 0.5) {
166 | desc1(i) = if (desc1(i) == 1) 0 else if (random.nextFloat < probHUX) 1 else desc1(i)
167 | desc2(i) = if (desc2(i) == 1) 0 else if (random.nextFloat < probHUX) 1 else desc2(i)
168 |
169 | if (majoritySelection) {
170 | minorityElements.foreach((i: Int) => desc1(i) = 1)
171 | minorityElements.foreach((i: Int) => desc2(i) = 1)
172 | }
173 | }
174 | }
175 |
176 | newPopulation += desc1
177 | newPopulation += desc2
178 | }
179 | }
180 |
181 | val newEvaluations: Array[Double] = new Array[Double](newPopulation.length)
182 | newPopulation.zipWithIndex.foreach { chromosome: (Array[Int], Int) =>
183 | newEvaluations(chromosome._2) = fitnessFunction(chromosome._1)
184 | }
185 |
186 | actualEvaluations += newPopulation.length
187 |
188 | // We order the population. The best ones (greater evaluation value) are the first
189 | val populationOrder: Array[(Double, Int, String)] = evaluations.zipWithIndex.sortBy(_._1)(Ordering[Double].reverse).map((e: (Double, Int)) => (e._1, e._2, "OLD"))
190 | val newPopulationOrder: Array[(Double, Int, String)] = newEvaluations.zipWithIndex.sortBy(_._1)(Ordering[Double].reverse).map((e: (Double, Int)) => (e._1, e._2, "NEW"))
191 |
192 | if (newPopulationOrder.length == 0 || populationOrder.last._1 > newPopulationOrder.head._1) {
193 | incestThreshold -= 1
194 | } else {
195 | val finalOrder: Array[(Double, Int, String)] = (populationOrder ++ newPopulationOrder).sortBy(_._1)(Ordering[Double].reverse).take(populationSize)
196 |
197 | finalOrder.zipWithIndex.foreach { e: ((Double, Int, String), Int) =>
198 | population(e._2) = if (e._1._3 == "OLD") population(e._1._2) else newPopulation(e._1._2)
199 | evaluations(e._2) = if (e._1._3 == "OLD") evaluations(e._1._2) else newEvaluations(e._1._2)
200 | }
201 | }
202 |
203 | if (incestThreshold <= 0) {
204 | population.indices.tail.foreach { i: Int =>
205 | val individual: Array[Int] = population(i).map(_ => if (random.nextFloat < recombination)
206 | if (random.nextFloat < prob0to1) 1 else 0 else population(0)(i))
207 |
208 | if (majoritySelection) {
209 | minorityElements.foreach((i: Int) => individual(i) = 1)
210 | }
211 |
212 | population(i) = individual
213 | }
214 |
215 | population.zipWithIndex.tail.par.foreach { e: (Array[Int], Int) =>
216 | evaluations(e._2) = fitnessFunction(e._1)
217 | }
218 |
219 | actualEvaluations += (population.length - 1)
220 |
221 | incestThreshold = (recombination * (1.0 - recombination) * targetInstances.length.toFloat).toInt
222 | }
223 | }
224 |
225 | val bestChromosome: Array[Int] = population(evaluations.zipWithIndex.sortBy(_._1)(Ordering[Double].reverse).head._2)
226 | val finalIndex: Array[Int] = zeroOneToIndex(bestChromosome) map targetInstances
227 | val finishTime: Long = System.nanoTime()
228 |
229 | if (verbose) {
230 | val newCounter: Map[Any, Int] = (finalIndex map classesToWorkWith).groupBy(identity).mapValues(_.length)
231 | println("ORIGINAL SIZE: %d".format(dataToWorkWith.length))
232 | println("NEW DATA SIZE: %d".format(finalIndex.length))
233 | println("REDUCTION PERCENTAGE: %s".format(100 - (finalIndex.length.toFloat / dataToWorkWith.length) * 100))
234 | println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass)))
235 | println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass)))
236 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
237 | }
238 |
239 | new Data(finalIndex map data.x, finalIndex map data.y, Some(finalIndex), data.fileInfo)
240 | }
241 | }
242 |
--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/oversampling/DBSMOTE.scala:
--------------------------------------------------------------------------------
1 | /*
2 | SOUL: Scala Oversampling and Undersampling Library.
3 | Copyright (C) 2019 Néstor Rodríguez, David López
4 |
5 | This program is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU General Public License as published by
7 | the Free Software Foundation in version 3 of the License.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | */
17 | package soul.algorithm.oversampling
18 |
19 | import soul.data.Data
20 | import soul.util.Utilities.Distance.Distance
21 | import soul.util.Utilities._
22 |
23 | import scala.collection.mutable.ArrayBuffer
24 | import scala.util.Random
25 |
26 | /** DBSMOTE algorithm. Original paper: "DBSMOTE: Density-Based Synthetic Minority Over-sampling Technique" by
27 | * Chumphol Bunkhumpornpat, Krung Sinapiromsaran and Chidchanok Lursinsap.
28 | *
29 | * @param data data to work with
30 | * @param eps epsilon to indicate the distance that must be between two points
31 | * @param k number of neighbors
32 | * @param dist object of Distance enumeration representing the distance to be used
33 | * @param seed seed to use. If it is not provided, it will use the system time
34 | * @param normalize normalize the data or not
35 | * @param verbose choose to display information about the execution or not
36 | * @author David López Pretel
37 | */
38 | class DBSMOTE(data: Data, eps: Double = -1, k: Int = 5, dist: Distance = Distance.EUCLIDEAN,
39 | seed: Long = 5, normalize: Boolean = false, verbose: Boolean = false) {
40 |
41 | /** Compute the DBSMOTE algorithm
42 | *
43 | * @return synthetic samples generated
44 | */
45 | def compute(): Data = {
46 | val initTime: Long = System.nanoTime()
47 | val minorityClassIndex: Array[Int] = minority(data.y)
48 | val samples: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
49 |
50 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
51 | (samples.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
52 | samples.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
53 | samples.transpose.map((column: Array[Double]) => standardDeviation(column)))
54 | } else {
55 | (null, null, null)
56 | }
57 |
58 | def regionQuery(point: Int, eps: Double): Array[Int] = {
59 | (minorityClassIndex map samples).indices.map(sample => {
60 | val D: Double = if (dist == Distance.EUCLIDEAN) {
61 | euclidean(samples(minorityClassIndex(point)), samples(minorityClassIndex(sample)))
62 | } else {
63 | HVDM(samples(minorityClassIndex(point)), samples(minorityClassIndex(sample)), data.fileInfo.nominal, sds,
64 | attrCounter, attrClassesCounter)
65 | }
66 | if (D <= eps) {
67 | Some(sample)
68 | } else {
69 | None
70 | }
71 | }).filterNot(_.forall(_ == None)).map(_.get).toArray
72 | }
73 |
74 | def expandCluster(point: Int, clusterId: Int, clusterIds: Array[Int], eps: Double, minPts: Int): Boolean = {
75 | val neighbors: ArrayBuffer[Int] = ArrayBuffer(regionQuery(point, eps): _*)
76 | if (neighbors.length < minPts) {
77 | clusterIds(point) = -2 //noise
78 | return false
79 | } else {
80 | neighbors.foreach(clusterIds(_) = clusterId)
81 | clusterIds(point) = clusterId
82 |
83 | var numNeighbors: Int = neighbors.length
84 | for (current <- 0 until numNeighbors) {
85 | val neighborsOfCurrent: Array[Int] = regionQuery(current, eps)
86 | if (neighborsOfCurrent.length >= minPts) {
87 | neighborsOfCurrent.foreach(neighbor => {
88 | if (clusterIds(neighbor) == -1 || clusterIds(neighbor) == -2) { //Noise or Unclassified
89 | if (clusterIds(neighbor) == -1) { //Unclassified
90 | neighbors += neighbor
91 | numNeighbors += 1
92 | }
93 | clusterIds(neighbor) = clusterId
94 | }
95 | })
96 | }
97 | }
98 | }
99 |
100 | true
101 | }
102 |
103 | def dbscan(eps: Double, minPts: Int): Array[Array[Int]] = {
104 | var clusterId: Int = 0
105 | val clusterIds: Array[Int] = Array.fill(minorityClassIndex.length)(-1)
106 | minorityClassIndex.indices.foreach(point => {
107 | if (clusterIds(point) == -1) {
108 | if (expandCluster(point, clusterId, clusterIds, eps, minPts)) {
109 | clusterId += 1
110 | }
111 | }
112 | })
113 |
114 | if (clusterId != 0) {
115 | val clusters: Array[Array[Int]] = Array.fill(clusterId)(Array())
116 | (0 until clusterId).foreach(i => {
117 | clusters(i) = clusterIds.zipWithIndex.filter(_._1 == i).map(_._2)
118 | })
119 | clusters
120 | } else { // the cluster is all the data
121 | Array(Array.range(0, minorityClassIndex.length))
122 | }
123 | }
124 |
125 | def buildGraph(cluster: Array[Int], eps: Double, minPts: Int): Array[Array[Boolean]] = {
126 | val graph: Array[Array[Boolean]] = Array.fill(cluster.length, cluster.length)(false)
127 | //distance between each pair of nodes
128 | val distances: Array[Array[Double]] = cluster.map { i =>
129 | cluster.map { j =>
130 | if (dist == Distance.EUCLIDEAN) {
131 | euclidean(samples(minorityClassIndex(i)), samples(minorityClassIndex(j)))
132 | } else {
133 | HVDM(samples(minorityClassIndex(i)), samples(minorityClassIndex(j)), data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
134 | }
135 |
136 | }
137 | }
138 |
139 | // number of nodes connected to another which satisfied distance(a,b) <= eps
140 | val NNq: Array[Int] = distances.map(row => row.map(dist => if (dist <= eps) 1 else 0)).map(_.sum)
141 |
142 | //build the graph
143 | cluster.indices.foreach(i => {
144 | if (cluster.length >= minPts + 1) {
145 | distances(i).zipWithIndex.foreach(dist => {
146 | if (dist._1 <= eps && dist._1 > 0 && NNq(dist._2) >= minPts) {
147 | graph(i)(dist._2) = true
148 | }
149 | })
150 | } else {
151 | distances(i).zipWithIndex.foreach(dist => {
152 | if (dist._1 <= eps && dist._1 > 0) {
153 | graph(i)(dist._2) = true
154 | }
155 | })
156 | }
157 | })
158 | graph
159 | }
160 |
161 | def dijsktra(graph: Array[Array[Boolean]], source: Int, target: Int, cluster: Array[Int]): Array[Int] = {
162 | // distance from source to node, prev node, node visited or not
163 | val nodeInfo: Array[(Double, Int, Boolean)] = Array.fill(graph.length)((9999999, -1, false))
164 | nodeInfo(source) = (0.0, source, false)
165 |
166 | val findMin = (x: ((Double, Int, Boolean), Int), y: ((Double, Int, Boolean), Int)) =>
167 | if ((x._1._1 < y._1._1 && !x._1._3) || (!x._1._3 && y._1._3)) x else y
168 |
169 | nodeInfo.indices.foreach(_ => {
170 | val u: Int = nodeInfo.zipWithIndex.reduceLeft(findMin)._2 //vertex with min distance
171 | nodeInfo(u) = (nodeInfo(u)._1, nodeInfo(u)._2, true)
172 | if (u == target) { // return shortest path
173 | val shortestPath: ArrayBuffer[Int] = ArrayBuffer()
174 | var current = target
175 | while (current != source) {
176 | shortestPath += current
177 | current = nodeInfo(current)._2
178 | }
179 | shortestPath += current
180 | return shortestPath.toArray
181 | }
182 | graph(u).indices.foreach(v => {
183 | if (graph(u)(v) && !nodeInfo(v)._3) {
184 | val d: Double = if (dist == Distance.EUCLIDEAN) {
185 | euclidean(samples(minorityClassIndex(cluster(u))),
186 | samples(minorityClassIndex(cluster(v))))
187 | } else {
188 | HVDM(samples(minorityClassIndex(cluster(u))), samples(minorityClassIndex(cluster(v))), data.fileInfo.nominal,
189 | sds, attrCounter, attrClassesCounter)
190 | }
191 | val alt = nodeInfo(u)._1 + d
192 | if (alt < nodeInfo(v)._1) nodeInfo(v) = (alt, u, nodeInfo(v)._3)
193 | }
194 | })
195 | })
196 |
197 | throw new Exception("Path not found")
198 | }
199 |
200 | val minorityClass: Any = data.y(minorityClassIndex(0))
201 | //check if the user pass the epsilon parameter
202 | var eps2 = eps
203 | if (eps == -1) {
204 | eps2 = samples.map { i =>
205 | samples.map { j =>
206 | if (dist == Distance.EUCLIDEAN) {
207 | euclidean(i, j)
208 | } else {
209 | HVDM(i, j, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
210 | }
211 | }.sum
212 | }.sum / (samples.length * samples.length)
213 | }
214 |
215 | //compute the clusters using dbscan
216 | val clusters: Array[Array[Int]] = dbscan(eps2, k)
217 |
218 | //the output of the algorithm
219 | val output: Array[Array[Double]] = Array.fill(clusters.map(_.length).sum, samples(0).length)(0)
220 |
221 | //for each cluster
222 | clusters.foreach(c => {
223 | //build a graph with the data of each cluster
224 | val graph: Array[Array[Boolean]] = buildGraph(c, eps2, k)
225 | val r: Random.type = scala.util.Random
226 | r.setSeed(seed)
227 | var newIndex: Int = 0
228 | //compute pseudo-centroid, centroid is the mean of the cluster
229 | val centroid = (c map samples).transpose.map(_.sum / c.length)
230 | var pseudoCentroid: (Int, Double) = (0, 99999999.0)
231 | //the pseudo-centroid is the sample that is closest to the centroid
232 | (c map samples).zipWithIndex.foreach(sample => {
233 | val d: Double = if (dist == Distance.EUCLIDEAN) {
234 | euclidean(sample._1, centroid)
235 | } else {
236 | HVDM(sample._1, centroid, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
237 | }
238 | if (d < pseudoCentroid._2) pseudoCentroid = (sample._2, d)
239 | })
240 |
241 | c.indices.foreach(p => {
242 | //compute the shortest path between the pseudo centroid and the samples in each cluster
243 | val shortestPath: Array[Int] = dijsktra(graph, p, pseudoCentroid._1, c)
244 | //a random sample in the path
245 | val e = r.nextInt(shortestPath.length)
246 | //get the nodes connected by e, then only the two first will be used
247 | val v1_v2: Array[(Boolean, Int)] = graph(shortestPath(e)).zipWithIndex.filter(_._1 == true)
248 | samples(0).indices.foreach(attrib => {
249 | // v1(attrib) - v2(attrib)
250 | val dif: Double = samples(minorityClassIndex(c(v1_v2(1)._2)))(attrib) - samples(minorityClassIndex(c(v1_v2(0)._2)))(attrib)
251 | val gap: Double = r.nextFloat()
252 | // v1(attrib) + gap * dif
253 | output(newIndex)(attrib) = samples(minorityClassIndex(c(v1_v2(0)._2)))(attrib) + gap * dif
254 | })
255 | newIndex += 1
256 | })
257 | })
258 |
259 | val finishTime: Long = System.nanoTime()
260 |
261 | if (verbose) {
262 | println("ORIGINAL SIZE: %d".format(data.x.length))
263 | println("NEW DATA SIZE: %d".format(data.x.length + output.length))
264 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
265 | }
266 |
267 | new Data(if (data.fileInfo.nominal.length == 0) {
268 | to2Decimals(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs,
269 | data.fileInfo.minAttribs) else output))
270 | } else {
271 | toNominal(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs,
272 | data.fileInfo.minAttribs) else output), data.nomToNum)
273 | }, Array.concat(data.y, Array.fill(output.length)(minorityClass)), None, data.fileInfo)
274 | }
275 | }
276 |
--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/undersampling/SBC.scala:
--------------------------------------------------------------------------------
1 | /*
2 | SOUL: Scala Oversampling and Undersampling Library.
3 | Copyright (C) 2019 Néstor Rodríguez, David López
4 |
5 | This program is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU General Public License as published by
7 | the Free Software Foundation in version 3 of the License.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | */
17 | package soul.algorithm.undersampling
18 |
19 | import soul.data.Data
20 | import soul.util.KDTree
21 | import soul.util.Utilities.Distance.Distance
22 | import soul.util.Utilities._
23 |
24 | import scala.math.{max, min}
25 |
26 | /** Undersampling Based on Clustering. Original paper: "Under-Sampling Approaches for Improving Prediction of the
27 | * Minority Class in an Imbalanced Dataset" by Show-Jane Yen and Yue-Shi Lee.
28 | *
29 | * @param data data to work with
30 | * @param seed seed to use. If it is not provided, it will use the system time
31 | * @param method selection method to apply. Possible options: random, NearMiss1, NearMiss2, NearMiss3, MostDistant and MostFar
32 | * @param m ratio used in the SSize calculation
33 | * @param k number of neighbours to use when computing k-NN rule (normally 3 neighbours)
34 | * @param numClusters number of clusters to be created by KMeans core
35 | * @param restarts number of times to relaunch KMeans core
36 | * @param minDispersion stop KMeans core if dispersion is lower than this value
37 | * @param maxIterations number of iterations to be done in KMeans core
38 | * @param dist object of Distance enumeration representing the distance to be used
39 | * @param normalize normalize the data or not
40 | * @param randomData iterate through the data randomly or not
41 | * @param verbose choose to display information about the execution or not
42 | * @author Néstor Rodríguez Vico
43 | */
44 | class SBC(data: Data, seed: Long = System.currentTimeMillis(), method: String = "NearMiss1", m: Double = 1.0, k: Int = 3, numClusters: Int = 50,
45 | restarts: Int = 1, minDispersion: Double = 0.0001, maxIterations: Int = 200, val dist: Distance = Distance.EUCLIDEAN,
46 | normalize: Boolean = false, randomData: Boolean = false, verbose: Boolean = false) {
47 |
48 | /** Compute the SBC algorithm.
49 | *
50 | * @return undersampled data structure
51 | */
52 | def compute(): Data = {
53 | val initTime: Long = System.nanoTime()
54 |
55 | val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length)
56 | val untouchableClass: Any = counter.minBy((c: (Any, Int)) => c._2)._1
57 | val random: scala.util.Random = new scala.util.Random(seed)
58 | var dataToWorkWith: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
59 | val classesToWorkWith: Array[Any] = if (randomData) {
60 | val randomIndex: List[Int] = random.shuffle(data.y.indices.toList)
61 | dataToWorkWith = (randomIndex map dataToWorkWith).toArray
62 | (randomIndex map data.y).toArray
63 | } else {
64 | data.y
65 | }
66 |
67 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
68 | (dataToWorkWith.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
69 | dataToWorkWith.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
70 | dataToWorkWith.transpose.map((column: Array[Double]) => standardDeviation(column)))
71 | } else {
72 | (null, null, null)
73 | }
74 |
75 | val (_, centroids, assignment) = kMeans(dataToWorkWith, data.fileInfo.nominal, numClusters, restarts, minDispersion, maxIterations, seed)
76 | val minMajElements: List[(Int, Int)] = (0 until numClusters).toList.map { cluster: Int =>
77 | val elements = assignment(cluster)
78 | val minElements: Int = (elements map classesToWorkWith).count((c: Any) => c == untouchableClass)
79 | (minElements, elements.length - minElements)
80 | }
81 | val nPos: Double = minMajElements.unzip._2.sum.toDouble
82 | val sizeK: Double = minMajElements.map((pair: (Int, Int)) => pair._2.toDouble / max(pair._1, 1)).sum
83 | val sSizes: Array[(Int, Int)] = assignment.map { element: (Int, Array[Int]) =>
84 | val ratio: (Int, Int) = minMajElements(element._1)
85 | // The min is to prevent infinity values if no minority elements are added to the cluster
86 | (element._1, min(m * nPos * ((ratio._2.toDouble / (ratio._1 + 1)) / sizeK), ratio._2).toInt)
87 | }.toArray
88 | val minorityElements: Array[Int] = assignment.flatMap((element: (Int, Array[Int])) => element._2.filter((index: Int) =>
89 | classesToWorkWith(index) == untouchableClass)).toArray
90 |
91 | val majorityElements: Array[Int] = if (method.equals("random")) {
92 | sSizes.filter(_._2 != 0).flatMap { clusterIdSize: (Int, Int) =>
93 | random.shuffle(assignment(clusterIdSize._1).toList).filter((e: Int) =>
94 | classesToWorkWith(e) != untouchableClass).take(clusterIdSize._2)
95 | }
96 | } else {
97 | sSizes.filter(_._2 != 0).flatMap { clusteridSize: (Int, Int) =>
98 | val majorityElementsIndex: Array[(Int, Int)] = assignment(clusteridSize._1).zipWithIndex.filter((e: (Int, Int)) =>
99 | classesToWorkWith(e._1) != untouchableClass)
100 |
101 | // If no minority class elements are assigned to the cluster
102 | if (majorityElementsIndex.length == assignment(clusteridSize._1).length) {
103 | // Use the centroid as "minority class" element
104 | val distances: Array[Double] = assignment(clusteridSize._1).map { instance: Int =>
105 | euclidean(dataToWorkWith(instance), centroids(clusteridSize._1))
106 | }
107 |
108 | distances.zipWithIndex.sortBy(_._2).take(clusteridSize._2).map(_._2) map assignment(clusteridSize._1)
109 | } else {
110 | val minorityElementsIndex: Array[(Int, Int)] = assignment(clusteridSize._1).zipWithIndex.filter((e: (Int, Int)) =>
111 | classesToWorkWith(e._1) == untouchableClass)
112 | val majorityElementsIndex: Array[(Int, Int)] = assignment(clusteridSize._1).zipWithIndex.filter((e: (Int, Int)) =>
113 | classesToWorkWith(e._1) != untouchableClass)
114 |
115 | val minNeighbours: Array[Array[Double]] = minorityElementsIndex.unzip._2 map dataToWorkWith
116 | val majNeighbours: Array[Array[Double]] = majorityElementsIndex.unzip._2 map dataToWorkWith
117 | val minClasses: Array[Any] = minorityElementsIndex.unzip._2 map classesToWorkWith
118 | val majClasses: Array[Any] = majorityElementsIndex.unzip._2 map classesToWorkWith
119 |
120 | val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) {
121 | Some(new KDTree(minNeighbours, minClasses, dataToWorkWith(0).length))
122 | } else {
123 | None
124 | }
125 |
126 | val majorityKDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) {
127 | Some(new KDTree(majNeighbours, majClasses, dataToWorkWith(0).length))
128 | } else {
129 | None
130 | }
131 |
132 | val reverseKDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) {
133 | Some(new KDTree(minNeighbours, minClasses, dataToWorkWith(0).length, which = "farthest"))
134 | } else {
135 | None
136 | }
137 |
138 | if (method.equals("NearMiss1")) {
139 | // selects the majority class samples whose average distances to k nearest minority class samples in the ith cluster are the smallest.
140 | val meanDistances: Array[(Int, Double)] = majorityElementsIndex.map { i: (Int, Int) =>
141 | if (dist == Distance.EUCLIDEAN) {
142 | val index = KDTree.get.nNeighbours(dataToWorkWith(i._1), k)._3
143 | (i._1, index.map(j => euclidean(dataToWorkWith(i._1), dataToWorkWith(j))).sum / index.length)
144 | } else {
145 | val result: (Any, Array[Int], Array[Double]) = nnRuleHVDM(minNeighbours, dataToWorkWith(i._1), -1, minClasses, k,
146 | data.fileInfo.nominal, sds, attrCounter, attrClassesCounter, "nearest")
147 | (i._1, (result._2 map result._3).sum / result._2.length)
148 | }
149 | }
150 | meanDistances.sortBy((pair: (Int, Double)) => pair._2).take(clusteridSize._2).map(_._1)
151 | } else if (method.equals("NearMiss2")) {
152 | // selects the majority class samples whose average distances to k farthest minority class samples in the ith cluster are the smallest.
153 | val meanDistances: Array[(Int, Double)] = majorityElementsIndex.map { i: (Int, Int) =>
154 | if (dist == Distance.EUCLIDEAN) {
155 | val index = reverseKDTree.get.nNeighbours(dataToWorkWith(i._1), k)._3
156 | (i._1, index.map(j => euclidean(dataToWorkWith(i._1), dataToWorkWith(j))).sum / index.length)
157 | } else {
158 | val result: (Any, Array[Int], Array[Double]) = nnRuleHVDM(minNeighbours, dataToWorkWith(i._1), -1, minClasses, k,
159 | data.fileInfo.nominal, sds, attrCounter, attrClassesCounter, "farthest")
160 | (i._1, (result._2 map result._3).sum / result._2.length)
161 | }
162 | }
163 | meanDistances.sortBy((pair: (Int, Double)) => pair._2).take(clusteridSize._2).map(_._1)
164 | } else if (method.equals("NearMiss3")) {
165 | // selects the majority class samples whose average distances to the closest minority class samples in the ith cluster are the smallest.
166 | val meanDistances: Array[(Int, Double)] = majorityElementsIndex.map { i: (Int, Int) =>
167 | if (dist == Distance.EUCLIDEAN) {
168 | val index = majorityKDTree.get.nNeighbours(dataToWorkWith(i._1), k)._3
169 | (i._1, index.map(j => euclidean(dataToWorkWith(i._1), dataToWorkWith(j))).sum / index.length)
170 | } else {
171 | val result: (Any, Array[Int], Array[Double]) = nnRuleHVDM(majNeighbours, dataToWorkWith(i._1), -1, majClasses, k,
172 | data.fileInfo.nominal, sds, attrCounter, attrClassesCounter, "nearest")
173 | (i._1, (result._2 map result._3).sum / result._2.length)
174 | }
175 | }
176 | meanDistances.sortBy((pair: (Int, Double)) => pair._2).take(clusteridSize._2).map(_._1)
177 | } else if (method.equals("MostDistant")) {
178 | // selects the majority class samples whose average distances to M closest minority class samples in the ith cluster are the farthest.
179 | val meanDistances: Array[(Int, Double)] = majorityElementsIndex.map { i: (Int, Int) =>
180 | if (dist == Distance.EUCLIDEAN) {
181 | val index = KDTree.get.nNeighbours(dataToWorkWith(i._1), k)._3
182 | (i._1, index.map(j => euclidean(dataToWorkWith(i._1), dataToWorkWith(j))).sum / index.length)
183 | } else {
184 | val result: (Any, Array[Int], Array[Double]) = nnRuleHVDM(minNeighbours, dataToWorkWith(i._1), -1, minClasses, k,
185 | data.fileInfo.nominal, sds, attrCounter, attrClassesCounter, "nearest")
186 | (i._1, (result._2 map result._3).sum / result._2.length)
187 | }
188 | }
189 | meanDistances.sortBy((pair: (Int, Double)) => pair._2).reverse.take(clusteridSize._2).map(_._1)
190 | } else if (method.equals("MostFar")) {
191 | // selects the majority class samples whose average distances to all minority class samples in the cluster are the farthest
192 | val meanDistances: Array[(Int, Double)] = majorityElementsIndex.map { i: (Int, Int) =>
193 | if (dist == Distance.EUCLIDEAN) {
194 | val index = KDTree.get.nNeighbours(dataToWorkWith(i._1), minorityElementsIndex.length)._3
195 | (i._1, index.map(j => euclidean(dataToWorkWith(i._1), dataToWorkWith(j))).sum / index.length)
196 | } else {
197 | val result: (Any, Array[Int], Array[Double]) = nnRuleHVDM(minNeighbours, dataToWorkWith(i._1), -1, minClasses,
198 | minorityElementsIndex.length, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter, "nearest")
199 | (i._1, (result._2 map result._3).sum / result._2.length)
200 | }
201 | }
202 | meanDistances.sortBy((pair: (Int, Double)) => pair._2).take(clusteridSize._2).map(_._1)
203 | } else {
204 | throw new Exception("Invalid argument: method should be: random, NearMiss1, NearMiss2, NearMiss3, MostDistant or MostFar")
205 | }
206 | }
207 | }
208 | }
209 |
210 | val finalIndex: Array[Int] = minorityElements.distinct ++ majorityElements.distinct
211 | val finishTime: Long = System.nanoTime()
212 |
213 | if (verbose) {
214 | val newCounter: Map[Any, Int] = (finalIndex map classesToWorkWith).groupBy(identity).mapValues(_.length)
215 | println("ORIGINAL SIZE: %d".format(dataToWorkWith.length))
216 | println("NEW DATA SIZE: %d".format(finalIndex.length))
217 | println("REDUCTION PERCENTAGE: %s".format(100 - (finalIndex.length.toFloat / dataToWorkWith.length) * 100))
218 | println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass)))
219 | println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass)))
220 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
221 | }
222 |
223 | new Data(finalIndex map data.x, finalIndex map data.y, Some(finalIndex), data.fileInfo)
224 | }
225 | }
--------------------------------------------------------------------------------