├── project
    └── build.properties
├── images
    ├── NCL.png
    ├── SBC.png
    ├── ADASYN.png
    ├── IHTS.png
    ├── IPADE.png
    ├── MWMOTE.png
    ├── SMOTE.png
    ├── original.png
    └── SafeLevelSMOTE.png
├── src
    └── main
    │   └── scala
    │       └── soul
    │           ├── data
    │               ├── Data.scala
    │               └── FileInfo.scala
    │           ├── algorithm
    │               ├── oversampling
    │               │   ├── RO.scala
    │               │   ├── SMOTE.scala
    │               │   ├── SMOTEENN.scala
    │               │   ├── SMOTETL.scala
    │               │   ├── SafeLevelSMOTE.scala
    │               │   ├── ADOMS.scala
    │               │   ├── BorderlineSMOTE.scala
    │               │   ├── ADASYN.scala
    │               │   ├── MDO.scala
    │               │   ├── SMOTERSB.scala
    │               │   ├── Spider2.scala
    │               │   ├── MWMOTE.scala
    │               │   └── DBSMOTE.scala
    │               └── undersampling
    │               │   ├── RU.scala
    │               │   ├── EE.scala
    │               │   ├── ENN.scala
    │               │   ├── OSS.scala
    │               │   ├── IHTS.scala
    │               │   ├── TL.scala
    │               │   ├── ClusterOSS.scala
    │               │   ├── CPM.scala
    │               │   ├── NCL.scala
    │               │   ├── BC.scala
    │               │   ├── CNN.scala
    │               │   ├── NM.scala
    │               │   ├── EUS.scala
    │               │   └── SBC.scala
    │           ├── io
    │               ├── Writer.scala
    │               └── Reader.scala
    │           └── util
    │               └── KDTree.scala
└── README.md


/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=1.2.3
2 | 


--------------------------------------------------------------------------------
/images/NCL.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NestorRV/SOUL/HEAD/images/NCL.png


--------------------------------------------------------------------------------
/images/SBC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NestorRV/SOUL/HEAD/images/SBC.png


--------------------------------------------------------------------------------
/images/ADASYN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NestorRV/SOUL/HEAD/images/ADASYN.png


--------------------------------------------------------------------------------
/images/IHTS.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NestorRV/SOUL/HEAD/images/IHTS.png


--------------------------------------------------------------------------------
/images/IPADE.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NestorRV/SOUL/HEAD/images/IPADE.png


--------------------------------------------------------------------------------
/images/MWMOTE.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NestorRV/SOUL/HEAD/images/MWMOTE.png


--------------------------------------------------------------------------------
/images/SMOTE.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NestorRV/SOUL/HEAD/images/SMOTE.png


--------------------------------------------------------------------------------
/images/original.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NestorRV/SOUL/HEAD/images/original.png


--------------------------------------------------------------------------------
/images/SafeLevelSMOTE.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NestorRV/SOUL/HEAD/images/SafeLevelSMOTE.png


--------------------------------------------------------------------------------
/src/main/scala/soul/data/Data.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | SOUL: Scala Oversampling and Undersampling Library.
 3 | Copyright (C) 2019 Néstor Rodríguez, David López
 4 | 
 5 | This program is free software: you can redistribute it and/or modify
 6 | it under the terms of the GNU General Public License as published by
 7 | the Free Software Foundation in version 3 of the License.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 |  */
17 | package soul.data
18 | 
19 | import scala.collection.mutable
20 | 
21 | /** Data structure used by the algorithms
22 |   *
23 |   * @param x        data associated to the file (x)
24 |   * @param y        classes associated to the file (y)
25 |   * @param index    randomIndex representing the kept elements
26 |   * @param fileInfo object with the information needed to save the data into a file
27 |   * @author Néstor Rodríguez Vico
28 |   */
29 | class Data private[soul](private[soul] val x: Array[Array[Any]], private[soul] val y: Array[Any],
30 |                          private[soul] val index: Option[Array[Int]] = None, private[soul] val fileInfo: FileInfo) {
31 | 
32 |   private[soul] var processedData: Array[Array[Double]] = new Array[Array[Double]](0)
33 |   private[soul] var nomToNum: Array[mutable.Map[Double, Any]] = new Array[mutable.Map[Double, Any]](0)
34 | }
35 | 


--------------------------------------------------------------------------------
/src/main/scala/soul/data/FileInfo.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | SOUL: Scala Oversampling and Undersampling Library.
 3 | Copyright (C) 2019 Néstor Rodríguez, David López
 4 | 
 5 | This program is free software: you can redistribute it and/or modify
 6 | it under the terms of the GNU General Public License as published by
 7 | the Free Software Foundation in version 3 of the License.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 |  */
17 | package soul.data
18 | 
19 | import scala.collection.mutable
20 | 
21 | /** Data structure used by the arff classes
22 |   *
23 |   * @param _file             file containing the data
24 |   * @param _comment          string indicating that a line is a comment
25 |   * @param _columnClass      indicates which column represents the class in the file
26 |   * @param _delimiter        string separating two elements
27 |   * @param _missing          string indicating a element is missed
28 |   * @param _header           header of the file. If it is _, there was no header
29 |   * @param _attributes       map with the form: index -> attributeName
30 |   * @param _attributesValues map with the form attributeName -> type (it it's nominal, possible values instead of type)
31 |   * @param nominal           array to know which attributes are nominal
32 |   * @author Néstor Rodríguez Vico
33 |   */
34 | class FileInfo private[soul](private[soul] val _file: String, private[soul] val _comment: String,
35 |                              private[soul] val _columnClass: Int = -1,
36 |                              private[soul] val _delimiter: String, private[soul] val _missing: String,
37 |                              private[soul] val _header: Array[String], private[soul] val _relationName: String,
38 |                              private[soul] val _attributes: mutable.Map[Int, String],
39 |                              private[soul] val _attributesValues: mutable.Map[String, String],
40 |                              private[soul] val nominal: Array[Int]) {
41 | 
42 |   // data necessary to denormalize the data
43 |   private[soul] var maxAttribs: Array[Double] = _
44 |   private[soul] var minAttribs: Array[Double] = _
45 | 
46 | }


--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/oversampling/RO.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | SOUL: Scala Oversampling and Undersampling Library.
 3 | Copyright (C) 2019 Néstor Rodríguez, David López
 4 | 
 5 | This program is free software: you can redistribute it and/or modify
 6 | it under the terms of the GNU General Public License as published by
 7 | the Free Software Foundation in version 3 of the License.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 |  */
17 | package soul.algorithm.oversampling
18 | 
19 | import soul.data.Data
20 | import soul.util.Utilities._
21 | 
22 | import scala.util.Random
23 | 
24 | /** Random Oversampling algorithm. Original paper: "A study of the behavior of several methods for balancing machine
25 |   * learning training data" by Batista, Gustavo EAPA and Prati, Ronaldo C and Monard, Maria Carolina.
26 |   *
27 |   * @param data    data to work with
28 |   * @param seed    seed to use. If it is not provided, it will use the system time
29 |   * @param percent number of samples to create
30 |   * @param verbose choose to display information about the execution or not
31 |   * @author David López Pretel
32 |   */
33 | class RO(data: Data, seed: Long = System.currentTimeMillis(), percent: Int = 500, verbose: Boolean = false) {
34 | 
35 |   /** Compute the SMOTE algorithm
36 |     *
37 |     * @return synthetic samples generated
38 |     */
39 |   def compute(): Data = {
40 |     val initTime: Long = System.nanoTime()
41 | 
42 |     if (percent < 0) {
43 |       throw new Exception("Percent must be a greather than 0")
44 |     }
45 | 
46 |     val minorityClassIndex: Array[Int] = minority(data.y)
47 |     val minorityClass: Any = data.y(minorityClassIndex(0))
48 | 
49 |     // output with a size of T*N samples
50 |     val output: Array[Array[Double]] = Array.ofDim[Double](percent, data.processedData(0).length)
51 | 
52 |     val r: Random = new Random(seed)
53 | 
54 |     // for each minority class sample
55 |     (0 until percent).par.foreach((i: Int) => {
56 |       output(i) = data.processedData(minorityClassIndex(r.nextInt(minorityClassIndex.length)))
57 |     })
58 | 
59 |     val finishTime: Long = System.nanoTime()
60 | 
61 |     if (verbose) {
62 |       println("ORIGINAL SIZE: %d".format(data.x.length))
63 |       println("NEW DATA SIZE: %d".format(data.x.length + output.length))
64 |       println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
65 |     }
66 | 
67 |     new Data(if (data.fileInfo.nominal.length == 0) {
68 |       to2Decimals(Array.concat(data.processedData, output))
69 |     } else {
70 |       toNominal(Array.concat(data.processedData, output), data.nomToNum)
71 |     }, Array.concat(data.y, Array.fill(output.length)(minorityClass)), None, data.fileInfo)
72 |   }
73 | }


--------------------------------------------------------------------------------
/src/main/scala/soul/io/Writer.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | SOUL: Scala Oversampling and Undersampling Library.
 3 | Copyright (C) 2019 Néstor Rodríguez, David López
 4 | 
 5 | This program is free software: you can redistribute it and/or modify
 6 | it under the terms of the GNU General Public License as published by
 7 | the Free Software Foundation in version 3 of the License.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 |  */
17 | package soul.io
18 | 
19 | import java.io.{File, PrintWriter}
20 | 
21 | import soul.data.Data
22 | 
23 | import scala.collection.immutable.ListMap
24 | 
25 | /** Class write data files
26 |   *
27 |   * @author Néstor Rodríguez Vico
28 |   */
29 | object Writer {
30 |   /** Store data into a delimited text file
31 |     *
32 |     * @param file filename where to store the data
33 |     * @param data data to save to the file
34 |     */
35 |   def writeArff(file: String, data: Data): Unit = {
36 |     val pr = new PrintWriter(new File(file))
37 |     pr.write("@relation %s\n".format(data.fileInfo._relationName))
38 | 
39 |     if (data.fileInfo._attributes == null || data.fileInfo._attributesValues == null)
40 |       throw new Exception("Unable to write arff: missing information")
41 | 
42 |     val orderedAttributes: Map[Int, String] = ListMap(data.fileInfo._attributes.toSeq.sortBy(_._1): _*)
43 | 
44 |     for (attribute <- orderedAttributes) {
45 |       pr.write("@attribute %s %s\n".format(attribute._2, data.fileInfo._attributesValues(attribute._2)))
46 |     }
47 | 
48 |     pr.write("@data\n")
49 | 
50 |     for (row <- data.x zip data.y) {
51 |       val naIndex: Array[Int] = row._1.zipWithIndex.filter(_._1 == "soul_NA").map(_._2)
52 |       val newRow: Array[Any] = row._1.clone()
53 |       for (index <- naIndex) {
54 |         newRow(index) = "?"
55 |       }
56 | 
57 |       pr.write(newRow.mkString(",") + "," + row._2 + "\n")
58 |     }
59 | 
60 |     pr.close()
61 |   }
62 | 
63 |   /** Store data into a delimited text file
64 |     *
65 |     * @param file filename where to store the data
66 |     * @param data data to save to the file
67 |     */
68 |   def writeDelimitedText(file: String, data: Data): Unit = {
69 |     val delimiter: String = if (data.fileInfo._delimiter == null) "," else data.fileInfo._delimiter
70 |     val missing: String = if (data.fileInfo._missing == null) "?" else data.fileInfo._delimiter
71 | 
72 |     val pr = new PrintWriter(new File(file))
73 |     if (data.fileInfo._header != null)
74 |       pr.write(data.fileInfo._header.mkString(delimiter) + "\n")
75 | 
76 |     for (row <- data.x zip data.y) {
77 |       val naIndex: Array[Int] = row._1.zipWithIndex.filter(_._1 == "soul_NA").map(_._2)
78 |       val newRow: Array[Any] = row._1.clone()
79 |       for (index <- naIndex) {
80 |         newRow(index) = missing
81 |       }
82 | 
83 |       pr.write(newRow.mkString(delimiter) + "," + row._2 + "\n")
84 |     }
85 | 
86 |     pr.close()
87 |   }
88 | }


--------------------------------------------------------------------------------
/src/main/scala/soul/util/KDTree.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | SOUL: Scala Oversampling and Undersampling Library.
 3 | Copyright (C) 2019 Néstor Rodríguez, David López
 4 | 
 5 | This program is free software: you can redistribute it and/or modify
 6 | it under the terms of the GNU General Public License as published by
 7 | the Free Software Foundation in version 3 of the License.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 |  */
17 | package soul.util
18 | 
19 | import com.thesamet.spatial.{DimensionalOrdering, KDTreeMap, Metric}
20 | 
21 | import scala.language.implicitConversions
22 | import scala.math.sqrt
23 | 
24 | /** Wrapper of a com.thesamet.spatial.KDTreeMap adapted for Arrays of Doubles
25 |   *
26 |   * @param x          data
27 |   * @param y          labels
28 |   * @param dimensions number of dimensions
29 |   * @param which      if it's set to "nearest", return the nearest neighbours, if it sets "farthest", return the farthest ones
30 |   * @author Néstor Rodríguez Vico
31 |   */
32 | class KDTree(x: Array[Array[Double]], y: Array[Any], dimensions: Int, which: String = "nearest") {
33 | 
34 |   private[soul] var kDTreeMap: KDTreeMap[Array[Double], (Any, Int)] = if (which == "nearest") {
35 |     KDTreeMap.fromSeq((x zip y.zipWithIndex).map(f => f._1 -> (f._2._1, f._2._2)))(dimensionalOrderingForArray[Array[Double], Double](dimensions))
36 |   } else {
37 |     KDTreeMap.fromSeq((x zip y.zipWithIndex).map(f => f._1 -> (f._2._1, f._2._2)))(dimensionalReverseOrderingForArray[Array[Double], Double](dimensions))
38 |   }
39 | 
40 |   def nNeighbours(instance: Array[Double], k: Int, leaveOneOut: Boolean = true): (Seq[Array[Double]], Seq[Any], Seq[Int]) = {
41 |     val realK: Int = if (leaveOneOut) k + 1 else k
42 |     val drop: Int = if (leaveOneOut) 1 else 0
43 |     val instances: (Seq[Array[Double]], Seq[(Any, Int)]) = kDTreeMap.findNearest(instance, realK).drop(drop).unzip
44 |     val (labels, index) = instances._2.unzip
45 |     (instances._1, labels, index)
46 |   }
47 | 
48 |   def apply(x: Array[Double]): (Any, Int) = kDTreeMap(x)
49 | 
50 |   def addElement(x: Array[Double], y: Any): Unit = {
51 |     kDTreeMap = kDTreeMap + (x -> (y, kDTreeMap.size + 1))
52 |   }
53 | 
54 |   def dimensionalOrderingForArray[T <: Array[A], A](dim: Int)(implicit ord: Ordering[A]): DimensionalOrdering[T] =
55 |     new DimensionalOrdering[T] {
56 |       val dimensions: Int = dim
57 | 
58 |       def compareProjection(d: Int)(x: T, y: T): Int = ord.compare(x(d), y(d))
59 |     }
60 | 
61 |   def dimensionalReverseOrderingForArray[T <: Array[A], A](dim: Int)(implicit ord: Ordering[A]): DimensionalOrdering[T] =
62 |     new DimensionalOrdering[T] {
63 |       val dimensions: Int = dim
64 | 
65 |       def compareProjection(d: Int)(x: T, y: T): Int = ord.compare(y(d), x(d))
66 |     }
67 | 
68 |   implicit def metricFromArray(implicit n: Numeric[Double]): Metric[Array[Double], Double] = new Metric[Array[Double], Double] {
69 |     override def distance(x: Array[Double], y: Array[Double]): Double = sqrt(x.zip(y).map { z =>
70 |       val d = z._1 - z._2
71 |       d * d
72 |     }.sum)
73 | 
74 |     override def planarDistance(dimension: Int)(x: Array[Double], y: Array[Double]): Double = {
75 |       val dd = x(dimension) - y(dimension)
76 |       dd * dd
77 |     }
78 |   }
79 | }


--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/undersampling/RU.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | SOUL: Scala Oversampling and Undersampling Library.
 3 | Copyright (C) 2019 Néstor Rodríguez, David López
 4 | 
 5 | This program is free software: you can redistribute it and/or modify
 6 | it under the terms of the GNU General Public License as published by
 7 | the Free Software Foundation in version 3 of the License.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 |  */
17 | package soul.algorithm.undersampling
18 | 
19 | import soul.data.Data
20 | import soul.util.Utilities._
21 | 
22 | /** Compute a random algorithm. Original paper: "A study of the behavior of several methods for balancing machine
23 |   * learning training data" by Batista, Gustavo EAPA and Prati, Ronaldo C and Monard, Maria Carolina.
24 |   *
25 |   * @param data        data to work with
26 |   * @param seed        seed to use. If it is not provided, it will use the system time
27 |   * @param ratio       ratio to know how many majority class examples to preserve. By default it's set to 1 so there
28 |   *                    will be the same minority class examples as majority class examples. It will take
29 |   *                    numMinorityInstances * ratio
30 |   * @param replacement whether or not to sample randomly with replacement or not. false by default
31 |   * @param verbose     choose to display information about the execution or not
32 |   * @author Néstor Rodríguez Vico
33 |   */
34 | class RU(data: Data, seed: Long = System.currentTimeMillis(), ratio: Double = 1.0, replacement: Boolean = false, verbose: Boolean = false) {
35 | 
36 |   /** Compute the RU algorithm.
37 |     *
38 |     * @return undersampled data structure
39 |     */
40 |   def compute(): Data = {
41 |     val initTime: Long = System.nanoTime()
42 | 
43 |     val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length)
44 |     val untouchableClass: Any = counter.minBy((c: (Any, Int)) => c._2)._1
45 |     val random: scala.util.Random = new scala.util.Random(seed)
46 | 
47 |     val minorityIndex: Array[Int] = data.y.zipWithIndex.collect { case (label, i) if label == untouchableClass => i }
48 |     val majorityIndex: Array[Int] = random.shuffle(data.y.zipWithIndex.collect { case (label, i)
49 |       if label != untouchableClass => i
50 |     }.toList).toArray
51 |     val selectedMajorityIndex: Array[Int] = if (!replacement) majorityIndex.take((minorityIndex.length * ratio).toInt) else
52 |       majorityIndex.indices.map(_ => random.nextInt(majorityIndex.length)).toArray map majorityIndex
53 |     val finalIndex: Array[Int] = minorityIndex ++ selectedMajorityIndex
54 |     val finishTime: Long = System.nanoTime()
55 | 
56 |     if (verbose) {
57 |       val newCounter: Map[Any, Int] = (finalIndex map data.y).groupBy(identity).mapValues(_.length)
58 |       println("ORIGINAL SIZE: %d".format(data.x.length))
59 |       println("NEW DATA SIZE: %d".format(finalIndex.length))
60 |       println("REDUCTION PERCENTAGE: %s".format(100 - (finalIndex.length.toFloat / data.x.length) * 100))
61 |       println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass)))
62 |       println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass)))
63 |       println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
64 |     }
65 | 
66 |     new Data(finalIndex map data.x, finalIndex map data.y, Some(finalIndex), data.fileInfo)
67 |   }
68 | }


--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/undersampling/EE.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | SOUL: Scala Oversampling and Undersampling Library.
 3 | Copyright (C) 2019 Néstor Rodríguez, David López
 4 | 
 5 | This program is free software: you can redistribute it and/or modify
 6 | it under the terms of the GNU General Public License as published by
 7 | the Free Software Foundation in version 3 of the License.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 |  */
17 | package soul.algorithm.undersampling
18 | 
19 | import soul.data.Data
20 | import soul.util.Utilities._
21 | 
22 | /** Easy Ensemble algorithm. Original paper: "Exploratory Undersampling for Class-Imbalance Learning" by Xu-Ying Liu,
23 |   * Jianxin Wu and Zhi-Hua Zhou.
24 |   *
25 |   * @param data        data to work with
26 |   * @param seed        seed to use. If it is not provided, it will use the system time
27 |   * @param ratio       ratio to know how many majority class examples to preserve. By default it's set to 1 so there
28 |   *                    will be the same minority class examples as majority class examples. It will take
29 |   *                    numMinorityInstances * ratio
30 |   * @param replacement whether or not to sample randomly with replacement or not. false by default
31 |   * @param nTimes      times to perform the random algorithm
32 |   * @param normalize   normalize the data or not
33 |   * @param randomData  iterate through the data randomly or not
34 |   * @param verbose     choose to display information about the execution or not
35 |   * @author Néstor Rodríguez Vico
36 |   */
37 | class EE(data: Data, seed: Long = System.currentTimeMillis(), ratio: Double = 1.0, replacement: Boolean = false, nTimes: Int = 5,
38 |          normalize: Boolean = false, randomData: Boolean = false, verbose: Boolean = false) {
39 | 
40 |   /** Compute the EE algorithm.
41 |     *
42 |     * @return undersampled data structure
43 |     */
44 |   def compute(): Data = {
45 |     val initTime: Long = System.nanoTime()
46 | 
47 |     val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length)
48 |     val untouchableClass: Any = counter.minBy((c: (Any, Int)) => c._2)._1
49 |     val random: scala.util.Random = new scala.util.Random(seed)
50 |     var dataToWorkWith: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
51 |     val classesToWorkWith: Array[Any] = if (randomData) {
52 |       val randomIndex: List[Int] = random.shuffle(data.y.indices.toList)
53 |       dataToWorkWith = (randomIndex map dataToWorkWith).toArray
54 |       (randomIndex map data.y).toArray
55 |     } else {
56 |       data.y
57 |     }
58 | 
59 |     val minorityIndex: Array[Int] = classesToWorkWith.zipWithIndex.collect { case (label, i) if label == untouchableClass => i }
60 |     val majIndex: List[Int] = classesToWorkWith.zipWithIndex.collect { case (label, i) if label != untouchableClass => i }.toList
61 |     val majElements: Array[Int] = (0 until nTimes).flatMap { _: Int =>
62 |       val majorityIndex: Array[Int] = random.shuffle(majIndex).toArray
63 |       if (!replacement) majorityIndex.take((minorityIndex.length * ratio).toInt) else majorityIndex.indices.map(_ =>
64 |         random.nextInt(majorityIndex.length)).toArray map majorityIndex
65 |     }.toArray
66 | 
67 |     // Make an histogram and select the majority class examples that have been selected more times
68 |     val majorityIndexHistogram: Array[(Int, Int)] = majElements.groupBy(identity).mapValues(_.length).toArray.sortBy(_._2).reverse
69 |     val majorityIndex: Array[Int] = majorityIndexHistogram.take((minorityIndex.length * ratio).toInt).map(_._1)
70 |     val finalIndex: Array[Int] = minorityIndex ++ majorityIndex
71 |     val finishTime: Long = System.nanoTime()
72 | 
73 |     if (verbose) {
74 |       val newCounter: Map[Any, Int] = (finalIndex map classesToWorkWith).groupBy(identity).mapValues(_.length)
75 |       println("ORIGINAL SIZE: %d".format(dataToWorkWith.length))
76 |       println("NEW DATA SIZE: %d".format(finalIndex.length))
77 |       println("REDUCTION PERCENTAGE: %s".format(100 - (finalIndex.length.toFloat / dataToWorkWith.length) * 100))
78 |       println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass)))
79 |       println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass)))
80 |       println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
81 |     }
82 | 
83 |     new Data(finalIndex map data.x, finalIndex map data.y, Some(finalIndex), data.fileInfo)
84 |   }
85 | }


--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/oversampling/SMOTE.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | SOUL: Scala Oversampling and Undersampling Library.
  3 | Copyright (C) 2019 Néstor Rodríguez, David López
  4 | 
  5 | This program is free software: you can redistribute it and/or modify
  6 | it under the terms of the GNU General Public License as published by
  7 | the Free Software Foundation in version 3 of the License.
  8 | 
  9 | This program is distributed in the hope that it will be useful,
 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | GNU General Public License for more details.
 13 | 
 14 | You should have received a copy of the GNU General Public License
 15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 |  */
 17 | package soul.algorithm.oversampling
 18 | 
 19 | import soul.data.Data
 20 | import soul.util.KDTree
 21 | import soul.util.Utilities.Distance.Distance
 22 | import soul.util.Utilities._
 23 | 
 24 | import scala.util.Random
 25 | 
 26 | /** SMOTE algorithm. Original paper: "SMOTE: Synthetic Minority Over-sampling Technique" by Nitesh V. Chawla, Kevin W.
 27 |   * Bowyer, Lawrence O. Hall and W. Philip Kegelmeyer.
 28 |   *
 29 |   * @param data      data to work with
 30 |   * @param seed      seed to use. If it is not provided, it will use the system time
 31 |   * @param percent   amount of SMOTE N%
 32 |   * @param k         number of minority class nearest neighbors
 33 |   * @param dist      object of Distance enumeration representing the distance to be used
 34 |   * @param normalize normalize the data or not
 35 |   * @param verbose   choose to display information about the execution or not
 36 |   * @author David López Pretel
 37 |   */
 38 | class SMOTE(data: Data, seed: Long = System.currentTimeMillis(), percent: Int = 500, k: Int = 5,
 39 |             dist: Distance = Distance.EUCLIDEAN, normalize: Boolean = false, verbose: Boolean = false) {
 40 | 
 41 |   /** Compute the SMOTE algorithm
 42 |     *
 43 |     * @return synthetic samples generated
 44 |     */
 45 |   def compute(): Data = {
 46 |     val initTime: Long = System.nanoTime()
 47 | 
 48 |     if (percent > 100 && percent % 100 != 0) {
 49 |       throw new Exception("Percent must be a multiple of 100")
 50 |     }
 51 | 
 52 |     val samples: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
 53 |     val minorityClassIndex: Array[Int] = minority(data.y)
 54 |     val minorityClass: Any = data.y(minorityClassIndex(0))
 55 | 
 56 |     val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
 57 |       (samples.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
 58 |         samples.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
 59 |         samples.transpose.map((column: Array[Double]) => standardDeviation(column)))
 60 |     } else {
 61 |       (null, null, null)
 62 |     }
 63 | 
 64 |     val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) {
 65 |       Some(new KDTree(samples, data.y, samples(0).length))
 66 |     } else {
 67 |       None
 68 |     }
 69 | 
 70 |     // check if the percent is correct
 71 |     var T: Int = minorityClassIndex.length
 72 |     var N: Int = percent
 73 | 
 74 |     if (N < 100) {
 75 |       T = N / 100 * T
 76 |       N = 100
 77 |     }
 78 |     N = N / 100
 79 | 
 80 |     // output with a size of T*N samples
 81 |     val output: Array[Array[Double]] = Array.ofDim[Double](N * T, samples(0).length)
 82 | 
 83 |     val r: Random = new Random(seed)
 84 | 
 85 |     // for each minority class sample
 86 |     minorityClassIndex.indices.par.foreach((i: Int) => {
 87 |       val neighbors: Array[Int] = if (dist == Distance.EUCLIDEAN) {
 88 |         KDTree.get.nNeighbours(samples(minorityClassIndex(i)), k)._3.toArray
 89 |       } else {
 90 |         kNeighborsHVDM(samples, minorityClassIndex(i), k, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
 91 |       }
 92 | 
 93 |       // compute populate for the sample
 94 |       (0 until N).par.foreach((n: Int) => {
 95 |         val nn: Int = neighbors(r.nextInt(neighbors.length))
 96 |         // compute attributes of the sample
 97 |         samples(0).indices.foreach((atrib: Int) => {
 98 |           val diff: Double = samples(nn)(atrib) - samples(minorityClassIndex(i))(atrib)
 99 |           val gap: Double = r.nextFloat()
100 |           output(i * N + n)(atrib) = samples(minorityClassIndex(i))(atrib) + gap * diff
101 |         })
102 |       })
103 |     })
104 | 
105 |     val finishTime: Long = System.nanoTime()
106 | 
107 |     if (verbose) {
108 |       println("ORIGINAL SIZE: %d".format(data.x.length))
109 |       println("NEW DATA SIZE: %d".format(data.x.length + output.length))
110 |       println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
111 |     }
112 | 
113 |     new Data(if (data.fileInfo.nominal.length == 0) {
114 |       to2Decimals(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs,
115 |         data.fileInfo.minAttribs) else output))
116 |     } else {
117 |       toNominal(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs,
118 |         data.fileInfo.minAttribs) else output), data.nomToNum)
119 |     }, Array.concat(data.y, Array.fill(output.length)(minorityClass)), None, data.fileInfo)
120 |   }
121 | }


--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/undersampling/ENN.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | SOUL: Scala Oversampling and Undersampling Library.
  3 | Copyright (C) 2019 Néstor Rodríguez, David López
  4 | 
  5 | This program is free software: you can redistribute it and/or modify
  6 | it under the terms of the GNU General Public License as published by
  7 | the Free Software Foundation in version 3 of the License.
  8 | 
  9 | This program is distributed in the hope that it will be useful,
 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | GNU General Public License for more details.
 13 | 
 14 | You should have received a copy of the GNU General Public License
 15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 |  */
 17 | package soul.algorithm.undersampling
 18 | 
 19 | import soul.data.Data
 20 | import soul.util.KDTree
 21 | import soul.util.Utilities.Distance.Distance
 22 | import soul.util.Utilities._
 23 | 
 24 | import scala.collection.mutable.ArrayBuffer
 25 | 
 26 | /** Edited Nearest Neighbour rule. Original paper: "Asymptotic Properties of Nearest Neighbor Rules Using Edited Data"
 27 |   * by Dennis L. Wilson.
 28 |   *
 29 |   * @param data       data to work with
 30 |   * @param seed       seed to use. If it is not provided, it will use the system time
 31 |   * @param dist       object of Distance enumeration representing the distance to be used
 32 |   * @param k          number of neighbours to use when computing k-NN rule (normally 3 neighbours)
 33 |   * @param normalize  normalize the data or not
 34 |   * @param randomData iterate through the data randomly or not
 35 |   * @param verbose    choose to display information about the execution or not
 36 |   * @author Néstor Rodríguez Vico
 37 |   */
 38 | class ENN(data: Data, seed: Long = System.currentTimeMillis(), dist: Distance = Distance.EUCLIDEAN,
 39 |           k: Int = 3, normalize: Boolean = false, randomData: Boolean = false, verbose: Boolean = false) {
 40 | 
 41 |   /** Compute the ENN algorithm.
 42 |     *
 43 |     * @return undersampled data structure
 44 |     */
 45 |   def compute(): Data = {
 46 |     val initTime: Long = System.nanoTime()
 47 | 
 48 |     val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length)
 49 |     val untouchableClass: Any = counter.minBy((c: (Any, Int)) => c._2)._1
 50 |     val random: scala.util.Random = new scala.util.Random(seed)
 51 |     var dataToWorkWith: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
 52 |     val classesToWorkWith: Array[Any] = if (randomData) {
 53 |       val randomIndex: List[Int] = random.shuffle(data.y.indices.toList)
 54 |       dataToWorkWith = (randomIndex map dataToWorkWith).toArray
 55 |       (randomIndex map data.y).toArray
 56 |     } else {
 57 |       data.y
 58 |     }
 59 | 
 60 |     val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
 61 |       (dataToWorkWith.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
 62 |         dataToWorkWith.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
 63 |         dataToWorkWith.transpose.map((column: Array[Double]) => standardDeviation(column)))
 64 |     } else {
 65 |       (null, null, null)
 66 |     }
 67 | 
 68 |     val finalIndex = new ArrayBuffer[Int]()
 69 |     val uniqueClasses = classesToWorkWith.distinct
 70 | 
 71 |     var j = 0
 72 |     val majorityClassIndex = new ArrayBuffer[Int]()
 73 |     while (j < classesToWorkWith.length) {
 74 |       if (classesToWorkWith(j) == untouchableClass) finalIndex += j else majorityClassIndex += j
 75 |       j += 1
 76 |     }
 77 | 
 78 |     val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) {
 79 |       Some(new KDTree(dataToWorkWith, classesToWorkWith, dataToWorkWith(0).length))
 80 |     } else {
 81 |       None
 82 |     }
 83 | 
 84 |     var i = 0
 85 |     while (i < uniqueClasses.length) {
 86 |       val targetClass = uniqueClasses(i)
 87 |       val selected: Array[(Int, Boolean)] = if (targetClass != untouchableClass) {
 88 |         majorityClassIndex.par.map { j =>
 89 |           val label = if (dist == Distance.EUCLIDEAN) {
 90 |             mode(KDTree.get.nNeighbours(dataToWorkWith(j), k)._2.toArray)
 91 |           } else {
 92 |             nnRuleHVDM(dataToWorkWith, dataToWorkWith(j), j, classesToWorkWith, k, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter, "nearest")._1
 93 |           }
 94 | 
 95 |           (j, label == targetClass)
 96 |         }.toArray
 97 |       } else {
 98 |         new Array[(Int, Boolean)](0)
 99 |       }
100 | 
101 |       selected.foreach(e => if (e._2) finalIndex += e._1)
102 | 
103 |       i += 1
104 |     }
105 | 
106 |     val finishTime: Long = System.nanoTime()
107 | 
108 |     if (verbose) {
109 |       val newCounter: Map[Any, Int] = (finalIndex.toArray map classesToWorkWith).groupBy(identity).mapValues(_.length)
110 |       println("ORIGINAL SIZE: %d".format(dataToWorkWith.length))
111 |       println("NEW DATA SIZE: %d".format(finalIndex.length))
112 |       println("REDUCTION PERCENTAGE: %s".format(100 - (finalIndex.length.toFloat / dataToWorkWith.length) * 100))
113 |       println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass)))
114 |       println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass)))
115 |       println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
116 |     }
117 | 
118 |     new Data(finalIndex.toArray map data.x, finalIndex.toArray map data.y, Some(finalIndex.toArray), data.fileInfo)
119 |   }
120 | }
121 | 


--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/undersampling/OSS.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | SOUL: Scala Oversampling and Undersampling Library.
  3 | Copyright (C) 2019 Néstor Rodríguez, David López
  4 | 
  5 | This program is free software: you can redistribute it and/or modify
  6 | it under the terms of the GNU General Public License as published by
  7 | the Free Software Foundation in version 3 of the License.
  8 | 
  9 | This program is distributed in the hope that it will be useful,
 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | GNU General Public License for more details.
 13 | 
 14 | You should have received a copy of the GNU General Public License
 15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 |  */
 17 | package soul.algorithm.undersampling
 18 | 
 19 | import soul.data.Data
 20 | import soul.util.KDTree
 21 | import soul.util.Utilities.Distance.Distance
 22 | import soul.util.Utilities._
 23 | 
 24 | /** One-Side Selection. Original paper: "Addressing the Curse of Imbalanced
 25 |   * Training Sets: One-Side Selection" by Miroslav Kubat and Stan Matwin.
 26 |   *
 27 |   * @param data       data to work with
 28 |   * @param seed       seed to use. If it is not provided, it will use the system time
 29 |   * @param dist       object of Distance enumeration representing the distance to be used
 30 |   * @param normalize  normalize the data or not
 31 |   * @param randomData iterate through the data randomly or not
 32 |   * @param verbose    choose to display information about the execution or not
 33 |   * @author Néstor Rodríguez Vico
 34 |   */
 35 | class OSS(data: Data, seed: Long = System.currentTimeMillis(), dist: Distance = Distance.EUCLIDEAN,
 36 |           normalize: Boolean = false, randomData: Boolean = false, verbose: Boolean = false) {
 37 | 
 38 |   /** Compute the OSS algorithm.
 39 |     *
 40 |     * @return undersampled data structure
 41 |     */
 42 |   def compute(): Data = {
 43 |     // Note: the notation used to refers the subsets of data is the used in the original paper.
 44 |     val initTime: Long = System.nanoTime()
 45 | 
 46 |     val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length)
 47 |     val untouchableClass: Any = counter.minBy((c: (Any, Int)) => c._2)._1
 48 |     val random: scala.util.Random = new scala.util.Random(seed)
 49 |     var dataToWorkWith: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
 50 |     val classesToWorkWith: Array[Any] = if (randomData) {
 51 |       val randomIndex: List[Int] = random.shuffle(data.y.indices.toList)
 52 |       dataToWorkWith = (randomIndex map dataToWorkWith).toArray
 53 |       (randomIndex map data.y).toArray
 54 |     } else {
 55 |       data.y
 56 |     }
 57 | 
 58 |     val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
 59 |       (dataToWorkWith.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
 60 |         dataToWorkWith.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
 61 |         dataToWorkWith.transpose.map((column: Array[Double]) => standardDeviation(column)))
 62 |     } else {
 63 |       (null, null, null)
 64 |     }
 65 | 
 66 |     val positives: Array[Int] = classesToWorkWith.zipWithIndex.collect { case (label, i) if label == untouchableClass => i }
 67 |     val randomElement: Int = classesToWorkWith.indices.diff(positives)(new util.Random(seed).nextInt(classesToWorkWith.length - positives.length))
 68 |     val c: Array[Int] = positives ++ Array(randomElement)
 69 | 
 70 |     val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) {
 71 |       Some(new KDTree(c map dataToWorkWith, c map classesToWorkWith, dataToWorkWith(0).length))
 72 |     } else {
 73 |       None
 74 |     }
 75 | 
 76 |     val labels: Seq[(Int, Any)] = if (dist == Distance.EUCLIDEAN) {
 77 |       dataToWorkWith.indices.map(i => (i, mode(KDTree.get.nNeighbours(dataToWorkWith(i), 1)._2.toArray)))
 78 |     } else {
 79 |       val neighbours = c map dataToWorkWith
 80 |       val classes = c map classesToWorkWith
 81 | 
 82 |       dataToWorkWith.indices.map(i => (i, nnRuleHVDM(neighbours, dataToWorkWith(i), c.indexOf(i), classes, 1, data.fileInfo.nominal,
 83 |         sds, attrCounter, attrClassesCounter, "nearest")._1))
 84 |     }
 85 |     val misclassified: Array[Int] = labels.collect { case (i, label) if label != classesToWorkWith(i) => i }.toArray
 86 |     val finalC: Array[Int] = (misclassified ++ c).distinct
 87 | 
 88 |     val auxData: Data = new Data(x = toXData(finalC map dataToWorkWith), y = finalC map classesToWorkWith, fileInfo = data.fileInfo)
 89 |     auxData.processedData = finalC map dataToWorkWith
 90 |     val tl = new TL(auxData, dist = dist, minorityClass = Some(untouchableClass))
 91 |     val resultTL: Data = tl.compute()
 92 |     val finalIndex: Array[Int] = (resultTL.index.get.toList map finalC).toArray
 93 |     val finishTime: Long = System.nanoTime()
 94 | 
 95 |     if (verbose) {
 96 |       val newCounter: Map[Any, Int] = (finalIndex map classesToWorkWith).groupBy(identity).mapValues(_.length)
 97 |       println("ORIGINAL SIZE: %d".format(dataToWorkWith.length))
 98 |       println("NEW DATA SIZE: %d".format(finalIndex.length))
 99 |       println("REDUCTION PERCENTAGE: %s".format(100 - (finalIndex.length.toFloat / dataToWorkWith.length) * 100))
100 |       println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass)))
101 |       println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass)))
102 |       println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
103 |     }
104 | 
105 |     new Data(finalIndex map data.x, finalIndex map data.y, Some(finalIndex), data.fileInfo)
106 |   }
107 | }
108 | 


--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/oversampling/SMOTEENN.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | SOUL: Scala Oversampling and Undersampling Library.
  3 | Copyright (C) 2019 Néstor Rodríguez, David López
  4 | 
  5 | This program is free software: you can redistribute it and/or modify
  6 | it under the terms of the GNU General Public License as published by
  7 | the Free Software Foundation in version 3 of the License.
  8 | 
  9 | This program is distributed in the hope that it will be useful,
 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | GNU General Public License for more details.
 13 | 
 14 | You should have received a copy of the GNU General Public License
 15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 |  */
 17 | package soul.algorithm.oversampling
 18 | 
 19 | import soul.algorithm.undersampling.ENN
 20 | import soul.data.Data
 21 | import soul.util.KDTree
 22 | import soul.util.Utilities.Distance.Distance
 23 | import soul.util.Utilities._
 24 | 
 25 | import scala.util.Random
 26 | 
 27 | /** SMOTEENN algorithm. Original paper: "A Study of the Behavior of Several Methods for Balancing Machine Learning
 28 |   * Training Data" by Gustavo E. A. P. A. Batista, Ronaldo C. Prati and Maria Carolina Monard.
 29 |   *
 30 |   * @param data      data to work with
 31 |   * @param seed      seed to use. If it is not provided, it will use the system time
 32 |   * @param percent   amount of Smote N%
 33 |   * @param k         number of minority class nearest neighbors
 34 |   * @param dist      object of Distance enumeration representing the distance to be used
 35 |   * @param normalize normalize the data or not
 36 |   * @param verbose   choose to display information about the execution or not
 37 |   * @author David López Pretel
 38 |   */
 39 | class SMOTEENN(data: Data, seed: Long = System.currentTimeMillis(), percent: Int = 500, k: Int = 5,
 40 |                dist: Distance = Distance.EUCLIDEAN, normalize: Boolean = false, verbose: Boolean = false) {
 41 | 
 42 |   /** Compute the SMOTEENN algorithm
 43 |     *
 44 |     * @return synthetic samples generated
 45 |     */
 46 |   def compute(): Data = {
 47 |     val initTime: Long = System.nanoTime()
 48 | 
 49 |     if (percent > 100 && percent % 100 != 0) {
 50 |       throw new Exception("Percent must be a multiple of 100")
 51 |     }
 52 | 
 53 |     val samples: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
 54 |     val minorityClassIndex: Array[Int] = minority(data.y)
 55 |     val minorityClass: Any = data.y(minorityClassIndex(0))
 56 | 
 57 |     val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
 58 |       (samples.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
 59 |         samples.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
 60 |         samples.transpose.map((column: Array[Double]) => standardDeviation(column)))
 61 |     } else {
 62 |       (null, null, null)
 63 |     }
 64 | 
 65 |     val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) {
 66 |       Some(new KDTree(samples, data.y, samples(0).length))
 67 |     } else {
 68 |       None
 69 |     }
 70 | 
 71 |     // check if the percent is correct
 72 |     var T: Int = minorityClassIndex.length
 73 |     var N: Int = percent
 74 | 
 75 |     if (N < 100) {
 76 |       T = N / 100 * T
 77 |       N = 100
 78 |     }
 79 |     N = N / 100
 80 | 
 81 |     // output with a size of T*N samples
 82 |     val output: Array[Array[Double]] = Array.ofDim[Double](N * T, samples(0).length)
 83 | 
 84 |     val r: Random = new Random(seed)
 85 | 
 86 |     // for each minority class sample
 87 |     minorityClassIndex.indices.par.foreach((i: Int) => {
 88 |       val neighbors: Array[Int] = if (dist == Distance.EUCLIDEAN) {
 89 |         KDTree.get.nNeighbours(samples(minorityClassIndex(i)), k)._3.toArray
 90 |       } else {
 91 |         kNeighborsHVDM(samples, minorityClassIndex(i), k, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
 92 |       }
 93 | 
 94 |       // compute populate for the sample
 95 |       (0 until N).par.foreach((n: Int) => {
 96 |         val nn: Int = neighbors(r.nextInt(neighbors.length))
 97 |         // compute attributes of the sample
 98 |         samples(0).indices.foreach((atrib: Int) => {
 99 |           val diff: Double = samples(nn)(atrib) - samples(minorityClassIndex(i))(atrib)
100 |           val gap: Double = r.nextFloat()
101 |           output(i * N + n)(atrib) = samples(minorityClassIndex(i))(atrib) + gap * diff
102 |         })
103 |       })
104 |     })
105 | 
106 |     val result: Array[Array[Double]] = Array.concat(samples, output)
107 |     val resultClasses: Array[Any] = Array.concat(data.y, Array.fill(output.length)(minorityClass))
108 | 
109 |     val ennData: Data = new Data(x = toXData(result), y = resultClasses, fileInfo = data.fileInfo)
110 |     ennData.processedData = result
111 |     val enn = new ENN(ennData, dist = dist)
112 |     val resultENN: Data = enn.compute()
113 |     val finalIndex: Array[Int] = result.indices.diff(resultENN.index.get).toArray
114 | 
115 |     val finishTime: Long = System.nanoTime()
116 | 
117 |     if (verbose) {
118 |       println("ORIGINAL SIZE: %d".format(data.x.length))
119 |       println("NEW DATA SIZE: %d".format(data.x.length + output.length))
120 |       println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
121 |     }
122 | 
123 |     new Data(if (data.nomToNum(0).isEmpty) {
124 |       to2Decimals(zeroOneDenormalization(finalIndex map result, data.fileInfo.maxAttribs, data.fileInfo.minAttribs))
125 |     } else {
126 |       toNominal(zeroOneDenormalization(finalIndex map result, data.fileInfo.maxAttribs, data.fileInfo.minAttribs), data.nomToNum)
127 |     }, finalIndex map resultClasses, None, data.fileInfo)
128 |   }
129 | }
130 | 


--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/undersampling/IHTS.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | SOUL: Scala Oversampling and Undersampling Library.
  3 | Copyright (C) 2019 Néstor Rodríguez, David López
  4 | 
  5 | This program is free software: you can redistribute it and/or modify
  6 | it under the terms of the GNU General Public License as published by
  7 | the Free Software Foundation in version 3 of the License.
  8 | 
  9 | This program is distributed in the hope that it will be useful,
 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | GNU General Public License for more details.
 13 | 
 14 | You should have received a copy of the GNU General Public License
 15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 |  */
 17 | package soul.algorithm.undersampling
 18 | 
 19 | import soul.data.Data
 20 | import soul.util.Utilities._
 21 | import weka.classifiers.trees.J48
 22 | import weka.core.Instances
 23 | 
 24 | 
 25 | /** Instance Hardness Threshold. Original paper: "An Empirical Study of Instance Hardness" by Michael R. Smith,
 26 |   * Tony Martinez and Christophe Giraud-Carrier.
 27 |   *
 28 |   * @param data       data to work with
 29 |   * @param seed       seed to use. If it is not provided, it will use the system time
 30 |   * @param nFolds     number of subsets to create when applying cross-validation
 31 |   * @param normalize  normalize the data or not
 32 |   * @param randomData iterate through the data randomly or not
 33 |   * @param verbose    choose to display information about the execution or not
 34 |   * @author Néstor Rodríguez Vico
 35 |   */
 36 | class IHTS(data: Data, seed: Long = System.currentTimeMillis(), nFolds: Int = 5,
 37 |            normalize: Boolean = false, randomData: Boolean = false, verbose: Boolean = false) {
 38 | 
 39 |   /** Compute the IHTS algorithm.
 40 |     *
 41 |     * @return undersampled data structure
 42 |     */
 43 |   def compute(): Data = {
 44 |     val initTime: Long = System.nanoTime()
 45 | 
 46 |     val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length)
 47 |     val untouchableClass: Any = counter.minBy((c: (Any, Int)) => c._2)._1
 48 |     val random: scala.util.Random = new scala.util.Random(seed)
 49 |     var dataToWorkWith: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
 50 |     val classesToWorkWith: Array[Any] = if (randomData) {
 51 |       val randomIndex: List[Int] = random.shuffle(data.y.indices.toList)
 52 |       dataToWorkWith = (randomIndex map dataToWorkWith).toArray
 53 |       (randomIndex map data.y).toArray
 54 |     } else {
 55 |       data.y
 56 |     }
 57 | 
 58 |     // Each element is the index of test elements
 59 |     val indices: Array[Array[Int]] = random.shuffle(classesToWorkWith.indices.toList).toArray.grouped((classesToWorkWith.length.toFloat / nFolds).ceil.toInt).toArray
 60 |     val probabilities: Array[Double] = new Array[Double](classesToWorkWith.length)
 61 | 
 62 |     indices.foreach { testIndex: Array[Int] =>
 63 |       val trainIndex: Array[Int] = classesToWorkWith.indices.diff(testIndex).toArray
 64 | 
 65 |       val j48: J48 = new J48
 66 |       j48.setOptions(Array("-U", "-M", "1"))
 67 | 
 68 |       val trainInstances: Instances = buildInstances(data = trainIndex map dataToWorkWith,
 69 |         classes = trainIndex map classesToWorkWith, fileInfo = data.fileInfo)
 70 |       val testInstances: Instances = buildInstances(data = testIndex map dataToWorkWith,
 71 |         classes = testIndex map classesToWorkWith, fileInfo = data.fileInfo)
 72 | 
 73 |       j48.buildClassifier(trainInstances)
 74 | 
 75 |       val probs: Array[Array[Double]] = testIndex.indices.map((i: Int) => j48.distributionForInstance(testInstances.instance(i))).toArray
 76 |       val classes: Array[Any] = (testIndex map classesToWorkWith).distinct
 77 |       val values: Array[Double] = (testIndex map classesToWorkWith).zipWithIndex.map((e: (Any, Int)) => probs(e._2)(classes.indexOf(e._1)))
 78 | 
 79 |       (testIndex zip values).foreach((i: (Int, Double)) => probabilities(i._1) = i._2)
 80 |     }
 81 | 
 82 |     val finalIndex: Array[Int] = classesToWorkWith.distinct.flatMap { targetClass: Any =>
 83 |       val indexTargetClass: Array[Int] = if (targetClass != untouchableClass) {
 84 |         val nSamples: Int = counter(untouchableClass)
 85 |         val targetIndex: Array[Int] = boolToIndex(classesToWorkWith.map((c: Any) => c == targetClass))
 86 |         val targetProbabilities: Array[Double] = targetIndex map probabilities
 87 |         val percentile: Double = (1.0 - (nSamples / counter(targetClass))) * 100.0
 88 |         val threshold: Double = targetProbabilities.sorted.apply(math.ceil((targetProbabilities.length - 1) * (percentile / 100.0)).toInt)
 89 |         boolToIndex((targetIndex map probabilities).map((e: Double) => e >= threshold))
 90 |       }
 91 |       else {
 92 |         classesToWorkWith.zipWithIndex.collect { case (c, i) if c == targetClass => i }
 93 |       }
 94 | 
 95 |       indexTargetClass
 96 |     }
 97 | 
 98 |     val finishTime: Long = System.nanoTime()
 99 | 
100 |     if (verbose) {
101 |       val newCounter: Map[Any, Int] = (finalIndex map classesToWorkWith).groupBy(identity).mapValues(_.length)
102 |       println("ORIGINAL SIZE: %d".format(dataToWorkWith.length))
103 |       println("NEW DATA SIZE: %d".format(finalIndex.length))
104 |       println("REDUCTION PERCENTAGE: %s".format(100 - (finalIndex.length.toFloat / dataToWorkWith.length) * 100))
105 |       println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass)))
106 |       println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass)))
107 |       println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
108 |     }
109 | 
110 |     new Data(finalIndex map data.x, finalIndex map data.y, Some(finalIndex), data.fileInfo)
111 |   }
112 | }
113 | 


--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/oversampling/SMOTETL.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | SOUL: Scala Oversampling and Undersampling Library.
  3 | Copyright (C) 2019 Néstor Rodríguez, David López
  4 | 
  5 | This program is free software: you can redistribute it and/or modify
  6 | it under the terms of the GNU General Public License as published by
  7 | the Free Software Foundation in version 3 of the License.
  8 | 
  9 | This program is distributed in the hope that it will be useful,
 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | GNU General Public License for more details.
 13 | 
 14 | You should have received a copy of the GNU General Public License
 15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 |  */
 17 | package soul.algorithm.oversampling
 18 | 
 19 | import soul.algorithm.undersampling.TL
 20 | import soul.data.Data
 21 | import soul.util.KDTree
 22 | import soul.util.Utilities.Distance.Distance
 23 | import soul.util.Utilities._
 24 | 
 25 | import scala.util.Random
 26 | 
 27 | /** SMOTETL algorithm. Original paper: "A Study of the Behavior of Several Methods for Balancing Machine Learning
 28 |   * Training Data" by Gustavo E. A. P. A. Batista, Ronaldo C. Prati and Maria Carolina Monard.
 29 |   *
 30 |   * @param data      data to work with
 31 |   * @param seed      seed to use. If it is not provided, it will use the system time
 32 |   * @param percent   Amount of Smote N%
 33 |   * @param k         Number of minority class nearest neighbors
 34 |   * @param dist      object of Distance enumeration representing the distance to be used
 35 |   * @param normalize normalize the data or not
 36 |   * @param verbose   choose to display information about the execution or not
 37 |   * @author David López Pretel
 38 |   */
 39 | class SMOTETL(data: Data, seed: Long = System.currentTimeMillis(), percent: Int = 500, k: Int = 5,
 40 |               dist: Distance = Distance.EUCLIDEAN, normalize: Boolean = false, verbose: Boolean = false) {
 41 | 
 42 |   /** Compute the SMOTETL algorithm
 43 |     *
 44 |     * @return synthetic samples generated
 45 |     */
 46 |   def compute(): Data = {
 47 |     val initTime: Long = System.nanoTime()
 48 | 
 49 |     if (percent > 100 && percent % 100 != 0) {
 50 |       throw new Exception("Percent must be a multiple of 100")
 51 |     }
 52 | 
 53 |     val samples: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
 54 |     // compute minority class
 55 |     val minorityClassIndex: Array[Int] = minority(data.y)
 56 |     val minorityClass: Any = data.y(minorityClassIndex(0))
 57 | 
 58 |     val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
 59 |       (samples.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
 60 |         samples.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
 61 |         samples.transpose.map((column: Array[Double]) => standardDeviation(column)))
 62 |     } else {
 63 |       (null, null, null)
 64 |     }
 65 | 
 66 |     val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) {
 67 |       Some(new KDTree(samples, data.y, samples(0).length))
 68 |     } else {
 69 |       None
 70 |     }
 71 | 
 72 |     // check if the percent is correct
 73 |     var T: Int = minorityClassIndex.length
 74 |     var N: Int = percent
 75 | 
 76 |     if (N < 100) {
 77 |       T = N / 100 * T
 78 |       N = 100
 79 |     }
 80 |     N = N / 100
 81 | 
 82 |     // output with a size of T*N samples
 83 |     val output: Array[Array[Double]] = Array.ofDim[Double](N * T, samples(0).length)
 84 | 
 85 |     val r: Random = new Random(seed)
 86 | 
 87 |     // for each minority class sample
 88 |     minorityClassIndex.indices.par.foreach((i: Int) => {
 89 |       val neighbors: Array[Int] = if (dist == Distance.EUCLIDEAN) {
 90 |         KDTree.get.nNeighbours(samples(minorityClassIndex(i)), k)._3.toArray
 91 |       } else {
 92 |         kNeighborsHVDM(samples, minorityClassIndex(i), k, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
 93 |       }
 94 | 
 95 |       // compute populate for the sample
 96 |       (0 until N).par.foreach((n: Int) => {
 97 |         val nn: Int = neighbors(r.nextInt(neighbors.length))
 98 |         // compute attributes of the sample
 99 |         samples(0).indices.foreach((atrib: Int) => {
100 |           val diff: Double = samples(nn)(atrib) - samples(minorityClassIndex(i))(atrib)
101 |           val gap: Double = r.nextFloat()
102 |           output(i * N + n)(atrib) = samples(minorityClassIndex(i))(atrib) + gap * diff
103 |         })
104 |       })
105 |     })
106 |     val result: Array[Array[Double]] = Array.concat(samples, output)
107 |     val resultClasses: Array[Any] = Array.concat(data.y, Array.fill(output.length)(minorityClass))
108 | 
109 |     val tlData: Data = new Data(x = toXData(result), y = resultClasses, fileInfo = data.fileInfo)
110 |     tlData.processedData = result
111 |     val tl = new TL(tlData, dist = dist, ratio = "all")
112 |     val resultTL: Data = tl.compute()
113 |     val finalIndex: Array[Int] = result.indices.diff(resultTL.index.get).toArray
114 | 
115 |     val finishTime: Long = System.nanoTime()
116 | 
117 |     if (verbose) {
118 |       println("ORIGINAL SIZE: %d".format(data.x.length))
119 |       println("NEW DATA SIZE: %d".format(data.x.length + output.length))
120 |       println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
121 |     }
122 | 
123 |     new Data(if (data.nomToNum(0).isEmpty) {
124 |       to2Decimals(zeroOneDenormalization(finalIndex map result, data.fileInfo.maxAttribs, data.fileInfo.minAttribs))
125 |     } else {
126 |       toNominal(zeroOneDenormalization(finalIndex map result, data.fileInfo.maxAttribs, data.fileInfo.minAttribs), data.nomToNum)
127 |     }, finalIndex map resultClasses, None, data.fileInfo)
128 |   }
129 | }
130 | 


--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/oversampling/SafeLevelSMOTE.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | SOUL: Scala Oversampling and Undersampling Library.
  3 | Copyright (C) 2019 Néstor Rodríguez, David López
  4 | 
  5 | This program is free software: you can redistribute it and/or modify
  6 | it under the terms of the GNU General Public License as published by
  7 | the Free Software Foundation in version 3 of the License.
  8 | 
  9 | This program is distributed in the hope that it will be useful,
 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | GNU General Public License for more details.
 13 | 
 14 | You should have received a copy of the GNU General Public License
 15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 |  */
 17 | package soul.algorithm.oversampling
 18 | 
 19 | import soul.data.Data
 20 | import soul.util.KDTree
 21 | import soul.util.Utilities.Distance.Distance
 22 | import soul.util.Utilities._
 23 | 
 24 | import scala.util.Random
 25 | 
 26 | /** SafeLevel-SMOTE algorithm. Original paper: "Safe-Level-SMOTE: Safe-Level-Synthetic Minority Over-Sampling Technique
 27 |   * for Handling the Class Imbalanced Problem" by Chumphol Bunkhumpornpat, Krung Sinapiromsaran, and Chidchanok Lursinsap.
 28 |   *
 29 |   * @param data      data to work with
 30 |   * @param seed      seed to use. If it is not provided, it will use the system time
 31 |   * @param k         Number of nearest neighbors
 32 |   * @param dist      object of Distance enumeration representing the distance to be used
 33 |   * @param normalize normalize the data or not
 34 |   * @param verbose   choose to display information about the execution or not
 35 |   * @author David López Pretel
 36 |   */
 37 | class SafeLevelSMOTE(data: Data, seed: Long = System.currentTimeMillis(), k: Int = 5,
 38 |                      dist: Distance = Distance.EUCLIDEAN, normalize: Boolean = false, verbose: Boolean = false) {
 39 | 
 40 |   /** Compute the SafeLevelSMOTE algorithm
 41 |     *
 42 |     * @return synthetic samples generated
 43 |     */
 44 |   def compute(): Data = {
 45 |     val initTime: Long = System.nanoTime()
 46 |     val samples: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
 47 |     // compute minority class
 48 |     val minorityClassIndex: Array[Int] = minority(data.y)
 49 |     val minorityClass: Any = data.y(minorityClassIndex(0))
 50 | 
 51 |     val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
 52 |       (samples.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
 53 |         samples.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
 54 |         samples.transpose.map((column: Array[Double]) => standardDeviation(column)))
 55 |     } else {
 56 |       (null, null, null)
 57 |     }
 58 | 
 59 |     val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) {
 60 |       Some(new KDTree(samples, data.y, samples(0).length))
 61 |     } else {
 62 |       None
 63 |     }
 64 | 
 65 |     var sl_ratio: Double = 0.0
 66 |     val r: Random = new Random(seed)
 67 | 
 68 |     val output: Array[Array[Double]] = minorityClassIndex.indices.par.map(i => {
 69 |       // compute k neighbors from p and save number of positive instances
 70 |       val neighbors: Array[Int] = if (dist == Distance.EUCLIDEAN) {
 71 |         KDTree.get.nNeighbours(samples(minorityClassIndex(i)), k)._3.toArray
 72 |       } else {
 73 |         kNeighborsHVDM(samples, i, k, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
 74 |       }
 75 |       val n: Int = neighbors(r.nextInt(neighbors.length))
 76 |       val slp: Int = neighbors.map(neighbor => {
 77 |         if (data.y(neighbor) == minorityClass) {
 78 |           1
 79 |         } else {
 80 |           0
 81 |         }
 82 |       }).sum
 83 |       // compute k neighbors from n and save number of positive instances
 84 |       val selectedNeighbors: Array[Int] = if (dist == Distance.EUCLIDEAN) {
 85 |         KDTree.get.nNeighbours(samples(n), k)._3.toArray
 86 |       } else {
 87 |         kNeighborsHVDM(samples, n, k, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
 88 |       }
 89 | 
 90 |       val sln: Int = selectedNeighbors.map(neighbor => {
 91 |         if (data.y(neighbor) == minorityClass) {
 92 |           1
 93 |         } else {
 94 |           0
 95 |         }
 96 |       }).sum
 97 |       if (sln != 0) { //sl is safe level
 98 |         sl_ratio = slp / sln
 99 |       } else {
100 |         sl_ratio = 99999999
101 |       }
102 |       if (sl_ratio == 99999999 && slp == 0) {
103 |         // dont create a synthetic instance
104 |         None
105 |       }
106 |       else {
107 |         // calculate synthetic sample
108 |         Some(samples(i).indices.map(atrib => {
109 |           var gap: Double = 0.0 // 2 case
110 |           if (sl_ratio == 1) { // 3 case
111 |             gap = r.nextFloat
112 |           } else if (sl_ratio > 1 && sl_ratio != 99999999) { // 4 case
113 |             gap = r.nextFloat * (1 / sl_ratio)
114 |           } else if (sl_ratio < 1) { // 5 case
115 |             gap = r.nextFloat()
116 |             if (gap < 1 - sl_ratio) {
117 |               gap = gap + 1 - sl_ratio
118 |             }
119 |           }
120 |           val diff: Double = samples(n)(atrib) - samples(minorityClassIndex(i))(atrib)
121 |           samples(minorityClassIndex(i))(atrib) + gap * diff
122 |         }).toArray)
123 |       }
124 |     }).filterNot(_.forall(_ == None)).map(_.get).toArray
125 | 
126 |     val finishTime: Long = System.nanoTime()
127 | 
128 |     if (verbose) {
129 |       println("ORIGINAL SIZE: %d".format(data.x.length))
130 |       println("NEW DATA SIZE: %d".format(data.x.length + output.length))
131 |       println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
132 |     }
133 | 
134 |     new Data(if (data.fileInfo.nominal.length == 0) {
135 |       to2Decimals(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs,
136 |         data.fileInfo.minAttribs) else output))
137 |     } else {
138 |       toNominal(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs,
139 |         data.fileInfo.minAttribs) else output), data.nomToNum)
140 |     }, Array.concat(data.y, Array.fill(output.length)(minorityClass)), None, data.fileInfo)
141 |   }
142 | }
143 | 


--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/oversampling/ADOMS.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | SOUL: Scala Oversampling and Undersampling Library.
  3 | Copyright (C) 2019 Néstor Rodríguez, David López
  4 | 
  5 | This program is free software: you can redistribute it and/or modify
  6 | it under the terms of the GNU General Public License as published by
  7 | the Free Software Foundation in version 3 of the License.
  8 | 
  9 | This program is distributed in the hope that it will be useful,
 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | GNU General Public License for more details.
 13 | 
 14 | You should have received a copy of the GNU General Public License
 15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 |  */
 17 | package soul.algorithm.oversampling
 18 | 
 19 | import breeze.linalg.{DenseMatrix, eigSym}
 20 | import soul.data.Data
 21 | import soul.util.KDTree
 22 | import soul.util.Utilities.Distance.Distance
 23 | import soul.util.Utilities._
 24 | 
 25 | import scala.util.Random
 26 | 
 27 | /** ADOMS algorithm. Original paper: "The Generation Mechanism of Synthetic Minority Class Examples" by Sheng TANG
 28 |   * and Si-ping CHEN.
 29 |   *
 30 |   * @param data      data to work with
 31 |   * @param seed      seed to use. If it is not provided, it will use the system time
 32 |   * @param percent   amount of samples N%
 33 |   * @param k         number of neighbors
 34 |   * @param dist      object of Distance enumeration representing the distance to be used
 35 |   * @param normalize normalize the data or not
 36 |   * @param verbose   choose to display information about the execution or not
 37 |   * @author David López Pretel
 38 |   */
 39 | class ADOMS(data: Data, seed: Long = System.currentTimeMillis(), percent: Int = 300, k: Int = 5,
 40 |             dist: Distance = Distance.EUCLIDEAN, normalize: Boolean = false, verbose: Boolean = false) {
 41 | 
 42 |   /** Compute the first principal component axis
 43 |     *
 44 |     * @param A the data
 45 |     * @return the first principal component axis
 46 |     */
 47 |   private def PCA(A: Array[Array[Double]]): Array[Double] = {
 48 |     val mean: Array[Double] = A.transpose.map(_.sum / A.length)
 49 |     // subtract the mean to the data
 50 |     val dataNoMean: DenseMatrix[Double] = DenseMatrix(A: _*) -:- DenseMatrix(A.map(_ => mean): _*)
 51 |     // get the covariance matrix
 52 |     val oneDividedByN: Array[Array[Double]] = Array.fill(dataNoMean.cols, dataNoMean.cols)(dataNoMean.rows)
 53 |     val S: DenseMatrix[Double] = (dataNoMean.t * dataNoMean) /:/ DenseMatrix(oneDividedByN: _*)
 54 |     //compute the eigenvectors and eigenvalues of S
 55 |     val eigen = eigSym(S)
 56 | 
 57 |     //return the first eigenvector because it represent the first principal component axis
 58 |     eigen.eigenvectors(0, ::).t.toArray
 59 |   }
 60 | 
 61 |   /** Compute the ADOMS algorithm
 62 |     *
 63 |     * @return synthetic samples generated
 64 |     */
 65 |   def compute(): Data = {
 66 |     val initTime: Long = System.nanoTime()
 67 |     val samples: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
 68 |     val minorityClassIndex: Array[Int] = minority(data.y)
 69 |     val minorityClass: Any = data.y(minorityClassIndex(0))
 70 |     // output with a size of T*N samples
 71 |     val output: Array[Array[Double]] = Array.ofDim(minorityClassIndex.length * percent / 100, samples(0).length)
 72 |     // index array to save the neighbors of each sample
 73 |     val r: Random = new Random(seed)
 74 | 
 75 |     val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
 76 |       (samples.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
 77 |         samples.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
 78 |         samples.transpose.map((column: Array[Double]) => standardDeviation(column)))
 79 |     } else {
 80 |       (null, null, null)
 81 |     }
 82 | 
 83 |     val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) {
 84 |       Some(new KDTree(samples, data.y, samples(0).length))
 85 |     } else {
 86 |       None
 87 |     }
 88 | 
 89 |     val N: Int = percent / 100
 90 | 
 91 |     (0 until N).par.foreach(nn => {
 92 |       // for each minority class sample
 93 |       minorityClassIndex.zipWithIndex.par.foreach(i => {
 94 |         val neighbors: Array[Int] = if (dist == Distance.EUCLIDEAN) {
 95 |           KDTree.get.nNeighbours(samples(i._1), k)._3.toArray
 96 |         } else {
 97 |           kNeighborsHVDM(samples, i._2, k, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
 98 |         }
 99 | 
100 |         val n: Int = r.nextInt(neighbors.length)
101 | 
102 |         // calculate first principal component axis of local data distribution
103 |         val l2: Array[Double] = PCA(neighbors map samples)
104 |         // compute projection of n in l2, M is on l2
105 |         val dotMN: Double = l2.indices.map(j => {
106 |           samples(i._1)(j) - samples(neighbors(n))(j)
107 |         }).toArray.zipWithIndex.map(j => {
108 |           j._1 * l2(j._2)
109 |         }).sum
110 |         val dotMM: Double = l2.map(x => x * x).sum
111 |         // create synthetic sample
112 |         output(nn * minorityClassIndex.length + i._2) = l2.indices.map(j => samples(i._1)(j) + dotMN / dotMM * l2(j)).toArray
113 |         output(nn * minorityClassIndex.length + i._2) = output(nn * minorityClassIndex.length + i._2).indices.map(j => output(nn * minorityClassIndex.length + i._2)(j) + (samples(i._1)(j) - output(nn * minorityClassIndex.length + i._2)(j)) * r.nextFloat()).toArray
114 |       })
115 |     })
116 | 
117 |     val finishTime: Long = System.nanoTime()
118 | 
119 |     if (verbose) {
120 |       println("ORIGINAL SIZE: %d".format(data.x.length))
121 |       println("NEW DATA SIZE: %d".format(data.x.length + output.length))
122 |       println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
123 |     }
124 | 
125 |     new Data(if (data.fileInfo.nominal.length == 0) {
126 |       to2Decimals(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs,
127 |         data.fileInfo.minAttribs) else output))
128 |     } else {
129 |       toNominal(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs,
130 |         data.fileInfo.minAttribs) else output), data.nomToNum)
131 |     }, Array.concat(data.y, Array.fill(output.length)(minorityClass)), None, data.fileInfo)
132 |   }
133 | }
134 | 


--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/oversampling/BorderlineSMOTE.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | SOUL: Scala Oversampling and Undersampling Library.
  3 | Copyright (C) 2019 Néstor Rodríguez, David López
  4 | 
  5 | This program is free software: you can redistribute it and/or modify
  6 | it under the terms of the GNU General Public License as published by
  7 | the Free Software Foundation in version 3 of the License.
  8 | 
  9 | This program is distributed in the hope that it will be useful,
 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | GNU General Public License for more details.
 13 | 
 14 | You should have received a copy of the GNU General Public License
 15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 |  */
 17 | package soul.algorithm.oversampling
 18 | 
 19 | import soul.data.Data
 20 | import soul.util.KDTree
 21 | import soul.util.Utilities.Distance.Distance
 22 | import soul.util.Utilities._
 23 | 
 24 | import scala.util.Random
 25 | 
 26 | /** Borderline-SMOTE algorithm. Original paper: "Borderline-SMOTE: A New Over-Sampling Method in Imbalanced Data Sets
 27 |   * Learning." by Hui Han, Wen-Yuan Wang, and Bing-Huan Mao.
 28 |   *
 29 |   * @param data      data to work with
 30 |   * @param seed      seed to use. If it is not provided, it will use the system time
 31 |   * @param m         number of nearest neighbors
 32 |   * @param k         number of minority class nearest neighbors
 33 |   * @param dist      object of Distance enumeration representing the distance to be used
 34 |   * @param normalize normalize the data or not
 35 |   * @param verbose   choose to display information about the execution or not
 36 |   * @author David López Pretel
 37 |   */
 38 | class BorderlineSMOTE(data: Data, seed: Long = System.currentTimeMillis(), m: Int = 10, k: Int = 5,
 39 |                       dist: Distance = Distance.EUCLIDEAN, normalize: Boolean = false, verbose: Boolean = false) {
 40 | 
 41 |   /** Compute the BorderlineSMOTE algorithm
 42 |     *
 43 |     * @return synthetic samples generated
 44 |     */
 45 |   def compute(): Data = {
 46 |     val initTime: Long = System.nanoTime()
 47 |     val samples: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
 48 |     val minorityClassIndex: Array[Int] = minority(data.y)
 49 |     val minorityClass: Any = data.y(minorityClassIndex(0))
 50 | 
 51 |     val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
 52 |       (samples.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
 53 |         samples.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
 54 |         samples.transpose.map((column: Array[Double]) => standardDeviation(column)))
 55 |     } else {
 56 |       (null, null, null)
 57 |     }
 58 | 
 59 |     val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) {
 60 |       Some(new KDTree(samples, data.y, samples(0).length))
 61 |     } else {
 62 |       None
 63 |     }
 64 | 
 65 |     val KDTreeMinority: Option[KDTree] = if (dist == Distance.EUCLIDEAN) {
 66 |       Some(new KDTree(minorityClassIndex map samples, minorityClassIndex map data.y, samples(0).length))
 67 |     } else {
 68 |       None
 69 |     }
 70 | 
 71 |     // compute minority class neighbors
 72 |     val minorityClassNeighbors: Array[Array[Int]] = new Array[Array[Int]](minorityClassIndex.length)
 73 |     if (dist == Distance.EUCLIDEAN) {
 74 |       minorityClassIndex.indices.par.foreach(i => minorityClassNeighbors(i) = KDTree.get.nNeighbours(samples(minorityClassIndex(i)), k)._3.toArray)
 75 |     } else {
 76 |       minorityClassIndex.indices.par.foreach(i => minorityClassNeighbors(i) = kNeighborsHVDM(samples, minorityClassIndex(i), m, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter))
 77 |     }
 78 | 
 79 |     //compute nodes in borderline
 80 |     val DangerNodes: Array[Int] = minorityClassNeighbors.map(neighbors => {
 81 |       var counter = 0
 82 |       neighbors.foreach(neighbor => {
 83 |         if (data.y(neighbor) != minorityClass) {
 84 |           counter += 1
 85 |         }
 86 |       })
 87 |       counter
 88 |     }).zipWithIndex.map(nNonMinorityClass => {
 89 |       if (nNonMinorityClass._1 >= (m / 2) && nNonMinorityClass._1 < m) {
 90 |         Some(nNonMinorityClass._2)
 91 |       } else {
 92 |         None
 93 |       }
 94 |     }).filterNot(_.forall(_ == None)).map(x => minorityClassIndex(x.get))
 95 | 
 96 |     val r: Random = new Random(seed)
 97 |     val s: Int = r.nextInt(k) + 1
 98 | 
 99 |     // output with a size of T*N samples
100 |     val output: Array[Array[Double]] = Array.ofDim(s * DangerNodes.length, samples(0).length)
101 | 
102 |     // for each minority class sample
103 |     DangerNodes.zipWithIndex.par.foreach(i => {
104 |       val neighbors = if (dist == Distance.EUCLIDEAN) {
105 |         KDTreeMinority.get.nNeighbours(samples(i._1), k)._3.toArray
106 |       } else {
107 |         kNeighborsHVDM(minorityClassIndex map samples, i._2, k, data.fileInfo.nominal, sds, attrCounter,
108 |           attrClassesCounter).map(minorityClassIndex(_))
109 |       }
110 |       val sNeighbors: Array[Int] = (0 until s).map(_ => r.nextInt(neighbors.length)).toArray
111 |       // calculate populate for the sample
112 |       (sNeighbors map neighbors).zipWithIndex.par.foreach(j => {
113 |         // calculate attributes of the sample
114 |         samples(i._1).indices.foreach(attrib => {
115 |           val diff: Double = samples(minorityClassIndex(j._1))(attrib) - samples(i._1)(attrib)
116 |           val gap: Float = r.nextFloat
117 |           output(i._2 * s + j._2)(attrib) = samples(i._1)(attrib) + gap * diff
118 |         })
119 |       })
120 |     })
121 | 
122 |     val finishTime: Long = System.nanoTime()
123 | 
124 |     if (verbose) {
125 |       println("ORIGINAL SIZE: %d".format(data.x.length))
126 |       println("NEW DATA SIZE: %d".format(data.x.length + output.length))
127 |       println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
128 |     }
129 | 
130 |     new Data(if (data.fileInfo.nominal.length == 0) {
131 |       to2Decimals(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs,
132 |         data.fileInfo.minAttribs) else output))
133 |     } else {
134 |       toNominal(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs,
135 |         data.fileInfo.minAttribs) else output), data.nomToNum)
136 |     }, Array.concat(data.y, Array.fill(output.length)(minorityClass)), None, data.fileInfo)
137 |   }
138 | }
139 | 


--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/undersampling/TL.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | SOUL: Scala Oversampling and Undersampling Library.
  3 | Copyright (C) 2019 Néstor Rodríguez, David López
  4 | 
  5 | This program is free software: you can redistribute it and/or modify
  6 | it under the terms of the GNU General Public License as published by
  7 | the Free Software Foundation in version 3 of the License.
  8 | 
  9 | This program is distributed in the hope that it will be useful,
 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | GNU General Public License for more details.
 13 | 
 14 | You should have received a copy of the GNU General Public License
 15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 |  */
 17 | package soul.algorithm.undersampling
 18 | 
 19 | import soul.data.Data
 20 | import soul.util.Utilities.Distance.Distance
 21 | import soul.util.Utilities._
 22 | 
 23 | /** Tomek Link. Original paper: "Two Modifications of CNN" by Ivan Tomek.
 24 |   *
 25 |   * @param data          data to work with
 26 |   * @param seed          seed to use. If it is not provided, it will use the system time
 27 |   * @param dist          object of Distance enumeration representing the distance to be used
 28 |   * @param ratio         indicates the instances of the Tomek Links that are going to be remove. "all" will remove all instances,
 29 |   *                      "minority" will remove instances of the minority class and "not minority" will remove all the instances
 30 |   *                      except the ones of the minority class.
 31 |   * @param minorityClass minority class. If set to None, it will be computed
 32 |   * @param normalize     normalize the data or not
 33 |   * @param randomData    iterate through the data randomly or not
 34 |   * @param verbose       choose to display information about the execution or not
 35 |   * @author Néstor Rodríguez Vico
 36 |   */
 37 | class TL(data: Data, seed: Long = System.currentTimeMillis(), dist: Distance = Distance.EUCLIDEAN, ratio: String = "not minority",
 38 |          val minorityClass: Option[Any] = None, normalize: Boolean = false, randomData: Boolean = false, verbose: Boolean = false) {
 39 | 
 40 |   /** Compute the TL algorithm.
 41 |     *
 42 |     * @return undersampled data structure
 43 |     */
 44 |   def compute(): Data = {
 45 |     val initTime: Long = System.nanoTime()
 46 |     val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length)
 47 |     val untouchableClass: Any = if (minorityClass.isDefined) minorityClass.get else counter.minBy((c: (Any, Int)) => c._2)._1
 48 |     val random: scala.util.Random = new scala.util.Random(seed)
 49 | 
 50 |     var dataToWorkWith: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
 51 |     val classesToWorkWith: Array[Any] = if (randomData) {
 52 |       val randomIndex: List[Int] = random.shuffle(data.y.indices.toList)
 53 |       dataToWorkWith = (randomIndex map dataToWorkWith).toArray
 54 |       (randomIndex map data.y).toArray
 55 |     } else {
 56 |       data.y
 57 |     }
 58 | 
 59 |     val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
 60 |       (dataToWorkWith.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
 61 |         dataToWorkWith.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
 62 |         dataToWorkWith.transpose.map((column: Array[Double]) => standardDeviation(column)))
 63 |     } else {
 64 |       (null, null, null)
 65 |     }
 66 | 
 67 |     val candidates: Map[Any, Array[Int]] = classesToWorkWith.distinct.map {
 68 |       c: Any =>
 69 |         c -> classesToWorkWith.zipWithIndex.collect {
 70 |           case (a, b) if a != c => b
 71 |         }
 72 |     }.toMap
 73 | 
 74 |     val distances: Array[Array[Double]] = Array.fill[Array[Double]](dataToWorkWith.length)(new Array[Double](dataToWorkWith.length))
 75 | 
 76 |     if (dist == Distance.EUCLIDEAN) {
 77 |       dataToWorkWith.indices.par.foreach { i: Int =>
 78 |         dataToWorkWith.indices.drop(i).par.foreach { j: Int =>
 79 |           distances(i)(j) = euclidean(dataToWorkWith(i), dataToWorkWith(j))
 80 |           distances(j)(i) = distances(i)(j)
 81 |         }
 82 |       }
 83 |     } else {
 84 |       dataToWorkWith.indices.par.foreach { i: Int =>
 85 |         dataToWorkWith.indices.drop(i).par.foreach { j: Int =>
 86 |           distances(i)(j) = HVDM(dataToWorkWith(i), dataToWorkWith(j), data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
 87 |           distances(j)(i) = distances(i)(j)
 88 |         }
 89 |       }
 90 |     }
 91 | 
 92 |     // Look for the nearest neighbour in the rest of the classes
 93 |     val nearestNeighbour: Array[Int] = distances.zipWithIndex.map((row: (Array[Double], Int)) => row._1.indexOf((candidates(classesToWorkWith(row._2)) map row._1).min))
 94 |     // For each instance, I: If my nearest neighbour is J and the nearest neighbour of J it's me, I, I and J form a Tomek link
 95 |     val tomekLinks: Array[(Int, Int)] = nearestNeighbour.zipWithIndex.filter((pair: (Int, Int)) => nearestNeighbour(pair._1) == pair._2)
 96 |     val targetInstances: Array[Int] = tomekLinks.flatMap((x: (Int, Int)) => List(x._1, x._2)).distinct
 97 |     // but the user can choose which of them should be removed
 98 |     val removedInstances: Array[Int] = if (ratio == "all") targetInstances else if (ratio == "minority")
 99 |       targetInstances.zipWithIndex.collect {
100 |         case (a, b) if a == untouchableClass => b
101 |       } else if (ratio == "not minority")
102 |       targetInstances.zipWithIndex.collect {
103 |         case (a, b) if a != untouchableClass => b
104 |       } else
105 |       throw new Exception("Incorrect value of ratio. Possible options: all, minority, not minority")
106 |     val finalIndex: Array[Int] = dataToWorkWith.indices.diff(removedInstances).toArray
107 |     val finishTime: Long = System.nanoTime()
108 | 
109 |     if (verbose) {
110 |       val newCounter: Map[Any, Int] = (finalIndex map classesToWorkWith).groupBy(identity).mapValues(_.length)
111 |       println("ORIGINAL SIZE: %d".format(dataToWorkWith.length))
112 |       println("NEW DATA SIZE: %d".format(finalIndex.length))
113 |       println("REDUCTION PERCENTAGE: %s".format(100 - (finalIndex.length.toFloat / dataToWorkWith.length) * 100))
114 |       println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass)))
115 |       println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass)))
116 |       println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
117 |       println("REMOVED INSTANCES: %s".format(ratio))
118 |     }
119 | 
120 |     new Data(finalIndex map data.x, finalIndex map data.y, Some(finalIndex), data.fileInfo)
121 |   }
122 | }
123 | 


--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/oversampling/ADASYN.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | SOUL: Scala Oversampling and Undersampling Library.
  3 | Copyright (C) 2019 Néstor Rodríguez, David López
  4 | 
  5 | This program is free software: you can redistribute it and/or modify
  6 | it under the terms of the GNU General Public License as published by
  7 | the Free Software Foundation in version 3 of the License.
  8 | 
  9 | This program is distributed in the hope that it will be useful,
 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | GNU General Public License for more details.
 13 | 
 14 | You should have received a copy of the GNU General Public License
 15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 |  */
 17 | package soul.algorithm.oversampling
 18 | 
 19 | import soul.data.Data
 20 | import soul.util.KDTree
 21 | import soul.util.Utilities.Distance.Distance
 22 | import soul.util.Utilities._
 23 | 
 24 | import scala.util.Random
 25 | 
 26 | /** ADASYN algorithm. Original paper: "ADASYN: Adaptive Synthetic Sampling Approach for Imbalanced Learning" by Haibo He,
 27 |   * Yang Bai, Edwardo A. Garcia, and Shutao Li.
 28 |   *
 29 |   * @param data      data to work with
 30 |   * @param seed      seed to use. If it is not provided, it will use the system time
 31 |   * @param d         preset threshold for the maximum tolerated degree of class imbalance radio
 32 |   * @param B         balance level after generation of synthetic data
 33 |   * @param k         number of neighbors
 34 |   * @param dist      object of Distance enumeration representing the distance to be used
 35 |   * @param normalize normalize the data or not
 36 |   * @param verbose   choose to display information about the execution or not
 37 |   * @author David López Pretel
 38 |   */
 39 | class ADASYN(data: Data, seed: Long = System.currentTimeMillis(), d: Double = 1, B: Double = 1, k: Int = 5,
 40 |              dist: Distance = Distance.EUCLIDEAN, normalize: Boolean = false, verbose: Boolean = false) {
 41 | 
 42 |   /** Compute the ADASYN algorithm
 43 |     *
 44 |     * @return synthetic samples generated
 45 |     */
 46 |   def compute(): Data = {
 47 |     val initTime: Long = System.nanoTime()
 48 | 
 49 |     if (B > 1 || B < 0) {
 50 |       throw new Exception("B must be between 0 and 1, both included")
 51 |     }
 52 | 
 53 |     if (d > 1 || d <= 0) {
 54 |       throw new Exception("d must be between 0 and 1, zero not included")
 55 |     }
 56 | 
 57 |     val samples: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
 58 | 
 59 |     val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
 60 |       (samples.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
 61 |         samples.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
 62 |         samples.transpose.map((column: Array[Double]) => standardDeviation(column)))
 63 |     } else {
 64 |       (null, null, null)
 65 |     }
 66 | 
 67 |     val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) {
 68 |       Some(new KDTree(samples, data.y, samples(0).length))
 69 |     } else {
 70 |       None
 71 |     }
 72 | 
 73 |     val minorityClassIndex: Array[Int] = minority(data.y)
 74 |     val minorityClass: Any = data.y(minorityClassIndex(0))
 75 | 
 76 |     // calculate size of the output
 77 |     val ms: Int = minorityClassIndex.length
 78 |     val ml: Int = data.y.length - ms
 79 |     val G: Int = ((ml - ms) * B).asInstanceOf[Int]
 80 | 
 81 |     // k neighbors of each minority sample
 82 |     val neighbors: Array[Array[Int]] = new Array[Array[Int]](minorityClassIndex.length)
 83 |     minorityClassIndex.indices.par.foreach { i =>
 84 |       if (dist == Distance.EUCLIDEAN) {
 85 |         neighbors(i) = KDTree.get.nNeighbours(samples(minorityClassIndex(i)), k)._3.toArray
 86 |       } else {
 87 |         neighbors(i) = kNeighborsHVDM(samples, minorityClassIndex(i), k, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
 88 |       }
 89 |     }
 90 | 
 91 |     // ratio of each minority sample
 92 |     val ratio: Array[Double] = new Array[Double](neighbors.length)
 93 |     neighbors.zipWithIndex.par.foreach(neighborsOfX => {
 94 |       ratio(neighborsOfX._2) = neighborsOfX._1.map(neighbor => {
 95 |         if (data.y(neighbor) != minorityClass) 1 else 0
 96 |       }).sum.asInstanceOf[Double] / k
 97 |     })
 98 | 
 99 |     // normalize ratios
100 |     val sumRatios: Double = ratio.sum
101 |     ratio.indices.par.foreach(i => ratio(i) = ratio(i) / sumRatios)
102 | 
103 |     // number of synthetic samples for each sample
104 |     val g: Array[Int] = new Array[Int](ratio.length)
105 |     ratio.zipWithIndex.par.foreach(ri => g(ri._2) = (ri._1 * G).asInstanceOf[Int])
106 | 
107 |     // output with a size of sum(Gi) samples
108 |     val output: Array[Array[Double]] = Array.ofDim(g.sum, samples(0).length)
109 | 
110 |     val r: Random = new Random(seed)
111 |     // must compute the random information before the loops due to parallelism
112 | 
113 |     var counter: Int = 0
114 |     val increment: Array[Int] = new Array[Int](g.length)
115 |     var i = 0
116 |     while (i < g.length) {
117 |       increment(i) = counter
118 |       counter += g(i)
119 |       i += 1
120 |     }
121 | 
122 |     // for each minority class sample, create gi synthetic samples
123 |     minorityClassIndex.indices.zip(increment).foreach(xi => {
124 |       (0 until g(xi._1)).foreach(n => {
125 |         // compute synthetic sample si = (xzi - xi) * lambda + xi
126 |         samples(0).indices.foreach(atrib => {
127 |           val nn: Int = neighbors(xi._1)(r.nextInt(neighbors(xi._1).length))
128 |           val diff: Double = samples(nn)(atrib) - samples(minorityClassIndex(xi._1))(atrib)
129 |           val gap: Float = r.nextFloat
130 |           output(xi._2 + n)(atrib) = samples(minorityClassIndex(xi._1))(atrib) + gap * diff
131 |         })
132 |       })
133 |     })
134 | 
135 |     val finishTime: Long = System.nanoTime()
136 | 
137 |     if (verbose) {
138 |       println("ORIGINAL SIZE: %d".format(data.x.length))
139 |       println("NEW DATA SIZE: %d".format(data.x.length + output.length))
140 |       println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
141 |     }
142 | 
143 |     new Data(if (data.fileInfo.nominal.length == 0) {
144 |       to2Decimals(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs,
145 |         data.fileInfo.minAttribs) else output))
146 |     } else {
147 |       toNominal(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs,
148 |         data.fileInfo.minAttribs) else output), data.nomToNum)
149 |     }, Array.concat(data.y, Array.fill(output.length)(minorityClass)), None, data.fileInfo)
150 |   }
151 | }
152 | 


--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/oversampling/MDO.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | SOUL: Scala Oversampling and Undersampling Library.
  3 | Copyright (C) 2019 Néstor Rodríguez, David López
  4 | 
  5 | This program is free software: you can redistribute it and/or modify
  6 | it under the terms of the GNU General Public License as published by
  7 | the Free Software Foundation in version 3 of the License.
  8 | 
  9 | This program is distributed in the hope that it will be useful,
 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | GNU General Public License for more details.
 13 | 
 14 | You should have received a copy of the GNU General Public License
 15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 |  */
 17 | package soul.algorithm.oversampling
 18 | 
 19 | import breeze.linalg.{DenseMatrix, DenseVector, eigSym, inv, sum}
 20 | import soul.data.Data
 21 | import soul.util.Utilities._
 22 | 
 23 | import scala.util.Random
 24 | 
 25 | /** MDO algorithm. Original paper: "To combat multi-class imbalanced problems by means of over-sampling and boosting
 26 |   * techniques" by Lida Adbi and Sattar Hashemi.
 27 |   *
 28 |   * @param data      data to work with
 29 |   * @param seed      seed to use. If it is not provided, it will use the system time
 30 |   * @param normalize normalize the data or not
 31 |   * @param verbose   choose to display information about the execution or not
 32 |   * @author David López Pretel
 33 |   */
 34 | class MDO(data: Data, seed: Long = System.currentTimeMillis(), normalize: Boolean = false, verbose: Boolean = false) {
 35 | 
 36 |   /** Compute the MDO algorithm
 37 |     *
 38 |     * @return synthetic samples generated
 39 |     */
 40 |   def compute(): Data = {
 41 |     val initTime: Long = System.nanoTime()
 42 |     val samples: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
 43 |     // compute minority class
 44 |     val minorityClassIndex: Array[Int] = minority(data.y)
 45 |     // compute majority class
 46 |     val minorityClass: Any = data.y(minorityClassIndex(0))
 47 |     val majorityClassIndex: Array[Int] = samples.indices.diff(minorityClassIndex.toList).toArray
 48 | 
 49 |     // compute the mean for the values of each attribute
 50 |     val mean: Array[Double] = (minorityClassIndex map samples).transpose.map(_.sum / minorityClassIndex.length)
 51 | 
 52 |     // subtract the mean to every attrib and then compute the covariance matrix
 53 |     val Zi: DenseMatrix[Double] = DenseMatrix(minorityClassIndex map samples: _*) -:- DenseMatrix(minorityClassIndex.map(_ => mean): _*)
 54 |     val oneDividedByN: Array[Array[Double]] = Array.fill(Zi.cols, Zi.cols)(Zi.rows)
 55 |     val S: DenseMatrix[Double] = (Zi.t * Zi) /:/ DenseMatrix(oneDividedByN: _*)
 56 |     //compute the eigenvectors and eigenvalues of S
 57 |     val eigen = eigSym(S)
 58 |     // the eigenvectors form the columns of the matrix that performs the change os basis
 59 |     val Ti: DenseMatrix[Double] = (eigen.eigenvectors * Zi.t).t
 60 |     // the diag are the eigenvalues
 61 |     val V: DenseVector[Double] = eigen.eigenvalues
 62 | 
 63 |     //compute the new samples
 64 |     val newSamples: Array[Array[Double]] = MDO_oversampling(Ti, mean, V, majorityClassIndex.length - minorityClassIndex.length, seed)
 65 | 
 66 |     //transform the samples to the original basis
 67 |     val newSamplesToOriginalSpace: DenseMatrix[Double] = (inv(eigen.eigenvectors) * DenseMatrix(newSamples: _*).t).t
 68 | 
 69 |     //sum the mean again
 70 |     val samplesWithMean: DenseMatrix[Double] = newSamplesToOriginalSpace +:+ DenseMatrix((0 until newSamplesToOriginalSpace.rows).map(_ => mean): _*)
 71 | 
 72 |     // the output
 73 |     val output: Array[Array[Double]] = Array.range(0, samplesWithMean.rows).map(i => samplesWithMean(i, ::).t.toArray)
 74 | 
 75 |     val finishTime: Long = System.nanoTime()
 76 | 
 77 |     if (verbose) {
 78 |       println("ORIGINAL SIZE: %d".format(data.x.length))
 79 |       println("NEW DATA SIZE: %d".format(data.x.length + output.length))
 80 |       println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
 81 |     }
 82 | 
 83 |     new Data(if (data.fileInfo.nominal.length == 0) {
 84 |       to2Decimals(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs,
 85 |         data.fileInfo.minAttribs) else output))
 86 |     } else {
 87 |       toNominal(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs,
 88 |         data.fileInfo.minAttribs) else output), data.nomToNum)
 89 |     }, Array.concat(data.y, Array.fill(output.length)(minorityClass)), None, data.fileInfo)
 90 |   }
 91 | 
 92 |   /** create the new samples for MDO algorithm
 93 |     *
 94 |     * @param Ti    the samples changed of basis
 95 |     * @param mean  the mean of every characteristic
 96 |     * @param V     the vector of coefficients
 97 |     * @param Orate majoritySamples - minoritySamples
 98 |     * @param seed  seed to use. If it is not provided, it will use the system time
 99 |     * @return return the new samples generated
100 |     */
101 |   def MDO_oversampling(Ti: DenseMatrix[Double], mean: Array[Double], V: DenseVector[Double], Orate: Int, seed: Long): Array[Array[Double]] = {
102 |     // check the number of new samples to be created
103 |     var I: Int = Ti.rows
104 |     var N: Int = Orate / I
105 |     if (I > Orate) {
106 |       N = 1
107 |       I = Orate
108 |     }
109 | 
110 |     val output: Array[Array[Double]] = Array.fill(Orate, Ti.cols)(0.0)
111 |     var newIndex: Int = 0
112 |     val rand: Random.type = scala.util.Random
113 |     rand.setSeed(seed)
114 | 
115 |     (0 until I).foreach(i => {
116 |       // square of each sample
117 |       val x: DenseVector[Double] = Ti(i, ::).t *:* Ti(i, ::).t
118 |       // vector results from α × V , which forms the denominators of ellipse equation
119 |       val alpha: Double = sum(x /:/ V)
120 |       val alphaV: DenseVector[Double] = V *:* alpha
121 |       (0 until N).foreach(_ => {
122 |         var s: Double = 0.0
123 |         (0 until Ti.cols - 1).foreach(p => {
124 |           //random number between -sqrt(alphaV(p)) and sqrt(alphaV(p))
125 |           val r: Double = -alphaV(p) / (Ti.cols - 1) + rand.nextFloat() * (alphaV(p) / (Ti.cols - 1) + alphaV(p) / (Ti.cols - 1))
126 |           //this number is the value for the attrib p
127 |           output(newIndex)(p) = r
128 |           // sum necessary to compute the last attrib later
129 |           s = s + (r * r / alphaV(p))
130 |         })
131 |         //compute the last attrib
132 |         val lastFeaVal: Double = (1 - s) * alphaV(alphaV.length - 1)
133 |         output(newIndex)(alphaV.size - 1) = if (rand.nextInt() % 2 == 0) -lastFeaVal else lastFeaVal
134 |         newIndex += 1
135 |       })
136 |     })
137 |     output
138 |   }
139 | }
140 | 


--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/undersampling/ClusterOSS.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | SOUL: Scala Oversampling and Undersampling Library.
  3 | Copyright (C) 2019 Néstor Rodríguez, David López
  4 | 
  5 | This program is free software: you can redistribute it and/or modify
  6 | it under the terms of the GNU General Public License as published by
  7 | the Free Software Foundation in version 3 of the License.
  8 | 
  9 | This program is distributed in the hope that it will be useful,
 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | GNU General Public License for more details.
 13 | 
 14 | You should have received a copy of the GNU General Public License
 15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 |  */
 17 | package soul.algorithm.undersampling
 18 | 
 19 | import soul.data.Data
 20 | import soul.util.KDTree
 21 | import soul.util.Utilities.Distance.Distance
 22 | import soul.util.Utilities._
 23 | 
 24 | /** ClusterOSS. Original paper: "ClusterOSS: a new undersampling method for imbalanced learning."
 25 |   * by Victor H Barella, Eduardo P Costa and André C P L F Carvalho.
 26 |   *
 27 |   * @param data          data to work with
 28 |   * @param seed          seed to use. If it is not provided, it will use the system time
 29 |   * @param dist          object of Distance enumeration representing the distance to be used
 30 |   * @param numClusters   number of clusters to be created by KMeans algorithm
 31 |   * @param restarts      number of times to relaunch KMeans algorithm
 32 |   * @param minDispersion stop KMeans core if dispersion is lower than this value
 33 |   * @param maxIterations number of iterations to be done in KMeans algorithm
 34 |   * @param normalize     normalize the data or not
 35 |   * @param randomData    iterate through the data randomly or not
 36 |   * @param verbose       choose to display information about the execution or not
 37 |   * @author Néstor Rodríguez Vico
 38 |   */
 39 | class ClusterOSS(data: Data, seed: Long = System.currentTimeMillis(), dist: Distance = Distance.EUCLIDEAN,
 40 |                  numClusters: Int = 15, restarts: Int = 5, minDispersion: Double = 0.0001, maxIterations: Int = 100,
 41 |                  normalize: Boolean = false, randomData: Boolean = false, verbose: Boolean = false) {
 42 | 
 43 |   /** Compute the ClusterOSS algorithm
 44 |     *
 45 |     * @return undersampled data structure
 46 |     */
 47 |   def compute(): Data = {
 48 |     val initTime: Long = System.nanoTime()
 49 | 
 50 |     val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length)
 51 |     val untouchableClass: Any = counter.minBy((c: (Any, Int)) => c._2)._1
 52 |     val random: scala.util.Random = new scala.util.Random(seed)
 53 |     var dataToWorkWith: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
 54 |     val classesToWorkWith: Array[Any] = if (randomData) {
 55 |       val randomIndex: List[Int] = random.shuffle(data.y.indices.toList)
 56 |       dataToWorkWith = (randomIndex map dataToWorkWith).toArray
 57 |       (randomIndex map data.y).toArray
 58 |     } else {
 59 |       data.y
 60 |     }
 61 | 
 62 |     val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
 63 |       (dataToWorkWith.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
 64 |         dataToWorkWith.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
 65 |         dataToWorkWith.transpose.map((column: Array[Double]) => standardDeviation(column)))
 66 |     } else {
 67 |       (null, null, null)
 68 |     }
 69 | 
 70 |     val majElements: Array[Int] = classesToWorkWith.zipWithIndex.collect { case (label, i) if label != untouchableClass => i }
 71 |     val (_, centroids, assignment) = kMeans(data = majElements map dataToWorkWith, nominal = data.fileInfo.nominal,
 72 |       numClusters = numClusters, restarts = restarts, minDispersion = minDispersion, maxIterations = maxIterations, seed = seed)
 73 | 
 74 |     val (closestInstances, restOfInstances) = assignment.par.map { cluster: (Int, Array[Int]) =>
 75 |       val distances: Array[(Int, Double)] = cluster._2.map { instance: Int =>
 76 |         (instance, euclidean(dataToWorkWith(instance), centroids(cluster._1)))
 77 |       }
 78 | 
 79 |       val closestInstance: Int = if (distances.isEmpty) -1 else distances.minBy(_._2)._1
 80 |       (closestInstance, cluster._2.diff(List(closestInstance)))
 81 |     }.toArray.unzip
 82 | 
 83 |     // Remove foo values
 84 |     val train: Array[Int] = closestInstances.diff(List(-1))
 85 |     // Flatten all the clusters
 86 |     val test: Array[Int] = restOfInstances.flatten
 87 |     val neighbours: Array[Array[Double]] = train map dataToWorkWith
 88 |     val classes: Array[Any] = train map classesToWorkWith
 89 | 
 90 |     val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) {
 91 |       Some(new KDTree(neighbours, classes, dataToWorkWith(0).length))
 92 |     } else {
 93 |       None
 94 |     }
 95 | 
 96 |     val calculatedLabels: Array[(Int, Any)] = test.zipWithIndex.map { i =>
 97 |       val label: Any = if (dist == Distance.EUCLIDEAN) {
 98 |         val labels = KDTree.get.nNeighbours(dataToWorkWith(i._1), 1)._2
 99 |         mode(labels.toArray)
100 |       } else {
101 |         nnRuleHVDM(neighbours, dataToWorkWith(i._1), -1, classes, 1, data.fileInfo.nominal, sds, attrCounter,
102 |           attrClassesCounter, "nearest")._1
103 |       }
104 |       (i._1, label)
105 |     }
106 | 
107 |     // if the label matches (it is well classified) the element is useful
108 |     val misclassified: Array[Int] = calculatedLabels.collect { case (i, label) if label != classesToWorkWith(i) => i }
109 |     val newDataIndex: Array[Int] = misclassified ++ train
110 | 
111 |     // Construct a data object to be passed to Tomek Link
112 |     val auxData: Data = new Data(x = toXData(newDataIndex map dataToWorkWith),
113 |       y = newDataIndex map classesToWorkWith, fileInfo = data.fileInfo)
114 |     auxData.processedData = newDataIndex map dataToWorkWith
115 |     val tl = new TL(auxData, dist = dist, minorityClass = Some(untouchableClass))
116 |     val resultTL: Data = tl.compute()
117 |     // The final instances is the result of applying Tomek Link to the content of newDataIndex
118 |     val finalIndex: Array[Int] = (resultTL.index.get.toList map newDataIndex).toArray
119 |     val finishTime: Long = System.nanoTime()
120 | 
121 |     if (verbose) {
122 |       val newCounter: Map[Any, Int] = (finalIndex map classesToWorkWith).groupBy(identity).mapValues(_.length)
123 |       println("ORIGINAL SIZE: %d".format(dataToWorkWith.length))
124 |       println("NEW DATA SIZE: %d".format(finalIndex.length))
125 |       println("REDUCTION PERCENTAGE: %s".format(100 - (finalIndex.length.toFloat / dataToWorkWith.length) * 100))
126 |       println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass)))
127 |       println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass)))
128 |       println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
129 |     }
130 | 
131 |     new Data(finalIndex map data.x, finalIndex map data.y, Some(finalIndex), data.fileInfo)
132 |   }
133 | }


--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/undersampling/CPM.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | SOUL: Scala Oversampling and Undersampling Library.
  3 | Copyright (C) 2019 Néstor Rodríguez, David López
  4 | 
  5 | This program is free software: you can redistribute it and/or modify
  6 | it under the terms of the GNU General Public License as published by
  7 | the Free Software Foundation in version 3 of the License.
  8 | 
  9 | This program is distributed in the hope that it will be useful,
 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | GNU General Public License for more details.
 13 | 
 14 | You should have received a copy of the GNU General Public License
 15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 |  */
 17 | package soul.algorithm.undersampling
 18 | 
 19 | import soul.data.Data
 20 | import soul.util.Utilities.Distance.Distance
 21 | import soul.util.Utilities._
 22 | 
 23 | import scala.collection.mutable.ArrayBuffer
 24 | import scala.math.min
 25 | 
 26 | /** Class Purity Maximization. Original paper: "An Unsupervised Learning Approach to Resolving the
 27 |   * Data Imbalanced Issue in Supervised Learning Problems in Functional Genomics" by Kihoon Yoon and Stephen Kwek.
 28 |   *
 29 |   * @param data       data to work with
 30 |   * @param seed       seed to use. If it is not provided, it will use the system time
 31 |   * @param dist       object of Distance enumeration representing the distance to be used
 32 |   * @param normalize  normalize the data or not
 33 |   * @param randomData iterate through the data randomly or not
 34 |   * @param verbose    choose to display information about the execution or not
 35 |   * @author Néstor Rodríguez Vico
 36 |   */
 37 | class CPM(data: Data, seed: Long = System.currentTimeMillis(), dist: Distance = Distance.EUCLIDEAN,
 38 |           normalize: Boolean = false, randomData: Boolean = false, verbose: Boolean = false) {
 39 | 
 40 |   /** Compute the CPM algorithm.
 41 |     *
 42 |     * @return undersampled data structure
 43 |     */
 44 |   def compute(): Data = {
 45 |     val initTime: Long = System.nanoTime()
 46 | 
 47 |     val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length)
 48 |     val untouchableClass: Any = counter.minBy((c: (Any, Int)) => c._2)._1
 49 |     val random: scala.util.Random = new scala.util.Random(seed)
 50 |     val centers: ArrayBuffer[Int] = new ArrayBuffer[Int](0)
 51 |     var dataToWorkWith: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
 52 |     val classesToWorkWith: Array[Any] = if (randomData) {
 53 |       val randomIndex: List[Int] = random.shuffle(data.y.indices.toList)
 54 |       dataToWorkWith = (randomIndex map dataToWorkWith).toArray
 55 |       (randomIndex map data.y).toArray
 56 |     } else {
 57 |       data.y
 58 |     }
 59 | 
 60 |     val posElements: Int = counter.head._2
 61 |     val negElements: Int = counter.tail.values.sum
 62 |     val impurity: Double = posElements.asInstanceOf[Double] / negElements.asInstanceOf[Double]
 63 |     val cluster: Array[Int] = new Array[Int](dataToWorkWith.length).indices.toArray
 64 | 
 65 |     val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
 66 |       (dataToWorkWith.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
 67 |         dataToWorkWith.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
 68 |         dataToWorkWith.transpose.map((column: Array[Double]) => standardDeviation(column)))
 69 |     } else {
 70 |       (null, null, null)
 71 |     }
 72 | 
 73 |     def purityMaximization(parentImpurity: Double, parentCluster: Array[Int], center: Int): Unit = {
 74 |       val cluster1: ArrayBuffer[Int] = new ArrayBuffer[Int](0)
 75 |       val cluster2: ArrayBuffer[Int] = new ArrayBuffer[Int](0)
 76 |       val posElements: ArrayBuffer[Int] = new ArrayBuffer[Int](0)
 77 |       val negElements: ArrayBuffer[Int] = new ArrayBuffer[Int](0)
 78 | 
 79 |       var center1: Int = 0
 80 |       var center2: Int = 0
 81 |       var pointer: Int = 0
 82 |       var impurity: Double = Double.PositiveInfinity
 83 |       var impurity1: Double = Double.PositiveInfinity
 84 |       var impurity2: Double = Double.PositiveInfinity
 85 | 
 86 |       parentCluster.foreach((f: Int) => if (data.y(f) == untouchableClass) posElements += f else negElements += f)
 87 | 
 88 |       val pairs: ArrayBuffer[(Int, Int)] = for {x <- negElements; y <- posElements} yield (x, y)
 89 | 
 90 |       while (parentImpurity <= impurity) {
 91 |         if (pointer >= pairs.length) {
 92 |           centers += center
 93 |           return
 94 |         }
 95 | 
 96 |         center1 = pairs(pointer)._1
 97 |         center2 = pairs(pointer)._2
 98 | 
 99 |         parentCluster.foreach { element: Int =>
100 |           val d1: Double = if (dist == Distance.EUCLIDEAN) {
101 |             euclidean(dataToWorkWith(element), dataToWorkWith(center1))
102 |           } else {
103 |             HVDM(dataToWorkWith(element), dataToWorkWith(center1), data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
104 |           }
105 | 
106 |           val d2: Double = if (dist == Distance.EUCLIDEAN) {
107 |             euclidean(dataToWorkWith(element), dataToWorkWith(center2))
108 |           } else {
109 |             HVDM(dataToWorkWith(element), dataToWorkWith(center2), data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
110 |           }
111 | 
112 |           if (d1 < d2)
113 |             cluster1 += element else cluster2 += element
114 |         }
115 | 
116 |         if (cluster1.nonEmpty)
117 |           impurity1 = cluster1.count((element: Int) => data.y(element) == untouchableClass).toDouble / cluster1.length
118 |         else {
119 |           centers += center2
120 |           return
121 |         }
122 | 
123 |         if (cluster2.nonEmpty)
124 |           impurity2 = cluster2.count((element: Int) => data.y(element) == untouchableClass).toDouble / cluster2.length
125 |         else {
126 |           centers += center1
127 |           return
128 |         }
129 | 
130 |         impurity = min(impurity1, impurity2)
131 |         pointer += 1
132 |       }
133 | 
134 |       purityMaximization(impurity1, cluster1.toArray, center1)
135 |       purityMaximization(impurity2, cluster2.toArray, center2)
136 |     }
137 | 
138 |     purityMaximization(impurity, cluster, 0)
139 | 
140 |     val finishTime: Long = System.nanoTime()
141 | 
142 |     if (verbose) {
143 |       val newCounter: Map[Any, Int] = (centers.toArray map classesToWorkWith).groupBy(identity).mapValues(_.length)
144 |       println("ORIGINAL SIZE: %d".format(dataToWorkWith.length))
145 |       println("NEW DATA SIZE: %d".format(centers.toArray.length))
146 |       println("REDUCTION PERCENTAGE: %s".format(100 - (centers.toArray.length.toFloat / dataToWorkWith.length) * 100))
147 |       println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass)))
148 |       println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass)))
149 |       println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
150 |     }
151 | 
152 |     new Data(centers.toArray map data.x, centers.toArray map data.y, Some(centers.toArray), data.fileInfo)
153 |   }
154 | }
155 | 


--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/oversampling/SMOTERSB.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | SOUL: Scala Oversampling and Undersampling Library.
  3 | Copyright (C) 2019 Néstor Rodríguez, David López
  4 | 
  5 | This program is free software: you can redistribute it and/or modify
  6 | it under the terms of the GNU General Public License as published by
  7 | the Free Software Foundation in version 3 of the License.
  8 | 
  9 | This program is distributed in the hope that it will be useful,
 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | GNU General Public License for more details.
 13 | 
 14 | You should have received a copy of the GNU General Public License
 15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 |  */
 17 | package soul.algorithm.oversampling
 18 | 
 19 | import soul.data.Data
 20 | import soul.util.KDTree
 21 | import soul.util.Utilities.Distance.Distance
 22 | import soul.util.Utilities._
 23 | 
 24 | import scala.Array._
 25 | import scala.collection.mutable.ArrayBuffer
 26 | import scala.util.Random
 27 | 
 28 | /** SMOTERSB algorithm. Original paper: "kNN Approach to Unbalanced Data Distribution: SMOTE-RSB: a hybrid preprocessing
 29 |   * approach based on oversampling and undersampling for high imbalanced data-sets using SMOTE and rough sets theory"
 30 |   * by Enislay Ramentol, Yailé Caballero, Rafael Bello and Francisco Herrera.
 31 |   *
 32 |   * @param data      data to work with
 33 |   * @param seed      seed to use. If it is not provided, it will use the system time
 34 |   * @param percent   amount of Smote N%
 35 |   * @param k         number of minority class nearest neighbors
 36 |   * @param dist      object of Distance enumeration representing the distance to be used
 37 |   * @param normalize normalize the data or not
 38 |   * @param verbose   choose to display information about the execution or not
 39 |   * @author David López Pretel
 40 |   */
 41 | class SMOTERSB(data: Data, seed: Long = System.currentTimeMillis(), percent: Int = 500, k: Int = 5,
 42 |                dist: Distance = Distance.EUCLIDEAN, normalize: Boolean = false, verbose: Boolean = false) {
 43 | 
 44 |   /** Compute the SMOTERSB algorithm
 45 |     *
 46 |     * @return synthetic samples generated
 47 |     */
 48 |   def compute(): Data = {
 49 |     val initTime: Long = System.nanoTime()
 50 | 
 51 |     if (percent > 100 && percent % 100 != 0) {
 52 |       throw new Exception("Percent must be a multiple of 100")
 53 |     }
 54 | 
 55 |     val samples: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
 56 |     val minorityClassIndex: Array[Int] = minority(data.y)
 57 |     val minorityClass: Any = data.y(minorityClassIndex(0))
 58 | 
 59 |     val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
 60 |       (samples.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
 61 |         samples.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
 62 |         samples.transpose.map((column: Array[Double]) => standardDeviation(column)))
 63 |     } else {
 64 |       (null, null, null)
 65 |     }
 66 | 
 67 |     val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) {
 68 |       Some(new KDTree(samples, data.y, samples(0).length))
 69 |     } else {
 70 |       None
 71 |     }
 72 | 
 73 |     // check if the percent is correct
 74 |     var T: Int = minorityClassIndex.length
 75 |     var N: Int = percent
 76 | 
 77 |     if (N < 100) {
 78 |       T = N / 100 * T
 79 |       N = 100
 80 |     }
 81 |     N = N / 100
 82 | 
 83 |     // output with a size of T*N samples
 84 |     val output: Array[Array[Double]] = Array.ofDim[Double](N * T, samples(0).length)
 85 | 
 86 |     val r: Random = new Random(seed)
 87 | 
 88 |     // for each minority class sample
 89 |     minorityClassIndex.indices.par.foreach((i: Int) => {
 90 |       val neighbors: Array[Int] = if (dist == Distance.EUCLIDEAN) {
 91 |         KDTree.get.nNeighbours(samples(minorityClassIndex(i)), k)._3.toArray
 92 |       } else {
 93 |         kNeighborsHVDM(samples, minorityClassIndex(i), k, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
 94 |       }
 95 | 
 96 |       // compute populate for the sample
 97 |       (0 until N).par.foreach((n: Int) => {
 98 |         val nn: Int = neighbors(r.nextInt(neighbors.length))
 99 |         // compute attributes of the sample
100 |         samples(0).indices.foreach((atrib: Int) => {
101 |           val diff: Double = samples(nn)(atrib) - samples(minorityClassIndex(i))(atrib)
102 |           val gap: Double = r.nextFloat()
103 |           output(i * N + n)(atrib) = samples(minorityClassIndex(i))(atrib) + gap * diff
104 |         })
105 |       })
106 |     })
107 | 
108 |     //compute the majority class
109 |     val majorityClassIndex: Array[Int] = samples.indices.diff(minorityClassIndex.toList).toArray
110 | 
111 |     // minimum and maximum value for each attrib
112 |     val maxMinValues: Array[(Double, Double)] = Array.concat(majorityClassIndex map samples, output).transpose.map(column => (column.max, column.min))
113 | 
114 |     //compute the similarity matrix
115 |     val similarityMatrix: Array[Array[Double]] = Array.ofDim(output.length, majorityClassIndex.length)
116 |     output.indices.par.foreach(i => {
117 |       (majorityClassIndex map samples).zipWithIndex.par.foreach(j => {
118 |         similarityMatrix(i)(j._2) = output(i).indices.map(k => {
119 |           if (data.nomToNum(0).isEmpty) {
120 |             1 - (Math.abs(output(i)(k) - j._1(k)) / (maxMinValues(k)._1 - maxMinValues(k)._2)) // this expression must be multiplied by wk
121 |           } else { // but all the features are included, so wk is 1
122 |             if (output(i)(k) == j._1(k)) 1 else 0
123 |           }
124 |         }).sum / output(i).length
125 |       })
126 |     })
127 | 
128 |     var result: ArrayBuffer[Int] = ArrayBuffer()
129 |     var similarityValue: Double = 0.4
130 |     var lowerApproximation: Boolean = true
131 |     while (similarityValue < 0.9) {
132 |       output.indices.foreach(i => {
133 |         lowerApproximation = true
134 |         majorityClassIndex.indices.foreach(j => {
135 |           if (similarityMatrix(i)(j) > similarityValue)
136 |             lowerApproximation = false
137 |         })
138 |         if (lowerApproximation) result += i
139 |       })
140 |       similarityValue += 0.05
141 |     }
142 | 
143 |     //if there are not synthetic samples with lower approximation, return all synthetic samples
144 |     if (result.isEmpty) {
145 |       result = ArrayBuffer.range(0, output.length)
146 |     } else {
147 |       result = result.distinct
148 |     }
149 | 
150 |     val finishTime: Long = System.nanoTime()
151 | 
152 |     if (verbose) {
153 |       println("ORIGINAL SIZE: %d".format(data.x.length))
154 |       println("NEW DATA SIZE: %d".format(data.x.length + output.length))
155 |       println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
156 |     }
157 | 
158 |     new Data(if (data.fileInfo.nominal.length == 0) {
159 |       to2Decimals(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(result.toArray map output, data.fileInfo.maxAttribs,
160 |         data.fileInfo.minAttribs) else result.toArray map output))
161 |     } else {
162 |       toNominal(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(result.toArray map output, data.fileInfo.maxAttribs,
163 |         data.fileInfo.minAttribs) else result.toArray map output), data.nomToNum)
164 |     }, Array.concat(data.y, Array.fill((result.toArray map output).length)(minorityClass)), None, data.fileInfo)
165 |   }
166 | }
167 | 


--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/undersampling/NCL.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | SOUL: Scala Oversampling and Undersampling Library.
  3 | Copyright (C) 2019 Néstor Rodríguez, David López
  4 | 
  5 | This program is free software: you can redistribute it and/or modify
  6 | it under the terms of the GNU General Public License as published by
  7 | the Free Software Foundation in version 3 of the License.
  8 | 
  9 | This program is distributed in the hope that it will be useful,
 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | GNU General Public License for more details.
 13 | 
 14 | You should have received a copy of the GNU General Public License
 15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 |  */
 17 | package soul.algorithm.undersampling
 18 | 
 19 | import soul.data.Data
 20 | import soul.util.KDTree
 21 | import soul.util.Utilities.Distance.Distance
 22 | import soul.util.Utilities._
 23 | 
 24 | import scala.collection.mutable.ArrayBuffer
 25 | 
 26 | /** Neighbourhood Cleaning Rule. Original paper: "Improving Identification of Difficult Small Classes by Balancing Class
 27 |   * Distribution" by J. Laurikkala.
 28 |   *
 29 |   * @param data       data to work with
 30 |   * @param seed       seed to use. If it is not provided, it will use the system time
 31 |   * @param dist       object of Distance enumeration representing the distance to be used
 32 |   * @param k          number of neighbours to use when computing k-NN rule (normally 3 neighbours)
 33 |   * @param threshold  consider a class to be undersampled if the number of instances of this class is
 34 |   *                   greater than data.size * threshold
 35 |   * @param normalize  normalize the data or not
 36 |   * @param randomData iterate through the data randomly or not
 37 |   * @param verbose    choose to display information about the execution or not
 38 |   * @author Néstor Rodríguez Vico
 39 |   */
 40 | class NCL(data: Data, seed: Long = System.currentTimeMillis(), dist: Distance = Distance.EUCLIDEAN, k: Int = 3,
 41 |           threshold: Double = 0.5, normalize: Boolean = false, randomData: Boolean = false, verbose: Boolean = false) {
 42 |   /** Compute the NCL algorithm.
 43 |     *
 44 |     * @return undersampled data structure
 45 |     */
 46 |   def compute(): Data = {
 47 |     // Note: the notation used to refers the subsets of data is the used in the original paper.
 48 |     val initTime: Long = System.nanoTime()
 49 | 
 50 |     val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length)
 51 |     val untouchableClass: Any = counter.minBy((c: (Any, Int)) => c._2)._1
 52 |     val random: scala.util.Random = new scala.util.Random(seed)
 53 |     var dataToWorkWith: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
 54 |     val classesToWorkWith: Array[Any] = if (randomData) {
 55 |       val randomIndex: List[Int] = random.shuffle(data.y.indices.toList)
 56 |       dataToWorkWith = (randomIndex map dataToWorkWith).toArray
 57 |       (randomIndex map data.y).toArray
 58 |     } else {
 59 |       data.y
 60 |     }
 61 | 
 62 |     val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
 63 |       (dataToWorkWith.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
 64 |         dataToWorkWith.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
 65 |         dataToWorkWith.transpose.map((column: Array[Double]) => standardDeviation(column)))
 66 |     } else {
 67 |       (null, null, null)
 68 |     }
 69 | 
 70 |     val minorityIndex: ArrayBuffer[Int] = new ArrayBuffer[Int](0)
 71 |     val majorityIndex: ArrayBuffer[Int] = new ArrayBuffer[Int](0)
 72 | 
 73 |     var i = 0
 74 |     while (i < classesToWorkWith.length) {
 75 |       if (classesToWorkWith(i) == untouchableClass) minorityIndex += i else majorityIndex += i
 76 |       i += 1
 77 |     }
 78 | 
 79 |     // ENN can not be applied when only one class is in the less important group
 80 |     val indexA1: Array[Int] = if (classesToWorkWith.distinct.length > 2) {
 81 |       val ennData = new Data(toXData((majorityIndex map dataToWorkWith).toArray), (majorityIndex map classesToWorkWith).toArray, None, data.fileInfo)
 82 |       ennData.processedData = (majorityIndex map dataToWorkWith).toArray
 83 |       val enn = new ENN(ennData, dist = dist, k = k)
 84 |       val resultENN: Data = enn.compute()
 85 |       classesToWorkWith.indices.diff(resultENN.index.get).toArray
 86 |     } else {
 87 |       new Array[Int](0)
 88 |     }
 89 | 
 90 |     val uniqueMajClasses = (majorityIndex map classesToWorkWith).distinct
 91 |     val ratio: Double = dataToWorkWith.length * threshold
 92 | 
 93 |     val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) {
 94 |       Some(new KDTree((minorityIndex map dataToWorkWith).toArray, (majorityIndex map classesToWorkWith).toArray, dataToWorkWith(0).length))
 95 |     } else {
 96 |       None
 97 |     }
 98 | 
 99 |     def selectNeighbours(l: Int): ArrayBuffer[Int] = {
100 |       var selectedElements = new ArrayBuffer[Int](0)
101 |       val (_, labels, index) = KDTree.get.nNeighbours(dataToWorkWith(l), k)
102 |       val label = mode(labels.toArray)
103 | 
104 |       if (label != classesToWorkWith(l)) {
105 |         index.foreach { n =>
106 |           if (classesToWorkWith(n) != untouchableClass && counter(classesToWorkWith(n)) > ratio) {
107 |             selectedElements += n
108 |           }
109 |         }
110 |       }
111 |       selectedElements
112 |     }
113 | 
114 |     def selectNeighboursHVDM(l: Int): ArrayBuffer[Int] = {
115 |       val selectedElements = new ArrayBuffer[Int]()
116 |       val (label, nNeighbours, _) = nnRuleHVDM(dataToWorkWith, dataToWorkWith(l), l, classesToWorkWith, k, data.fileInfo.nominal,
117 |         sds, attrCounter, attrClassesCounter, "nearest")
118 | 
119 |       if (label != classesToWorkWith(l)) {
120 |         nNeighbours.foreach { n =>
121 |           val nNeighbourClass: Any = classesToWorkWith(n)
122 |           if (nNeighbourClass != untouchableClass && counter(nNeighbourClass) > ratio) {
123 |             selectedElements += n
124 |           }
125 |         }
126 |       }
127 |       selectedElements
128 |     }
129 | 
130 |     var j = 0
131 |     val indexA2 = new ArrayBuffer[Int](0)
132 |     while (j < uniqueMajClasses.length) {
133 |       val selectedNeighbours: Array[ArrayBuffer[Int]] = if (dist == Distance.EUCLIDEAN) {
134 |         minorityIndex.par.map(l => selectNeighbours(l)).toArray
135 |       } else {
136 |         minorityIndex.par.map(l => selectNeighboursHVDM(l)).toArray
137 |       }
138 | 
139 |       selectedNeighbours.flatten.distinct.foreach(e => indexA2 += e)
140 |       j += 1
141 |     }
142 | 
143 |     val finalIndex: Array[Int] = classesToWorkWith.indices.diff(indexA1.toList ++ indexA2.distinct).toArray
144 |     val finishTime: Long = System.nanoTime()
145 | 
146 |     if (verbose) {
147 |       val newCounter: Map[Any, Int] = (finalIndex map classesToWorkWith).groupBy(identity).mapValues(_.length)
148 |       println("ORIGINAL SIZE: %d".format(dataToWorkWith.length))
149 |       println("NEW DATA SIZE: %d".format(finalIndex.length))
150 |       println("REDUCTION PERCENTAGE: %s".format(100 - (finalIndex.length.toFloat / dataToWorkWith.length) * 100))
151 |       println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass)))
152 |       println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass)))
153 |       println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
154 |     }
155 | 
156 |     new Data(finalIndex map data.x, finalIndex map data.y, Some(finalIndex), data.fileInfo)
157 |   }
158 | }


--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/undersampling/BC.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | SOUL: Scala Oversampling and Undersampling Library.
  3 | Copyright (C) 2019 Néstor Rodríguez, David López
  4 | 
  5 | This program is free software: you can redistribute it and/or modify
  6 | it under the terms of the GNU General Public License as published by
  7 | the Free Software Foundation in version 3 of the License.
  8 | 
  9 | This program is distributed in the hope that it will be useful,
 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | GNU General Public License for more details.
 13 | 
 14 | You should have received a copy of the GNU General Public License
 15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 |  */
 17 | package soul.algorithm.undersampling
 18 | 
 19 | import soul.data.Data
 20 | import soul.util.Utilities.Distance.Distance
 21 | import soul.util.Utilities._
 22 | 
 23 | import scala.collection.mutable.ArrayBuffer
 24 | 
 25 | /** Balance Cascade algorithm. Original paper: "Exploratory Undersampling for Class-Imbalance Learning" by Xu-Ying Liu,
 26 |   * Jianxin Wu and Zhi-Hua Zhou.
 27 |   *
 28 |   * @param data        data to work with
 29 |   * @param seed        seed to use. If it is not provided, it will use the system time
 30 |   * @param dist        object of Distance enumeration representing the distance to be used
 31 |   * @param k           number of neighbours to use when computing k-NN rule (normally 3 neighbours)
 32 |   * @param nMaxSubsets maximum number of subsets to generate
 33 |   * @param nFolds      number of subsets to create when applying cross-validation
 34 |   * @param ratio       ratio to know how many majority class examples to preserve. By default it's set to 1 so there
 35 |   *                    will be the same minority class examples as majority class examples. It will take
 36 |   *                    numMinorityInstances * ratio
 37 |   * @param normalize   normalize the data or not
 38 |   * @param randomData  iterate through the data randomly or not
 39 |   * @param verbose     choose to display information about the execution or not
 40 |   * @author Néstor Rodríguez Vico
 41 |   */
 42 | class BC(data: Data, seed: Long = System.currentTimeMillis(), dist: Distance = Distance.EUCLIDEAN,
 43 |          k: Int = 3, nMaxSubsets: Int = 5, nFolds: Int = 5, ratio: Double = 1.0, normalize: Boolean = false,
 44 |          randomData: Boolean = false, verbose: Boolean = false) {
 45 | 
 46 |   /** Compute the BC algorithm.
 47 |     *
 48 |     * @return undersampled data structure
 49 |     */
 50 |   def compute(): Data = {
 51 |     val initTime: Long = System.nanoTime()
 52 | 
 53 |     val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length)
 54 |     val untouchableClass: Any = counter.minBy((c: (Any, Int)) => c._2)._1
 55 |     val random: scala.util.Random = new scala.util.Random(seed)
 56 |     var dataToWorkWith: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
 57 |     val classesToWorkWith: Array[Any] = if (randomData) {
 58 |       val randomIndex: List[Int] = random.shuffle(data.y.indices.toList)
 59 |       dataToWorkWith = (randomIndex map dataToWorkWith).toArray
 60 |       (randomIndex map data.y).toArray
 61 |     } else {
 62 |       data.y
 63 |     }
 64 | 
 65 |     val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
 66 |       (dataToWorkWith.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
 67 |         dataToWorkWith.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
 68 |         dataToWorkWith.transpose.map((column: Array[Double]) => standardDeviation(column)))
 69 |     } else {
 70 |       (null, null, null)
 71 |     }
 72 | 
 73 |     var search: Boolean = true
 74 |     var subsetsCounter: Int = 0
 75 |     val mask: Array[Boolean] = Array.fill(classesToWorkWith.length)(true)
 76 |     val subsets: ArrayBuffer[Array[Int]] = new ArrayBuffer[Array[Int]](0)
 77 |     val minorityElements: ArrayBuffer[Int] = new ArrayBuffer[Int](0)
 78 |     val majorityElements: ArrayBuffer[Int] = new ArrayBuffer[Int](0)
 79 | 
 80 |     while (search) {
 81 |       val indexToUnderSample: ArrayBuffer[Int] = new ArrayBuffer[Int](0)
 82 |       val minorityIndex: ArrayBuffer[Int] = new ArrayBuffer[Int](0)
 83 |       val classesCounter: Map[Any, Int] = (boolToIndex(mask) map classesToWorkWith).groupBy(identity).mapValues(_.length)
 84 | 
 85 |       classesCounter.foreach { target: (Any, Int) =>
 86 |         val indexClass: Array[Int] = classesToWorkWith.zipWithIndex.collect { case (c, i) if c == target._1 => i }
 87 |         if (target._1 != untouchableClass) {
 88 |           val sameClassBool: Array[Boolean] = mask.zipWithIndex.collect { case (c, i) if classesToWorkWith(i) == target._1 => c }
 89 |           val indexClassInterest: Array[Int] = boolToIndex(sameClassBool) map indexClass
 90 |           val indexTargetClass: List[Int] = random.shuffle((indexClassInterest map classesToWorkWith).indices.toList).take(counter(untouchableClass))
 91 |           indexToUnderSample ++= (indexTargetClass map indexClassInterest)
 92 |           majorityElements ++= (indexTargetClass map indexClassInterest)
 93 |         } else {
 94 |           minorityIndex ++= indexClass
 95 |           minorityElements ++= indexClass
 96 |         }
 97 |       }
 98 | 
 99 |       subsetsCounter += 1
100 |       val subset: Array[Int] = (indexToUnderSample ++ minorityIndex).toArray
101 |       subsets += subset
102 | 
103 |       val classesToWorkWithSubset: Array[Any] = subset map classesToWorkWith
104 |       val dataToWorkWithSubset: Array[Array[Double]] = subset map dataToWorkWith
105 |       val prediction: Array[Any] = (if (dist == Distance.EUCLIDEAN) {
106 |         kFoldPrediction(dataToWorkWithSubset, classesToWorkWithSubset, k, nFolds, "nearest")
107 |       } else {
108 |         kFoldPredictionHVDM(dataToWorkWithSubset, classesToWorkWithSubset, k, nFolds, data.fileInfo.nominal, sds, attrCounter,
109 |           attrClassesCounter, "nearest")
110 |       }).take(indexToUnderSample.length)
111 | 
112 |       val classifiedInstances: Array[Boolean] = ((indexToUnderSample.indices map classesToWorkWithSubset)
113 |         zip prediction).map((e: (Any, Any)) => e._1 == e._2).toArray
114 |       (boolToIndex(classifiedInstances) map indexToUnderSample).foreach((i: Int) => mask(i) = false)
115 | 
116 |       if (subsetsCounter == nMaxSubsets) search = false
117 | 
118 |       val finalTargetStats: Map[Any, Int] = (boolToIndex(mask) map classesToWorkWith).groupBy(identity).mapValues(_.length)
119 |       classesToWorkWith.distinct.filter((c: Any) => c != untouchableClass).foreach { c: Any =>
120 |         if (finalTargetStats(c) < counter(untouchableClass)) search = false
121 |       }
122 |     }
123 | 
124 |     val majorityIndexHistogram: Array[(Int, Int)] = majorityElements.groupBy(identity).mapValues(_.length).toArray.sortBy(_._2).reverse
125 |     val majorityIndex: Array[Int] = majorityIndexHistogram.take((minorityElements.distinct.length * ratio).toInt).map(_._1)
126 |     val finalIndex: Array[Int] = minorityElements.distinct.toArray ++ majorityIndex
127 |     val finishTime: Long = System.nanoTime()
128 | 
129 |     if (verbose) {
130 |       val newCounter: Map[Any, Int] = (finalIndex map classesToWorkWith).groupBy(identity).mapValues(_.length)
131 |       println("ORIGINAL SIZE: %d".format(dataToWorkWith.length))
132 |       println("NEW DATA SIZE: %d".format(finalIndex.length))
133 |       println("REDUCTION PERCENTAGE: %s".format(100 - (finalIndex.length.toFloat / dataToWorkWith.length) * 100))
134 |       println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass)))
135 |       println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass)))
136 |       println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
137 |     }
138 | 
139 |     new Data(finalIndex map data.x, finalIndex map data.y, Some(finalIndex), data.fileInfo)
140 |   }
141 | }


--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/undersampling/CNN.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | SOUL: Scala Oversampling and Undersampling Library.
  3 | Copyright (C) 2019 Néstor Rodríguez, David López
  4 | 
  5 | This program is free software: you can redistribute it and/or modify
  6 | it under the terms of the GNU General Public License as published by
  7 | the Free Software Foundation in version 3 of the License.
  8 | 
  9 | This program is distributed in the hope that it will be useful,
 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | GNU General Public License for more details.
 13 | 
 14 | You should have received a copy of the GNU General Public License
 15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 |  */
 17 | package soul.algorithm.undersampling
 18 | 
 19 | import soul.data.Data
 20 | import soul.util.KDTree
 21 | import soul.util.Utilities.Distance.Distance
 22 | import soul.util.Utilities._
 23 | 
 24 | import scala.collection.mutable.ListBuffer
 25 | 
 26 | /** Condensed Nearest Neighbor decision rule. Original paper: "The Condensed Nearest Neighbor Rule" by P. Hart.
 27 |   *
 28 |   * @param data       data to work with
 29 |   * @param seed       seed to use. If it is not provided, it will use the system time
 30 |   * @param dist       object of Distance enumeration representing the distance to be used
 31 |   * @param normalize  normalize the data or not
 32 |   * @param randomData iterate through the data randomly or not
 33 |   * @param verbose    choose to display information about the execution or not
 34 |   * @author Néstor Rodríguez Vico
 35 |   */
 36 | class CNN(data: Data, seed: Long = System.currentTimeMillis(), dist: Distance = Distance.EUCLIDEAN,
 37 |           normalize: Boolean = false, randomData: Boolean = false, verbose: Boolean = false) {
 38 | 
 39 |   /** Compute the CNN algorithm
 40 |     *
 41 |     * @return undersampled data structure
 42 |     */
 43 |   def compute(): Data = {
 44 |     val initTime: Long = System.nanoTime()
 45 | 
 46 |     val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length)
 47 |     val untouchableClass: Any = counter.minBy((c: (Any, Int)) => c._2)._1
 48 |     val random: scala.util.Random = new scala.util.Random(seed)
 49 |     var dataToWorkWith: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
 50 |     val classesToWorkWith: Array[Any] = if (randomData) {
 51 |       val randomIndex: List[Int] = random.shuffle(data.y.indices.toList)
 52 |       dataToWorkWith = (randomIndex map dataToWorkWith).toArray
 53 |       (randomIndex map data.y).toArray
 54 |     } else {
 55 |       data.y
 56 |     }
 57 | 
 58 |     val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
 59 |       (dataToWorkWith.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
 60 |         dataToWorkWith.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
 61 |         dataToWorkWith.transpose.map((column: Array[Double]) => standardDeviation(column)))
 62 |     } else {
 63 |       (null, null, null)
 64 |     }
 65 | 
 66 |     val finalIndex: Array[Int] = if (dist == Distance.HVDM) {
 67 |       // Indicate the corresponding group: 1 for store, 0 for unknown, -1 for grabbag
 68 |       val location: Array[Int] = List.fill(dataToWorkWith.length)(0).toArray
 69 |       // The first element is added to store
 70 |       location(0) = 1
 71 |       var changed = true
 72 | 
 73 |       // Iterate the data, x (except the first instance)
 74 |       dataToWorkWith.zipWithIndex.tail.foreach { element: (Array[Double], Int) =>
 75 |         // and classify each element with the actual content of store
 76 |         val index: Array[Int] = location.zipWithIndex.collect { case (a, b) if a == 1 => b }
 77 |         val neighbours: Array[Array[Double]] = index map dataToWorkWith
 78 |         val classes: Array[Any] = index map classesToWorkWith
 79 |         val label: (Any, Array[Int], Array[Double]) = nnRuleHVDM(neighbours, element._1, -1, classes, 1, data.fileInfo.nominal,
 80 |           sds, attrCounter, attrClassesCounter, "nearest")
 81 | 
 82 |         // If it is misclassified or is a element of the untouchable class it is added to store; otherwise, it is added to grabbag
 83 |         location(element._2) = if (label._1 != classesToWorkWith(element._2)) 1 else -1
 84 |       }
 85 | 
 86 |       // After a first pass, iterate grabbag until is exhausted:
 87 |       // 1. There is no element in grabbag or
 88 |       // 2. There is no data change between grabbag and store after a full iteration
 89 |       while (location.count((z: Int) => z == -1) != 0 && changed) {
 90 |         changed = false
 91 |         // Now, instead of iterating x, we iterate grabbag
 92 |         location.zipWithIndex.filter((x: (Int, Int)) => x._1 == -1).foreach { element: (Int, Int) =>
 93 |           val index: Array[Int] = location.zipWithIndex.collect { case (a, b) if a == 1 => b }
 94 |           val neighbours: Array[Array[Double]] = index map dataToWorkWith
 95 |           val classes: Array[Any] = index map classesToWorkWith
 96 |           val label: Any = nnRuleHVDM(neighbours, dataToWorkWith(element._2), -1, classes, 1, data.fileInfo.nominal,
 97 |             sds, attrCounter, attrClassesCounter, "nearest")._1
 98 |           // If it is misclassified or is a element of the untouchable class it is added to store; otherwise, it is added to grabbag
 99 |           location(element._2) = if (label != classesToWorkWith(element._2)) {
100 |             changed = true
101 |             1
102 |           } else -1
103 |         }
104 |       }
105 | 
106 |       location.zipWithIndex.filter((x: (Int, Int)) => x._1 == 1).collect { case (_, a) => a }
107 |     } else {
108 |       val store: KDTree = new KDTree(Array(dataToWorkWith(0)), Array(classesToWorkWith(0)), dataToWorkWith(0).length)
109 |       var grabbag: ListBuffer[(Array[Double], Int)] = new ListBuffer[(Array[Double], Int)]()
110 |       var newGrabbag: ListBuffer[(Array[Double], Int)] = new ListBuffer[(Array[Double], Int)]()
111 | 
112 |       // Iterate the data, x (except the first instance)
113 |       dataToWorkWith.zipWithIndex.tail.foreach { instance: (Array[Double], Int) =>
114 |         val label = mode(store.nNeighbours(instance._1, k = 1, leaveOneOut = false)._2.toArray)
115 |         if (label != classesToWorkWith(instance._2)) {
116 |           store.addElement(instance._1, classesToWorkWith(instance._2))
117 |         } else {
118 |           grabbag += instance
119 |         }
120 |       }
121 | 
122 |       var changed = true
123 |       while (grabbag.nonEmpty && changed) {
124 |         changed = false
125 |         grabbag.foreach { instance =>
126 |           val label = mode(store.nNeighbours(instance._1, k = 1, leaveOneOut = false)._2.toArray)
127 |           if (label != classesToWorkWith(instance._2)) {
128 |             store.addElement(instance._1, classesToWorkWith(instance._2))
129 |             changed = true
130 |           } else {
131 |             newGrabbag += instance
132 |           }
133 |         }
134 | 
135 |         grabbag = newGrabbag
136 |         newGrabbag = new ListBuffer[(Array[Double], Int)]()
137 |       }
138 | 
139 |       store.kDTreeMap.values.unzip._2.toArray
140 |     }
141 | 
142 |     val finishTime: Long = System.nanoTime()
143 | 
144 |     if (verbose) {
145 |       val newCounter: Map[Any, Int] = (finalIndex map classesToWorkWith).groupBy(identity).mapValues(_.length)
146 |       println("ORIGINAL SIZE: %d".format(dataToWorkWith.length))
147 |       println("NEW DATA SIZE: %d".format(finalIndex.length))
148 |       println("REDUCTION PERCENTAGE: %s".format(100 - (finalIndex.length.toFloat / dataToWorkWith.length) * 100))
149 |       println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass)))
150 |       println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass)))
151 |       println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
152 |     }
153 | 
154 |     new Data(finalIndex map data.x, finalIndex map data.y, Some(finalIndex), data.fileInfo)
155 |   }
156 | }
157 | 


--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/undersampling/NM.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | SOUL: Scala Oversampling and Undersampling Library.
  3 | Copyright (C) 2019 Néstor Rodríguez, David López
  4 | 
  5 | This program is free software: you can redistribute it and/or modify
  6 | it under the terms of the GNU General Public License as published by
  7 | the Free Software Foundation in version 3 of the License.
  8 | 
  9 | This program is distributed in the hope that it will be useful,
 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | GNU General Public License for more details.
 13 | 
 14 | You should have received a copy of the GNU General Public License
 15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 |  */
 17 | package soul.algorithm.undersampling
 18 | 
 19 | import soul.data.Data
 20 | import soul.util.KDTree
 21 | import soul.util.Utilities.Distance.Distance
 22 | import soul.util.Utilities._
 23 | 
 24 | import scala.collection.mutable.ArrayBuffer
 25 | import scala.util.Random
 26 | 
 27 | /** NearMiss. Original paper: "kNN Approach to Unbalanced Data Distribution: A Case Study involving Information
 28 |   * Extraction" by Jianping Zhang and Inderjeet Mani.
 29 |   *
 30 |   * @param data        data to work with
 31 |   * @param seed        seed to use. If it is not provided, it will use the system time
 32 |   * @param dist        object of Distance enumeration representing the distance to be used
 33 |   * @param version     version of the core to execute
 34 |   * @param nNeighbours number of neighbours to take for each minority example (only used if version is set to 3)
 35 |   * @param ratio       ratio to know how many majority class examples to preserve. By default it's set to 1 so there
 36 |   *                    will be the same minority class examples as majority class examples. It will take
 37 |   *                    numMinorityInstances * ratio
 38 |   * @param normalize   normalize the data or not
 39 |   * @param randomData  iterate through the data randomly or not
 40 |   * @param verbose     choose to display information about the execution or not
 41 |   * @author Néstor Rodríguez Vico
 42 |   */
 43 | class NM(data: Data, seed: Long = System.currentTimeMillis(), dist: Distance = Distance.EUCLIDEAN, version: Int = 1,
 44 |          nNeighbours: Int = 3, ratio: Double = 1.0, normalize: Boolean = false, randomData: Boolean = false, verbose: Boolean = false) {
 45 | 
 46 |   /** Compute the NM algorithm.
 47 |     *
 48 |     * @return undersampled data structure
 49 |     */
 50 |   def compute(): Data = {
 51 |     val initTime: Long = System.nanoTime()
 52 | 
 53 |     val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length)
 54 |     val untouchableClass: Any = counter.minBy((c: (Any, Int)) => c._2)._1
 55 |     val random: scala.util.Random = new scala.util.Random(seed)
 56 |     var dataToWorkWith: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
 57 |     val classesToWorkWith: Array[Any] = if (randomData) {
 58 |       val randomIndex: List[Int] = random.shuffle(data.y.indices.toList)
 59 |       dataToWorkWith = (randomIndex map dataToWorkWith).toArray
 60 |       (randomIndex map data.y).toArray
 61 |     } else {
 62 |       data.y
 63 |     }
 64 | 
 65 |     val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
 66 |       (dataToWorkWith.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
 67 |         dataToWorkWith.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
 68 |         dataToWorkWith.transpose.map((column: Array[Double]) => standardDeviation(column)))
 69 |     } else {
 70 |       (null, null, null)
 71 |     }
 72 | 
 73 |     val majElements: ArrayBuffer[Int] = new ArrayBuffer[Int](0)
 74 |     val minElements: ArrayBuffer[Int] = new ArrayBuffer[Int](0)
 75 |     classesToWorkWith.zipWithIndex.foreach(i => if (i._1 == untouchableClass) minElements += i._2 else majElements += i._2)
 76 |     val minNeighbours: Array[Array[Double]] = minElements.toArray map dataToWorkWith
 77 |     val majNeighbours: Array[Array[Double]] = majElements.toArray map dataToWorkWith
 78 |     val minClasses: Array[Any] = minElements.toArray map classesToWorkWith
 79 |     val majClasses: Array[Any] = majElements.toArray map classesToWorkWith
 80 | 
 81 |     val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) {
 82 |       Some(new KDTree(minNeighbours, minClasses, dataToWorkWith(0).length))
 83 |     } else {
 84 |       None
 85 |     }
 86 | 
 87 |     val majorityKDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) {
 88 |       Some(new KDTree(majNeighbours, majClasses, dataToWorkWith(0).length))
 89 |     } else {
 90 |       None
 91 |     }
 92 | 
 93 |     val reverseKDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) {
 94 |       Some(new KDTree(minNeighbours, minClasses, dataToWorkWith(0).length, which = "farthest"))
 95 |     } else {
 96 |       None
 97 |     }
 98 | 
 99 |     val selectedMajElements: Array[Int] = if (version == 1) {
100 |       majElements.map { i: Int =>
101 |         if (dist == Distance.EUCLIDEAN) {
102 |           val index = KDTree.get.nNeighbours(dataToWorkWith(i), 3)._3
103 |           (i, index.map(j => euclidean(dataToWorkWith(i), dataToWorkWith(j))).sum / index.length)
104 |         } else {
105 |           val result: (Any, Array[Int], Array[Double]) = nnRuleHVDM(minNeighbours, dataToWorkWith(i), -1, minClasses, 3,
106 |             data.fileInfo.nominal, sds, attrCounter, attrClassesCounter, "nearest")
107 |           (i, (result._2 map result._3).sum / result._2.length)
108 |         }
109 |       }.toArray.sortBy(_._2).map(_._1)
110 |     } else if (version == 2) {
111 |       majElements.map { i: Int =>
112 |         if (dist == Distance.EUCLIDEAN) {
113 |           val index = reverseKDTree.get.nNeighbours(dataToWorkWith(i), 3)._3
114 |           (i, index.map(j => euclidean(dataToWorkWith(i), dataToWorkWith(j))).sum / index.length)
115 |         } else {
116 |           val result: (Any, Array[Int], Array[Double]) = nnRuleHVDM(minNeighbours, dataToWorkWith(i), -1, minClasses, 3,
117 |             data.fileInfo.nominal, sds, attrCounter, attrClassesCounter, "farthest")
118 |           (i, (result._2 map result._3).sum / result._2.length)
119 |         }
120 |       }.toArray.sortBy(_._2).map(_._1)
121 |     } else if (version == 3) {
122 |       // We shuffle the data because, at last, we are going to take, at least, minElements.length * ratio elements and if
123 |       // we don't shuffle, we only take majority elements examples that are near to the first minority class examples
124 |       new Random(seed).shuffle(minElements.flatMap { i: Int =>
125 |         if (dist == Distance.EUCLIDEAN) {
126 |           majorityKDTree.get.nNeighbours(dataToWorkWith(i), 3)._3
127 |         } else {
128 |           nnRuleHVDM(majNeighbours, dataToWorkWith(i), -1, majClasses, nNeighbours, data.fileInfo.nominal, sds, attrCounter,
129 |             attrClassesCounter, "nearest")._2
130 |         }
131 |       }.distinct.toList).toArray
132 |     } else {
133 |       throw new Exception("Invalid argument: version should be: 1, 2 or 3")
134 |     }
135 | 
136 |     val finalIndex: Array[Int] = minElements.toArray ++ selectedMajElements.take((minElements.length * ratio).toInt)
137 |     val finishTime: Long = System.nanoTime()
138 | 
139 |     if (verbose) {
140 |       val newCounter: Map[Any, Int] = (finalIndex map classesToWorkWith).groupBy(identity).mapValues(_.length)
141 |       println("ORIGINAL SIZE: %d".format(dataToWorkWith.length))
142 |       println("NEW DATA SIZE: %d".format(finalIndex.length))
143 |       println("REDUCTION PERCENTAGE: %s".format(100 - (finalIndex.length.toFloat / dataToWorkWith.length) * 100))
144 |       println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass)))
145 |       println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass)))
146 |       println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
147 |     }
148 | 
149 |     new Data(finalIndex map data.x, finalIndex map data.y, Some(finalIndex), data.fileInfo)
150 |   }
151 | }
152 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # SOUL
  2 | 
  3 | ### Scala Oversampling and Undersampling Library
  4 | 
  5 | Included algorithms for oversampling:
  6 | 
  7 | * **Random Oversampling.** Original paper: "A study of the behavior of several methods for balancing machine learning training data" by Batista, Gustavo EAPA and Prati, Ronaldo C and Monard, Maria Carolina.
  8 | 
  9 | * **SMOTE.** Original paper: "SMOTE: Synthetic Minority Over-sampling Technique" by Nitesh V. Chawla, Kevin W. Bowyer, Lawrence O. Hall and W. Philip Kegelmeyer.
 10 | 
 11 | * **SMOTE + ENN.** Original paper: "A Study of the Behavior of Several Methods for Balancing Machine Learning Training Data" by Gustavo E. A. P. A. Batista, Ronaldo C. Prati and Maria Carolina Monard.
 12 | 
 13 | * **SMOTE + TL.** Original paper: "A Study of the Behavior of Several Methods for Balancing Machine Learning Training Data" by Gustavo E. A. P. A. Batista, Ronaldo C. Prati and Maria Carolina Monard.
 14 | 
 15 | * **Borderline-SMOTE.** Original paper: "Borderline-SMOTE: A New Over-Sampling Method in Imbalanced Data Sets Learning." by Hui Han, Wen-Yuan Wang, and Bing-Huan Mao.
 16 | 
 17 | * **Adasyn.** Original paper: "ADASYN: Adaptive Synthetic Sampling Approach for Imbalanced Learning" by Haibo He, Yang Bai, Edwardo A. Garcia, and Shutao Li.
 18 | 
 19 | * **Adoms.** Original paper: "The Generation Mechanism of Synthetic Minority Class Examples" by Sheng TANG and Si-ping CHEN.
 20 | 
 21 | * **SafeLevel-SMOTE.** Original paper: "Safe-Level-SMOTE: Safe-Level-Synthetic Minority Over-Sampling TEchnique for Handling the Class Imbalanced Problem" by Chumphol Bunkhumpornpat, Krung Sinapiromsaran, and Chidchanok Lursinsap.
 22 | 
 23 | * **Spider2.** Original paper: "Learning from Imbalanced Data in Presence of Noisy and Borderline Examples" by Krystyna Napiera la, Jerzy Stefanowski and Szymon Wilk.
 24 | 
 25 | * **DBSMOTE.** Original paper: "DBSMOTE: Density-Based Synthetic Minority Over-sampling Technique" by Chumphol Bunkhumpornpat,  Krung Sinapiromsaran and Chidchanok Lursinsap.
 26 | 
 27 | * **SMOTE-RSB.** Original paper: "kNN Approach to Unbalanced Data Distribution: SMOTE-RSB: a hybrid preprocessing approach based on oversampling and undersampling for high imbalanced data-sets using SMOTE and rough sets theory" by Enislay Ramentol, Yailé Caballero, Rafael Bello and Francisco Herrera.
 28 | 
 29 | * **MWMOTE.** Original paper: "MWMOTE—Majority Weighted Minority Oversampling Technique for Imbalanced Data Set Learning" by Sukarna Barua, Md. Monirul Islam, Xin Yao, Fellow, IEEE, and Kazuyuki Muras.
 30 | 
 31 | * **MDO.** Original paper: "To combat multi-class imbalanced problems by means of over-sampling and boosting techniques" by Lida Adbi and Sattar Hashemi.
 32 | 
 33 | Included algorithms for undersampling:
 34 | 
 35 | * **Random Undersampling.** Original paper: "A study of the behavior of several methods for balancing machine learning training data" by Batista, Gustavo EAPA and Prati, Ronaldo C and Monard, Maria Carolina.
 36 | 
 37 | * **Condensed Nearest Neighbor decision rule.** Original paper: "The Condensed Nearest Neighbor Rule" by P. Hart.
 38 | 
 39 | * **Edited Nearest Neighbour rule.** Original paper: "Asymptotic Properties of Nearest Neighbor Rules Using Edited Data" by Dennis L. Wilson.
 40 | 
 41 | * **Tomek Link.** Original paper: "Two Modifications of CNN" by Ivan Tomek.
 42 | 
 43 | * **One-Side Selection.** Original paper: "Addressing the Curse of Imbalanced Training Sets: One-Side Selection" by Miroslav Kubat and Stan Matwin.
 44 | 
 45 | * **Neighbourhood Cleaning Rule.** Original paper: "Improving Identification of Difficult Small Classes by Balancing Class Distribution" by J. Laurikkala.
 46 | 
 47 | * **NearMiss.** Original paper: "kNN Approach to Unbalanced Data Distribution: A Case Study involving Information Extraction" by Jianping Zhang and Inderjeet Mani.
 48 | 
 49 | * **Class Purity Maximization algorithm.** Original paper: "An Unsupervised Learning Approach to Resolving the Data Imbalanced Issue in Supervised Learning Problems in Functional Genomics" by Kihoon Yoon and Stephen Kwek.
 50 | 
 51 | * **Undersampling Based on Clustering.** Original paper: "Under-Sampling Approaches for Improving Prediction of the Minority Class in an Imbalanced Dataset" by Show-Jane Yen and Yue-Shi Lee.
 52 | 
 53 | * **Balance Cascade.** Original paper: "Exploratory Undersampling for Class-Imbalance Learning" by Xu-Ying Liu, Jianxin Wu and Zhi-Hua Zhou.
 54 | 
 55 | * **Easy Ensemble.** Original paper: "Exploratory Undersampling for Class-Imbalance Learning" by Xu-Ying Liu, Jianxin Wu and Zhi-Hua Zhou.
 56 | 
 57 | * **Evolutionary Undersampling.** Original paper: "Evolutionary Under-Sampling for Classification with Imbalanced Data Sets: Proposals and Taxonomy" by Salvador Garcia and Francisco Herrera.
 58 | 
 59 | * **Instance Hardness Threshold.** Original paper: "An Empirical Study of Instance Hardness" by Michael R. Smith, Tony Martinez and Christophe Giraud-Carrier.
 60 | 
 61 | * **ClusterOSS.** Original paper: "ClusterOSS: a new undersampling method for imbalanced learning." by Victor H Barella, Eduardo P Costa and André C. P. L. F. Carvalho.
 62 | 
 63 | * **Iterative Instance Adjustment for Imbalanced Domains.** Original paper: "Addressing imbalanced classification with instance generation techniques: IPADE-ID" by Victoria López, Isaac Triguero, Cristóbal J. Carmona, Salvador García and Francisco Herrera.
 64 | 
 65 | ### How-to use it
 66 | 
 67 | If you are going to use this library from another `sbt` project, you just need to clone the original repository, in the root folder of the cloned repository execute `sbt publishLocal` and add the following dependendy to the `build.sbt` file of your project:
 68 | 
 69 | ```scala
 70 | libraryDependencies += "com.github.soul" %% "soul" % "1.0.0"
 71 | ```
 72 | 
 73 | To read a data file you only need to do this:
 74 | 
 75 | ```scala
 76 | import soul.io.Reader
 77 | import soul.data.Data
 78 | 
 79 | /* Read a csv file or any delimited text file */
 80 | val csvData: Data = Reader.readDelimitedText(file = <pathToFile>)
 81 | /* Read a WEKA arff file */
 82 | val arffData: Data = Reader.readArff(file = <pathToFile>)
 83 | ```
 84 | 
 85 | Now we're going to run an undersampling algorithm:
 86 | 
 87 | ```scala
 88 | import soul.algorithm.undersampling.NCL
 89 | import soul.data.Data
 90 | 
 91 | val nclCSV = new NCL(csvData)
 92 | val resultCSV: Data = nclCSV.compute()
 93 | 
 94 | val nclARFF = new NCL(arffData)
 95 | val resultARFF: Data = nclARFF.compute()
 96 | ```
 97 | 
 98 | In this example we've used an undersampling algorithm but it's the same for an oversampling one. All the algorithm's parameters have default values so you don't need to specify any of them.
 99 | 
100 | Finally, we only need to save the result to a file: 
101 | 
102 | ```scala
103 | import soul.io.Writer
104 | 
105 | Writer.writeDelimitedText(file = <pathToFile>, data = resultCSV)
106 | Writer.writeArff(file = <pathToFile>, data = resultARFF)
107 | ```
108 | 
109 | ### Experiments
110 | 
111 | With the objective of showing the capabilities of **SOUL**, we have generated a two dimension synthetic imbalanced dataset with 1,871 instances. Among them, 1,600 instances belong to the majority class and the remaining 271 belongs to the minority class, leading to about a 17% of minority instances in the whole dataset (IR=5.9). The representation of this dataset can be found below, where we may observe a clear overlapping between the classes, as well as a cluster of minority instances in the middle of the majority instances. 
112 | 
113 | Next, we have used the following parameters of the algorithms to perform an experiment with some relevant oversampling and undersampling approaches:
114 | 
115 | 
116 | * **MWMOTE**: *seed*: 0, *N*: 1400, *k1*: 5, *k2*: 5, *k3*: 5, *dist*: euclidean, *normalize*: false, *verbose*: false.
117 | 
118 | * **SMOTE**: *seed*: 0, *percent*: 500, *k*: 5, *dist*: euclidean, *normalize*: false, *verbose*: false.
119 | 
120 | * **ADASYN**: *seed*: 0, *d*: 1, *B*: 1, *k*: 5, *dist*: euclidean, *normalize*: false, *verbose*: false.
121 | 
122 | * **SafeLevelSMOTE**: *seed*: 0, *k*: 5, *dist*: euclidean, *normalize*: false, *verbose*: false.
123 | 
124 | * **IHTS**: *seed* = 0, *nFolds* = 5, *normalize* = false, *randomData* = false, *verbose* = false
125 | 
126 | * **IPADE**: *seed* = 0, *iterations* = 100, *strategy* = 1, *randomChoice* = true, *normalize* = false, *randomData* = false, *verbose* = false
127 | 
128 | * **NCL**: *seed* = 0, *dist* = euclidean, *k* = 3, *threshold* = 0.5, *normalize* = false, *randomData* = false, *verbose* = false
129 | 
130 | * **SBC**: *seed* = 0, *method* = "NearMiss1", *m* = 1.0, *k* = 3, *numClusters* = 50, *restarts* = 1, *minDispersion* = 0.0001, *maxIterations* = 200, val *dist* = euclidean, *normalize* = false, *randomData* = false, *verbose* = false
131 | 
132 | 
133 | ![Original](images/original.png)
134 | 
135 | 
136 | | ![ADASYN](images/ADASYN.png) | ![SafeLevelSMOTE](images/SafeLevelSMOTE.png) |
137 | | ------------- | ------------- |
138 | ![MWMOTE](images/MWMOTE.png) | ![SMOTE](images/SMOTE.png)
139 | 
140 | 
141 | | ![IHTS](images/IHTS.png) | ![IPADE](images/IPADE.png) |
142 | | ------------- | ------------- |
143 | ![NCL](images/NCL.png) | ![SBC](images/SBC.png)
144 | 


--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/oversampling/Spider2.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | SOUL: Scala Oversampling and Undersampling Library.
  3 | Copyright (C) 2019 Néstor Rodríguez, David López
  4 | 
  5 | This program is free software: you can redistribute it and/or modify
  6 | it under the terms of the GNU General Public License as published by
  7 | the Free Software Foundation in version 3 of the License.
  8 | 
  9 | This program is distributed in the hope that it will be useful,
 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | GNU General Public License for more details.
 13 | 
 14 | You should have received a copy of the GNU General Public License
 15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 |  */
 17 | package soul.algorithm.oversampling
 18 | 
 19 | import soul.data.Data
 20 | import soul.util.Utilities.Distance.Distance
 21 | import soul.util.Utilities._
 22 | 
 23 | import scala.collection.mutable.ArrayBuffer
 24 | 
 25 | /** Spider2 algorithm. Original paper: "Learning from Imbalanced Data in Presence of Noisy and Borderline Examples" by
 26 |   * Krystyna Napiera la, Jerzy Stefanowski and Szymon Wilk.
 27 |   *
 28 |   * @param data      data to work with
 29 |   * @param seed      seed to use. If it is not provided, it will use the system time
 30 |   * @param relabel   relabeling option
 31 |   * @param ampl      amplification option
 32 |   * @param k         number of minority class nearest neighbors
 33 |   * @param dist      object of Distance enumeration representing the distance to be used
 34 |   * @param normalize normalize the data or not
 35 |   * @param verbose   choose to display information about the execution or not
 36 |   * @author David López Pretel
 37 |   */
 38 | class Spider2(data: Data, seed: Long = System.currentTimeMillis(), relabel: String = "yes", ampl: String = "weak", k: Int = 5,
 39 |               dist: Distance = Distance.EUCLIDEAN, normalize: Boolean = false, verbose: Boolean = false) {
 40 | 
 41 |   /** Compute the Spider2 algorithm
 42 |     *
 43 |     * @return synthetic samples generated
 44 |     */
 45 |   def compute(): Data = {
 46 |     val initTime: Long = System.nanoTime()
 47 | 
 48 |     if (relabel != "no" && relabel != "yes") {
 49 |       throw new Exception("relabel must be yes or no.")
 50 |     }
 51 | 
 52 |     if (ampl != "weak" && ampl != "strong" && ampl != "no") {
 53 |       throw new Exception("amplification must be weak or strong or no.")
 54 |     }
 55 | 
 56 |     var minorityClassIndex: Array[Int] = minority(data.y)
 57 |     val minorityClass: Any = data.y(minorityClassIndex(0))
 58 |     var majorityClassIndex: Array[Int] = data.processedData.indices.diff(minorityClassIndex.toList).toArray
 59 |     val output: ArrayBuffer[Array[Double]] = ArrayBuffer()
 60 |     var resultClasses: Array[Any] = new Array[Any](0)
 61 | 
 62 |     val samples: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
 63 | 
 64 |     val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
 65 |       (samples.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
 66 |         samples.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
 67 |         samples.transpose.map((column: Array[Double]) => standardDeviation(column)))
 68 |     } else {
 69 |       (null, null, null)
 70 |     }
 71 | 
 72 |     def flagged(c: Array[Int], f: Array[Boolean]): Array[Int] = {
 73 |       c.map(classes => {
 74 |         if (!f(classes)) Some(classes) else None
 75 |       }).filterNot(_.forall(_ == None)).map(_.get)
 76 |     }
 77 | 
 78 |     def amplify(x: Int, k: Int): Unit = {
 79 |       // compute the neighborhood for the majority and minority class
 80 |       val majNeighbors: Array[Int] = if (dist == Distance.EUCLIDEAN) {
 81 |         kNeighbors(majorityClassIndex map output, output(x), k)
 82 |       } else {
 83 |         kNeighborsHVDM(majorityClassIndex map output, output(x), k, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
 84 |       }
 85 | 
 86 |       val minNeighbors: Array[Int] = if (dist == Distance.EUCLIDEAN) {
 87 |         kNeighbors(minorityClassIndex map output, output(x), k)
 88 |       } else {
 89 |         kNeighborsHVDM(minorityClassIndex map output, output(x), k, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
 90 |       }
 91 | 
 92 |       // compute the number of copies to create
 93 |       val S: Int = Math.abs(majNeighbors.length - minNeighbors.length) + 1
 94 |       // need to know the size of the output to save the randomIndex of the elements inserted
 95 |       val outputSize: Int = output.length
 96 |       (0 until S).foreach(_ => {
 97 |         output ++= Traversable(output(x))
 98 |       })
 99 |       // add n copies to the output
100 |       if (resultClasses(x) == minorityClass) {
101 |         minorityClassIndex = minorityClassIndex ++ (outputSize until outputSize + S)
102 |       } else {
103 |         majorityClassIndex = majorityClassIndex ++ (outputSize until outputSize + S)
104 |       }
105 |       resultClasses = resultClasses ++ Array.fill(S)(resultClasses(x))
106 |     }
107 | 
108 |     def correct(x: Int, k: Int, out: Boolean): Boolean = {
109 |       // compute the neighbors
110 |       val neighbors: Array[Int] = if (dist == Distance.EUCLIDEAN) {
111 |         kNeighbors(if (out) samples else output.toArray, if (out) samples(x) else output(x), k)
112 |       } else {
113 |         kNeighborsHVDM(if (out) samples else output.toArray, if (out) samples(x) else output(x), k,
114 |           data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
115 |       }
116 |       val classes: scala.collection.mutable.Map[Any, Int] = scala.collection.mutable.Map()
117 |       // compute the number of samples for each class in the neighborhood
118 |       neighbors.foreach(neighbor => classes += data.y(neighbor) -> 0)
119 |       neighbors.foreach(neighbor => classes(data.y(neighbor)) += 1)
120 | 
121 |       // if the majority class in neighborhood is the minority class return true
122 |       if (classes.reduceLeft((x: (Any, Int), y: (Any, Int)) => if (x._2 > y._2) x else y)._1 == data.y(x))
123 |         true
124 |       else
125 |         false
126 |     }
127 | 
128 |     // array with the randomIndex of each sample
129 |     var DS: Array[Int] = Array.range(0, samples.length)
130 |     // at the beginning there are not safe samples
131 |     var safeSamples: Array[Boolean] = Array.fill(samples.length)(false)
132 | 
133 |     // for each sample in majority class check if the neighbors has the same class
134 |     majorityClassIndex.foreach(index => if (correct(index, k, out = true)) safeSamples(index) = true)
135 | 
136 |     // return a subset of samples that are not safe and belong to the majority class
137 |     val RS: Array[Int] = flagged(majorityClassIndex, safeSamples)
138 |     if (relabel == "yes") {
139 |       //add the RS samples to the minority set
140 |       minorityClassIndex = minorityClassIndex ++ RS
141 |       resultClasses = data.y
142 |       RS.foreach(resultClasses(_) = minorityClass)
143 |     } else {
144 |       // eliminate the samples from the initial set, first we recalculate the randomIndex for min and maj class
145 |       var newIndex: Int = 0
146 |       minorityClassIndex = minorityClassIndex.map(minor => {
147 |         newIndex = minor
148 |         RS.foreach(index => if (index < minor) newIndex -= 1)
149 |         newIndex
150 |       })
151 |       majorityClassIndex = majorityClassIndex.map(major => {
152 |         newIndex = major
153 |         RS.foreach(index => if (index < major) newIndex -= 1)
154 |         newIndex
155 |       })
156 |       DS = DS.diff(RS)
157 |       safeSamples = DS map safeSamples
158 |       resultClasses = DS map data.y
159 |     }
160 | 
161 |     // the output is DS if ampl is not weak or strong
162 |     output ++= (DS map samples)
163 | 
164 |     // if the neighbors of each sample in minority class belong to it, flag as safe
165 |     minorityClassIndex.foreach(index => if (correct(index, k, out = false)) safeSamples(index) = true)
166 |     if (ampl == "weak") {
167 |       // for each sample returned by flagged amplify the data creating n copies (n calculated in amplify)
168 |       flagged(minorityClassIndex, safeSamples).foreach(amplify(_, k))
169 |     } else if (ampl == "strong") {
170 |       // if the sample is correct amplify with k, else amplify with k + 2 (k is not n)
171 |       flagged(minorityClassIndex, safeSamples).foreach(x => {
172 |         if (correct(x, k + 2, out = false)) amplify(x, k) else amplify(x, k + 2)
173 |       })
174 |     }
175 | 
176 |     val finishTime: Long = System.nanoTime()
177 | 
178 |     if (verbose) {
179 |       println("ORIGINAL SIZE: %d".format(data.x.length))
180 |       println("NEW DATA SIZE: %d".format(data.x.length + output.length))
181 |       println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
182 |     }
183 | 
184 |     new Data(if (data.fileInfo.nominal.length == 0) {
185 |       to2Decimals(if (normalize) zeroOneDenormalization(output.toArray, data.fileInfo.maxAttribs, data.fileInfo.minAttribs) else output.toArray)
186 |     } else {
187 |       toNominal(if (normalize) zeroOneDenormalization(output.toArray, data.fileInfo.maxAttribs, data.fileInfo.minAttribs) else output.toArray, data.nomToNum)
188 |     }, resultClasses, None, data.fileInfo)
189 |   }
190 | }
191 | 


--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/oversampling/MWMOTE.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | SOUL: Scala Oversampling and Undersampling Library.
  3 | Copyright (C) 2019 Néstor Rodríguez, David López
  4 | 
  5 | This program is free software: you can redistribute it and/or modify
  6 | it under the terms of the GNU General Public License as published by
  7 | the Free Software Foundation in version 3 of the License.
  8 | 
  9 | This program is distributed in the hope that it will be useful,
 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | GNU General Public License for more details.
 13 | 
 14 | You should have received a copy of the GNU General Public License
 15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 |  */
 17 | package soul.algorithm.oversampling
 18 | 
 19 | import soul.data.Data
 20 | import soul.util.Utilities.Distance.Distance
 21 | import soul.util.Utilities._
 22 | 
 23 | import scala.collection.mutable.ArrayBuffer
 24 | import scala.util.Random
 25 | 
 26 | /** MWMOTE algorithm. Original paper: "MWMOTE—Majority Weighted Minority Oversampling Technique for Imbalanced Data Set
 27 |   * Learning" by Sukarna Barua, Md. Monirul Islam, Xin Yao, Fellow, IEEE, and Kazuyuki Muras.
 28 |   *
 29 |   * @param data      data to work with
 30 |   * @param seed      seed to use. If it is not provided, it will use the system time
 31 |   * @param N         number of synthetic samples to be generated
 32 |   * @param k1        number of neighbors used for predicting noisy minority class samples
 33 |   * @param k2        number of majority neighbors used for constructing informative minority set
 34 |   * @param k3        number of minority neighbors used for constructing informative minority set
 35 |   * @param dist      object of Distance enumeration representing the distance to be used
 36 |   * @param normalize normalize the data or not
 37 |   * @param verbose   choose to display information about the execution or not
 38 |   * @author David López Pretel
 39 |   */
 40 | class MWMOTE(data: Data, seed: Long = System.currentTimeMillis(), N: Int = 500, k1: Int = 5, k2: Int = 5, k3: Int = 5,
 41 |              dist: Distance = Distance.EUCLIDEAN, normalize: Boolean = false, verbose: Boolean = false) {
 42 | 
 43 |   /** Compute the MWMOTE algorithm
 44 |     *
 45 |     * @return synthetic samples generated
 46 |     */
 47 |   def compute(): Data = {
 48 |     val initTime: Long = System.nanoTime()
 49 |     val samples: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
 50 | 
 51 |     val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
 52 |       (samples.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
 53 |         samples.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
 54 |         samples.transpose.map((column: Array[Double]) => standardDeviation(column)))
 55 |     } else {
 56 |       (null, null, null)
 57 |     }
 58 | 
 59 |     def f(value: Double, cut: Double): Double = {
 60 |       if (value < cut) value else cut
 61 |     }
 62 | 
 63 |     def Cf(y: (Int, Int), x: Int, Nmin: Array[Array[Int]]): Double = {
 64 |       val cut: Double = 5 // values used in the paper
 65 |       val CMAX: Double = 2
 66 | 
 67 |       if (!Nmin(y._2).contains(x)) {
 68 |         val D: Double = if (dist == Distance.EUCLIDEAN) {
 69 |           euclidean(samples(y._1), samples(x))
 70 |         } else {
 71 |           HVDM(samples(y._1), samples(x), data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
 72 |         }
 73 |         f(samples(0).length / D, cut) * CMAX
 74 |       } else
 75 |         0.0
 76 |     }
 77 | 
 78 |     def Iw(y: (Int, Int), x: Int, Nmin: Array[Array[Int]], Simin: Array[Int]): Double = {
 79 |       val cf = Cf(y, x, Nmin)
 80 |       val df = cf / Simin.map(Cf(y, _, Nmin)).sum
 81 |       cf + df
 82 |     }
 83 | 
 84 |     def clusterDistance(cluster1: Array[Int], cluster2: Array[Int]): Double = {
 85 |       val centroid1: Array[Double] = (cluster1 map samples).transpose.map(_.sum / cluster1.length)
 86 |       val centroid2: Array[Double] = (cluster2 map samples).transpose.map(_.sum / cluster2.length)
 87 | 
 88 |       if (dist == Distance.EUCLIDEAN) {
 89 |         euclidean(centroid1, centroid2)
 90 |       } else {
 91 |         HVDM(centroid1, centroid2, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
 92 |       }
 93 |     }
 94 | 
 95 |     def minDistance(cluster: ArrayBuffer[ArrayBuffer[Int]]): (Int, Int, Double) = {
 96 |       var minDist: (Int, Int, Double) = (0, 0, 99999999)
 97 |       var i, j: Int = 0
 98 |       while (i < cluster.length) {
 99 |         j = 0
100 |         while (j < cluster.length) {
101 |           if (i != j) {
102 |             val dist = clusterDistance(cluster(i).toArray, cluster(j).toArray)
103 |             if (dist < minDist._3) minDist = (i, j, dist)
104 |           }
105 |           j += 1
106 |         }
107 |         i += 1
108 |       }
109 |       minDist
110 |     }
111 | 
112 |     def cluster(Sminf: Array[Int]): Array[Array[Int]] = {
113 |       val distances: Array[Array[Double]] = Array.fill(Sminf.length, Sminf.length)(9999999.0)
114 |       var i, j: Int = 0
115 |       while (i < Sminf.length) {
116 |         j = 0
117 |         while (j < Sminf.length) {
118 |           if (i != j) {
119 |             distances(i)(j) = if (dist == Distance.EUCLIDEAN) {
120 |               euclidean(samples(Sminf(i)), samples(Sminf(j)))
121 |             } else {
122 |               HVDM(samples(Sminf(i)), samples(Sminf(j)), data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
123 |             }
124 |           }
125 |           j += 1
126 |         }
127 |         i += 1
128 |       }
129 | 
130 |       val Cp: Double = 3 // used in paper
131 |       val Th: Double = distances.map(_.min).sum / Sminf.length * Cp
132 |       var minDist: (Int, Int, Double) = (0, 0, 0.0)
133 |       val clusters: ArrayBuffer[ArrayBuffer[Int]] = Sminf.map(ArrayBuffer(_)).to[ArrayBuffer]
134 |       while (minDist._3 < Th) {
135 |         //compute the min distance between each cluster
136 |         minDist = minDistance(clusters)
137 |         //merge the two more proximal clusters
138 |         clusters(minDist._1) ++= clusters(minDist._2)
139 |         clusters -= clusters(minDist._2)
140 |       }
141 | 
142 |       clusters.map(_.toArray).toArray
143 |     }
144 | 
145 |     // compute minority class
146 |     val minorityClassIndex: Array[Int] = minority(data.y)
147 |     val minorityClass: Any = data.y(minorityClassIndex(0))
148 |     // compute majority class
149 |     val majorityClassIndex: Array[Int] = samples.indices.par.diff(minorityClassIndex.toList).toArray
150 | 
151 |     // construct the filtered minority set
152 |     val Sminf: Array[Int] = minorityClassIndex.par.map(index => {
153 |       val neighbors = if (dist == Distance.EUCLIDEAN) {
154 |         kNeighbors(samples, index, k1)
155 |       } else {
156 |         kNeighborsHVDM(samples, index, k1, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
157 |       }
158 |       if (neighbors map data.y contains data.y(minorityClassIndex(0))) {
159 |         Some(index)
160 |       } else {
161 |         None
162 |       }
163 |     }).filterNot(_.forall(_ == None)).map(_.get).toArray
164 | 
165 |     //for each sample in Sminf compute the nearest majority set
166 |     val Sbmaj: Array[Int] = Sminf.par.flatMap { x =>
167 |       if (dist == Distance.EUCLIDEAN) {
168 |         kNeighbors(majorityClassIndex map samples, samples(x), k2)
169 |       } else {
170 |         kNeighborsHVDM(majorityClassIndex map samples, samples(x), k2, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
171 |       }
172 |     }.distinct.par.map(majorityClassIndex(_)).toArray
173 | 
174 |     // for each majority example in Sbmaj , compute the nearest minority set
175 |     val Nmin: Array[Array[Int]] = Sbmaj.par.map { x =>
176 |       (if (dist == Distance.EUCLIDEAN) {
177 |         kNeighbors(minorityClassIndex map samples, samples(x), k3)
178 |       } else {
179 |         kNeighborsHVDM(minorityClassIndex map samples, samples(x), k3, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
180 |       }).par.map(minorityClassIndex(_)).toArray
181 |     }.toArray
182 | 
183 |     // find the informative minority set (union of all Nmin)
184 |     val Simin: Array[Int] = Nmin.par.flatten.distinct.toArray
185 |     // for each sample in Simin compute the selection weight
186 |     val Sw: Array[Double] = Simin.par.map(x => Sbmaj.zipWithIndex.par.map(y => Iw(y, x, Nmin, Simin)).sum).toArray
187 |     val sumSw: Double = Sw.sum
188 |     // convert each Sw into probability
189 |     val Sp: Array[(Double, Int)] = Sw.par.map(_ / sumSw).toArray.zip(Simin).sortWith(_._1 > _._1)
190 | 
191 |     // compute the clusters
192 |     val clusters: Array[Array[Int]] = cluster(minorityClassIndex) // cluster => index to processedData
193 |     val clustersIndex: Map[Int, Int] = clusters.zipWithIndex.flatMap(c => {
194 |       clusters(c._2).map(index => (index, c._2))
195 |     }).toMap // index to processedData => cluster
196 | 
197 |     //output data
198 |     val output: Array[Array[Double]] = Array.ofDim(N, samples(0).length)
199 | 
200 |     val probsSum: Double = Sp.map(_._1).sum
201 |     val r: Random = new Random(seed)
202 | 
203 |     (0 until N).par.foreach(i => {
204 |       // select a sample, then select another randomly from the cluster that have this sample
205 |       val x = chooseByProb(Sp, probsSum, r)
206 |       val y = clusters(clustersIndex(x))(r.nextInt(clusters(clustersIndex(x)).length))
207 |       // compute atributtes of the sample
208 |       samples(0).indices.foreach(atrib => {
209 |         val diff: Double = samples(y)(atrib) - samples(x)(atrib)
210 |         val gap: Float = r.nextFloat
211 |         output(i)(atrib) = samples(x)(atrib) + gap * diff
212 |       })
213 |     })
214 | 
215 |     val finishTime: Long = System.nanoTime()
216 | 
217 |     if (verbose) {
218 |       println("ORIGINAL SIZE: %d".format(data.x.length))
219 |       println("NEW DATA SIZE: %d".format(data.x.length + output.length))
220 |       println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
221 |     }
222 | 
223 |     new Data(if (data.fileInfo.nominal.length == 0) {
224 |       to2Decimals(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs, data.fileInfo.minAttribs) else output))
225 |     } else {
226 |       toNominal(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs, data.fileInfo.minAttribs) else output), data.nomToNum)
227 |     }, Array.concat(data.y, Array.fill(output.length)(minorityClass)), None, data.fileInfo)
228 |   }
229 | }
230 | 


--------------------------------------------------------------------------------
/src/main/scala/soul/io/Reader.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | SOUL: Scala Oversampling and Undersampling Library.
  3 | Copyright (C) 2019 Néstor Rodríguez, David López
  4 | 
  5 | This program is free software: you can redistribute it and/or modify
  6 | it under the terms of the GNU General Public License as published by
  7 | the Free Software Foundation in version 3 of the License.
  8 | 
  9 | This program is distributed in the hope that it will be useful,
 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | GNU General Public License for more details.
 13 | 
 14 | You should have received a copy of the GNU General Public License
 15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 |  */
 17 | package soul.io
 18 | 
 19 | import java.io.{BufferedReader, FileInputStream, InputStreamReader}
 20 | import java.text.ParseException
 21 | 
 22 | import soul.data.{Data, FileInfo}
 23 | import soul.util.Utilities.processData
 24 | 
 25 | import scala.collection.mutable
 26 | import scala.collection.mutable.ArrayBuffer
 27 | 
 28 | /** Class to read data files
 29 |   *
 30 |   * @author Néstor Rodríguez Vico
 31 |   */
 32 | object Reader {
 33 |   /** Parse a arff file
 34 |     *
 35 |     * @param file        file containing the data
 36 |     * @param columnClass indicates which column represents the class in the file. It it's set to -1, it will take the las column
 37 |     * @return a data object containing all the relevant information
 38 |     */
 39 |   def readArff(file: String, columnClass: Int = -1): Data = {
 40 |     val reader: BufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(file)))
 41 |     var line: String = reader.readLine()
 42 |     var relationName: String = ""
 43 |     // index -> attributeName
 44 |     val attributes: mutable.Map[Int, String] = collection.mutable.Map[Int, String]()
 45 |     // attributeName -> type (it it's nominal, possible values instead of type)
 46 |     val attributesValues: mutable.Map[String, String] = collection.mutable.Map[String, String]()
 47 | 
 48 |     var dataDetected: Boolean = false
 49 |     var counter: Int = 0
 50 | 
 51 |     while (line != null && !dataDetected) {
 52 |       // ignore comments/description lines
 53 |       if (line.isEmpty || line.startsWith("%")) {
 54 |         line = reader.readLine
 55 |       } else {
 56 |         // take care if the relation name has commas, tabs, multiple spaces...
 57 |         val parts: Array[String] = line.replaceAll("\t", " ").replaceAll("\\s{2,}", " ").split(" ", 3)
 58 |         if (parts(0).equalsIgnoreCase("@relation")) {
 59 |           // drop the identifier and group all the possible parts separated by a space
 60 |           relationName = parts.drop(1).mkString(" ")
 61 |         } else if (parts(0).equalsIgnoreCase("@attribute")) {
 62 |           attributes += (counter -> parts(1))
 63 |           attributesValues += (parts(1) -> parts(2))
 64 |           counter += 1
 65 |         } else if (parts(0).equalsIgnoreCase("@data")) {
 66 |           dataDetected = true
 67 |         }
 68 | 
 69 |         line = reader.readLine
 70 |       }
 71 |     }
 72 | 
 73 |     if (columnClass >= attributes.size)
 74 |       throw new ParseException("Invalid response variable index: " + columnClass, columnClass)
 75 | 
 76 |     val response: Int = if (columnClass == -1) attributes.size - 1 else columnClass
 77 |     val readData: ArrayBuffer[Array[String]] = new ArrayBuffer[Array[String]](0)
 78 | 
 79 |     // Now we have the attributes, let's save the data
 80 |     while (line != null) {
 81 |       if (line.isEmpty || line.startsWith("%")) {
 82 |         line = reader.readLine
 83 |       } else {
 84 |         val parts: Array[String] = line.replaceAll("\t", " ").replaceAll("\\s{2,}", " ").split(",")
 85 |         // there are not quotations mark
 86 |         if (parts.length == (attributes.size + 1)) {
 87 |           readData += parts.asInstanceOf
 88 |         } else {
 89 |           // if there are quotations marks, they are going to be in pairs
 90 |           val subParts: Array[Array[Int]] = parts.zipWithIndex.filter((x: (String, Int)) => x._1.contains("\"")).collect { case (_, a) => a }.grouped(2).toArray
 91 |           // separators indicates the index of the elements that need to be merged into one class
 92 |           val separators = new ArrayBuffer[Array[Int]](0)
 93 |           for (quotationMarks <- subParts)
 94 |             separators += (quotationMarks(0) to quotationMarks(1)).toArray
 95 | 
 96 |           val separatedValues: ArrayBuffer[String] = new ArrayBuffer[String]()
 97 |           // append all the parts into one value
 98 |           for (pair <- subParts)
 99 |             separatedValues += ((pair(0) to pair(1)).toArray map parts).mkString(",")
100 | 
101 |           val nonSeparatedValuesIndex: Array[Int] = parts.indices.diff(separators.flatten.toList).toArray
102 |           val nonSeparatedValues: Array[String] = nonSeparatedValuesIndex map parts
103 |           // append all the data
104 |           val values: Array[String] = (separatedValues ++ nonSeparatedValues).toArray
105 |           // make an index array merging all the index: take care with the separatedValuesIndex because there are more than one
106 |           // index for each value, so we compute the mean for all the numbers associated to one value
107 |           val index: Array[Double] = separators.map((a: Array[Int]) => a.sum.toDouble / a.length).toArray ++ nonSeparatedValuesIndex.map(_.asInstanceOf[Double])
108 |           // finally, construct an array to sort the values
109 |           val indexForMap: Array[Int] = index.zipWithIndex.sortBy((pair: (Double, Int)) => pair._1).map((pair: (Double, Int)) => pair._2)
110 |           // get the final values
111 |           val finalValues: Array[String] = indexForMap map values
112 |           if (finalValues.length != attributes.size)
113 |             throw new ParseException("%d columns, expected %d".format((indexForMap map values).length, attributes.size), (indexForMap map values).length)
114 | 
115 |           readData += finalValues
116 |         }
117 |         line = reader.readLine
118 |       }
119 |     }
120 | 
121 |     val finalData: ArrayBuffer[Array[Any]] = new ArrayBuffer[Array[Any]](0)
122 |     val readClasses: ArrayBuffer[Any] = new ArrayBuffer[Any](0)
123 |     val readNominal: ArrayBuffer[Int] = new ArrayBuffer[Int](0)
124 | 
125 |     for (row <- readData) {
126 |       val r = new ArrayBuffer[Any](0)
127 |       for (e <- row.zipWithIndex) {
128 |         if (e._2 == response)
129 |           readClasses += e._1
130 |         else if (e._1.matches("-?\\d+(\\.\\d+)?"))
131 |           r += e._1.toDouble
132 |         else {
133 |           if (e._1 == "?" || e._1 == "'?'")
134 |             r += "soul_NA"
135 |           else {
136 |             r += e._1
137 |             readNominal += (if (e._2 >= response) e._2 - 1 else e._2)
138 |           }
139 |         }
140 |       }
141 | 
142 |       finalData += r.toArray
143 |     }
144 | 
145 |     val fileInfo = new FileInfo(_file = file, _comment = "%", _columnClass = response, _delimiter = null, _missing = "?", _header = null,
146 |       _relationName = relationName, _attributes = attributes, _attributesValues = attributesValues, nominal = readNominal.distinct.toArray)
147 |     val data: Data = new Data(x = finalData.toArray, y = readClasses.toArray, fileInfo = fileInfo)
148 |     val (processedData, nomToNum) = processData(data)
149 |     data.processedData = processedData
150 |     data.nomToNum = nomToNum
151 |     data
152 |   }
153 | 
154 |   /** Parse a delimited text data file
155 |     *
156 |     * @param file        file containing the data
157 |     * @param comment     string indicating that a line is a comment
158 |     * @param delimiter   string separating two elements
159 |     * @param missing     string indicating a element is missed
160 |     * @param header      indicates if the file contains a header or not
161 |     * @param columnClass indicates which column represents the class in the file. It it's set to -1, it will take the las column
162 |     * @return a data object containing all the relevant information
163 |     */
164 |   def readDelimitedText(file: String, comment: String = "#", delimiter: String = ",", missing: String = "?", header: Boolean = true, columnClass: Int = -1): Data = {
165 |     val reader: BufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(file)))
166 |     reader.mark(100)
167 |     val firstLine: String = reader.readLine
168 |     if (columnClass >= firstLine.split(delimiter).length) throw new ParseException("Invalid response variable index: " + columnClass, columnClass)
169 |     val response: Int = if (columnClass == -1) firstLine.split(delimiter).length - 1 else columnClass
170 |     reader.reset()
171 | 
172 |     val headerArray: Array[String] = if (header) reader.readLine.split(delimiter) else null
173 |     var line: String = reader.readLine
174 |     val readData: ArrayBuffer[Array[Any]] = new ArrayBuffer[Array[Any]](0)
175 |     val readClasses: ArrayBuffer[Any] = new ArrayBuffer[Any](0)
176 |     val readNominal: ArrayBuffer[Int] = new ArrayBuffer[Int](0)
177 | 
178 |     while (line != null) {
179 |       if (line.isEmpty || line.startsWith(comment)) {
180 |         line = reader.readLine
181 |       } else {
182 |         val elements: Array[String] = line.split(delimiter)
183 | 
184 |         if (elements.length != firstLine.split(delimiter).length)
185 |           throw new ParseException("%d columns, expected %d".format(elements.length, firstLine.length), elements.length)
186 | 
187 |         val row = new ArrayBuffer[Any](0)
188 |         for (e <- elements.zipWithIndex) {
189 |           if (e._2 == response)
190 |             readClasses += e._1
191 |           else if (e._1.replaceAll("\\s", "").matches("[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?"))
192 |             row += e._1.replaceAll("\\s", "").toDouble
193 |           else {
194 |             if (e._1 == missing)
195 |               row += "soul_NA"
196 |             else {
197 |               row += e._1
198 |               readNominal += (if (e._2 >= response) e._2 - 1 else e._2)
199 |             }
200 |           }
201 |         }
202 | 
203 |         readData += row.toArray
204 |         line = reader.readLine
205 |       }
206 |     }
207 | 
208 |     val attributesValues: mutable.Map[String, String] = collection.mutable.Map[String, String]()
209 |     attributesValues += ("Class" -> readClasses.distinct.mkString(","))
210 | 
211 |     val fileInfo = new FileInfo(_file = file, _comment = "%", _columnClass = response, _delimiter = delimiter, _missing = missing, _header = headerArray, _relationName = null,
212 |       _attributes = null, _attributesValues = attributesValues, nominal = readNominal.distinct.toArray)
213 |     val data: Data = new Data(x = readData.toArray, y = readClasses.toArray, fileInfo = fileInfo)
214 |     val (processedData, nomToNum) = processData(data)
215 |     data.processedData = processedData
216 |     data.nomToNum = nomToNum
217 |     data
218 |   }
219 | }
220 | 


--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/undersampling/EUS.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | SOUL: Scala Oversampling and Undersampling Library.
  3 | Copyright (C) 2019 Néstor Rodríguez, David López
  4 | 
  5 | This program is free software: you can redistribute it and/or modify
  6 | it under the terms of the GNU General Public License as published by
  7 | the Free Software Foundation in version 3 of the License.
  8 | 
  9 | This program is distributed in the hope that it will be useful,
 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | GNU General Public License for more details.
 13 | 
 14 | You should have received a copy of the GNU General Public License
 15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 |  */
 17 | package soul.algorithm.undersampling
 18 | 
 19 | import soul.data.Data
 20 | import soul.util.Utilities.Distance.Distance
 21 | import soul.util.Utilities._
 22 | 
 23 | import scala.collection.mutable.ArrayBuffer
 24 | import scala.math.{abs, sqrt}
 25 | 
 26 | /** Evolutionary Under Sampling. Original paper: "Evolutionary Under-Sampling for Classification with Imbalanced Data
 27 |   * Sets: Proposals and Taxonomy" by Salvador Garcia and Francisco Herrera.
 28 |   *
 29 |   * @param data           data to work with
 30 |   * @param seed           seed to use. If it is not provided, it will use the system time
 31 |   * @param populationSize number of chromosomes to generate
 32 |   * @param maxEvaluations number of evaluations
 33 |   * @param algorithm      version of core to execute. One of: EBUSGSGM, EBUSMSGM, EBUSGSAUC, EBUSMSAUC,
 34 |   *                       EUSCMGSGM, EUSCMMSGM, EUSCMGSAUC or EUSCMMSAUC
 35 |   * @param dist           object of Distance enumeration representing the distance to be used
 36 |   * @param probHUX        probability of changing a gen from 0 to 1 (used in crossover)
 37 |   * @param recombination  recombination threshold (used in reinitialization)
 38 |   * @param prob0to1       probability of changing a gen from 0 to 1 (used in reinitialization)
 39 |   * @param normalize      normalize the data or not
 40 |   * @param randomData     iterate through the data randomly or not
 41 |   * @param verbose        choose to display information about the execution or not
 42 |   * @author Néstor Rodríguez Vico
 43 |   */
 44 | class EUS(data: Data, seed: Long = System.currentTimeMillis(), populationSize: Int = 50, maxEvaluations: Int = 1000,
 45 |           algorithm: String = "EBUSMSGM", dist: Distance = Distance.EUCLIDEAN, probHUX: Double = 0.25,
 46 |           recombination: Double = 0.35, prob0to1: Double = 0.05, normalize: Boolean = false, randomData: Boolean = false,
 47 |           verbose: Boolean = false) {
 48 | 
 49 |   /** Compute the EUS algorithm.
 50 |     *
 51 |     * @return undersampled data structure
 52 |     */
 53 |   def compute(): Data = {
 54 |     val initTime: Long = System.nanoTime()
 55 | 
 56 |     val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length)
 57 |     val untouchableClass: Any = counter.minBy((c: (Any, Int)) => c._2)._1
 58 |     val random: scala.util.Random = new scala.util.Random(seed)
 59 |     var dataToWorkWith: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
 60 |     val classesToWorkWith: Array[Any] = if (randomData) {
 61 |       val randomIndex: List[Int] = random.shuffle(data.y.indices.toList)
 62 |       dataToWorkWith = (randomIndex map dataToWorkWith).toArray
 63 |       (randomIndex map data.y).toArray
 64 |     } else {
 65 |       data.y
 66 |     }
 67 | 
 68 |     val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
 69 |       (dataToWorkWith.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
 70 |         dataToWorkWith.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
 71 |         dataToWorkWith.transpose.map((column: Array[Double]) => standardDeviation(column)))
 72 |     } else {
 73 |       (null, null, null)
 74 |     }
 75 | 
 76 |     val majoritySelection: Boolean = algorithm.contains("MS")
 77 |     val targetInstances: Array[Int] = classesToWorkWith.indices.toArray
 78 |     val minorityElements: Array[Int] = classesToWorkWith.zipWithIndex.collect { case (c, i) if c == untouchableClass => i }
 79 | 
 80 |     def fitnessFunction(instance: Array[Int]): Double = {
 81 |       val index: Array[Int] = zeroOneToIndex(instance) map targetInstances
 82 |       val neighbours: Array[Array[Double]] = index map dataToWorkWith
 83 |       val classes: Array[Any] = index map classesToWorkWith
 84 |       val predicted: Array[Any] = dataToWorkWith.indices.map { e: Int =>
 85 |         if (dist == Distance.EUCLIDEAN) {
 86 |           nnRule(neighbours, dataToWorkWith(e), index.indexOf(e), classes, 1, "nearest")._1
 87 |         } else {
 88 |           nnRuleHVDM(neighbours, dataToWorkWith(e), index.indexOf(e), classes, 1, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter, "nearest")._1
 89 |         }
 90 |       }.toArray
 91 | 
 92 |       val matrix: (Int, Int, Int, Int) = confusionMatrix(originalLabels = index map classesToWorkWith,
 93 |         predictedLabels = predicted, minorityClass = untouchableClass)
 94 | 
 95 |       val tp: Int = matrix._1
 96 |       val fp: Int = matrix._2
 97 |       val fn: Int = matrix._3
 98 |       val tn: Int = matrix._4
 99 | 
100 |       val nPositives: Int = (index map classesToWorkWith).count(_ == untouchableClass)
101 |       val nNegatives: Int = (index map classesToWorkWith).length - nPositives
102 | 
103 |       val tpr: Double = tp / ((tp + fn) + 0.00000001)
104 |       val fpr: Double = fp / ((fp + tn) + 0.00000001)
105 |       val auc: Double = (1.0 + tpr - fpr) / 2.0
106 |       val tnr: Double = tn / ((tn + fp) + 0.00000001)
107 |       val g: Double = sqrt(tpr * tnr)
108 | 
109 |       val fitness: Double = if (algorithm == "EBUSGSGM") {
110 |         g - abs(1 - (nPositives.toFloat / nNegatives)) * 20
111 |       } else if (algorithm == "EBUSMSGM") {
112 |         g - abs(1 - (counter(untouchableClass).toFloat / nNegatives)) * 20
113 |       } else if (algorithm == "EUSCMGSGM") {
114 |         g
115 |       } else if (algorithm == "EUSCMMSGM") {
116 |         g
117 |       } else if (algorithm == "EBUSGSAUC") {
118 |         auc - abs(1 - (nPositives.toFloat / nNegatives)) * 0.2
119 |       } else if (algorithm == "EBUSMSAUC") {
120 |         auc - abs(1 - (counter(untouchableClass).toFloat / nNegatives)) * 0.2
121 |       } else if (algorithm == "EUSCMGSAUC") {
122 |         auc
123 |       } else if (algorithm == "EUSCMMSAUC") {
124 |         auc
125 |       } else {
126 |         Double.NaN
127 |       }
128 | 
129 |       if (fitness.isNaN)
130 |         throw new Exception("Invalid argument: core should be: EBUSGSGM, EBUSMSGM, EBUSGSAUC, EBUSMSAUC, EUSCMGSGM, " +
131 |           "EUSCMMSGM, EUSCMGSAUC or EUSCMMSAUC")
132 | 
133 |       fitness
134 |     }
135 | 
136 |     val population: Array[Array[Int]] = new Array[Array[Int]](populationSize)
137 |     (0 until populationSize).foreach { i: Int =>
138 |       val individual: Array[Int] = targetInstances.indices.map(_ => random.nextInt(2)).toArray
139 |       if (majoritySelection) {
140 |         minorityElements.foreach((i: Int) => individual(i) = 1)
141 |       }
142 |       population(i) = individual
143 |     }
144 | 
145 |     val evaluations: Array[Double] = new Array[Double](population.length)
146 |     population.zipWithIndex.foreach { chromosome: (Array[Int], Int) =>
147 |       evaluations(chromosome._2) = fitnessFunction(chromosome._1)
148 |     }
149 | 
150 |     var incestThreshold: Int = targetInstances.length / 4
151 |     var actualEvaluations: Int = populationSize
152 | 
153 |     while (actualEvaluations < maxEvaluations) {
154 |       val randomPopulation: Array[Array[Int]] = random.shuffle(population.indices.toList).toArray map population
155 |       val newPopulation: ArrayBuffer[Array[Int]] = new ArrayBuffer[Array[Int]](0)
156 | 
157 |       (randomPopulation.indices by 2).foreach { i: Int =>
158 |         val hammingDistance: Int = (randomPopulation(i) zip randomPopulation(i + 1)).count((pair: (Int, Int)) => pair._1 != pair._2)
159 | 
160 |         if ((hammingDistance / 2) > incestThreshold) {
161 |           val desc1: Array[Int] = randomPopulation(i).clone
162 |           val desc2: Array[Int] = randomPopulation(i + 1).clone
163 | 
164 |           desc1.indices.foreach { i: Int =>
165 |             if (desc1(i) != desc2(i) && random.nextFloat < 0.5) {
166 |               desc1(i) = if (desc1(i) == 1) 0 else if (random.nextFloat < probHUX) 1 else desc1(i)
167 |               desc2(i) = if (desc2(i) == 1) 0 else if (random.nextFloat < probHUX) 1 else desc2(i)
168 | 
169 |               if (majoritySelection) {
170 |                 minorityElements.foreach((i: Int) => desc1(i) = 1)
171 |                 minorityElements.foreach((i: Int) => desc2(i) = 1)
172 |               }
173 |             }
174 |           }
175 | 
176 |           newPopulation += desc1
177 |           newPopulation += desc2
178 |         }
179 |       }
180 | 
181 |       val newEvaluations: Array[Double] = new Array[Double](newPopulation.length)
182 |       newPopulation.zipWithIndex.foreach { chromosome: (Array[Int], Int) =>
183 |         newEvaluations(chromosome._2) = fitnessFunction(chromosome._1)
184 |       }
185 | 
186 |       actualEvaluations += newPopulation.length
187 | 
188 |       // We order the population. The best ones (greater evaluation value) are the first
189 |       val populationOrder: Array[(Double, Int, String)] = evaluations.zipWithIndex.sortBy(_._1)(Ordering[Double].reverse).map((e: (Double, Int)) => (e._1, e._2, "OLD"))
190 |       val newPopulationOrder: Array[(Double, Int, String)] = newEvaluations.zipWithIndex.sortBy(_._1)(Ordering[Double].reverse).map((e: (Double, Int)) => (e._1, e._2, "NEW"))
191 | 
192 |       if (newPopulationOrder.length == 0 || populationOrder.last._1 > newPopulationOrder.head._1) {
193 |         incestThreshold -= 1
194 |       } else {
195 |         val finalOrder: Array[(Double, Int, String)] = (populationOrder ++ newPopulationOrder).sortBy(_._1)(Ordering[Double].reverse).take(populationSize)
196 | 
197 |         finalOrder.zipWithIndex.foreach { e: ((Double, Int, String), Int) =>
198 |           population(e._2) = if (e._1._3 == "OLD") population(e._1._2) else newPopulation(e._1._2)
199 |           evaluations(e._2) = if (e._1._3 == "OLD") evaluations(e._1._2) else newEvaluations(e._1._2)
200 |         }
201 |       }
202 | 
203 |       if (incestThreshold <= 0) {
204 |         population.indices.tail.foreach { i: Int =>
205 |           val individual: Array[Int] = population(i).map(_ => if (random.nextFloat < recombination)
206 |             if (random.nextFloat < prob0to1) 1 else 0 else population(0)(i))
207 | 
208 |           if (majoritySelection) {
209 |             minorityElements.foreach((i: Int) => individual(i) = 1)
210 |           }
211 | 
212 |           population(i) = individual
213 |         }
214 | 
215 |         population.zipWithIndex.tail.par.foreach { e: (Array[Int], Int) =>
216 |           evaluations(e._2) = fitnessFunction(e._1)
217 |         }
218 | 
219 |         actualEvaluations += (population.length - 1)
220 | 
221 |         incestThreshold = (recombination * (1.0 - recombination) * targetInstances.length.toFloat).toInt
222 |       }
223 |     }
224 | 
225 |     val bestChromosome: Array[Int] = population(evaluations.zipWithIndex.sortBy(_._1)(Ordering[Double].reverse).head._2)
226 |     val finalIndex: Array[Int] = zeroOneToIndex(bestChromosome) map targetInstances
227 |     val finishTime: Long = System.nanoTime()
228 | 
229 |     if (verbose) {
230 |       val newCounter: Map[Any, Int] = (finalIndex map classesToWorkWith).groupBy(identity).mapValues(_.length)
231 |       println("ORIGINAL SIZE: %d".format(dataToWorkWith.length))
232 |       println("NEW DATA SIZE: %d".format(finalIndex.length))
233 |       println("REDUCTION PERCENTAGE: %s".format(100 - (finalIndex.length.toFloat / dataToWorkWith.length) * 100))
234 |       println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass)))
235 |       println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass)))
236 |       println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
237 |     }
238 | 
239 |     new Data(finalIndex map data.x, finalIndex map data.y, Some(finalIndex), data.fileInfo)
240 |   }
241 | }
242 | 


--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/oversampling/DBSMOTE.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | SOUL: Scala Oversampling and Undersampling Library.
  3 | Copyright (C) 2019 Néstor Rodríguez, David López
  4 | 
  5 | This program is free software: you can redistribute it and/or modify
  6 | it under the terms of the GNU General Public License as published by
  7 | the Free Software Foundation in version 3 of the License.
  8 | 
  9 | This program is distributed in the hope that it will be useful,
 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | GNU General Public License for more details.
 13 | 
 14 | You should have received a copy of the GNU General Public License
 15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 |  */
 17 | package soul.algorithm.oversampling
 18 | 
 19 | import soul.data.Data
 20 | import soul.util.Utilities.Distance.Distance
 21 | import soul.util.Utilities._
 22 | 
 23 | import scala.collection.mutable.ArrayBuffer
 24 | import scala.util.Random
 25 | 
 26 | /** DBSMOTE algorithm. Original paper: "DBSMOTE: Density-Based Synthetic Minority Over-sampling Technique" by
 27 |   * Chumphol Bunkhumpornpat, Krung Sinapiromsaran and Chidchanok Lursinsap.
 28 |   *
 29 |   * @param data      data to work with
 30 |   * @param eps       epsilon to indicate the distance that must be between two points
 31 |   * @param k         number of neighbors
 32 |   * @param dist      object of Distance enumeration representing the distance to be used
 33 |   * @param seed      seed to use. If it is not provided, it will use the system time
 34 |   * @param normalize normalize the data or not
 35 |   * @param verbose   choose to display information about the execution or not
 36 |   * @author David López Pretel
 37 |   */
 38 | class DBSMOTE(data: Data, eps: Double = -1, k: Int = 5, dist: Distance = Distance.EUCLIDEAN,
 39 |               seed: Long = 5, normalize: Boolean = false, verbose: Boolean = false) {
 40 | 
 41 |   /** Compute the DBSMOTE algorithm
 42 |     *
 43 |     * @return synthetic samples generated
 44 |     */
 45 |   def compute(): Data = {
 46 |     val initTime: Long = System.nanoTime()
 47 |     val minorityClassIndex: Array[Int] = minority(data.y)
 48 |     val samples: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
 49 | 
 50 |     val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
 51 |       (samples.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
 52 |         samples.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
 53 |         samples.transpose.map((column: Array[Double]) => standardDeviation(column)))
 54 |     } else {
 55 |       (null, null, null)
 56 |     }
 57 | 
 58 |     def regionQuery(point: Int, eps: Double): Array[Int] = {
 59 |       (minorityClassIndex map samples).indices.map(sample => {
 60 |         val D: Double = if (dist == Distance.EUCLIDEAN) {
 61 |           euclidean(samples(minorityClassIndex(point)), samples(minorityClassIndex(sample)))
 62 |         } else {
 63 |           HVDM(samples(minorityClassIndex(point)), samples(minorityClassIndex(sample)), data.fileInfo.nominal, sds,
 64 |             attrCounter, attrClassesCounter)
 65 |         }
 66 |         if (D <= eps) {
 67 |           Some(sample)
 68 |         } else {
 69 |           None
 70 |         }
 71 |       }).filterNot(_.forall(_ == None)).map(_.get).toArray
 72 |     }
 73 | 
 74 |     def expandCluster(point: Int, clusterId: Int, clusterIds: Array[Int], eps: Double, minPts: Int): Boolean = {
 75 |       val neighbors: ArrayBuffer[Int] = ArrayBuffer(regionQuery(point, eps): _*)
 76 |       if (neighbors.length < minPts) {
 77 |         clusterIds(point) = -2 //noise
 78 |         return false
 79 |       } else {
 80 |         neighbors.foreach(clusterIds(_) = clusterId)
 81 |         clusterIds(point) = clusterId
 82 | 
 83 |         var numNeighbors: Int = neighbors.length
 84 |         for (current <- 0 until numNeighbors) {
 85 |           val neighborsOfCurrent: Array[Int] = regionQuery(current, eps)
 86 |           if (neighborsOfCurrent.length >= minPts) {
 87 |             neighborsOfCurrent.foreach(neighbor => {
 88 |               if (clusterIds(neighbor) == -1 || clusterIds(neighbor) == -2) { //Noise or Unclassified
 89 |                 if (clusterIds(neighbor) == -1) { //Unclassified
 90 |                   neighbors += neighbor
 91 |                   numNeighbors += 1
 92 |                 }
 93 |                 clusterIds(neighbor) = clusterId
 94 |               }
 95 |             })
 96 |           }
 97 |         }
 98 |       }
 99 | 
100 |       true
101 |     }
102 | 
103 |     def dbscan(eps: Double, minPts: Int): Array[Array[Int]] = {
104 |       var clusterId: Int = 0
105 |       val clusterIds: Array[Int] = Array.fill(minorityClassIndex.length)(-1)
106 |       minorityClassIndex.indices.foreach(point => {
107 |         if (clusterIds(point) == -1) {
108 |           if (expandCluster(point, clusterId, clusterIds, eps, minPts)) {
109 |             clusterId += 1
110 |           }
111 |         }
112 |       })
113 | 
114 |       if (clusterId != 0) {
115 |         val clusters: Array[Array[Int]] = Array.fill(clusterId)(Array())
116 |         (0 until clusterId).foreach(i => {
117 |           clusters(i) = clusterIds.zipWithIndex.filter(_._1 == i).map(_._2)
118 |         })
119 |         clusters
120 |       } else { // the cluster is all the data
121 |         Array(Array.range(0, minorityClassIndex.length))
122 |       }
123 |     }
124 | 
125 |     def buildGraph(cluster: Array[Int], eps: Double, minPts: Int): Array[Array[Boolean]] = {
126 |       val graph: Array[Array[Boolean]] = Array.fill(cluster.length, cluster.length)(false)
127 |       //distance between each pair of nodes
128 |       val distances: Array[Array[Double]] = cluster.map { i =>
129 |         cluster.map { j =>
130 |           if (dist == Distance.EUCLIDEAN) {
131 |             euclidean(samples(minorityClassIndex(i)), samples(minorityClassIndex(j)))
132 |           } else {
133 |             HVDM(samples(minorityClassIndex(i)), samples(minorityClassIndex(j)), data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
134 |           }
135 | 
136 |         }
137 |       }
138 | 
139 |       // number of nodes connected to another which satisfied distance(a,b) <= eps
140 |       val NNq: Array[Int] = distances.map(row => row.map(dist => if (dist <= eps) 1 else 0)).map(_.sum)
141 | 
142 |       //build the graph
143 |       cluster.indices.foreach(i => {
144 |         if (cluster.length >= minPts + 1) {
145 |           distances(i).zipWithIndex.foreach(dist => {
146 |             if (dist._1 <= eps && dist._1 > 0 && NNq(dist._2) >= minPts) {
147 |               graph(i)(dist._2) = true
148 |             }
149 |           })
150 |         } else {
151 |           distances(i).zipWithIndex.foreach(dist => {
152 |             if (dist._1 <= eps && dist._1 > 0) {
153 |               graph(i)(dist._2) = true
154 |             }
155 |           })
156 |         }
157 |       })
158 |       graph
159 |     }
160 | 
161 |     def dijsktra(graph: Array[Array[Boolean]], source: Int, target: Int, cluster: Array[Int]): Array[Int] = {
162 |       // distance from source to node, prev node, node visited or not
163 |       val nodeInfo: Array[(Double, Int, Boolean)] = Array.fill(graph.length)((9999999, -1, false))
164 |       nodeInfo(source) = (0.0, source, false)
165 | 
166 |       val findMin = (x: ((Double, Int, Boolean), Int), y: ((Double, Int, Boolean), Int)) =>
167 |         if ((x._1._1 < y._1._1 && !x._1._3) || (!x._1._3 && y._1._3)) x else y
168 | 
169 |       nodeInfo.indices.foreach(_ => {
170 |         val u: Int = nodeInfo.zipWithIndex.reduceLeft(findMin)._2 //vertex with min distance
171 |         nodeInfo(u) = (nodeInfo(u)._1, nodeInfo(u)._2, true)
172 |         if (u == target) { // return shortest path
173 |           val shortestPath: ArrayBuffer[Int] = ArrayBuffer()
174 |           var current = target
175 |           while (current != source) {
176 |             shortestPath += current
177 |             current = nodeInfo(current)._2
178 |           }
179 |           shortestPath += current
180 |           return shortestPath.toArray
181 |         }
182 |         graph(u).indices.foreach(v => {
183 |           if (graph(u)(v) && !nodeInfo(v)._3) {
184 |             val d: Double = if (dist == Distance.EUCLIDEAN) {
185 |               euclidean(samples(minorityClassIndex(cluster(u))),
186 |                 samples(minorityClassIndex(cluster(v))))
187 |             } else {
188 |               HVDM(samples(minorityClassIndex(cluster(u))), samples(minorityClassIndex(cluster(v))), data.fileInfo.nominal,
189 |                 sds, attrCounter, attrClassesCounter)
190 |             }
191 |             val alt = nodeInfo(u)._1 + d
192 |             if (alt < nodeInfo(v)._1) nodeInfo(v) = (alt, u, nodeInfo(v)._3)
193 |           }
194 |         })
195 |       })
196 | 
197 |       throw new Exception("Path not found")
198 |     }
199 | 
200 |     val minorityClass: Any = data.y(minorityClassIndex(0))
201 |     //check if the user pass the epsilon parameter
202 |     var eps2 = eps
203 |     if (eps == -1) {
204 |       eps2 = samples.map { i =>
205 |         samples.map { j =>
206 |           if (dist == Distance.EUCLIDEAN) {
207 |             euclidean(i, j)
208 |           } else {
209 |             HVDM(i, j, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
210 |           }
211 |         }.sum
212 |       }.sum / (samples.length * samples.length)
213 |     }
214 | 
215 |     //compute the clusters using dbscan
216 |     val clusters: Array[Array[Int]] = dbscan(eps2, k)
217 | 
218 |     //the output of the algorithm
219 |     val output: Array[Array[Double]] = Array.fill(clusters.map(_.length).sum, samples(0).length)(0)
220 | 
221 |     //for each cluster
222 |     clusters.foreach(c => {
223 |       //build a graph with the data of each cluster
224 |       val graph: Array[Array[Boolean]] = buildGraph(c, eps2, k)
225 |       val r: Random.type = scala.util.Random
226 |       r.setSeed(seed)
227 |       var newIndex: Int = 0
228 |       //compute pseudo-centroid, centroid is the mean of the cluster
229 |       val centroid = (c map samples).transpose.map(_.sum / c.length)
230 |       var pseudoCentroid: (Int, Double) = (0, 99999999.0)
231 |       //the pseudo-centroid is the sample that is closest to the centroid
232 |       (c map samples).zipWithIndex.foreach(sample => {
233 |         val d: Double = if (dist == Distance.EUCLIDEAN) {
234 |           euclidean(sample._1, centroid)
235 |         } else {
236 |           HVDM(sample._1, centroid, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)
237 |         }
238 |         if (d < pseudoCentroid._2) pseudoCentroid = (sample._2, d)
239 |       })
240 | 
241 |       c.indices.foreach(p => {
242 |         //compute the shortest path between the pseudo centroid and the samples in each cluster
243 |         val shortestPath: Array[Int] = dijsktra(graph, p, pseudoCentroid._1, c)
244 |         //a random sample in the path
245 |         val e = r.nextInt(shortestPath.length)
246 |         //get the nodes connected by e, then only the two first will be used
247 |         val v1_v2: Array[(Boolean, Int)] = graph(shortestPath(e)).zipWithIndex.filter(_._1 == true)
248 |         samples(0).indices.foreach(attrib => {
249 |           // v1(attrib) - v2(attrib)
250 |           val dif: Double = samples(minorityClassIndex(c(v1_v2(1)._2)))(attrib) - samples(minorityClassIndex(c(v1_v2(0)._2)))(attrib)
251 |           val gap: Double = r.nextFloat()
252 |           // v1(attrib) + gap * dif
253 |           output(newIndex)(attrib) = samples(minorityClassIndex(c(v1_v2(0)._2)))(attrib) + gap * dif
254 |         })
255 |         newIndex += 1
256 |       })
257 |     })
258 | 
259 |     val finishTime: Long = System.nanoTime()
260 | 
261 |     if (verbose) {
262 |       println("ORIGINAL SIZE: %d".format(data.x.length))
263 |       println("NEW DATA SIZE: %d".format(data.x.length + output.length))
264 |       println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
265 |     }
266 | 
267 |     new Data(if (data.fileInfo.nominal.length == 0) {
268 |       to2Decimals(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs,
269 |         data.fileInfo.minAttribs) else output))
270 |     } else {
271 |       toNominal(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs,
272 |         data.fileInfo.minAttribs) else output), data.nomToNum)
273 |     }, Array.concat(data.y, Array.fill(output.length)(minorityClass)), None, data.fileInfo)
274 |   }
275 | }
276 | 


--------------------------------------------------------------------------------
/src/main/scala/soul/algorithm/undersampling/SBC.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | SOUL: Scala Oversampling and Undersampling Library.
  3 | Copyright (C) 2019 Néstor Rodríguez, David López
  4 | 
  5 | This program is free software: you can redistribute it and/or modify
  6 | it under the terms of the GNU General Public License as published by
  7 | the Free Software Foundation in version 3 of the License.
  8 | 
  9 | This program is distributed in the hope that it will be useful,
 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | GNU General Public License for more details.
 13 | 
 14 | You should have received a copy of the GNU General Public License
 15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 |  */
 17 | package soul.algorithm.undersampling
 18 | 
 19 | import soul.data.Data
 20 | import soul.util.KDTree
 21 | import soul.util.Utilities.Distance.Distance
 22 | import soul.util.Utilities._
 23 | 
 24 | import scala.math.{max, min}
 25 | 
 26 | /** Undersampling Based on Clustering. Original paper: "Under-Sampling Approaches for Improving Prediction of the
 27 |   * Minority Class in an Imbalanced Dataset" by Show-Jane Yen and Yue-Shi Lee.
 28 |   *
 29 |   * @param data          data to work with
 30 |   * @param seed          seed to use. If it is not provided, it will use the system time
 31 |   * @param method        selection method to apply. Possible options: random, NearMiss1, NearMiss2, NearMiss3, MostDistant and MostFar
 32 |   * @param m             ratio used in the SSize calculation
 33 |   * @param k             number of neighbours to use when computing k-NN rule (normally 3 neighbours)
 34 |   * @param numClusters   number of clusters to be created by KMeans core
 35 |   * @param restarts      number of times to relaunch KMeans core
 36 |   * @param minDispersion stop KMeans core if dispersion is lower than this value
 37 |   * @param maxIterations number of iterations to be done in KMeans core
 38 |   * @param dist          object of Distance enumeration representing the distance to be used
 39 |   * @param normalize     normalize the data or not
 40 |   * @param randomData    iterate through the data randomly or not
 41 |   * @param verbose       choose to display information about the execution or not
 42 |   * @author Néstor Rodríguez Vico
 43 |   */
 44 | class SBC(data: Data, seed: Long = System.currentTimeMillis(), method: String = "NearMiss1", m: Double = 1.0, k: Int = 3, numClusters: Int = 50,
 45 |           restarts: Int = 1, minDispersion: Double = 0.0001, maxIterations: Int = 200, val dist: Distance = Distance.EUCLIDEAN,
 46 |           normalize: Boolean = false, randomData: Boolean = false, verbose: Boolean = false) {
 47 | 
 48 |   /** Compute the SBC algorithm.
 49 |     *
 50 |     * @return undersampled data structure
 51 |     */
 52 |   def compute(): Data = {
 53 |     val initTime: Long = System.nanoTime()
 54 | 
 55 |     val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length)
 56 |     val untouchableClass: Any = counter.minBy((c: (Any, Int)) => c._2)._1
 57 |     val random: scala.util.Random = new scala.util.Random(seed)
 58 |     var dataToWorkWith: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData
 59 |     val classesToWorkWith: Array[Any] = if (randomData) {
 60 |       val randomIndex: List[Int] = random.shuffle(data.y.indices.toList)
 61 |       dataToWorkWith = (randomIndex map dataToWorkWith).toArray
 62 |       (randomIndex map data.y).toArray
 63 |     } else {
 64 |       data.y
 65 |     }
 66 | 
 67 |     val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) {
 68 |       (dataToWorkWith.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)),
 69 |         dataToWorkWith.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)),
 70 |         dataToWorkWith.transpose.map((column: Array[Double]) => standardDeviation(column)))
 71 |     } else {
 72 |       (null, null, null)
 73 |     }
 74 | 
 75 |     val (_, centroids, assignment) = kMeans(dataToWorkWith, data.fileInfo.nominal, numClusters, restarts, minDispersion, maxIterations, seed)
 76 |     val minMajElements: List[(Int, Int)] = (0 until numClusters).toList.map { cluster: Int =>
 77 |       val elements = assignment(cluster)
 78 |       val minElements: Int = (elements map classesToWorkWith).count((c: Any) => c == untouchableClass)
 79 |       (minElements, elements.length - minElements)
 80 |     }
 81 |     val nPos: Double = minMajElements.unzip._2.sum.toDouble
 82 |     val sizeK: Double = minMajElements.map((pair: (Int, Int)) => pair._2.toDouble / max(pair._1, 1)).sum
 83 |     val sSizes: Array[(Int, Int)] = assignment.map { element: (Int, Array[Int]) =>
 84 |       val ratio: (Int, Int) = minMajElements(element._1)
 85 |       // The min is to prevent infinity values if no minority elements are added to the cluster
 86 |       (element._1, min(m * nPos * ((ratio._2.toDouble / (ratio._1 + 1)) / sizeK), ratio._2).toInt)
 87 |     }.toArray
 88 |     val minorityElements: Array[Int] = assignment.flatMap((element: (Int, Array[Int])) => element._2.filter((index: Int) =>
 89 |       classesToWorkWith(index) == untouchableClass)).toArray
 90 | 
 91 |     val majorityElements: Array[Int] = if (method.equals("random")) {
 92 |       sSizes.filter(_._2 != 0).flatMap { clusterIdSize: (Int, Int) =>
 93 |         random.shuffle(assignment(clusterIdSize._1).toList).filter((e: Int) =>
 94 |           classesToWorkWith(e) != untouchableClass).take(clusterIdSize._2)
 95 |       }
 96 |     } else {
 97 |       sSizes.filter(_._2 != 0).flatMap { clusteridSize: (Int, Int) =>
 98 |         val majorityElementsIndex: Array[(Int, Int)] = assignment(clusteridSize._1).zipWithIndex.filter((e: (Int, Int)) =>
 99 |           classesToWorkWith(e._1) != untouchableClass)
100 | 
101 |         // If no minority class elements are assigned to the cluster
102 |         if (majorityElementsIndex.length == assignment(clusteridSize._1).length) {
103 |           // Use the centroid as "minority class" element
104 |           val distances: Array[Double] = assignment(clusteridSize._1).map { instance: Int =>
105 |             euclidean(dataToWorkWith(instance), centroids(clusteridSize._1))
106 |           }
107 | 
108 |           distances.zipWithIndex.sortBy(_._2).take(clusteridSize._2).map(_._2) map assignment(clusteridSize._1)
109 |         } else {
110 |           val minorityElementsIndex: Array[(Int, Int)] = assignment(clusteridSize._1).zipWithIndex.filter((e: (Int, Int)) =>
111 |             classesToWorkWith(e._1) == untouchableClass)
112 |           val majorityElementsIndex: Array[(Int, Int)] = assignment(clusteridSize._1).zipWithIndex.filter((e: (Int, Int)) =>
113 |             classesToWorkWith(e._1) != untouchableClass)
114 | 
115 |           val minNeighbours: Array[Array[Double]] = minorityElementsIndex.unzip._2 map dataToWorkWith
116 |           val majNeighbours: Array[Array[Double]] = majorityElementsIndex.unzip._2 map dataToWorkWith
117 |           val minClasses: Array[Any] = minorityElementsIndex.unzip._2 map classesToWorkWith
118 |           val majClasses: Array[Any] = majorityElementsIndex.unzip._2 map classesToWorkWith
119 | 
120 |           val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) {
121 |             Some(new KDTree(minNeighbours, minClasses, dataToWorkWith(0).length))
122 |           } else {
123 |             None
124 |           }
125 | 
126 |           val majorityKDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) {
127 |             Some(new KDTree(majNeighbours, majClasses, dataToWorkWith(0).length))
128 |           } else {
129 |             None
130 |           }
131 | 
132 |           val reverseKDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) {
133 |             Some(new KDTree(minNeighbours, minClasses, dataToWorkWith(0).length, which = "farthest"))
134 |           } else {
135 |             None
136 |           }
137 | 
138 |           if (method.equals("NearMiss1")) {
139 |             // selects the majority class samples whose average distances to k nearest minority class samples in the ith cluster are the smallest.
140 |             val meanDistances: Array[(Int, Double)] = majorityElementsIndex.map { i: (Int, Int) =>
141 |               if (dist == Distance.EUCLIDEAN) {
142 |                 val index = KDTree.get.nNeighbours(dataToWorkWith(i._1), k)._3
143 |                 (i._1, index.map(j => euclidean(dataToWorkWith(i._1), dataToWorkWith(j))).sum / index.length)
144 |               } else {
145 |                 val result: (Any, Array[Int], Array[Double]) = nnRuleHVDM(minNeighbours, dataToWorkWith(i._1), -1, minClasses, k,
146 |                   data.fileInfo.nominal, sds, attrCounter, attrClassesCounter, "nearest")
147 |                 (i._1, (result._2 map result._3).sum / result._2.length)
148 |               }
149 |             }
150 |             meanDistances.sortBy((pair: (Int, Double)) => pair._2).take(clusteridSize._2).map(_._1)
151 |           } else if (method.equals("NearMiss2")) {
152 |             // selects the majority class samples whose average distances to k farthest minority class samples in the ith cluster are the smallest.
153 |             val meanDistances: Array[(Int, Double)] = majorityElementsIndex.map { i: (Int, Int) =>
154 |               if (dist == Distance.EUCLIDEAN) {
155 |                 val index = reverseKDTree.get.nNeighbours(dataToWorkWith(i._1), k)._3
156 |                 (i._1, index.map(j => euclidean(dataToWorkWith(i._1), dataToWorkWith(j))).sum / index.length)
157 |               } else {
158 |                 val result: (Any, Array[Int], Array[Double]) = nnRuleHVDM(minNeighbours, dataToWorkWith(i._1), -1, minClasses, k,
159 |                   data.fileInfo.nominal, sds, attrCounter, attrClassesCounter, "farthest")
160 |                 (i._1, (result._2 map result._3).sum / result._2.length)
161 |               }
162 |             }
163 |             meanDistances.sortBy((pair: (Int, Double)) => pair._2).take(clusteridSize._2).map(_._1)
164 |           } else if (method.equals("NearMiss3")) {
165 |             // selects the majority class samples whose average distances to the closest minority class samples in the ith cluster are the smallest.
166 |             val meanDistances: Array[(Int, Double)] = majorityElementsIndex.map { i: (Int, Int) =>
167 |               if (dist == Distance.EUCLIDEAN) {
168 |                 val index = majorityKDTree.get.nNeighbours(dataToWorkWith(i._1), k)._3
169 |                 (i._1, index.map(j => euclidean(dataToWorkWith(i._1), dataToWorkWith(j))).sum / index.length)
170 |               } else {
171 |                 val result: (Any, Array[Int], Array[Double]) = nnRuleHVDM(majNeighbours, dataToWorkWith(i._1), -1, majClasses, k,
172 |                   data.fileInfo.nominal, sds, attrCounter, attrClassesCounter, "nearest")
173 |                 (i._1, (result._2 map result._3).sum / result._2.length)
174 |               }
175 |             }
176 |             meanDistances.sortBy((pair: (Int, Double)) => pair._2).take(clusteridSize._2).map(_._1)
177 |           } else if (method.equals("MostDistant")) {
178 |             // selects the majority class samples whose average distances to M closest minority class samples in the ith cluster are the farthest.
179 |             val meanDistances: Array[(Int, Double)] = majorityElementsIndex.map { i: (Int, Int) =>
180 |               if (dist == Distance.EUCLIDEAN) {
181 |                 val index = KDTree.get.nNeighbours(dataToWorkWith(i._1), k)._3
182 |                 (i._1, index.map(j => euclidean(dataToWorkWith(i._1), dataToWorkWith(j))).sum / index.length)
183 |               } else {
184 |                 val result: (Any, Array[Int], Array[Double]) = nnRuleHVDM(minNeighbours, dataToWorkWith(i._1), -1, minClasses, k,
185 |                   data.fileInfo.nominal, sds, attrCounter, attrClassesCounter, "nearest")
186 |                 (i._1, (result._2 map result._3).sum / result._2.length)
187 |               }
188 |             }
189 |             meanDistances.sortBy((pair: (Int, Double)) => pair._2).reverse.take(clusteridSize._2).map(_._1)
190 |           } else if (method.equals("MostFar")) {
191 |             // selects the majority class samples whose average distances to all minority class samples in the cluster are the farthest
192 |             val meanDistances: Array[(Int, Double)] = majorityElementsIndex.map { i: (Int, Int) =>
193 |               if (dist == Distance.EUCLIDEAN) {
194 |                 val index = KDTree.get.nNeighbours(dataToWorkWith(i._1), minorityElementsIndex.length)._3
195 |                 (i._1, index.map(j => euclidean(dataToWorkWith(i._1), dataToWorkWith(j))).sum / index.length)
196 |               } else {
197 |                 val result: (Any, Array[Int], Array[Double]) = nnRuleHVDM(minNeighbours, dataToWorkWith(i._1), -1, minClasses,
198 |                   minorityElementsIndex.length, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter, "nearest")
199 |                 (i._1, (result._2 map result._3).sum / result._2.length)
200 |               }
201 |             }
202 |             meanDistances.sortBy((pair: (Int, Double)) => pair._2).take(clusteridSize._2).map(_._1)
203 |           } else {
204 |             throw new Exception("Invalid argument: method should be: random, NearMiss1, NearMiss2, NearMiss3, MostDistant or MostFar")
205 |           }
206 |         }
207 |       }
208 |     }
209 | 
210 |     val finalIndex: Array[Int] = minorityElements.distinct ++ majorityElements.distinct
211 |     val finishTime: Long = System.nanoTime()
212 | 
213 |     if (verbose) {
214 |       val newCounter: Map[Any, Int] = (finalIndex map classesToWorkWith).groupBy(identity).mapValues(_.length)
215 |       println("ORIGINAL SIZE: %d".format(dataToWorkWith.length))
216 |       println("NEW DATA SIZE: %d".format(finalIndex.length))
217 |       println("REDUCTION PERCENTAGE: %s".format(100 - (finalIndex.length.toFloat / dataToWorkWith.length) * 100))
218 |       println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass)))
219 |       println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass)))
220 |       println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime)))
221 |     }
222 | 
223 |     new Data(finalIndex map data.x, finalIndex map data.y, Some(finalIndex), data.fileInfo)
224 |   }
225 | }


--------------------------------------------------------------------------------