├── project └── build.properties ├── images ├── NCL.png ├── SBC.png ├── ADASYN.png ├── IHTS.png ├── IPADE.png ├── MWMOTE.png ├── SMOTE.png ├── original.png └── SafeLevelSMOTE.png ├── src └── main │ └── scala │ └── soul │ ├── data │ ├── Data.scala │ └── FileInfo.scala │ ├── algorithm │ ├── oversampling │ │ ├── RO.scala │ │ ├── SMOTE.scala │ │ ├── SMOTEENN.scala │ │ ├── SMOTETL.scala │ │ ├── SafeLevelSMOTE.scala │ │ ├── ADOMS.scala │ │ ├── BorderlineSMOTE.scala │ │ ├── ADASYN.scala │ │ ├── MDO.scala │ │ ├── SMOTERSB.scala │ │ ├── Spider2.scala │ │ ├── MWMOTE.scala │ │ └── DBSMOTE.scala │ └── undersampling │ │ ├── RU.scala │ │ ├── EE.scala │ │ ├── ENN.scala │ │ ├── OSS.scala │ │ ├── IHTS.scala │ │ ├── TL.scala │ │ ├── ClusterOSS.scala │ │ ├── CPM.scala │ │ ├── NCL.scala │ │ ├── BC.scala │ │ ├── CNN.scala │ │ ├── NM.scala │ │ ├── EUS.scala │ │ └── SBC.scala │ ├── io │ ├── Writer.scala │ └── Reader.scala │ └── util │ └── KDTree.scala └── README.md /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.2.3 2 | -------------------------------------------------------------------------------- /images/NCL.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NestorRV/SOUL/HEAD/images/NCL.png -------------------------------------------------------------------------------- /images/SBC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NestorRV/SOUL/HEAD/images/SBC.png -------------------------------------------------------------------------------- /images/ADASYN.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NestorRV/SOUL/HEAD/images/ADASYN.png -------------------------------------------------------------------------------- /images/IHTS.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NestorRV/SOUL/HEAD/images/IHTS.png -------------------------------------------------------------------------------- /images/IPADE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NestorRV/SOUL/HEAD/images/IPADE.png -------------------------------------------------------------------------------- /images/MWMOTE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NestorRV/SOUL/HEAD/images/MWMOTE.png -------------------------------------------------------------------------------- /images/SMOTE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NestorRV/SOUL/HEAD/images/SMOTE.png -------------------------------------------------------------------------------- /images/original.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NestorRV/SOUL/HEAD/images/original.png -------------------------------------------------------------------------------- /images/SafeLevelSMOTE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NestorRV/SOUL/HEAD/images/SafeLevelSMOTE.png -------------------------------------------------------------------------------- /src/main/scala/soul/data/Data.scala: -------------------------------------------------------------------------------- 1 | /* 2 | SOUL: Scala Oversampling and Undersampling Library. 3 | Copyright (C) 2019 Néstor Rodríguez, David López 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation in version 3 of the License. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | */ 17 | package soul.data 18 | 19 | import scala.collection.mutable 20 | 21 | /** Data structure used by the algorithms 22 | * 23 | * @param x data associated to the file (x) 24 | * @param y classes associated to the file (y) 25 | * @param index randomIndex representing the kept elements 26 | * @param fileInfo object with the information needed to save the data into a file 27 | * @author Néstor Rodríguez Vico 28 | */ 29 | class Data private[soul](private[soul] val x: Array[Array[Any]], private[soul] val y: Array[Any], 30 | private[soul] val index: Option[Array[Int]] = None, private[soul] val fileInfo: FileInfo) { 31 | 32 | private[soul] var processedData: Array[Array[Double]] = new Array[Array[Double]](0) 33 | private[soul] var nomToNum: Array[mutable.Map[Double, Any]] = new Array[mutable.Map[Double, Any]](0) 34 | } 35 | -------------------------------------------------------------------------------- /src/main/scala/soul/data/FileInfo.scala: -------------------------------------------------------------------------------- 1 | /* 2 | SOUL: Scala Oversampling and Undersampling Library. 3 | Copyright (C) 2019 Néstor Rodríguez, David López 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation in version 3 of the License. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | */ 17 | package soul.data 18 | 19 | import scala.collection.mutable 20 | 21 | /** Data structure used by the arff classes 22 | * 23 | * @param _file file containing the data 24 | * @param _comment string indicating that a line is a comment 25 | * @param _columnClass indicates which column represents the class in the file 26 | * @param _delimiter string separating two elements 27 | * @param _missing string indicating a element is missed 28 | * @param _header header of the file. If it is _, there was no header 29 | * @param _attributes map with the form: index -> attributeName 30 | * @param _attributesValues map with the form attributeName -> type (it it's nominal, possible values instead of type) 31 | * @param nominal array to know which attributes are nominal 32 | * @author Néstor Rodríguez Vico 33 | */ 34 | class FileInfo private[soul](private[soul] val _file: String, private[soul] val _comment: String, 35 | private[soul] val _columnClass: Int = -1, 36 | private[soul] val _delimiter: String, private[soul] val _missing: String, 37 | private[soul] val _header: Array[String], private[soul] val _relationName: String, 38 | private[soul] val _attributes: mutable.Map[Int, String], 39 | private[soul] val _attributesValues: mutable.Map[String, String], 40 | private[soul] val nominal: Array[Int]) { 41 | 42 | // data necessary to denormalize the data 43 | private[soul] var maxAttribs: Array[Double] = _ 44 | private[soul] var minAttribs: Array[Double] = _ 45 | 46 | } -------------------------------------------------------------------------------- /src/main/scala/soul/algorithm/oversampling/RO.scala: -------------------------------------------------------------------------------- 1 | /* 2 | SOUL: Scala Oversampling and Undersampling Library. 3 | Copyright (C) 2019 Néstor Rodríguez, David López 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation in version 3 of the License. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | */ 17 | package soul.algorithm.oversampling 18 | 19 | import soul.data.Data 20 | import soul.util.Utilities._ 21 | 22 | import scala.util.Random 23 | 24 | /** Random Oversampling algorithm. Original paper: "A study of the behavior of several methods for balancing machine 25 | * learning training data" by Batista, Gustavo EAPA and Prati, Ronaldo C and Monard, Maria Carolina. 26 | * 27 | * @param data data to work with 28 | * @param seed seed to use. If it is not provided, it will use the system time 29 | * @param percent number of samples to create 30 | * @param verbose choose to display information about the execution or not 31 | * @author David López Pretel 32 | */ 33 | class RO(data: Data, seed: Long = System.currentTimeMillis(), percent: Int = 500, verbose: Boolean = false) { 34 | 35 | /** Compute the SMOTE algorithm 36 | * 37 | * @return synthetic samples generated 38 | */ 39 | def compute(): Data = { 40 | val initTime: Long = System.nanoTime() 41 | 42 | if (percent < 0) { 43 | throw new Exception("Percent must be a greather than 0") 44 | } 45 | 46 | val minorityClassIndex: Array[Int] = minority(data.y) 47 | val minorityClass: Any = data.y(minorityClassIndex(0)) 48 | 49 | // output with a size of T*N samples 50 | val output: Array[Array[Double]] = Array.ofDim[Double](percent, data.processedData(0).length) 51 | 52 | val r: Random = new Random(seed) 53 | 54 | // for each minority class sample 55 | (0 until percent).par.foreach((i: Int) => { 56 | output(i) = data.processedData(minorityClassIndex(r.nextInt(minorityClassIndex.length))) 57 | }) 58 | 59 | val finishTime: Long = System.nanoTime() 60 | 61 | if (verbose) { 62 | println("ORIGINAL SIZE: %d".format(data.x.length)) 63 | println("NEW DATA SIZE: %d".format(data.x.length + output.length)) 64 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime))) 65 | } 66 | 67 | new Data(if (data.fileInfo.nominal.length == 0) { 68 | to2Decimals(Array.concat(data.processedData, output)) 69 | } else { 70 | toNominal(Array.concat(data.processedData, output), data.nomToNum) 71 | }, Array.concat(data.y, Array.fill(output.length)(minorityClass)), None, data.fileInfo) 72 | } 73 | } -------------------------------------------------------------------------------- /src/main/scala/soul/io/Writer.scala: -------------------------------------------------------------------------------- 1 | /* 2 | SOUL: Scala Oversampling and Undersampling Library. 3 | Copyright (C) 2019 Néstor Rodríguez, David López 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation in version 3 of the License. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | */ 17 | package soul.io 18 | 19 | import java.io.{File, PrintWriter} 20 | 21 | import soul.data.Data 22 | 23 | import scala.collection.immutable.ListMap 24 | 25 | /** Class write data files 26 | * 27 | * @author Néstor Rodríguez Vico 28 | */ 29 | object Writer { 30 | /** Store data into a delimited text file 31 | * 32 | * @param file filename where to store the data 33 | * @param data data to save to the file 34 | */ 35 | def writeArff(file: String, data: Data): Unit = { 36 | val pr = new PrintWriter(new File(file)) 37 | pr.write("@relation %s\n".format(data.fileInfo._relationName)) 38 | 39 | if (data.fileInfo._attributes == null || data.fileInfo._attributesValues == null) 40 | throw new Exception("Unable to write arff: missing information") 41 | 42 | val orderedAttributes: Map[Int, String] = ListMap(data.fileInfo._attributes.toSeq.sortBy(_._1): _*) 43 | 44 | for (attribute <- orderedAttributes) { 45 | pr.write("@attribute %s %s\n".format(attribute._2, data.fileInfo._attributesValues(attribute._2))) 46 | } 47 | 48 | pr.write("@data\n") 49 | 50 | for (row <- data.x zip data.y) { 51 | val naIndex: Array[Int] = row._1.zipWithIndex.filter(_._1 == "soul_NA").map(_._2) 52 | val newRow: Array[Any] = row._1.clone() 53 | for (index <- naIndex) { 54 | newRow(index) = "?" 55 | } 56 | 57 | pr.write(newRow.mkString(",") + "," + row._2 + "\n") 58 | } 59 | 60 | pr.close() 61 | } 62 | 63 | /** Store data into a delimited text file 64 | * 65 | * @param file filename where to store the data 66 | * @param data data to save to the file 67 | */ 68 | def writeDelimitedText(file: String, data: Data): Unit = { 69 | val delimiter: String = if (data.fileInfo._delimiter == null) "," else data.fileInfo._delimiter 70 | val missing: String = if (data.fileInfo._missing == null) "?" else data.fileInfo._delimiter 71 | 72 | val pr = new PrintWriter(new File(file)) 73 | if (data.fileInfo._header != null) 74 | pr.write(data.fileInfo._header.mkString(delimiter) + "\n") 75 | 76 | for (row <- data.x zip data.y) { 77 | val naIndex: Array[Int] = row._1.zipWithIndex.filter(_._1 == "soul_NA").map(_._2) 78 | val newRow: Array[Any] = row._1.clone() 79 | for (index <- naIndex) { 80 | newRow(index) = missing 81 | } 82 | 83 | pr.write(newRow.mkString(delimiter) + "," + row._2 + "\n") 84 | } 85 | 86 | pr.close() 87 | } 88 | } -------------------------------------------------------------------------------- /src/main/scala/soul/util/KDTree.scala: -------------------------------------------------------------------------------- 1 | /* 2 | SOUL: Scala Oversampling and Undersampling Library. 3 | Copyright (C) 2019 Néstor Rodríguez, David López 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation in version 3 of the License. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | */ 17 | package soul.util 18 | 19 | import com.thesamet.spatial.{DimensionalOrdering, KDTreeMap, Metric} 20 | 21 | import scala.language.implicitConversions 22 | import scala.math.sqrt 23 | 24 | /** Wrapper of a com.thesamet.spatial.KDTreeMap adapted for Arrays of Doubles 25 | * 26 | * @param x data 27 | * @param y labels 28 | * @param dimensions number of dimensions 29 | * @param which if it's set to "nearest", return the nearest neighbours, if it sets "farthest", return the farthest ones 30 | * @author Néstor Rodríguez Vico 31 | */ 32 | class KDTree(x: Array[Array[Double]], y: Array[Any], dimensions: Int, which: String = "nearest") { 33 | 34 | private[soul] var kDTreeMap: KDTreeMap[Array[Double], (Any, Int)] = if (which == "nearest") { 35 | KDTreeMap.fromSeq((x zip y.zipWithIndex).map(f => f._1 -> (f._2._1, f._2._2)))(dimensionalOrderingForArray[Array[Double], Double](dimensions)) 36 | } else { 37 | KDTreeMap.fromSeq((x zip y.zipWithIndex).map(f => f._1 -> (f._2._1, f._2._2)))(dimensionalReverseOrderingForArray[Array[Double], Double](dimensions)) 38 | } 39 | 40 | def nNeighbours(instance: Array[Double], k: Int, leaveOneOut: Boolean = true): (Seq[Array[Double]], Seq[Any], Seq[Int]) = { 41 | val realK: Int = if (leaveOneOut) k + 1 else k 42 | val drop: Int = if (leaveOneOut) 1 else 0 43 | val instances: (Seq[Array[Double]], Seq[(Any, Int)]) = kDTreeMap.findNearest(instance, realK).drop(drop).unzip 44 | val (labels, index) = instances._2.unzip 45 | (instances._1, labels, index) 46 | } 47 | 48 | def apply(x: Array[Double]): (Any, Int) = kDTreeMap(x) 49 | 50 | def addElement(x: Array[Double], y: Any): Unit = { 51 | kDTreeMap = kDTreeMap + (x -> (y, kDTreeMap.size + 1)) 52 | } 53 | 54 | def dimensionalOrderingForArray[T <: Array[A], A](dim: Int)(implicit ord: Ordering[A]): DimensionalOrdering[T] = 55 | new DimensionalOrdering[T] { 56 | val dimensions: Int = dim 57 | 58 | def compareProjection(d: Int)(x: T, y: T): Int = ord.compare(x(d), y(d)) 59 | } 60 | 61 | def dimensionalReverseOrderingForArray[T <: Array[A], A](dim: Int)(implicit ord: Ordering[A]): DimensionalOrdering[T] = 62 | new DimensionalOrdering[T] { 63 | val dimensions: Int = dim 64 | 65 | def compareProjection(d: Int)(x: T, y: T): Int = ord.compare(y(d), x(d)) 66 | } 67 | 68 | implicit def metricFromArray(implicit n: Numeric[Double]): Metric[Array[Double], Double] = new Metric[Array[Double], Double] { 69 | override def distance(x: Array[Double], y: Array[Double]): Double = sqrt(x.zip(y).map { z => 70 | val d = z._1 - z._2 71 | d * d 72 | }.sum) 73 | 74 | override def planarDistance(dimension: Int)(x: Array[Double], y: Array[Double]): Double = { 75 | val dd = x(dimension) - y(dimension) 76 | dd * dd 77 | } 78 | } 79 | } -------------------------------------------------------------------------------- /src/main/scala/soul/algorithm/undersampling/RU.scala: -------------------------------------------------------------------------------- 1 | /* 2 | SOUL: Scala Oversampling and Undersampling Library. 3 | Copyright (C) 2019 Néstor Rodríguez, David López 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation in version 3 of the License. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | */ 17 | package soul.algorithm.undersampling 18 | 19 | import soul.data.Data 20 | import soul.util.Utilities._ 21 | 22 | /** Compute a random algorithm. Original paper: "A study of the behavior of several methods for balancing machine 23 | * learning training data" by Batista, Gustavo EAPA and Prati, Ronaldo C and Monard, Maria Carolina. 24 | * 25 | * @param data data to work with 26 | * @param seed seed to use. If it is not provided, it will use the system time 27 | * @param ratio ratio to know how many majority class examples to preserve. By default it's set to 1 so there 28 | * will be the same minority class examples as majority class examples. It will take 29 | * numMinorityInstances * ratio 30 | * @param replacement whether or not to sample randomly with replacement or not. false by default 31 | * @param verbose choose to display information about the execution or not 32 | * @author Néstor Rodríguez Vico 33 | */ 34 | class RU(data: Data, seed: Long = System.currentTimeMillis(), ratio: Double = 1.0, replacement: Boolean = false, verbose: Boolean = false) { 35 | 36 | /** Compute the RU algorithm. 37 | * 38 | * @return undersampled data structure 39 | */ 40 | def compute(): Data = { 41 | val initTime: Long = System.nanoTime() 42 | 43 | val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length) 44 | val untouchableClass: Any = counter.minBy((c: (Any, Int)) => c._2)._1 45 | val random: scala.util.Random = new scala.util.Random(seed) 46 | 47 | val minorityIndex: Array[Int] = data.y.zipWithIndex.collect { case (label, i) if label == untouchableClass => i } 48 | val majorityIndex: Array[Int] = random.shuffle(data.y.zipWithIndex.collect { case (label, i) 49 | if label != untouchableClass => i 50 | }.toList).toArray 51 | val selectedMajorityIndex: Array[Int] = if (!replacement) majorityIndex.take((minorityIndex.length * ratio).toInt) else 52 | majorityIndex.indices.map(_ => random.nextInt(majorityIndex.length)).toArray map majorityIndex 53 | val finalIndex: Array[Int] = minorityIndex ++ selectedMajorityIndex 54 | val finishTime: Long = System.nanoTime() 55 | 56 | if (verbose) { 57 | val newCounter: Map[Any, Int] = (finalIndex map data.y).groupBy(identity).mapValues(_.length) 58 | println("ORIGINAL SIZE: %d".format(data.x.length)) 59 | println("NEW DATA SIZE: %d".format(finalIndex.length)) 60 | println("REDUCTION PERCENTAGE: %s".format(100 - (finalIndex.length.toFloat / data.x.length) * 100)) 61 | println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass))) 62 | println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass))) 63 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime))) 64 | } 65 | 66 | new Data(finalIndex map data.x, finalIndex map data.y, Some(finalIndex), data.fileInfo) 67 | } 68 | } -------------------------------------------------------------------------------- /src/main/scala/soul/algorithm/undersampling/EE.scala: -------------------------------------------------------------------------------- 1 | /* 2 | SOUL: Scala Oversampling and Undersampling Library. 3 | Copyright (C) 2019 Néstor Rodríguez, David López 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation in version 3 of the License. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | */ 17 | package soul.algorithm.undersampling 18 | 19 | import soul.data.Data 20 | import soul.util.Utilities._ 21 | 22 | /** Easy Ensemble algorithm. Original paper: "Exploratory Undersampling for Class-Imbalance Learning" by Xu-Ying Liu, 23 | * Jianxin Wu and Zhi-Hua Zhou. 24 | * 25 | * @param data data to work with 26 | * @param seed seed to use. If it is not provided, it will use the system time 27 | * @param ratio ratio to know how many majority class examples to preserve. By default it's set to 1 so there 28 | * will be the same minority class examples as majority class examples. It will take 29 | * numMinorityInstances * ratio 30 | * @param replacement whether or not to sample randomly with replacement or not. false by default 31 | * @param nTimes times to perform the random algorithm 32 | * @param normalize normalize the data or not 33 | * @param randomData iterate through the data randomly or not 34 | * @param verbose choose to display information about the execution or not 35 | * @author Néstor Rodríguez Vico 36 | */ 37 | class EE(data: Data, seed: Long = System.currentTimeMillis(), ratio: Double = 1.0, replacement: Boolean = false, nTimes: Int = 5, 38 | normalize: Boolean = false, randomData: Boolean = false, verbose: Boolean = false) { 39 | 40 | /** Compute the EE algorithm. 41 | * 42 | * @return undersampled data structure 43 | */ 44 | def compute(): Data = { 45 | val initTime: Long = System.nanoTime() 46 | 47 | val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length) 48 | val untouchableClass: Any = counter.minBy((c: (Any, Int)) => c._2)._1 49 | val random: scala.util.Random = new scala.util.Random(seed) 50 | var dataToWorkWith: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData 51 | val classesToWorkWith: Array[Any] = if (randomData) { 52 | val randomIndex: List[Int] = random.shuffle(data.y.indices.toList) 53 | dataToWorkWith = (randomIndex map dataToWorkWith).toArray 54 | (randomIndex map data.y).toArray 55 | } else { 56 | data.y 57 | } 58 | 59 | val minorityIndex: Array[Int] = classesToWorkWith.zipWithIndex.collect { case (label, i) if label == untouchableClass => i } 60 | val majIndex: List[Int] = classesToWorkWith.zipWithIndex.collect { case (label, i) if label != untouchableClass => i }.toList 61 | val majElements: Array[Int] = (0 until nTimes).flatMap { _: Int => 62 | val majorityIndex: Array[Int] = random.shuffle(majIndex).toArray 63 | if (!replacement) majorityIndex.take((minorityIndex.length * ratio).toInt) else majorityIndex.indices.map(_ => 64 | random.nextInt(majorityIndex.length)).toArray map majorityIndex 65 | }.toArray 66 | 67 | // Make an histogram and select the majority class examples that have been selected more times 68 | val majorityIndexHistogram: Array[(Int, Int)] = majElements.groupBy(identity).mapValues(_.length).toArray.sortBy(_._2).reverse 69 | val majorityIndex: Array[Int] = majorityIndexHistogram.take((minorityIndex.length * ratio).toInt).map(_._1) 70 | val finalIndex: Array[Int] = minorityIndex ++ majorityIndex 71 | val finishTime: Long = System.nanoTime() 72 | 73 | if (verbose) { 74 | val newCounter: Map[Any, Int] = (finalIndex map classesToWorkWith).groupBy(identity).mapValues(_.length) 75 | println("ORIGINAL SIZE: %d".format(dataToWorkWith.length)) 76 | println("NEW DATA SIZE: %d".format(finalIndex.length)) 77 | println("REDUCTION PERCENTAGE: %s".format(100 - (finalIndex.length.toFloat / dataToWorkWith.length) * 100)) 78 | println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass))) 79 | println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass))) 80 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime))) 81 | } 82 | 83 | new Data(finalIndex map data.x, finalIndex map data.y, Some(finalIndex), data.fileInfo) 84 | } 85 | } -------------------------------------------------------------------------------- /src/main/scala/soul/algorithm/oversampling/SMOTE.scala: -------------------------------------------------------------------------------- 1 | /* 2 | SOUL: Scala Oversampling and Undersampling Library. 3 | Copyright (C) 2019 Néstor Rodríguez, David López 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation in version 3 of the License. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | */ 17 | package soul.algorithm.oversampling 18 | 19 | import soul.data.Data 20 | import soul.util.KDTree 21 | import soul.util.Utilities.Distance.Distance 22 | import soul.util.Utilities._ 23 | 24 | import scala.util.Random 25 | 26 | /** SMOTE algorithm. Original paper: "SMOTE: Synthetic Minority Over-sampling Technique" by Nitesh V. Chawla, Kevin W. 27 | * Bowyer, Lawrence O. Hall and W. Philip Kegelmeyer. 28 | * 29 | * @param data data to work with 30 | * @param seed seed to use. If it is not provided, it will use the system time 31 | * @param percent amount of SMOTE N% 32 | * @param k number of minority class nearest neighbors 33 | * @param dist object of Distance enumeration representing the distance to be used 34 | * @param normalize normalize the data or not 35 | * @param verbose choose to display information about the execution or not 36 | * @author David López Pretel 37 | */ 38 | class SMOTE(data: Data, seed: Long = System.currentTimeMillis(), percent: Int = 500, k: Int = 5, 39 | dist: Distance = Distance.EUCLIDEAN, normalize: Boolean = false, verbose: Boolean = false) { 40 | 41 | /** Compute the SMOTE algorithm 42 | * 43 | * @return synthetic samples generated 44 | */ 45 | def compute(): Data = { 46 | val initTime: Long = System.nanoTime() 47 | 48 | if (percent > 100 && percent % 100 != 0) { 49 | throw new Exception("Percent must be a multiple of 100") 50 | } 51 | 52 | val samples: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData 53 | val minorityClassIndex: Array[Int] = minority(data.y) 54 | val minorityClass: Any = data.y(minorityClassIndex(0)) 55 | 56 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) { 57 | (samples.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)), 58 | samples.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)), 59 | samples.transpose.map((column: Array[Double]) => standardDeviation(column))) 60 | } else { 61 | (null, null, null) 62 | } 63 | 64 | val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) { 65 | Some(new KDTree(samples, data.y, samples(0).length)) 66 | } else { 67 | None 68 | } 69 | 70 | // check if the percent is correct 71 | var T: Int = minorityClassIndex.length 72 | var N: Int = percent 73 | 74 | if (N < 100) { 75 | T = N / 100 * T 76 | N = 100 77 | } 78 | N = N / 100 79 | 80 | // output with a size of T*N samples 81 | val output: Array[Array[Double]] = Array.ofDim[Double](N * T, samples(0).length) 82 | 83 | val r: Random = new Random(seed) 84 | 85 | // for each minority class sample 86 | minorityClassIndex.indices.par.foreach((i: Int) => { 87 | val neighbors: Array[Int] = if (dist == Distance.EUCLIDEAN) { 88 | KDTree.get.nNeighbours(samples(minorityClassIndex(i)), k)._3.toArray 89 | } else { 90 | kNeighborsHVDM(samples, minorityClassIndex(i), k, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter) 91 | } 92 | 93 | // compute populate for the sample 94 | (0 until N).par.foreach((n: Int) => { 95 | val nn: Int = neighbors(r.nextInt(neighbors.length)) 96 | // compute attributes of the sample 97 | samples(0).indices.foreach((atrib: Int) => { 98 | val diff: Double = samples(nn)(atrib) - samples(minorityClassIndex(i))(atrib) 99 | val gap: Double = r.nextFloat() 100 | output(i * N + n)(atrib) = samples(minorityClassIndex(i))(atrib) + gap * diff 101 | }) 102 | }) 103 | }) 104 | 105 | val finishTime: Long = System.nanoTime() 106 | 107 | if (verbose) { 108 | println("ORIGINAL SIZE: %d".format(data.x.length)) 109 | println("NEW DATA SIZE: %d".format(data.x.length + output.length)) 110 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime))) 111 | } 112 | 113 | new Data(if (data.fileInfo.nominal.length == 0) { 114 | to2Decimals(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs, 115 | data.fileInfo.minAttribs) else output)) 116 | } else { 117 | toNominal(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs, 118 | data.fileInfo.minAttribs) else output), data.nomToNum) 119 | }, Array.concat(data.y, Array.fill(output.length)(minorityClass)), None, data.fileInfo) 120 | } 121 | } -------------------------------------------------------------------------------- /src/main/scala/soul/algorithm/undersampling/ENN.scala: -------------------------------------------------------------------------------- 1 | /* 2 | SOUL: Scala Oversampling and Undersampling Library. 3 | Copyright (C) 2019 Néstor Rodríguez, David López 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation in version 3 of the License. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | */ 17 | package soul.algorithm.undersampling 18 | 19 | import soul.data.Data 20 | import soul.util.KDTree 21 | import soul.util.Utilities.Distance.Distance 22 | import soul.util.Utilities._ 23 | 24 | import scala.collection.mutable.ArrayBuffer 25 | 26 | /** Edited Nearest Neighbour rule. Original paper: "Asymptotic Properties of Nearest Neighbor Rules Using Edited Data" 27 | * by Dennis L. Wilson. 28 | * 29 | * @param data data to work with 30 | * @param seed seed to use. If it is not provided, it will use the system time 31 | * @param dist object of Distance enumeration representing the distance to be used 32 | * @param k number of neighbours to use when computing k-NN rule (normally 3 neighbours) 33 | * @param normalize normalize the data or not 34 | * @param randomData iterate through the data randomly or not 35 | * @param verbose choose to display information about the execution or not 36 | * @author Néstor Rodríguez Vico 37 | */ 38 | class ENN(data: Data, seed: Long = System.currentTimeMillis(), dist: Distance = Distance.EUCLIDEAN, 39 | k: Int = 3, normalize: Boolean = false, randomData: Boolean = false, verbose: Boolean = false) { 40 | 41 | /** Compute the ENN algorithm. 42 | * 43 | * @return undersampled data structure 44 | */ 45 | def compute(): Data = { 46 | val initTime: Long = System.nanoTime() 47 | 48 | val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length) 49 | val untouchableClass: Any = counter.minBy((c: (Any, Int)) => c._2)._1 50 | val random: scala.util.Random = new scala.util.Random(seed) 51 | var dataToWorkWith: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData 52 | val classesToWorkWith: Array[Any] = if (randomData) { 53 | val randomIndex: List[Int] = random.shuffle(data.y.indices.toList) 54 | dataToWorkWith = (randomIndex map dataToWorkWith).toArray 55 | (randomIndex map data.y).toArray 56 | } else { 57 | data.y 58 | } 59 | 60 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) { 61 | (dataToWorkWith.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)), 62 | dataToWorkWith.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)), 63 | dataToWorkWith.transpose.map((column: Array[Double]) => standardDeviation(column))) 64 | } else { 65 | (null, null, null) 66 | } 67 | 68 | val finalIndex = new ArrayBuffer[Int]() 69 | val uniqueClasses = classesToWorkWith.distinct 70 | 71 | var j = 0 72 | val majorityClassIndex = new ArrayBuffer[Int]() 73 | while (j < classesToWorkWith.length) { 74 | if (classesToWorkWith(j) == untouchableClass) finalIndex += j else majorityClassIndex += j 75 | j += 1 76 | } 77 | 78 | val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) { 79 | Some(new KDTree(dataToWorkWith, classesToWorkWith, dataToWorkWith(0).length)) 80 | } else { 81 | None 82 | } 83 | 84 | var i = 0 85 | while (i < uniqueClasses.length) { 86 | val targetClass = uniqueClasses(i) 87 | val selected: Array[(Int, Boolean)] = if (targetClass != untouchableClass) { 88 | majorityClassIndex.par.map { j => 89 | val label = if (dist == Distance.EUCLIDEAN) { 90 | mode(KDTree.get.nNeighbours(dataToWorkWith(j), k)._2.toArray) 91 | } else { 92 | nnRuleHVDM(dataToWorkWith, dataToWorkWith(j), j, classesToWorkWith, k, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter, "nearest")._1 93 | } 94 | 95 | (j, label == targetClass) 96 | }.toArray 97 | } else { 98 | new Array[(Int, Boolean)](0) 99 | } 100 | 101 | selected.foreach(e => if (e._2) finalIndex += e._1) 102 | 103 | i += 1 104 | } 105 | 106 | val finishTime: Long = System.nanoTime() 107 | 108 | if (verbose) { 109 | val newCounter: Map[Any, Int] = (finalIndex.toArray map classesToWorkWith).groupBy(identity).mapValues(_.length) 110 | println("ORIGINAL SIZE: %d".format(dataToWorkWith.length)) 111 | println("NEW DATA SIZE: %d".format(finalIndex.length)) 112 | println("REDUCTION PERCENTAGE: %s".format(100 - (finalIndex.length.toFloat / dataToWorkWith.length) * 100)) 113 | println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass))) 114 | println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass))) 115 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime))) 116 | } 117 | 118 | new Data(finalIndex.toArray map data.x, finalIndex.toArray map data.y, Some(finalIndex.toArray), data.fileInfo) 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /src/main/scala/soul/algorithm/undersampling/OSS.scala: -------------------------------------------------------------------------------- 1 | /* 2 | SOUL: Scala Oversampling and Undersampling Library. 3 | Copyright (C) 2019 Néstor Rodríguez, David López 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation in version 3 of the License. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | */ 17 | package soul.algorithm.undersampling 18 | 19 | import soul.data.Data 20 | import soul.util.KDTree 21 | import soul.util.Utilities.Distance.Distance 22 | import soul.util.Utilities._ 23 | 24 | /** One-Side Selection. Original paper: "Addressing the Curse of Imbalanced 25 | * Training Sets: One-Side Selection" by Miroslav Kubat and Stan Matwin. 26 | * 27 | * @param data data to work with 28 | * @param seed seed to use. If it is not provided, it will use the system time 29 | * @param dist object of Distance enumeration representing the distance to be used 30 | * @param normalize normalize the data or not 31 | * @param randomData iterate through the data randomly or not 32 | * @param verbose choose to display information about the execution or not 33 | * @author Néstor Rodríguez Vico 34 | */ 35 | class OSS(data: Data, seed: Long = System.currentTimeMillis(), dist: Distance = Distance.EUCLIDEAN, 36 | normalize: Boolean = false, randomData: Boolean = false, verbose: Boolean = false) { 37 | 38 | /** Compute the OSS algorithm. 39 | * 40 | * @return undersampled data structure 41 | */ 42 | def compute(): Data = { 43 | // Note: the notation used to refers the subsets of data is the used in the original paper. 44 | val initTime: Long = System.nanoTime() 45 | 46 | val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length) 47 | val untouchableClass: Any = counter.minBy((c: (Any, Int)) => c._2)._1 48 | val random: scala.util.Random = new scala.util.Random(seed) 49 | var dataToWorkWith: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData 50 | val classesToWorkWith: Array[Any] = if (randomData) { 51 | val randomIndex: List[Int] = random.shuffle(data.y.indices.toList) 52 | dataToWorkWith = (randomIndex map dataToWorkWith).toArray 53 | (randomIndex map data.y).toArray 54 | } else { 55 | data.y 56 | } 57 | 58 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) { 59 | (dataToWorkWith.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)), 60 | dataToWorkWith.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)), 61 | dataToWorkWith.transpose.map((column: Array[Double]) => standardDeviation(column))) 62 | } else { 63 | (null, null, null) 64 | } 65 | 66 | val positives: Array[Int] = classesToWorkWith.zipWithIndex.collect { case (label, i) if label == untouchableClass => i } 67 | val randomElement: Int = classesToWorkWith.indices.diff(positives)(new util.Random(seed).nextInt(classesToWorkWith.length - positives.length)) 68 | val c: Array[Int] = positives ++ Array(randomElement) 69 | 70 | val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) { 71 | Some(new KDTree(c map dataToWorkWith, c map classesToWorkWith, dataToWorkWith(0).length)) 72 | } else { 73 | None 74 | } 75 | 76 | val labels: Seq[(Int, Any)] = if (dist == Distance.EUCLIDEAN) { 77 | dataToWorkWith.indices.map(i => (i, mode(KDTree.get.nNeighbours(dataToWorkWith(i), 1)._2.toArray))) 78 | } else { 79 | val neighbours = c map dataToWorkWith 80 | val classes = c map classesToWorkWith 81 | 82 | dataToWorkWith.indices.map(i => (i, nnRuleHVDM(neighbours, dataToWorkWith(i), c.indexOf(i), classes, 1, data.fileInfo.nominal, 83 | sds, attrCounter, attrClassesCounter, "nearest")._1)) 84 | } 85 | val misclassified: Array[Int] = labels.collect { case (i, label) if label != classesToWorkWith(i) => i }.toArray 86 | val finalC: Array[Int] = (misclassified ++ c).distinct 87 | 88 | val auxData: Data = new Data(x = toXData(finalC map dataToWorkWith), y = finalC map classesToWorkWith, fileInfo = data.fileInfo) 89 | auxData.processedData = finalC map dataToWorkWith 90 | val tl = new TL(auxData, dist = dist, minorityClass = Some(untouchableClass)) 91 | val resultTL: Data = tl.compute() 92 | val finalIndex: Array[Int] = (resultTL.index.get.toList map finalC).toArray 93 | val finishTime: Long = System.nanoTime() 94 | 95 | if (verbose) { 96 | val newCounter: Map[Any, Int] = (finalIndex map classesToWorkWith).groupBy(identity).mapValues(_.length) 97 | println("ORIGINAL SIZE: %d".format(dataToWorkWith.length)) 98 | println("NEW DATA SIZE: %d".format(finalIndex.length)) 99 | println("REDUCTION PERCENTAGE: %s".format(100 - (finalIndex.length.toFloat / dataToWorkWith.length) * 100)) 100 | println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass))) 101 | println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass))) 102 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime))) 103 | } 104 | 105 | new Data(finalIndex map data.x, finalIndex map data.y, Some(finalIndex), data.fileInfo) 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /src/main/scala/soul/algorithm/oversampling/SMOTEENN.scala: -------------------------------------------------------------------------------- 1 | /* 2 | SOUL: Scala Oversampling and Undersampling Library. 3 | Copyright (C) 2019 Néstor Rodríguez, David López 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation in version 3 of the License. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | */ 17 | package soul.algorithm.oversampling 18 | 19 | import soul.algorithm.undersampling.ENN 20 | import soul.data.Data 21 | import soul.util.KDTree 22 | import soul.util.Utilities.Distance.Distance 23 | import soul.util.Utilities._ 24 | 25 | import scala.util.Random 26 | 27 | /** SMOTEENN algorithm. Original paper: "A Study of the Behavior of Several Methods for Balancing Machine Learning 28 | * Training Data" by Gustavo E. A. P. A. Batista, Ronaldo C. Prati and Maria Carolina Monard. 29 | * 30 | * @param data data to work with 31 | * @param seed seed to use. If it is not provided, it will use the system time 32 | * @param percent amount of Smote N% 33 | * @param k number of minority class nearest neighbors 34 | * @param dist object of Distance enumeration representing the distance to be used 35 | * @param normalize normalize the data or not 36 | * @param verbose choose to display information about the execution or not 37 | * @author David López Pretel 38 | */ 39 | class SMOTEENN(data: Data, seed: Long = System.currentTimeMillis(), percent: Int = 500, k: Int = 5, 40 | dist: Distance = Distance.EUCLIDEAN, normalize: Boolean = false, verbose: Boolean = false) { 41 | 42 | /** Compute the SMOTEENN algorithm 43 | * 44 | * @return synthetic samples generated 45 | */ 46 | def compute(): Data = { 47 | val initTime: Long = System.nanoTime() 48 | 49 | if (percent > 100 && percent % 100 != 0) { 50 | throw new Exception("Percent must be a multiple of 100") 51 | } 52 | 53 | val samples: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData 54 | val minorityClassIndex: Array[Int] = minority(data.y) 55 | val minorityClass: Any = data.y(minorityClassIndex(0)) 56 | 57 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) { 58 | (samples.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)), 59 | samples.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)), 60 | samples.transpose.map((column: Array[Double]) => standardDeviation(column))) 61 | } else { 62 | (null, null, null) 63 | } 64 | 65 | val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) { 66 | Some(new KDTree(samples, data.y, samples(0).length)) 67 | } else { 68 | None 69 | } 70 | 71 | // check if the percent is correct 72 | var T: Int = minorityClassIndex.length 73 | var N: Int = percent 74 | 75 | if (N < 100) { 76 | T = N / 100 * T 77 | N = 100 78 | } 79 | N = N / 100 80 | 81 | // output with a size of T*N samples 82 | val output: Array[Array[Double]] = Array.ofDim[Double](N * T, samples(0).length) 83 | 84 | val r: Random = new Random(seed) 85 | 86 | // for each minority class sample 87 | minorityClassIndex.indices.par.foreach((i: Int) => { 88 | val neighbors: Array[Int] = if (dist == Distance.EUCLIDEAN) { 89 | KDTree.get.nNeighbours(samples(minorityClassIndex(i)), k)._3.toArray 90 | } else { 91 | kNeighborsHVDM(samples, minorityClassIndex(i), k, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter) 92 | } 93 | 94 | // compute populate for the sample 95 | (0 until N).par.foreach((n: Int) => { 96 | val nn: Int = neighbors(r.nextInt(neighbors.length)) 97 | // compute attributes of the sample 98 | samples(0).indices.foreach((atrib: Int) => { 99 | val diff: Double = samples(nn)(atrib) - samples(minorityClassIndex(i))(atrib) 100 | val gap: Double = r.nextFloat() 101 | output(i * N + n)(atrib) = samples(minorityClassIndex(i))(atrib) + gap * diff 102 | }) 103 | }) 104 | }) 105 | 106 | val result: Array[Array[Double]] = Array.concat(samples, output) 107 | val resultClasses: Array[Any] = Array.concat(data.y, Array.fill(output.length)(minorityClass)) 108 | 109 | val ennData: Data = new Data(x = toXData(result), y = resultClasses, fileInfo = data.fileInfo) 110 | ennData.processedData = result 111 | val enn = new ENN(ennData, dist = dist) 112 | val resultENN: Data = enn.compute() 113 | val finalIndex: Array[Int] = result.indices.diff(resultENN.index.get).toArray 114 | 115 | val finishTime: Long = System.nanoTime() 116 | 117 | if (verbose) { 118 | println("ORIGINAL SIZE: %d".format(data.x.length)) 119 | println("NEW DATA SIZE: %d".format(data.x.length + output.length)) 120 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime))) 121 | } 122 | 123 | new Data(if (data.nomToNum(0).isEmpty) { 124 | to2Decimals(zeroOneDenormalization(finalIndex map result, data.fileInfo.maxAttribs, data.fileInfo.minAttribs)) 125 | } else { 126 | toNominal(zeroOneDenormalization(finalIndex map result, data.fileInfo.maxAttribs, data.fileInfo.minAttribs), data.nomToNum) 127 | }, finalIndex map resultClasses, None, data.fileInfo) 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /src/main/scala/soul/algorithm/undersampling/IHTS.scala: -------------------------------------------------------------------------------- 1 | /* 2 | SOUL: Scala Oversampling and Undersampling Library. 3 | Copyright (C) 2019 Néstor Rodríguez, David López 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation in version 3 of the License. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | */ 17 | package soul.algorithm.undersampling 18 | 19 | import soul.data.Data 20 | import soul.util.Utilities._ 21 | import weka.classifiers.trees.J48 22 | import weka.core.Instances 23 | 24 | 25 | /** Instance Hardness Threshold. Original paper: "An Empirical Study of Instance Hardness" by Michael R. Smith, 26 | * Tony Martinez and Christophe Giraud-Carrier. 27 | * 28 | * @param data data to work with 29 | * @param seed seed to use. If it is not provided, it will use the system time 30 | * @param nFolds number of subsets to create when applying cross-validation 31 | * @param normalize normalize the data or not 32 | * @param randomData iterate through the data randomly or not 33 | * @param verbose choose to display information about the execution or not 34 | * @author Néstor Rodríguez Vico 35 | */ 36 | class IHTS(data: Data, seed: Long = System.currentTimeMillis(), nFolds: Int = 5, 37 | normalize: Boolean = false, randomData: Boolean = false, verbose: Boolean = false) { 38 | 39 | /** Compute the IHTS algorithm. 40 | * 41 | * @return undersampled data structure 42 | */ 43 | def compute(): Data = { 44 | val initTime: Long = System.nanoTime() 45 | 46 | val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length) 47 | val untouchableClass: Any = counter.minBy((c: (Any, Int)) => c._2)._1 48 | val random: scala.util.Random = new scala.util.Random(seed) 49 | var dataToWorkWith: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData 50 | val classesToWorkWith: Array[Any] = if (randomData) { 51 | val randomIndex: List[Int] = random.shuffle(data.y.indices.toList) 52 | dataToWorkWith = (randomIndex map dataToWorkWith).toArray 53 | (randomIndex map data.y).toArray 54 | } else { 55 | data.y 56 | } 57 | 58 | // Each element is the index of test elements 59 | val indices: Array[Array[Int]] = random.shuffle(classesToWorkWith.indices.toList).toArray.grouped((classesToWorkWith.length.toFloat / nFolds).ceil.toInt).toArray 60 | val probabilities: Array[Double] = new Array[Double](classesToWorkWith.length) 61 | 62 | indices.foreach { testIndex: Array[Int] => 63 | val trainIndex: Array[Int] = classesToWorkWith.indices.diff(testIndex).toArray 64 | 65 | val j48: J48 = new J48 66 | j48.setOptions(Array("-U", "-M", "1")) 67 | 68 | val trainInstances: Instances = buildInstances(data = trainIndex map dataToWorkWith, 69 | classes = trainIndex map classesToWorkWith, fileInfo = data.fileInfo) 70 | val testInstances: Instances = buildInstances(data = testIndex map dataToWorkWith, 71 | classes = testIndex map classesToWorkWith, fileInfo = data.fileInfo) 72 | 73 | j48.buildClassifier(trainInstances) 74 | 75 | val probs: Array[Array[Double]] = testIndex.indices.map((i: Int) => j48.distributionForInstance(testInstances.instance(i))).toArray 76 | val classes: Array[Any] = (testIndex map classesToWorkWith).distinct 77 | val values: Array[Double] = (testIndex map classesToWorkWith).zipWithIndex.map((e: (Any, Int)) => probs(e._2)(classes.indexOf(e._1))) 78 | 79 | (testIndex zip values).foreach((i: (Int, Double)) => probabilities(i._1) = i._2) 80 | } 81 | 82 | val finalIndex: Array[Int] = classesToWorkWith.distinct.flatMap { targetClass: Any => 83 | val indexTargetClass: Array[Int] = if (targetClass != untouchableClass) { 84 | val nSamples: Int = counter(untouchableClass) 85 | val targetIndex: Array[Int] = boolToIndex(classesToWorkWith.map((c: Any) => c == targetClass)) 86 | val targetProbabilities: Array[Double] = targetIndex map probabilities 87 | val percentile: Double = (1.0 - (nSamples / counter(targetClass))) * 100.0 88 | val threshold: Double = targetProbabilities.sorted.apply(math.ceil((targetProbabilities.length - 1) * (percentile / 100.0)).toInt) 89 | boolToIndex((targetIndex map probabilities).map((e: Double) => e >= threshold)) 90 | } 91 | else { 92 | classesToWorkWith.zipWithIndex.collect { case (c, i) if c == targetClass => i } 93 | } 94 | 95 | indexTargetClass 96 | } 97 | 98 | val finishTime: Long = System.nanoTime() 99 | 100 | if (verbose) { 101 | val newCounter: Map[Any, Int] = (finalIndex map classesToWorkWith).groupBy(identity).mapValues(_.length) 102 | println("ORIGINAL SIZE: %d".format(dataToWorkWith.length)) 103 | println("NEW DATA SIZE: %d".format(finalIndex.length)) 104 | println("REDUCTION PERCENTAGE: %s".format(100 - (finalIndex.length.toFloat / dataToWorkWith.length) * 100)) 105 | println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass))) 106 | println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass))) 107 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime))) 108 | } 109 | 110 | new Data(finalIndex map data.x, finalIndex map data.y, Some(finalIndex), data.fileInfo) 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /src/main/scala/soul/algorithm/oversampling/SMOTETL.scala: -------------------------------------------------------------------------------- 1 | /* 2 | SOUL: Scala Oversampling and Undersampling Library. 3 | Copyright (C) 2019 Néstor Rodríguez, David López 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation in version 3 of the License. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | */ 17 | package soul.algorithm.oversampling 18 | 19 | import soul.algorithm.undersampling.TL 20 | import soul.data.Data 21 | import soul.util.KDTree 22 | import soul.util.Utilities.Distance.Distance 23 | import soul.util.Utilities._ 24 | 25 | import scala.util.Random 26 | 27 | /** SMOTETL algorithm. Original paper: "A Study of the Behavior of Several Methods for Balancing Machine Learning 28 | * Training Data" by Gustavo E. A. P. A. Batista, Ronaldo C. Prati and Maria Carolina Monard. 29 | * 30 | * @param data data to work with 31 | * @param seed seed to use. If it is not provided, it will use the system time 32 | * @param percent Amount of Smote N% 33 | * @param k Number of minority class nearest neighbors 34 | * @param dist object of Distance enumeration representing the distance to be used 35 | * @param normalize normalize the data or not 36 | * @param verbose choose to display information about the execution or not 37 | * @author David López Pretel 38 | */ 39 | class SMOTETL(data: Data, seed: Long = System.currentTimeMillis(), percent: Int = 500, k: Int = 5, 40 | dist: Distance = Distance.EUCLIDEAN, normalize: Boolean = false, verbose: Boolean = false) { 41 | 42 | /** Compute the SMOTETL algorithm 43 | * 44 | * @return synthetic samples generated 45 | */ 46 | def compute(): Data = { 47 | val initTime: Long = System.nanoTime() 48 | 49 | if (percent > 100 && percent % 100 != 0) { 50 | throw new Exception("Percent must be a multiple of 100") 51 | } 52 | 53 | val samples: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData 54 | // compute minority class 55 | val minorityClassIndex: Array[Int] = minority(data.y) 56 | val minorityClass: Any = data.y(minorityClassIndex(0)) 57 | 58 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) { 59 | (samples.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)), 60 | samples.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)), 61 | samples.transpose.map((column: Array[Double]) => standardDeviation(column))) 62 | } else { 63 | (null, null, null) 64 | } 65 | 66 | val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) { 67 | Some(new KDTree(samples, data.y, samples(0).length)) 68 | } else { 69 | None 70 | } 71 | 72 | // check if the percent is correct 73 | var T: Int = minorityClassIndex.length 74 | var N: Int = percent 75 | 76 | if (N < 100) { 77 | T = N / 100 * T 78 | N = 100 79 | } 80 | N = N / 100 81 | 82 | // output with a size of T*N samples 83 | val output: Array[Array[Double]] = Array.ofDim[Double](N * T, samples(0).length) 84 | 85 | val r: Random = new Random(seed) 86 | 87 | // for each minority class sample 88 | minorityClassIndex.indices.par.foreach((i: Int) => { 89 | val neighbors: Array[Int] = if (dist == Distance.EUCLIDEAN) { 90 | KDTree.get.nNeighbours(samples(minorityClassIndex(i)), k)._3.toArray 91 | } else { 92 | kNeighborsHVDM(samples, minorityClassIndex(i), k, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter) 93 | } 94 | 95 | // compute populate for the sample 96 | (0 until N).par.foreach((n: Int) => { 97 | val nn: Int = neighbors(r.nextInt(neighbors.length)) 98 | // compute attributes of the sample 99 | samples(0).indices.foreach((atrib: Int) => { 100 | val diff: Double = samples(nn)(atrib) - samples(minorityClassIndex(i))(atrib) 101 | val gap: Double = r.nextFloat() 102 | output(i * N + n)(atrib) = samples(minorityClassIndex(i))(atrib) + gap * diff 103 | }) 104 | }) 105 | }) 106 | val result: Array[Array[Double]] = Array.concat(samples, output) 107 | val resultClasses: Array[Any] = Array.concat(data.y, Array.fill(output.length)(minorityClass)) 108 | 109 | val tlData: Data = new Data(x = toXData(result), y = resultClasses, fileInfo = data.fileInfo) 110 | tlData.processedData = result 111 | val tl = new TL(tlData, dist = dist, ratio = "all") 112 | val resultTL: Data = tl.compute() 113 | val finalIndex: Array[Int] = result.indices.diff(resultTL.index.get).toArray 114 | 115 | val finishTime: Long = System.nanoTime() 116 | 117 | if (verbose) { 118 | println("ORIGINAL SIZE: %d".format(data.x.length)) 119 | println("NEW DATA SIZE: %d".format(data.x.length + output.length)) 120 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime))) 121 | } 122 | 123 | new Data(if (data.nomToNum(0).isEmpty) { 124 | to2Decimals(zeroOneDenormalization(finalIndex map result, data.fileInfo.maxAttribs, data.fileInfo.minAttribs)) 125 | } else { 126 | toNominal(zeroOneDenormalization(finalIndex map result, data.fileInfo.maxAttribs, data.fileInfo.minAttribs), data.nomToNum) 127 | }, finalIndex map resultClasses, None, data.fileInfo) 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /src/main/scala/soul/algorithm/oversampling/SafeLevelSMOTE.scala: -------------------------------------------------------------------------------- 1 | /* 2 | SOUL: Scala Oversampling and Undersampling Library. 3 | Copyright (C) 2019 Néstor Rodríguez, David López 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation in version 3 of the License. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | */ 17 | package soul.algorithm.oversampling 18 | 19 | import soul.data.Data 20 | import soul.util.KDTree 21 | import soul.util.Utilities.Distance.Distance 22 | import soul.util.Utilities._ 23 | 24 | import scala.util.Random 25 | 26 | /** SafeLevel-SMOTE algorithm. Original paper: "Safe-Level-SMOTE: Safe-Level-Synthetic Minority Over-Sampling Technique 27 | * for Handling the Class Imbalanced Problem" by Chumphol Bunkhumpornpat, Krung Sinapiromsaran, and Chidchanok Lursinsap. 28 | * 29 | * @param data data to work with 30 | * @param seed seed to use. If it is not provided, it will use the system time 31 | * @param k Number of nearest neighbors 32 | * @param dist object of Distance enumeration representing the distance to be used 33 | * @param normalize normalize the data or not 34 | * @param verbose choose to display information about the execution or not 35 | * @author David López Pretel 36 | */ 37 | class SafeLevelSMOTE(data: Data, seed: Long = System.currentTimeMillis(), k: Int = 5, 38 | dist: Distance = Distance.EUCLIDEAN, normalize: Boolean = false, verbose: Boolean = false) { 39 | 40 | /** Compute the SafeLevelSMOTE algorithm 41 | * 42 | * @return synthetic samples generated 43 | */ 44 | def compute(): Data = { 45 | val initTime: Long = System.nanoTime() 46 | val samples: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData 47 | // compute minority class 48 | val minorityClassIndex: Array[Int] = minority(data.y) 49 | val minorityClass: Any = data.y(minorityClassIndex(0)) 50 | 51 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) { 52 | (samples.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)), 53 | samples.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)), 54 | samples.transpose.map((column: Array[Double]) => standardDeviation(column))) 55 | } else { 56 | (null, null, null) 57 | } 58 | 59 | val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) { 60 | Some(new KDTree(samples, data.y, samples(0).length)) 61 | } else { 62 | None 63 | } 64 | 65 | var sl_ratio: Double = 0.0 66 | val r: Random = new Random(seed) 67 | 68 | val output: Array[Array[Double]] = minorityClassIndex.indices.par.map(i => { 69 | // compute k neighbors from p and save number of positive instances 70 | val neighbors: Array[Int] = if (dist == Distance.EUCLIDEAN) { 71 | KDTree.get.nNeighbours(samples(minorityClassIndex(i)), k)._3.toArray 72 | } else { 73 | kNeighborsHVDM(samples, i, k, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter) 74 | } 75 | val n: Int = neighbors(r.nextInt(neighbors.length)) 76 | val slp: Int = neighbors.map(neighbor => { 77 | if (data.y(neighbor) == minorityClass) { 78 | 1 79 | } else { 80 | 0 81 | } 82 | }).sum 83 | // compute k neighbors from n and save number of positive instances 84 | val selectedNeighbors: Array[Int] = if (dist == Distance.EUCLIDEAN) { 85 | KDTree.get.nNeighbours(samples(n), k)._3.toArray 86 | } else { 87 | kNeighborsHVDM(samples, n, k, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter) 88 | } 89 | 90 | val sln: Int = selectedNeighbors.map(neighbor => { 91 | if (data.y(neighbor) == minorityClass) { 92 | 1 93 | } else { 94 | 0 95 | } 96 | }).sum 97 | if (sln != 0) { //sl is safe level 98 | sl_ratio = slp / sln 99 | } else { 100 | sl_ratio = 99999999 101 | } 102 | if (sl_ratio == 99999999 && slp == 0) { 103 | // dont create a synthetic instance 104 | None 105 | } 106 | else { 107 | // calculate synthetic sample 108 | Some(samples(i).indices.map(atrib => { 109 | var gap: Double = 0.0 // 2 case 110 | if (sl_ratio == 1) { // 3 case 111 | gap = r.nextFloat 112 | } else if (sl_ratio > 1 && sl_ratio != 99999999) { // 4 case 113 | gap = r.nextFloat * (1 / sl_ratio) 114 | } else if (sl_ratio < 1) { // 5 case 115 | gap = r.nextFloat() 116 | if (gap < 1 - sl_ratio) { 117 | gap = gap + 1 - sl_ratio 118 | } 119 | } 120 | val diff: Double = samples(n)(atrib) - samples(minorityClassIndex(i))(atrib) 121 | samples(minorityClassIndex(i))(atrib) + gap * diff 122 | }).toArray) 123 | } 124 | }).filterNot(_.forall(_ == None)).map(_.get).toArray 125 | 126 | val finishTime: Long = System.nanoTime() 127 | 128 | if (verbose) { 129 | println("ORIGINAL SIZE: %d".format(data.x.length)) 130 | println("NEW DATA SIZE: %d".format(data.x.length + output.length)) 131 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime))) 132 | } 133 | 134 | new Data(if (data.fileInfo.nominal.length == 0) { 135 | to2Decimals(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs, 136 | data.fileInfo.minAttribs) else output)) 137 | } else { 138 | toNominal(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs, 139 | data.fileInfo.minAttribs) else output), data.nomToNum) 140 | }, Array.concat(data.y, Array.fill(output.length)(minorityClass)), None, data.fileInfo) 141 | } 142 | } 143 | -------------------------------------------------------------------------------- /src/main/scala/soul/algorithm/oversampling/ADOMS.scala: -------------------------------------------------------------------------------- 1 | /* 2 | SOUL: Scala Oversampling and Undersampling Library. 3 | Copyright (C) 2019 Néstor Rodríguez, David López 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation in version 3 of the License. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | */ 17 | package soul.algorithm.oversampling 18 | 19 | import breeze.linalg.{DenseMatrix, eigSym} 20 | import soul.data.Data 21 | import soul.util.KDTree 22 | import soul.util.Utilities.Distance.Distance 23 | import soul.util.Utilities._ 24 | 25 | import scala.util.Random 26 | 27 | /** ADOMS algorithm. Original paper: "The Generation Mechanism of Synthetic Minority Class Examples" by Sheng TANG 28 | * and Si-ping CHEN. 29 | * 30 | * @param data data to work with 31 | * @param seed seed to use. If it is not provided, it will use the system time 32 | * @param percent amount of samples N% 33 | * @param k number of neighbors 34 | * @param dist object of Distance enumeration representing the distance to be used 35 | * @param normalize normalize the data or not 36 | * @param verbose choose to display information about the execution or not 37 | * @author David López Pretel 38 | */ 39 | class ADOMS(data: Data, seed: Long = System.currentTimeMillis(), percent: Int = 300, k: Int = 5, 40 | dist: Distance = Distance.EUCLIDEAN, normalize: Boolean = false, verbose: Boolean = false) { 41 | 42 | /** Compute the first principal component axis 43 | * 44 | * @param A the data 45 | * @return the first principal component axis 46 | */ 47 | private def PCA(A: Array[Array[Double]]): Array[Double] = { 48 | val mean: Array[Double] = A.transpose.map(_.sum / A.length) 49 | // subtract the mean to the data 50 | val dataNoMean: DenseMatrix[Double] = DenseMatrix(A: _*) -:- DenseMatrix(A.map(_ => mean): _*) 51 | // get the covariance matrix 52 | val oneDividedByN: Array[Array[Double]] = Array.fill(dataNoMean.cols, dataNoMean.cols)(dataNoMean.rows) 53 | val S: DenseMatrix[Double] = (dataNoMean.t * dataNoMean) /:/ DenseMatrix(oneDividedByN: _*) 54 | //compute the eigenvectors and eigenvalues of S 55 | val eigen = eigSym(S) 56 | 57 | //return the first eigenvector because it represent the first principal component axis 58 | eigen.eigenvectors(0, ::).t.toArray 59 | } 60 | 61 | /** Compute the ADOMS algorithm 62 | * 63 | * @return synthetic samples generated 64 | */ 65 | def compute(): Data = { 66 | val initTime: Long = System.nanoTime() 67 | val samples: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData 68 | val minorityClassIndex: Array[Int] = minority(data.y) 69 | val minorityClass: Any = data.y(minorityClassIndex(0)) 70 | // output with a size of T*N samples 71 | val output: Array[Array[Double]] = Array.ofDim(minorityClassIndex.length * percent / 100, samples(0).length) 72 | // index array to save the neighbors of each sample 73 | val r: Random = new Random(seed) 74 | 75 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) { 76 | (samples.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)), 77 | samples.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)), 78 | samples.transpose.map((column: Array[Double]) => standardDeviation(column))) 79 | } else { 80 | (null, null, null) 81 | } 82 | 83 | val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) { 84 | Some(new KDTree(samples, data.y, samples(0).length)) 85 | } else { 86 | None 87 | } 88 | 89 | val N: Int = percent / 100 90 | 91 | (0 until N).par.foreach(nn => { 92 | // for each minority class sample 93 | minorityClassIndex.zipWithIndex.par.foreach(i => { 94 | val neighbors: Array[Int] = if (dist == Distance.EUCLIDEAN) { 95 | KDTree.get.nNeighbours(samples(i._1), k)._3.toArray 96 | } else { 97 | kNeighborsHVDM(samples, i._2, k, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter) 98 | } 99 | 100 | val n: Int = r.nextInt(neighbors.length) 101 | 102 | // calculate first principal component axis of local data distribution 103 | val l2: Array[Double] = PCA(neighbors map samples) 104 | // compute projection of n in l2, M is on l2 105 | val dotMN: Double = l2.indices.map(j => { 106 | samples(i._1)(j) - samples(neighbors(n))(j) 107 | }).toArray.zipWithIndex.map(j => { 108 | j._1 * l2(j._2) 109 | }).sum 110 | val dotMM: Double = l2.map(x => x * x).sum 111 | // create synthetic sample 112 | output(nn * minorityClassIndex.length + i._2) = l2.indices.map(j => samples(i._1)(j) + dotMN / dotMM * l2(j)).toArray 113 | output(nn * minorityClassIndex.length + i._2) = output(nn * minorityClassIndex.length + i._2).indices.map(j => output(nn * minorityClassIndex.length + i._2)(j) + (samples(i._1)(j) - output(nn * minorityClassIndex.length + i._2)(j)) * r.nextFloat()).toArray 114 | }) 115 | }) 116 | 117 | val finishTime: Long = System.nanoTime() 118 | 119 | if (verbose) { 120 | println("ORIGINAL SIZE: %d".format(data.x.length)) 121 | println("NEW DATA SIZE: %d".format(data.x.length + output.length)) 122 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime))) 123 | } 124 | 125 | new Data(if (data.fileInfo.nominal.length == 0) { 126 | to2Decimals(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs, 127 | data.fileInfo.minAttribs) else output)) 128 | } else { 129 | toNominal(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs, 130 | data.fileInfo.minAttribs) else output), data.nomToNum) 131 | }, Array.concat(data.y, Array.fill(output.length)(minorityClass)), None, data.fileInfo) 132 | } 133 | } 134 | -------------------------------------------------------------------------------- /src/main/scala/soul/algorithm/oversampling/BorderlineSMOTE.scala: -------------------------------------------------------------------------------- 1 | /* 2 | SOUL: Scala Oversampling and Undersampling Library. 3 | Copyright (C) 2019 Néstor Rodríguez, David López 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation in version 3 of the License. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | */ 17 | package soul.algorithm.oversampling 18 | 19 | import soul.data.Data 20 | import soul.util.KDTree 21 | import soul.util.Utilities.Distance.Distance 22 | import soul.util.Utilities._ 23 | 24 | import scala.util.Random 25 | 26 | /** Borderline-SMOTE algorithm. Original paper: "Borderline-SMOTE: A New Over-Sampling Method in Imbalanced Data Sets 27 | * Learning." by Hui Han, Wen-Yuan Wang, and Bing-Huan Mao. 28 | * 29 | * @param data data to work with 30 | * @param seed seed to use. If it is not provided, it will use the system time 31 | * @param m number of nearest neighbors 32 | * @param k number of minority class nearest neighbors 33 | * @param dist object of Distance enumeration representing the distance to be used 34 | * @param normalize normalize the data or not 35 | * @param verbose choose to display information about the execution or not 36 | * @author David López Pretel 37 | */ 38 | class BorderlineSMOTE(data: Data, seed: Long = System.currentTimeMillis(), m: Int = 10, k: Int = 5, 39 | dist: Distance = Distance.EUCLIDEAN, normalize: Boolean = false, verbose: Boolean = false) { 40 | 41 | /** Compute the BorderlineSMOTE algorithm 42 | * 43 | * @return synthetic samples generated 44 | */ 45 | def compute(): Data = { 46 | val initTime: Long = System.nanoTime() 47 | val samples: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData 48 | val minorityClassIndex: Array[Int] = minority(data.y) 49 | val minorityClass: Any = data.y(minorityClassIndex(0)) 50 | 51 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) { 52 | (samples.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)), 53 | samples.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)), 54 | samples.transpose.map((column: Array[Double]) => standardDeviation(column))) 55 | } else { 56 | (null, null, null) 57 | } 58 | 59 | val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) { 60 | Some(new KDTree(samples, data.y, samples(0).length)) 61 | } else { 62 | None 63 | } 64 | 65 | val KDTreeMinority: Option[KDTree] = if (dist == Distance.EUCLIDEAN) { 66 | Some(new KDTree(minorityClassIndex map samples, minorityClassIndex map data.y, samples(0).length)) 67 | } else { 68 | None 69 | } 70 | 71 | // compute minority class neighbors 72 | val minorityClassNeighbors: Array[Array[Int]] = new Array[Array[Int]](minorityClassIndex.length) 73 | if (dist == Distance.EUCLIDEAN) { 74 | minorityClassIndex.indices.par.foreach(i => minorityClassNeighbors(i) = KDTree.get.nNeighbours(samples(minorityClassIndex(i)), k)._3.toArray) 75 | } else { 76 | minorityClassIndex.indices.par.foreach(i => minorityClassNeighbors(i) = kNeighborsHVDM(samples, minorityClassIndex(i), m, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter)) 77 | } 78 | 79 | //compute nodes in borderline 80 | val DangerNodes: Array[Int] = minorityClassNeighbors.map(neighbors => { 81 | var counter = 0 82 | neighbors.foreach(neighbor => { 83 | if (data.y(neighbor) != minorityClass) { 84 | counter += 1 85 | } 86 | }) 87 | counter 88 | }).zipWithIndex.map(nNonMinorityClass => { 89 | if (nNonMinorityClass._1 >= (m / 2) && nNonMinorityClass._1 < m) { 90 | Some(nNonMinorityClass._2) 91 | } else { 92 | None 93 | } 94 | }).filterNot(_.forall(_ == None)).map(x => minorityClassIndex(x.get)) 95 | 96 | val r: Random = new Random(seed) 97 | val s: Int = r.nextInt(k) + 1 98 | 99 | // output with a size of T*N samples 100 | val output: Array[Array[Double]] = Array.ofDim(s * DangerNodes.length, samples(0).length) 101 | 102 | // for each minority class sample 103 | DangerNodes.zipWithIndex.par.foreach(i => { 104 | val neighbors = if (dist == Distance.EUCLIDEAN) { 105 | KDTreeMinority.get.nNeighbours(samples(i._1), k)._3.toArray 106 | } else { 107 | kNeighborsHVDM(minorityClassIndex map samples, i._2, k, data.fileInfo.nominal, sds, attrCounter, 108 | attrClassesCounter).map(minorityClassIndex(_)) 109 | } 110 | val sNeighbors: Array[Int] = (0 until s).map(_ => r.nextInt(neighbors.length)).toArray 111 | // calculate populate for the sample 112 | (sNeighbors map neighbors).zipWithIndex.par.foreach(j => { 113 | // calculate attributes of the sample 114 | samples(i._1).indices.foreach(attrib => { 115 | val diff: Double = samples(minorityClassIndex(j._1))(attrib) - samples(i._1)(attrib) 116 | val gap: Float = r.nextFloat 117 | output(i._2 * s + j._2)(attrib) = samples(i._1)(attrib) + gap * diff 118 | }) 119 | }) 120 | }) 121 | 122 | val finishTime: Long = System.nanoTime() 123 | 124 | if (verbose) { 125 | println("ORIGINAL SIZE: %d".format(data.x.length)) 126 | println("NEW DATA SIZE: %d".format(data.x.length + output.length)) 127 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime))) 128 | } 129 | 130 | new Data(if (data.fileInfo.nominal.length == 0) { 131 | to2Decimals(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs, 132 | data.fileInfo.minAttribs) else output)) 133 | } else { 134 | toNominal(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs, 135 | data.fileInfo.minAttribs) else output), data.nomToNum) 136 | }, Array.concat(data.y, Array.fill(output.length)(minorityClass)), None, data.fileInfo) 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /src/main/scala/soul/algorithm/undersampling/TL.scala: -------------------------------------------------------------------------------- 1 | /* 2 | SOUL: Scala Oversampling and Undersampling Library. 3 | Copyright (C) 2019 Néstor Rodríguez, David López 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation in version 3 of the License. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | */ 17 | package soul.algorithm.undersampling 18 | 19 | import soul.data.Data 20 | import soul.util.Utilities.Distance.Distance 21 | import soul.util.Utilities._ 22 | 23 | /** Tomek Link. Original paper: "Two Modifications of CNN" by Ivan Tomek. 24 | * 25 | * @param data data to work with 26 | * @param seed seed to use. If it is not provided, it will use the system time 27 | * @param dist object of Distance enumeration representing the distance to be used 28 | * @param ratio indicates the instances of the Tomek Links that are going to be remove. "all" will remove all instances, 29 | * "minority" will remove instances of the minority class and "not minority" will remove all the instances 30 | * except the ones of the minority class. 31 | * @param minorityClass minority class. If set to None, it will be computed 32 | * @param normalize normalize the data or not 33 | * @param randomData iterate through the data randomly or not 34 | * @param verbose choose to display information about the execution or not 35 | * @author Néstor Rodríguez Vico 36 | */ 37 | class TL(data: Data, seed: Long = System.currentTimeMillis(), dist: Distance = Distance.EUCLIDEAN, ratio: String = "not minority", 38 | val minorityClass: Option[Any] = None, normalize: Boolean = false, randomData: Boolean = false, verbose: Boolean = false) { 39 | 40 | /** Compute the TL algorithm. 41 | * 42 | * @return undersampled data structure 43 | */ 44 | def compute(): Data = { 45 | val initTime: Long = System.nanoTime() 46 | val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length) 47 | val untouchableClass: Any = if (minorityClass.isDefined) minorityClass.get else counter.minBy((c: (Any, Int)) => c._2)._1 48 | val random: scala.util.Random = new scala.util.Random(seed) 49 | 50 | var dataToWorkWith: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData 51 | val classesToWorkWith: Array[Any] = if (randomData) { 52 | val randomIndex: List[Int] = random.shuffle(data.y.indices.toList) 53 | dataToWorkWith = (randomIndex map dataToWorkWith).toArray 54 | (randomIndex map data.y).toArray 55 | } else { 56 | data.y 57 | } 58 | 59 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) { 60 | (dataToWorkWith.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)), 61 | dataToWorkWith.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)), 62 | dataToWorkWith.transpose.map((column: Array[Double]) => standardDeviation(column))) 63 | } else { 64 | (null, null, null) 65 | } 66 | 67 | val candidates: Map[Any, Array[Int]] = classesToWorkWith.distinct.map { 68 | c: Any => 69 | c -> classesToWorkWith.zipWithIndex.collect { 70 | case (a, b) if a != c => b 71 | } 72 | }.toMap 73 | 74 | val distances: Array[Array[Double]] = Array.fill[Array[Double]](dataToWorkWith.length)(new Array[Double](dataToWorkWith.length)) 75 | 76 | if (dist == Distance.EUCLIDEAN) { 77 | dataToWorkWith.indices.par.foreach { i: Int => 78 | dataToWorkWith.indices.drop(i).par.foreach { j: Int => 79 | distances(i)(j) = euclidean(dataToWorkWith(i), dataToWorkWith(j)) 80 | distances(j)(i) = distances(i)(j) 81 | } 82 | } 83 | } else { 84 | dataToWorkWith.indices.par.foreach { i: Int => 85 | dataToWorkWith.indices.drop(i).par.foreach { j: Int => 86 | distances(i)(j) = HVDM(dataToWorkWith(i), dataToWorkWith(j), data.fileInfo.nominal, sds, attrCounter, attrClassesCounter) 87 | distances(j)(i) = distances(i)(j) 88 | } 89 | } 90 | } 91 | 92 | // Look for the nearest neighbour in the rest of the classes 93 | val nearestNeighbour: Array[Int] = distances.zipWithIndex.map((row: (Array[Double], Int)) => row._1.indexOf((candidates(classesToWorkWith(row._2)) map row._1).min)) 94 | // For each instance, I: If my nearest neighbour is J and the nearest neighbour of J it's me, I, I and J form a Tomek link 95 | val tomekLinks: Array[(Int, Int)] = nearestNeighbour.zipWithIndex.filter((pair: (Int, Int)) => nearestNeighbour(pair._1) == pair._2) 96 | val targetInstances: Array[Int] = tomekLinks.flatMap((x: (Int, Int)) => List(x._1, x._2)).distinct 97 | // but the user can choose which of them should be removed 98 | val removedInstances: Array[Int] = if (ratio == "all") targetInstances else if (ratio == "minority") 99 | targetInstances.zipWithIndex.collect { 100 | case (a, b) if a == untouchableClass => b 101 | } else if (ratio == "not minority") 102 | targetInstances.zipWithIndex.collect { 103 | case (a, b) if a != untouchableClass => b 104 | } else 105 | throw new Exception("Incorrect value of ratio. Possible options: all, minority, not minority") 106 | val finalIndex: Array[Int] = dataToWorkWith.indices.diff(removedInstances).toArray 107 | val finishTime: Long = System.nanoTime() 108 | 109 | if (verbose) { 110 | val newCounter: Map[Any, Int] = (finalIndex map classesToWorkWith).groupBy(identity).mapValues(_.length) 111 | println("ORIGINAL SIZE: %d".format(dataToWorkWith.length)) 112 | println("NEW DATA SIZE: %d".format(finalIndex.length)) 113 | println("REDUCTION PERCENTAGE: %s".format(100 - (finalIndex.length.toFloat / dataToWorkWith.length) * 100)) 114 | println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass))) 115 | println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass))) 116 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime))) 117 | println("REMOVED INSTANCES: %s".format(ratio)) 118 | } 119 | 120 | new Data(finalIndex map data.x, finalIndex map data.y, Some(finalIndex), data.fileInfo) 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /src/main/scala/soul/algorithm/oversampling/ADASYN.scala: -------------------------------------------------------------------------------- 1 | /* 2 | SOUL: Scala Oversampling and Undersampling Library. 3 | Copyright (C) 2019 Néstor Rodríguez, David López 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation in version 3 of the License. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | */ 17 | package soul.algorithm.oversampling 18 | 19 | import soul.data.Data 20 | import soul.util.KDTree 21 | import soul.util.Utilities.Distance.Distance 22 | import soul.util.Utilities._ 23 | 24 | import scala.util.Random 25 | 26 | /** ADASYN algorithm. Original paper: "ADASYN: Adaptive Synthetic Sampling Approach for Imbalanced Learning" by Haibo He, 27 | * Yang Bai, Edwardo A. Garcia, and Shutao Li. 28 | * 29 | * @param data data to work with 30 | * @param seed seed to use. If it is not provided, it will use the system time 31 | * @param d preset threshold for the maximum tolerated degree of class imbalance radio 32 | * @param B balance level after generation of synthetic data 33 | * @param k number of neighbors 34 | * @param dist object of Distance enumeration representing the distance to be used 35 | * @param normalize normalize the data or not 36 | * @param verbose choose to display information about the execution or not 37 | * @author David López Pretel 38 | */ 39 | class ADASYN(data: Data, seed: Long = System.currentTimeMillis(), d: Double = 1, B: Double = 1, k: Int = 5, 40 | dist: Distance = Distance.EUCLIDEAN, normalize: Boolean = false, verbose: Boolean = false) { 41 | 42 | /** Compute the ADASYN algorithm 43 | * 44 | * @return synthetic samples generated 45 | */ 46 | def compute(): Data = { 47 | val initTime: Long = System.nanoTime() 48 | 49 | if (B > 1 || B < 0) { 50 | throw new Exception("B must be between 0 and 1, both included") 51 | } 52 | 53 | if (d > 1 || d <= 0) { 54 | throw new Exception("d must be between 0 and 1, zero not included") 55 | } 56 | 57 | val samples: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData 58 | 59 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) { 60 | (samples.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)), 61 | samples.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)), 62 | samples.transpose.map((column: Array[Double]) => standardDeviation(column))) 63 | } else { 64 | (null, null, null) 65 | } 66 | 67 | val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) { 68 | Some(new KDTree(samples, data.y, samples(0).length)) 69 | } else { 70 | None 71 | } 72 | 73 | val minorityClassIndex: Array[Int] = minority(data.y) 74 | val minorityClass: Any = data.y(minorityClassIndex(0)) 75 | 76 | // calculate size of the output 77 | val ms: Int = minorityClassIndex.length 78 | val ml: Int = data.y.length - ms 79 | val G: Int = ((ml - ms) * B).asInstanceOf[Int] 80 | 81 | // k neighbors of each minority sample 82 | val neighbors: Array[Array[Int]] = new Array[Array[Int]](minorityClassIndex.length) 83 | minorityClassIndex.indices.par.foreach { i => 84 | if (dist == Distance.EUCLIDEAN) { 85 | neighbors(i) = KDTree.get.nNeighbours(samples(minorityClassIndex(i)), k)._3.toArray 86 | } else { 87 | neighbors(i) = kNeighborsHVDM(samples, minorityClassIndex(i), k, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter) 88 | } 89 | } 90 | 91 | // ratio of each minority sample 92 | val ratio: Array[Double] = new Array[Double](neighbors.length) 93 | neighbors.zipWithIndex.par.foreach(neighborsOfX => { 94 | ratio(neighborsOfX._2) = neighborsOfX._1.map(neighbor => { 95 | if (data.y(neighbor) != minorityClass) 1 else 0 96 | }).sum.asInstanceOf[Double] / k 97 | }) 98 | 99 | // normalize ratios 100 | val sumRatios: Double = ratio.sum 101 | ratio.indices.par.foreach(i => ratio(i) = ratio(i) / sumRatios) 102 | 103 | // number of synthetic samples for each sample 104 | val g: Array[Int] = new Array[Int](ratio.length) 105 | ratio.zipWithIndex.par.foreach(ri => g(ri._2) = (ri._1 * G).asInstanceOf[Int]) 106 | 107 | // output with a size of sum(Gi) samples 108 | val output: Array[Array[Double]] = Array.ofDim(g.sum, samples(0).length) 109 | 110 | val r: Random = new Random(seed) 111 | // must compute the random information before the loops due to parallelism 112 | 113 | var counter: Int = 0 114 | val increment: Array[Int] = new Array[Int](g.length) 115 | var i = 0 116 | while (i < g.length) { 117 | increment(i) = counter 118 | counter += g(i) 119 | i += 1 120 | } 121 | 122 | // for each minority class sample, create gi synthetic samples 123 | minorityClassIndex.indices.zip(increment).foreach(xi => { 124 | (0 until g(xi._1)).foreach(n => { 125 | // compute synthetic sample si = (xzi - xi) * lambda + xi 126 | samples(0).indices.foreach(atrib => { 127 | val nn: Int = neighbors(xi._1)(r.nextInt(neighbors(xi._1).length)) 128 | val diff: Double = samples(nn)(atrib) - samples(minorityClassIndex(xi._1))(atrib) 129 | val gap: Float = r.nextFloat 130 | output(xi._2 + n)(atrib) = samples(minorityClassIndex(xi._1))(atrib) + gap * diff 131 | }) 132 | }) 133 | }) 134 | 135 | val finishTime: Long = System.nanoTime() 136 | 137 | if (verbose) { 138 | println("ORIGINAL SIZE: %d".format(data.x.length)) 139 | println("NEW DATA SIZE: %d".format(data.x.length + output.length)) 140 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime))) 141 | } 142 | 143 | new Data(if (data.fileInfo.nominal.length == 0) { 144 | to2Decimals(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs, 145 | data.fileInfo.minAttribs) else output)) 146 | } else { 147 | toNominal(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs, 148 | data.fileInfo.minAttribs) else output), data.nomToNum) 149 | }, Array.concat(data.y, Array.fill(output.length)(minorityClass)), None, data.fileInfo) 150 | } 151 | } 152 | -------------------------------------------------------------------------------- /src/main/scala/soul/algorithm/oversampling/MDO.scala: -------------------------------------------------------------------------------- 1 | /* 2 | SOUL: Scala Oversampling and Undersampling Library. 3 | Copyright (C) 2019 Néstor Rodríguez, David López 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation in version 3 of the License. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | */ 17 | package soul.algorithm.oversampling 18 | 19 | import breeze.linalg.{DenseMatrix, DenseVector, eigSym, inv, sum} 20 | import soul.data.Data 21 | import soul.util.Utilities._ 22 | 23 | import scala.util.Random 24 | 25 | /** MDO algorithm. Original paper: "To combat multi-class imbalanced problems by means of over-sampling and boosting 26 | * techniques" by Lida Adbi and Sattar Hashemi. 27 | * 28 | * @param data data to work with 29 | * @param seed seed to use. If it is not provided, it will use the system time 30 | * @param normalize normalize the data or not 31 | * @param verbose choose to display information about the execution or not 32 | * @author David López Pretel 33 | */ 34 | class MDO(data: Data, seed: Long = System.currentTimeMillis(), normalize: Boolean = false, verbose: Boolean = false) { 35 | 36 | /** Compute the MDO algorithm 37 | * 38 | * @return synthetic samples generated 39 | */ 40 | def compute(): Data = { 41 | val initTime: Long = System.nanoTime() 42 | val samples: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData 43 | // compute minority class 44 | val minorityClassIndex: Array[Int] = minority(data.y) 45 | // compute majority class 46 | val minorityClass: Any = data.y(minorityClassIndex(0)) 47 | val majorityClassIndex: Array[Int] = samples.indices.diff(minorityClassIndex.toList).toArray 48 | 49 | // compute the mean for the values of each attribute 50 | val mean: Array[Double] = (minorityClassIndex map samples).transpose.map(_.sum / minorityClassIndex.length) 51 | 52 | // subtract the mean to every attrib and then compute the covariance matrix 53 | val Zi: DenseMatrix[Double] = DenseMatrix(minorityClassIndex map samples: _*) -:- DenseMatrix(minorityClassIndex.map(_ => mean): _*) 54 | val oneDividedByN: Array[Array[Double]] = Array.fill(Zi.cols, Zi.cols)(Zi.rows) 55 | val S: DenseMatrix[Double] = (Zi.t * Zi) /:/ DenseMatrix(oneDividedByN: _*) 56 | //compute the eigenvectors and eigenvalues of S 57 | val eigen = eigSym(S) 58 | // the eigenvectors form the columns of the matrix that performs the change os basis 59 | val Ti: DenseMatrix[Double] = (eigen.eigenvectors * Zi.t).t 60 | // the diag are the eigenvalues 61 | val V: DenseVector[Double] = eigen.eigenvalues 62 | 63 | //compute the new samples 64 | val newSamples: Array[Array[Double]] = MDO_oversampling(Ti, mean, V, majorityClassIndex.length - minorityClassIndex.length, seed) 65 | 66 | //transform the samples to the original basis 67 | val newSamplesToOriginalSpace: DenseMatrix[Double] = (inv(eigen.eigenvectors) * DenseMatrix(newSamples: _*).t).t 68 | 69 | //sum the mean again 70 | val samplesWithMean: DenseMatrix[Double] = newSamplesToOriginalSpace +:+ DenseMatrix((0 until newSamplesToOriginalSpace.rows).map(_ => mean): _*) 71 | 72 | // the output 73 | val output: Array[Array[Double]] = Array.range(0, samplesWithMean.rows).map(i => samplesWithMean(i, ::).t.toArray) 74 | 75 | val finishTime: Long = System.nanoTime() 76 | 77 | if (verbose) { 78 | println("ORIGINAL SIZE: %d".format(data.x.length)) 79 | println("NEW DATA SIZE: %d".format(data.x.length + output.length)) 80 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime))) 81 | } 82 | 83 | new Data(if (data.fileInfo.nominal.length == 0) { 84 | to2Decimals(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs, 85 | data.fileInfo.minAttribs) else output)) 86 | } else { 87 | toNominal(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs, 88 | data.fileInfo.minAttribs) else output), data.nomToNum) 89 | }, Array.concat(data.y, Array.fill(output.length)(minorityClass)), None, data.fileInfo) 90 | } 91 | 92 | /** create the new samples for MDO algorithm 93 | * 94 | * @param Ti the samples changed of basis 95 | * @param mean the mean of every characteristic 96 | * @param V the vector of coefficients 97 | * @param Orate majoritySamples - minoritySamples 98 | * @param seed seed to use. If it is not provided, it will use the system time 99 | * @return return the new samples generated 100 | */ 101 | def MDO_oversampling(Ti: DenseMatrix[Double], mean: Array[Double], V: DenseVector[Double], Orate: Int, seed: Long): Array[Array[Double]] = { 102 | // check the number of new samples to be created 103 | var I: Int = Ti.rows 104 | var N: Int = Orate / I 105 | if (I > Orate) { 106 | N = 1 107 | I = Orate 108 | } 109 | 110 | val output: Array[Array[Double]] = Array.fill(Orate, Ti.cols)(0.0) 111 | var newIndex: Int = 0 112 | val rand: Random.type = scala.util.Random 113 | rand.setSeed(seed) 114 | 115 | (0 until I).foreach(i => { 116 | // square of each sample 117 | val x: DenseVector[Double] = Ti(i, ::).t *:* Ti(i, ::).t 118 | // vector results from α × V , which forms the denominators of ellipse equation 119 | val alpha: Double = sum(x /:/ V) 120 | val alphaV: DenseVector[Double] = V *:* alpha 121 | (0 until N).foreach(_ => { 122 | var s: Double = 0.0 123 | (0 until Ti.cols - 1).foreach(p => { 124 | //random number between -sqrt(alphaV(p)) and sqrt(alphaV(p)) 125 | val r: Double = -alphaV(p) / (Ti.cols - 1) + rand.nextFloat() * (alphaV(p) / (Ti.cols - 1) + alphaV(p) / (Ti.cols - 1)) 126 | //this number is the value for the attrib p 127 | output(newIndex)(p) = r 128 | // sum necessary to compute the last attrib later 129 | s = s + (r * r / alphaV(p)) 130 | }) 131 | //compute the last attrib 132 | val lastFeaVal: Double = (1 - s) * alphaV(alphaV.length - 1) 133 | output(newIndex)(alphaV.size - 1) = if (rand.nextInt() % 2 == 0) -lastFeaVal else lastFeaVal 134 | newIndex += 1 135 | }) 136 | }) 137 | output 138 | } 139 | } 140 | -------------------------------------------------------------------------------- /src/main/scala/soul/algorithm/undersampling/ClusterOSS.scala: -------------------------------------------------------------------------------- 1 | /* 2 | SOUL: Scala Oversampling and Undersampling Library. 3 | Copyright (C) 2019 Néstor Rodríguez, David López 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation in version 3 of the License. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | */ 17 | package soul.algorithm.undersampling 18 | 19 | import soul.data.Data 20 | import soul.util.KDTree 21 | import soul.util.Utilities.Distance.Distance 22 | import soul.util.Utilities._ 23 | 24 | /** ClusterOSS. Original paper: "ClusterOSS: a new undersampling method for imbalanced learning." 25 | * by Victor H Barella, Eduardo P Costa and André C P L F Carvalho. 26 | * 27 | * @param data data to work with 28 | * @param seed seed to use. If it is not provided, it will use the system time 29 | * @param dist object of Distance enumeration representing the distance to be used 30 | * @param numClusters number of clusters to be created by KMeans algorithm 31 | * @param restarts number of times to relaunch KMeans algorithm 32 | * @param minDispersion stop KMeans core if dispersion is lower than this value 33 | * @param maxIterations number of iterations to be done in KMeans algorithm 34 | * @param normalize normalize the data or not 35 | * @param randomData iterate through the data randomly or not 36 | * @param verbose choose to display information about the execution or not 37 | * @author Néstor Rodríguez Vico 38 | */ 39 | class ClusterOSS(data: Data, seed: Long = System.currentTimeMillis(), dist: Distance = Distance.EUCLIDEAN, 40 | numClusters: Int = 15, restarts: Int = 5, minDispersion: Double = 0.0001, maxIterations: Int = 100, 41 | normalize: Boolean = false, randomData: Boolean = false, verbose: Boolean = false) { 42 | 43 | /** Compute the ClusterOSS algorithm 44 | * 45 | * @return undersampled data structure 46 | */ 47 | def compute(): Data = { 48 | val initTime: Long = System.nanoTime() 49 | 50 | val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length) 51 | val untouchableClass: Any = counter.minBy((c: (Any, Int)) => c._2)._1 52 | val random: scala.util.Random = new scala.util.Random(seed) 53 | var dataToWorkWith: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData 54 | val classesToWorkWith: Array[Any] = if (randomData) { 55 | val randomIndex: List[Int] = random.shuffle(data.y.indices.toList) 56 | dataToWorkWith = (randomIndex map dataToWorkWith).toArray 57 | (randomIndex map data.y).toArray 58 | } else { 59 | data.y 60 | } 61 | 62 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) { 63 | (dataToWorkWith.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)), 64 | dataToWorkWith.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)), 65 | dataToWorkWith.transpose.map((column: Array[Double]) => standardDeviation(column))) 66 | } else { 67 | (null, null, null) 68 | } 69 | 70 | val majElements: Array[Int] = classesToWorkWith.zipWithIndex.collect { case (label, i) if label != untouchableClass => i } 71 | val (_, centroids, assignment) = kMeans(data = majElements map dataToWorkWith, nominal = data.fileInfo.nominal, 72 | numClusters = numClusters, restarts = restarts, minDispersion = minDispersion, maxIterations = maxIterations, seed = seed) 73 | 74 | val (closestInstances, restOfInstances) = assignment.par.map { cluster: (Int, Array[Int]) => 75 | val distances: Array[(Int, Double)] = cluster._2.map { instance: Int => 76 | (instance, euclidean(dataToWorkWith(instance), centroids(cluster._1))) 77 | } 78 | 79 | val closestInstance: Int = if (distances.isEmpty) -1 else distances.minBy(_._2)._1 80 | (closestInstance, cluster._2.diff(List(closestInstance))) 81 | }.toArray.unzip 82 | 83 | // Remove foo values 84 | val train: Array[Int] = closestInstances.diff(List(-1)) 85 | // Flatten all the clusters 86 | val test: Array[Int] = restOfInstances.flatten 87 | val neighbours: Array[Array[Double]] = train map dataToWorkWith 88 | val classes: Array[Any] = train map classesToWorkWith 89 | 90 | val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) { 91 | Some(new KDTree(neighbours, classes, dataToWorkWith(0).length)) 92 | } else { 93 | None 94 | } 95 | 96 | val calculatedLabels: Array[(Int, Any)] = test.zipWithIndex.map { i => 97 | val label: Any = if (dist == Distance.EUCLIDEAN) { 98 | val labels = KDTree.get.nNeighbours(dataToWorkWith(i._1), 1)._2 99 | mode(labels.toArray) 100 | } else { 101 | nnRuleHVDM(neighbours, dataToWorkWith(i._1), -1, classes, 1, data.fileInfo.nominal, sds, attrCounter, 102 | attrClassesCounter, "nearest")._1 103 | } 104 | (i._1, label) 105 | } 106 | 107 | // if the label matches (it is well classified) the element is useful 108 | val misclassified: Array[Int] = calculatedLabels.collect { case (i, label) if label != classesToWorkWith(i) => i } 109 | val newDataIndex: Array[Int] = misclassified ++ train 110 | 111 | // Construct a data object to be passed to Tomek Link 112 | val auxData: Data = new Data(x = toXData(newDataIndex map dataToWorkWith), 113 | y = newDataIndex map classesToWorkWith, fileInfo = data.fileInfo) 114 | auxData.processedData = newDataIndex map dataToWorkWith 115 | val tl = new TL(auxData, dist = dist, minorityClass = Some(untouchableClass)) 116 | val resultTL: Data = tl.compute() 117 | // The final instances is the result of applying Tomek Link to the content of newDataIndex 118 | val finalIndex: Array[Int] = (resultTL.index.get.toList map newDataIndex).toArray 119 | val finishTime: Long = System.nanoTime() 120 | 121 | if (verbose) { 122 | val newCounter: Map[Any, Int] = (finalIndex map classesToWorkWith).groupBy(identity).mapValues(_.length) 123 | println("ORIGINAL SIZE: %d".format(dataToWorkWith.length)) 124 | println("NEW DATA SIZE: %d".format(finalIndex.length)) 125 | println("REDUCTION PERCENTAGE: %s".format(100 - (finalIndex.length.toFloat / dataToWorkWith.length) * 100)) 126 | println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass))) 127 | println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass))) 128 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime))) 129 | } 130 | 131 | new Data(finalIndex map data.x, finalIndex map data.y, Some(finalIndex), data.fileInfo) 132 | } 133 | } -------------------------------------------------------------------------------- /src/main/scala/soul/algorithm/undersampling/CPM.scala: -------------------------------------------------------------------------------- 1 | /* 2 | SOUL: Scala Oversampling and Undersampling Library. 3 | Copyright (C) 2019 Néstor Rodríguez, David López 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation in version 3 of the License. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | */ 17 | package soul.algorithm.undersampling 18 | 19 | import soul.data.Data 20 | import soul.util.Utilities.Distance.Distance 21 | import soul.util.Utilities._ 22 | 23 | import scala.collection.mutable.ArrayBuffer 24 | import scala.math.min 25 | 26 | /** Class Purity Maximization. Original paper: "An Unsupervised Learning Approach to Resolving the 27 | * Data Imbalanced Issue in Supervised Learning Problems in Functional Genomics" by Kihoon Yoon and Stephen Kwek. 28 | * 29 | * @param data data to work with 30 | * @param seed seed to use. If it is not provided, it will use the system time 31 | * @param dist object of Distance enumeration representing the distance to be used 32 | * @param normalize normalize the data or not 33 | * @param randomData iterate through the data randomly or not 34 | * @param verbose choose to display information about the execution or not 35 | * @author Néstor Rodríguez Vico 36 | */ 37 | class CPM(data: Data, seed: Long = System.currentTimeMillis(), dist: Distance = Distance.EUCLIDEAN, 38 | normalize: Boolean = false, randomData: Boolean = false, verbose: Boolean = false) { 39 | 40 | /** Compute the CPM algorithm. 41 | * 42 | * @return undersampled data structure 43 | */ 44 | def compute(): Data = { 45 | val initTime: Long = System.nanoTime() 46 | 47 | val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length) 48 | val untouchableClass: Any = counter.minBy((c: (Any, Int)) => c._2)._1 49 | val random: scala.util.Random = new scala.util.Random(seed) 50 | val centers: ArrayBuffer[Int] = new ArrayBuffer[Int](0) 51 | var dataToWorkWith: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData 52 | val classesToWorkWith: Array[Any] = if (randomData) { 53 | val randomIndex: List[Int] = random.shuffle(data.y.indices.toList) 54 | dataToWorkWith = (randomIndex map dataToWorkWith).toArray 55 | (randomIndex map data.y).toArray 56 | } else { 57 | data.y 58 | } 59 | 60 | val posElements: Int = counter.head._2 61 | val negElements: Int = counter.tail.values.sum 62 | val impurity: Double = posElements.asInstanceOf[Double] / negElements.asInstanceOf[Double] 63 | val cluster: Array[Int] = new Array[Int](dataToWorkWith.length).indices.toArray 64 | 65 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) { 66 | (dataToWorkWith.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)), 67 | dataToWorkWith.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)), 68 | dataToWorkWith.transpose.map((column: Array[Double]) => standardDeviation(column))) 69 | } else { 70 | (null, null, null) 71 | } 72 | 73 | def purityMaximization(parentImpurity: Double, parentCluster: Array[Int], center: Int): Unit = { 74 | val cluster1: ArrayBuffer[Int] = new ArrayBuffer[Int](0) 75 | val cluster2: ArrayBuffer[Int] = new ArrayBuffer[Int](0) 76 | val posElements: ArrayBuffer[Int] = new ArrayBuffer[Int](0) 77 | val negElements: ArrayBuffer[Int] = new ArrayBuffer[Int](0) 78 | 79 | var center1: Int = 0 80 | var center2: Int = 0 81 | var pointer: Int = 0 82 | var impurity: Double = Double.PositiveInfinity 83 | var impurity1: Double = Double.PositiveInfinity 84 | var impurity2: Double = Double.PositiveInfinity 85 | 86 | parentCluster.foreach((f: Int) => if (data.y(f) == untouchableClass) posElements += f else negElements += f) 87 | 88 | val pairs: ArrayBuffer[(Int, Int)] = for {x <- negElements; y <- posElements} yield (x, y) 89 | 90 | while (parentImpurity <= impurity) { 91 | if (pointer >= pairs.length) { 92 | centers += center 93 | return 94 | } 95 | 96 | center1 = pairs(pointer)._1 97 | center2 = pairs(pointer)._2 98 | 99 | parentCluster.foreach { element: Int => 100 | val d1: Double = if (dist == Distance.EUCLIDEAN) { 101 | euclidean(dataToWorkWith(element), dataToWorkWith(center1)) 102 | } else { 103 | HVDM(dataToWorkWith(element), dataToWorkWith(center1), data.fileInfo.nominal, sds, attrCounter, attrClassesCounter) 104 | } 105 | 106 | val d2: Double = if (dist == Distance.EUCLIDEAN) { 107 | euclidean(dataToWorkWith(element), dataToWorkWith(center2)) 108 | } else { 109 | HVDM(dataToWorkWith(element), dataToWorkWith(center2), data.fileInfo.nominal, sds, attrCounter, attrClassesCounter) 110 | } 111 | 112 | if (d1 < d2) 113 | cluster1 += element else cluster2 += element 114 | } 115 | 116 | if (cluster1.nonEmpty) 117 | impurity1 = cluster1.count((element: Int) => data.y(element) == untouchableClass).toDouble / cluster1.length 118 | else { 119 | centers += center2 120 | return 121 | } 122 | 123 | if (cluster2.nonEmpty) 124 | impurity2 = cluster2.count((element: Int) => data.y(element) == untouchableClass).toDouble / cluster2.length 125 | else { 126 | centers += center1 127 | return 128 | } 129 | 130 | impurity = min(impurity1, impurity2) 131 | pointer += 1 132 | } 133 | 134 | purityMaximization(impurity1, cluster1.toArray, center1) 135 | purityMaximization(impurity2, cluster2.toArray, center2) 136 | } 137 | 138 | purityMaximization(impurity, cluster, 0) 139 | 140 | val finishTime: Long = System.nanoTime() 141 | 142 | if (verbose) { 143 | val newCounter: Map[Any, Int] = (centers.toArray map classesToWorkWith).groupBy(identity).mapValues(_.length) 144 | println("ORIGINAL SIZE: %d".format(dataToWorkWith.length)) 145 | println("NEW DATA SIZE: %d".format(centers.toArray.length)) 146 | println("REDUCTION PERCENTAGE: %s".format(100 - (centers.toArray.length.toFloat / dataToWorkWith.length) * 100)) 147 | println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass))) 148 | println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass))) 149 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime))) 150 | } 151 | 152 | new Data(centers.toArray map data.x, centers.toArray map data.y, Some(centers.toArray), data.fileInfo) 153 | } 154 | } 155 | -------------------------------------------------------------------------------- /src/main/scala/soul/algorithm/oversampling/SMOTERSB.scala: -------------------------------------------------------------------------------- 1 | /* 2 | SOUL: Scala Oversampling and Undersampling Library. 3 | Copyright (C) 2019 Néstor Rodríguez, David López 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation in version 3 of the License. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | */ 17 | package soul.algorithm.oversampling 18 | 19 | import soul.data.Data 20 | import soul.util.KDTree 21 | import soul.util.Utilities.Distance.Distance 22 | import soul.util.Utilities._ 23 | 24 | import scala.Array._ 25 | import scala.collection.mutable.ArrayBuffer 26 | import scala.util.Random 27 | 28 | /** SMOTERSB algorithm. Original paper: "kNN Approach to Unbalanced Data Distribution: SMOTE-RSB: a hybrid preprocessing 29 | * approach based on oversampling and undersampling for high imbalanced data-sets using SMOTE and rough sets theory" 30 | * by Enislay Ramentol, Yailé Caballero, Rafael Bello and Francisco Herrera. 31 | * 32 | * @param data data to work with 33 | * @param seed seed to use. If it is not provided, it will use the system time 34 | * @param percent amount of Smote N% 35 | * @param k number of minority class nearest neighbors 36 | * @param dist object of Distance enumeration representing the distance to be used 37 | * @param normalize normalize the data or not 38 | * @param verbose choose to display information about the execution or not 39 | * @author David López Pretel 40 | */ 41 | class SMOTERSB(data: Data, seed: Long = System.currentTimeMillis(), percent: Int = 500, k: Int = 5, 42 | dist: Distance = Distance.EUCLIDEAN, normalize: Boolean = false, verbose: Boolean = false) { 43 | 44 | /** Compute the SMOTERSB algorithm 45 | * 46 | * @return synthetic samples generated 47 | */ 48 | def compute(): Data = { 49 | val initTime: Long = System.nanoTime() 50 | 51 | if (percent > 100 && percent % 100 != 0) { 52 | throw new Exception("Percent must be a multiple of 100") 53 | } 54 | 55 | val samples: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData 56 | val minorityClassIndex: Array[Int] = minority(data.y) 57 | val minorityClass: Any = data.y(minorityClassIndex(0)) 58 | 59 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) { 60 | (samples.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)), 61 | samples.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)), 62 | samples.transpose.map((column: Array[Double]) => standardDeviation(column))) 63 | } else { 64 | (null, null, null) 65 | } 66 | 67 | val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) { 68 | Some(new KDTree(samples, data.y, samples(0).length)) 69 | } else { 70 | None 71 | } 72 | 73 | // check if the percent is correct 74 | var T: Int = minorityClassIndex.length 75 | var N: Int = percent 76 | 77 | if (N < 100) { 78 | T = N / 100 * T 79 | N = 100 80 | } 81 | N = N / 100 82 | 83 | // output with a size of T*N samples 84 | val output: Array[Array[Double]] = Array.ofDim[Double](N * T, samples(0).length) 85 | 86 | val r: Random = new Random(seed) 87 | 88 | // for each minority class sample 89 | minorityClassIndex.indices.par.foreach((i: Int) => { 90 | val neighbors: Array[Int] = if (dist == Distance.EUCLIDEAN) { 91 | KDTree.get.nNeighbours(samples(minorityClassIndex(i)), k)._3.toArray 92 | } else { 93 | kNeighborsHVDM(samples, minorityClassIndex(i), k, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter) 94 | } 95 | 96 | // compute populate for the sample 97 | (0 until N).par.foreach((n: Int) => { 98 | val nn: Int = neighbors(r.nextInt(neighbors.length)) 99 | // compute attributes of the sample 100 | samples(0).indices.foreach((atrib: Int) => { 101 | val diff: Double = samples(nn)(atrib) - samples(minorityClassIndex(i))(atrib) 102 | val gap: Double = r.nextFloat() 103 | output(i * N + n)(atrib) = samples(minorityClassIndex(i))(atrib) + gap * diff 104 | }) 105 | }) 106 | }) 107 | 108 | //compute the majority class 109 | val majorityClassIndex: Array[Int] = samples.indices.diff(minorityClassIndex.toList).toArray 110 | 111 | // minimum and maximum value for each attrib 112 | val maxMinValues: Array[(Double, Double)] = Array.concat(majorityClassIndex map samples, output).transpose.map(column => (column.max, column.min)) 113 | 114 | //compute the similarity matrix 115 | val similarityMatrix: Array[Array[Double]] = Array.ofDim(output.length, majorityClassIndex.length) 116 | output.indices.par.foreach(i => { 117 | (majorityClassIndex map samples).zipWithIndex.par.foreach(j => { 118 | similarityMatrix(i)(j._2) = output(i).indices.map(k => { 119 | if (data.nomToNum(0).isEmpty) { 120 | 1 - (Math.abs(output(i)(k) - j._1(k)) / (maxMinValues(k)._1 - maxMinValues(k)._2)) // this expression must be multiplied by wk 121 | } else { // but all the features are included, so wk is 1 122 | if (output(i)(k) == j._1(k)) 1 else 0 123 | } 124 | }).sum / output(i).length 125 | }) 126 | }) 127 | 128 | var result: ArrayBuffer[Int] = ArrayBuffer() 129 | var similarityValue: Double = 0.4 130 | var lowerApproximation: Boolean = true 131 | while (similarityValue < 0.9) { 132 | output.indices.foreach(i => { 133 | lowerApproximation = true 134 | majorityClassIndex.indices.foreach(j => { 135 | if (similarityMatrix(i)(j) > similarityValue) 136 | lowerApproximation = false 137 | }) 138 | if (lowerApproximation) result += i 139 | }) 140 | similarityValue += 0.05 141 | } 142 | 143 | //if there are not synthetic samples with lower approximation, return all synthetic samples 144 | if (result.isEmpty) { 145 | result = ArrayBuffer.range(0, output.length) 146 | } else { 147 | result = result.distinct 148 | } 149 | 150 | val finishTime: Long = System.nanoTime() 151 | 152 | if (verbose) { 153 | println("ORIGINAL SIZE: %d".format(data.x.length)) 154 | println("NEW DATA SIZE: %d".format(data.x.length + output.length)) 155 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime))) 156 | } 157 | 158 | new Data(if (data.fileInfo.nominal.length == 0) { 159 | to2Decimals(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(result.toArray map output, data.fileInfo.maxAttribs, 160 | data.fileInfo.minAttribs) else result.toArray map output)) 161 | } else { 162 | toNominal(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(result.toArray map output, data.fileInfo.maxAttribs, 163 | data.fileInfo.minAttribs) else result.toArray map output), data.nomToNum) 164 | }, Array.concat(data.y, Array.fill((result.toArray map output).length)(minorityClass)), None, data.fileInfo) 165 | } 166 | } 167 | -------------------------------------------------------------------------------- /src/main/scala/soul/algorithm/undersampling/NCL.scala: -------------------------------------------------------------------------------- 1 | /* 2 | SOUL: Scala Oversampling and Undersampling Library. 3 | Copyright (C) 2019 Néstor Rodríguez, David López 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation in version 3 of the License. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | */ 17 | package soul.algorithm.undersampling 18 | 19 | import soul.data.Data 20 | import soul.util.KDTree 21 | import soul.util.Utilities.Distance.Distance 22 | import soul.util.Utilities._ 23 | 24 | import scala.collection.mutable.ArrayBuffer 25 | 26 | /** Neighbourhood Cleaning Rule. Original paper: "Improving Identification of Difficult Small Classes by Balancing Class 27 | * Distribution" by J. Laurikkala. 28 | * 29 | * @param data data to work with 30 | * @param seed seed to use. If it is not provided, it will use the system time 31 | * @param dist object of Distance enumeration representing the distance to be used 32 | * @param k number of neighbours to use when computing k-NN rule (normally 3 neighbours) 33 | * @param threshold consider a class to be undersampled if the number of instances of this class is 34 | * greater than data.size * threshold 35 | * @param normalize normalize the data or not 36 | * @param randomData iterate through the data randomly or not 37 | * @param verbose choose to display information about the execution or not 38 | * @author Néstor Rodríguez Vico 39 | */ 40 | class NCL(data: Data, seed: Long = System.currentTimeMillis(), dist: Distance = Distance.EUCLIDEAN, k: Int = 3, 41 | threshold: Double = 0.5, normalize: Boolean = false, randomData: Boolean = false, verbose: Boolean = false) { 42 | /** Compute the NCL algorithm. 43 | * 44 | * @return undersampled data structure 45 | */ 46 | def compute(): Data = { 47 | // Note: the notation used to refers the subsets of data is the used in the original paper. 48 | val initTime: Long = System.nanoTime() 49 | 50 | val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length) 51 | val untouchableClass: Any = counter.minBy((c: (Any, Int)) => c._2)._1 52 | val random: scala.util.Random = new scala.util.Random(seed) 53 | var dataToWorkWith: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData 54 | val classesToWorkWith: Array[Any] = if (randomData) { 55 | val randomIndex: List[Int] = random.shuffle(data.y.indices.toList) 56 | dataToWorkWith = (randomIndex map dataToWorkWith).toArray 57 | (randomIndex map data.y).toArray 58 | } else { 59 | data.y 60 | } 61 | 62 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) { 63 | (dataToWorkWith.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)), 64 | dataToWorkWith.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)), 65 | dataToWorkWith.transpose.map((column: Array[Double]) => standardDeviation(column))) 66 | } else { 67 | (null, null, null) 68 | } 69 | 70 | val minorityIndex: ArrayBuffer[Int] = new ArrayBuffer[Int](0) 71 | val majorityIndex: ArrayBuffer[Int] = new ArrayBuffer[Int](0) 72 | 73 | var i = 0 74 | while (i < classesToWorkWith.length) { 75 | if (classesToWorkWith(i) == untouchableClass) minorityIndex += i else majorityIndex += i 76 | i += 1 77 | } 78 | 79 | // ENN can not be applied when only one class is in the less important group 80 | val indexA1: Array[Int] = if (classesToWorkWith.distinct.length > 2) { 81 | val ennData = new Data(toXData((majorityIndex map dataToWorkWith).toArray), (majorityIndex map classesToWorkWith).toArray, None, data.fileInfo) 82 | ennData.processedData = (majorityIndex map dataToWorkWith).toArray 83 | val enn = new ENN(ennData, dist = dist, k = k) 84 | val resultENN: Data = enn.compute() 85 | classesToWorkWith.indices.diff(resultENN.index.get).toArray 86 | } else { 87 | new Array[Int](0) 88 | } 89 | 90 | val uniqueMajClasses = (majorityIndex map classesToWorkWith).distinct 91 | val ratio: Double = dataToWorkWith.length * threshold 92 | 93 | val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) { 94 | Some(new KDTree((minorityIndex map dataToWorkWith).toArray, (majorityIndex map classesToWorkWith).toArray, dataToWorkWith(0).length)) 95 | } else { 96 | None 97 | } 98 | 99 | def selectNeighbours(l: Int): ArrayBuffer[Int] = { 100 | var selectedElements = new ArrayBuffer[Int](0) 101 | val (_, labels, index) = KDTree.get.nNeighbours(dataToWorkWith(l), k) 102 | val label = mode(labels.toArray) 103 | 104 | if (label != classesToWorkWith(l)) { 105 | index.foreach { n => 106 | if (classesToWorkWith(n) != untouchableClass && counter(classesToWorkWith(n)) > ratio) { 107 | selectedElements += n 108 | } 109 | } 110 | } 111 | selectedElements 112 | } 113 | 114 | def selectNeighboursHVDM(l: Int): ArrayBuffer[Int] = { 115 | val selectedElements = new ArrayBuffer[Int]() 116 | val (label, nNeighbours, _) = nnRuleHVDM(dataToWorkWith, dataToWorkWith(l), l, classesToWorkWith, k, data.fileInfo.nominal, 117 | sds, attrCounter, attrClassesCounter, "nearest") 118 | 119 | if (label != classesToWorkWith(l)) { 120 | nNeighbours.foreach { n => 121 | val nNeighbourClass: Any = classesToWorkWith(n) 122 | if (nNeighbourClass != untouchableClass && counter(nNeighbourClass) > ratio) { 123 | selectedElements += n 124 | } 125 | } 126 | } 127 | selectedElements 128 | } 129 | 130 | var j = 0 131 | val indexA2 = new ArrayBuffer[Int](0) 132 | while (j < uniqueMajClasses.length) { 133 | val selectedNeighbours: Array[ArrayBuffer[Int]] = if (dist == Distance.EUCLIDEAN) { 134 | minorityIndex.par.map(l => selectNeighbours(l)).toArray 135 | } else { 136 | minorityIndex.par.map(l => selectNeighboursHVDM(l)).toArray 137 | } 138 | 139 | selectedNeighbours.flatten.distinct.foreach(e => indexA2 += e) 140 | j += 1 141 | } 142 | 143 | val finalIndex: Array[Int] = classesToWorkWith.indices.diff(indexA1.toList ++ indexA2.distinct).toArray 144 | val finishTime: Long = System.nanoTime() 145 | 146 | if (verbose) { 147 | val newCounter: Map[Any, Int] = (finalIndex map classesToWorkWith).groupBy(identity).mapValues(_.length) 148 | println("ORIGINAL SIZE: %d".format(dataToWorkWith.length)) 149 | println("NEW DATA SIZE: %d".format(finalIndex.length)) 150 | println("REDUCTION PERCENTAGE: %s".format(100 - (finalIndex.length.toFloat / dataToWorkWith.length) * 100)) 151 | println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass))) 152 | println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass))) 153 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime))) 154 | } 155 | 156 | new Data(finalIndex map data.x, finalIndex map data.y, Some(finalIndex), data.fileInfo) 157 | } 158 | } -------------------------------------------------------------------------------- /src/main/scala/soul/algorithm/undersampling/BC.scala: -------------------------------------------------------------------------------- 1 | /* 2 | SOUL: Scala Oversampling and Undersampling Library. 3 | Copyright (C) 2019 Néstor Rodríguez, David López 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation in version 3 of the License. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | */ 17 | package soul.algorithm.undersampling 18 | 19 | import soul.data.Data 20 | import soul.util.Utilities.Distance.Distance 21 | import soul.util.Utilities._ 22 | 23 | import scala.collection.mutable.ArrayBuffer 24 | 25 | /** Balance Cascade algorithm. Original paper: "Exploratory Undersampling for Class-Imbalance Learning" by Xu-Ying Liu, 26 | * Jianxin Wu and Zhi-Hua Zhou. 27 | * 28 | * @param data data to work with 29 | * @param seed seed to use. If it is not provided, it will use the system time 30 | * @param dist object of Distance enumeration representing the distance to be used 31 | * @param k number of neighbours to use when computing k-NN rule (normally 3 neighbours) 32 | * @param nMaxSubsets maximum number of subsets to generate 33 | * @param nFolds number of subsets to create when applying cross-validation 34 | * @param ratio ratio to know how many majority class examples to preserve. By default it's set to 1 so there 35 | * will be the same minority class examples as majority class examples. It will take 36 | * numMinorityInstances * ratio 37 | * @param normalize normalize the data or not 38 | * @param randomData iterate through the data randomly or not 39 | * @param verbose choose to display information about the execution or not 40 | * @author Néstor Rodríguez Vico 41 | */ 42 | class BC(data: Data, seed: Long = System.currentTimeMillis(), dist: Distance = Distance.EUCLIDEAN, 43 | k: Int = 3, nMaxSubsets: Int = 5, nFolds: Int = 5, ratio: Double = 1.0, normalize: Boolean = false, 44 | randomData: Boolean = false, verbose: Boolean = false) { 45 | 46 | /** Compute the BC algorithm. 47 | * 48 | * @return undersampled data structure 49 | */ 50 | def compute(): Data = { 51 | val initTime: Long = System.nanoTime() 52 | 53 | val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length) 54 | val untouchableClass: Any = counter.minBy((c: (Any, Int)) => c._2)._1 55 | val random: scala.util.Random = new scala.util.Random(seed) 56 | var dataToWorkWith: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData 57 | val classesToWorkWith: Array[Any] = if (randomData) { 58 | val randomIndex: List[Int] = random.shuffle(data.y.indices.toList) 59 | dataToWorkWith = (randomIndex map dataToWorkWith).toArray 60 | (randomIndex map data.y).toArray 61 | } else { 62 | data.y 63 | } 64 | 65 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) { 66 | (dataToWorkWith.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)), 67 | dataToWorkWith.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)), 68 | dataToWorkWith.transpose.map((column: Array[Double]) => standardDeviation(column))) 69 | } else { 70 | (null, null, null) 71 | } 72 | 73 | var search: Boolean = true 74 | var subsetsCounter: Int = 0 75 | val mask: Array[Boolean] = Array.fill(classesToWorkWith.length)(true) 76 | val subsets: ArrayBuffer[Array[Int]] = new ArrayBuffer[Array[Int]](0) 77 | val minorityElements: ArrayBuffer[Int] = new ArrayBuffer[Int](0) 78 | val majorityElements: ArrayBuffer[Int] = new ArrayBuffer[Int](0) 79 | 80 | while (search) { 81 | val indexToUnderSample: ArrayBuffer[Int] = new ArrayBuffer[Int](0) 82 | val minorityIndex: ArrayBuffer[Int] = new ArrayBuffer[Int](0) 83 | val classesCounter: Map[Any, Int] = (boolToIndex(mask) map classesToWorkWith).groupBy(identity).mapValues(_.length) 84 | 85 | classesCounter.foreach { target: (Any, Int) => 86 | val indexClass: Array[Int] = classesToWorkWith.zipWithIndex.collect { case (c, i) if c == target._1 => i } 87 | if (target._1 != untouchableClass) { 88 | val sameClassBool: Array[Boolean] = mask.zipWithIndex.collect { case (c, i) if classesToWorkWith(i) == target._1 => c } 89 | val indexClassInterest: Array[Int] = boolToIndex(sameClassBool) map indexClass 90 | val indexTargetClass: List[Int] = random.shuffle((indexClassInterest map classesToWorkWith).indices.toList).take(counter(untouchableClass)) 91 | indexToUnderSample ++= (indexTargetClass map indexClassInterest) 92 | majorityElements ++= (indexTargetClass map indexClassInterest) 93 | } else { 94 | minorityIndex ++= indexClass 95 | minorityElements ++= indexClass 96 | } 97 | } 98 | 99 | subsetsCounter += 1 100 | val subset: Array[Int] = (indexToUnderSample ++ minorityIndex).toArray 101 | subsets += subset 102 | 103 | val classesToWorkWithSubset: Array[Any] = subset map classesToWorkWith 104 | val dataToWorkWithSubset: Array[Array[Double]] = subset map dataToWorkWith 105 | val prediction: Array[Any] = (if (dist == Distance.EUCLIDEAN) { 106 | kFoldPrediction(dataToWorkWithSubset, classesToWorkWithSubset, k, nFolds, "nearest") 107 | } else { 108 | kFoldPredictionHVDM(dataToWorkWithSubset, classesToWorkWithSubset, k, nFolds, data.fileInfo.nominal, sds, attrCounter, 109 | attrClassesCounter, "nearest") 110 | }).take(indexToUnderSample.length) 111 | 112 | val classifiedInstances: Array[Boolean] = ((indexToUnderSample.indices map classesToWorkWithSubset) 113 | zip prediction).map((e: (Any, Any)) => e._1 == e._2).toArray 114 | (boolToIndex(classifiedInstances) map indexToUnderSample).foreach((i: Int) => mask(i) = false) 115 | 116 | if (subsetsCounter == nMaxSubsets) search = false 117 | 118 | val finalTargetStats: Map[Any, Int] = (boolToIndex(mask) map classesToWorkWith).groupBy(identity).mapValues(_.length) 119 | classesToWorkWith.distinct.filter((c: Any) => c != untouchableClass).foreach { c: Any => 120 | if (finalTargetStats(c) < counter(untouchableClass)) search = false 121 | } 122 | } 123 | 124 | val majorityIndexHistogram: Array[(Int, Int)] = majorityElements.groupBy(identity).mapValues(_.length).toArray.sortBy(_._2).reverse 125 | val majorityIndex: Array[Int] = majorityIndexHistogram.take((minorityElements.distinct.length * ratio).toInt).map(_._1) 126 | val finalIndex: Array[Int] = minorityElements.distinct.toArray ++ majorityIndex 127 | val finishTime: Long = System.nanoTime() 128 | 129 | if (verbose) { 130 | val newCounter: Map[Any, Int] = (finalIndex map classesToWorkWith).groupBy(identity).mapValues(_.length) 131 | println("ORIGINAL SIZE: %d".format(dataToWorkWith.length)) 132 | println("NEW DATA SIZE: %d".format(finalIndex.length)) 133 | println("REDUCTION PERCENTAGE: %s".format(100 - (finalIndex.length.toFloat / dataToWorkWith.length) * 100)) 134 | println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass))) 135 | println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass))) 136 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime))) 137 | } 138 | 139 | new Data(finalIndex map data.x, finalIndex map data.y, Some(finalIndex), data.fileInfo) 140 | } 141 | } -------------------------------------------------------------------------------- /src/main/scala/soul/algorithm/undersampling/CNN.scala: -------------------------------------------------------------------------------- 1 | /* 2 | SOUL: Scala Oversampling and Undersampling Library. 3 | Copyright (C) 2019 Néstor Rodríguez, David López 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation in version 3 of the License. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | */ 17 | package soul.algorithm.undersampling 18 | 19 | import soul.data.Data 20 | import soul.util.KDTree 21 | import soul.util.Utilities.Distance.Distance 22 | import soul.util.Utilities._ 23 | 24 | import scala.collection.mutable.ListBuffer 25 | 26 | /** Condensed Nearest Neighbor decision rule. Original paper: "The Condensed Nearest Neighbor Rule" by P. Hart. 27 | * 28 | * @param data data to work with 29 | * @param seed seed to use. If it is not provided, it will use the system time 30 | * @param dist object of Distance enumeration representing the distance to be used 31 | * @param normalize normalize the data or not 32 | * @param randomData iterate through the data randomly or not 33 | * @param verbose choose to display information about the execution or not 34 | * @author Néstor Rodríguez Vico 35 | */ 36 | class CNN(data: Data, seed: Long = System.currentTimeMillis(), dist: Distance = Distance.EUCLIDEAN, 37 | normalize: Boolean = false, randomData: Boolean = false, verbose: Boolean = false) { 38 | 39 | /** Compute the CNN algorithm 40 | * 41 | * @return undersampled data structure 42 | */ 43 | def compute(): Data = { 44 | val initTime: Long = System.nanoTime() 45 | 46 | val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length) 47 | val untouchableClass: Any = counter.minBy((c: (Any, Int)) => c._2)._1 48 | val random: scala.util.Random = new scala.util.Random(seed) 49 | var dataToWorkWith: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData 50 | val classesToWorkWith: Array[Any] = if (randomData) { 51 | val randomIndex: List[Int] = random.shuffle(data.y.indices.toList) 52 | dataToWorkWith = (randomIndex map dataToWorkWith).toArray 53 | (randomIndex map data.y).toArray 54 | } else { 55 | data.y 56 | } 57 | 58 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) { 59 | (dataToWorkWith.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)), 60 | dataToWorkWith.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)), 61 | dataToWorkWith.transpose.map((column: Array[Double]) => standardDeviation(column))) 62 | } else { 63 | (null, null, null) 64 | } 65 | 66 | val finalIndex: Array[Int] = if (dist == Distance.HVDM) { 67 | // Indicate the corresponding group: 1 for store, 0 for unknown, -1 for grabbag 68 | val location: Array[Int] = List.fill(dataToWorkWith.length)(0).toArray 69 | // The first element is added to store 70 | location(0) = 1 71 | var changed = true 72 | 73 | // Iterate the data, x (except the first instance) 74 | dataToWorkWith.zipWithIndex.tail.foreach { element: (Array[Double], Int) => 75 | // and classify each element with the actual content of store 76 | val index: Array[Int] = location.zipWithIndex.collect { case (a, b) if a == 1 => b } 77 | val neighbours: Array[Array[Double]] = index map dataToWorkWith 78 | val classes: Array[Any] = index map classesToWorkWith 79 | val label: (Any, Array[Int], Array[Double]) = nnRuleHVDM(neighbours, element._1, -1, classes, 1, data.fileInfo.nominal, 80 | sds, attrCounter, attrClassesCounter, "nearest") 81 | 82 | // If it is misclassified or is a element of the untouchable class it is added to store; otherwise, it is added to grabbag 83 | location(element._2) = if (label._1 != classesToWorkWith(element._2)) 1 else -1 84 | } 85 | 86 | // After a first pass, iterate grabbag until is exhausted: 87 | // 1. There is no element in grabbag or 88 | // 2. There is no data change between grabbag and store after a full iteration 89 | while (location.count((z: Int) => z == -1) != 0 && changed) { 90 | changed = false 91 | // Now, instead of iterating x, we iterate grabbag 92 | location.zipWithIndex.filter((x: (Int, Int)) => x._1 == -1).foreach { element: (Int, Int) => 93 | val index: Array[Int] = location.zipWithIndex.collect { case (a, b) if a == 1 => b } 94 | val neighbours: Array[Array[Double]] = index map dataToWorkWith 95 | val classes: Array[Any] = index map classesToWorkWith 96 | val label: Any = nnRuleHVDM(neighbours, dataToWorkWith(element._2), -1, classes, 1, data.fileInfo.nominal, 97 | sds, attrCounter, attrClassesCounter, "nearest")._1 98 | // If it is misclassified or is a element of the untouchable class it is added to store; otherwise, it is added to grabbag 99 | location(element._2) = if (label != classesToWorkWith(element._2)) { 100 | changed = true 101 | 1 102 | } else -1 103 | } 104 | } 105 | 106 | location.zipWithIndex.filter((x: (Int, Int)) => x._1 == 1).collect { case (_, a) => a } 107 | } else { 108 | val store: KDTree = new KDTree(Array(dataToWorkWith(0)), Array(classesToWorkWith(0)), dataToWorkWith(0).length) 109 | var grabbag: ListBuffer[(Array[Double], Int)] = new ListBuffer[(Array[Double], Int)]() 110 | var newGrabbag: ListBuffer[(Array[Double], Int)] = new ListBuffer[(Array[Double], Int)]() 111 | 112 | // Iterate the data, x (except the first instance) 113 | dataToWorkWith.zipWithIndex.tail.foreach { instance: (Array[Double], Int) => 114 | val label = mode(store.nNeighbours(instance._1, k = 1, leaveOneOut = false)._2.toArray) 115 | if (label != classesToWorkWith(instance._2)) { 116 | store.addElement(instance._1, classesToWorkWith(instance._2)) 117 | } else { 118 | grabbag += instance 119 | } 120 | } 121 | 122 | var changed = true 123 | while (grabbag.nonEmpty && changed) { 124 | changed = false 125 | grabbag.foreach { instance => 126 | val label = mode(store.nNeighbours(instance._1, k = 1, leaveOneOut = false)._2.toArray) 127 | if (label != classesToWorkWith(instance._2)) { 128 | store.addElement(instance._1, classesToWorkWith(instance._2)) 129 | changed = true 130 | } else { 131 | newGrabbag += instance 132 | } 133 | } 134 | 135 | grabbag = newGrabbag 136 | newGrabbag = new ListBuffer[(Array[Double], Int)]() 137 | } 138 | 139 | store.kDTreeMap.values.unzip._2.toArray 140 | } 141 | 142 | val finishTime: Long = System.nanoTime() 143 | 144 | if (verbose) { 145 | val newCounter: Map[Any, Int] = (finalIndex map classesToWorkWith).groupBy(identity).mapValues(_.length) 146 | println("ORIGINAL SIZE: %d".format(dataToWorkWith.length)) 147 | println("NEW DATA SIZE: %d".format(finalIndex.length)) 148 | println("REDUCTION PERCENTAGE: %s".format(100 - (finalIndex.length.toFloat / dataToWorkWith.length) * 100)) 149 | println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass))) 150 | println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass))) 151 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime))) 152 | } 153 | 154 | new Data(finalIndex map data.x, finalIndex map data.y, Some(finalIndex), data.fileInfo) 155 | } 156 | } 157 | -------------------------------------------------------------------------------- /src/main/scala/soul/algorithm/undersampling/NM.scala: -------------------------------------------------------------------------------- 1 | /* 2 | SOUL: Scala Oversampling and Undersampling Library. 3 | Copyright (C) 2019 Néstor Rodríguez, David López 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation in version 3 of the License. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | */ 17 | package soul.algorithm.undersampling 18 | 19 | import soul.data.Data 20 | import soul.util.KDTree 21 | import soul.util.Utilities.Distance.Distance 22 | import soul.util.Utilities._ 23 | 24 | import scala.collection.mutable.ArrayBuffer 25 | import scala.util.Random 26 | 27 | /** NearMiss. Original paper: "kNN Approach to Unbalanced Data Distribution: A Case Study involving Information 28 | * Extraction" by Jianping Zhang and Inderjeet Mani. 29 | * 30 | * @param data data to work with 31 | * @param seed seed to use. If it is not provided, it will use the system time 32 | * @param dist object of Distance enumeration representing the distance to be used 33 | * @param version version of the core to execute 34 | * @param nNeighbours number of neighbours to take for each minority example (only used if version is set to 3) 35 | * @param ratio ratio to know how many majority class examples to preserve. By default it's set to 1 so there 36 | * will be the same minority class examples as majority class examples. It will take 37 | * numMinorityInstances * ratio 38 | * @param normalize normalize the data or not 39 | * @param randomData iterate through the data randomly or not 40 | * @param verbose choose to display information about the execution or not 41 | * @author Néstor Rodríguez Vico 42 | */ 43 | class NM(data: Data, seed: Long = System.currentTimeMillis(), dist: Distance = Distance.EUCLIDEAN, version: Int = 1, 44 | nNeighbours: Int = 3, ratio: Double = 1.0, normalize: Boolean = false, randomData: Boolean = false, verbose: Boolean = false) { 45 | 46 | /** Compute the NM algorithm. 47 | * 48 | * @return undersampled data structure 49 | */ 50 | def compute(): Data = { 51 | val initTime: Long = System.nanoTime() 52 | 53 | val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length) 54 | val untouchableClass: Any = counter.minBy((c: (Any, Int)) => c._2)._1 55 | val random: scala.util.Random = new scala.util.Random(seed) 56 | var dataToWorkWith: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData 57 | val classesToWorkWith: Array[Any] = if (randomData) { 58 | val randomIndex: List[Int] = random.shuffle(data.y.indices.toList) 59 | dataToWorkWith = (randomIndex map dataToWorkWith).toArray 60 | (randomIndex map data.y).toArray 61 | } else { 62 | data.y 63 | } 64 | 65 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) { 66 | (dataToWorkWith.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)), 67 | dataToWorkWith.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)), 68 | dataToWorkWith.transpose.map((column: Array[Double]) => standardDeviation(column))) 69 | } else { 70 | (null, null, null) 71 | } 72 | 73 | val majElements: ArrayBuffer[Int] = new ArrayBuffer[Int](0) 74 | val minElements: ArrayBuffer[Int] = new ArrayBuffer[Int](0) 75 | classesToWorkWith.zipWithIndex.foreach(i => if (i._1 == untouchableClass) minElements += i._2 else majElements += i._2) 76 | val minNeighbours: Array[Array[Double]] = minElements.toArray map dataToWorkWith 77 | val majNeighbours: Array[Array[Double]] = majElements.toArray map dataToWorkWith 78 | val minClasses: Array[Any] = minElements.toArray map classesToWorkWith 79 | val majClasses: Array[Any] = majElements.toArray map classesToWorkWith 80 | 81 | val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) { 82 | Some(new KDTree(minNeighbours, minClasses, dataToWorkWith(0).length)) 83 | } else { 84 | None 85 | } 86 | 87 | val majorityKDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) { 88 | Some(new KDTree(majNeighbours, majClasses, dataToWorkWith(0).length)) 89 | } else { 90 | None 91 | } 92 | 93 | val reverseKDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) { 94 | Some(new KDTree(minNeighbours, minClasses, dataToWorkWith(0).length, which = "farthest")) 95 | } else { 96 | None 97 | } 98 | 99 | val selectedMajElements: Array[Int] = if (version == 1) { 100 | majElements.map { i: Int => 101 | if (dist == Distance.EUCLIDEAN) { 102 | val index = KDTree.get.nNeighbours(dataToWorkWith(i), 3)._3 103 | (i, index.map(j => euclidean(dataToWorkWith(i), dataToWorkWith(j))).sum / index.length) 104 | } else { 105 | val result: (Any, Array[Int], Array[Double]) = nnRuleHVDM(minNeighbours, dataToWorkWith(i), -1, minClasses, 3, 106 | data.fileInfo.nominal, sds, attrCounter, attrClassesCounter, "nearest") 107 | (i, (result._2 map result._3).sum / result._2.length) 108 | } 109 | }.toArray.sortBy(_._2).map(_._1) 110 | } else if (version == 2) { 111 | majElements.map { i: Int => 112 | if (dist == Distance.EUCLIDEAN) { 113 | val index = reverseKDTree.get.nNeighbours(dataToWorkWith(i), 3)._3 114 | (i, index.map(j => euclidean(dataToWorkWith(i), dataToWorkWith(j))).sum / index.length) 115 | } else { 116 | val result: (Any, Array[Int], Array[Double]) = nnRuleHVDM(minNeighbours, dataToWorkWith(i), -1, minClasses, 3, 117 | data.fileInfo.nominal, sds, attrCounter, attrClassesCounter, "farthest") 118 | (i, (result._2 map result._3).sum / result._2.length) 119 | } 120 | }.toArray.sortBy(_._2).map(_._1) 121 | } else if (version == 3) { 122 | // We shuffle the data because, at last, we are going to take, at least, minElements.length * ratio elements and if 123 | // we don't shuffle, we only take majority elements examples that are near to the first minority class examples 124 | new Random(seed).shuffle(minElements.flatMap { i: Int => 125 | if (dist == Distance.EUCLIDEAN) { 126 | majorityKDTree.get.nNeighbours(dataToWorkWith(i), 3)._3 127 | } else { 128 | nnRuleHVDM(majNeighbours, dataToWorkWith(i), -1, majClasses, nNeighbours, data.fileInfo.nominal, sds, attrCounter, 129 | attrClassesCounter, "nearest")._2 130 | } 131 | }.distinct.toList).toArray 132 | } else { 133 | throw new Exception("Invalid argument: version should be: 1, 2 or 3") 134 | } 135 | 136 | val finalIndex: Array[Int] = minElements.toArray ++ selectedMajElements.take((minElements.length * ratio).toInt) 137 | val finishTime: Long = System.nanoTime() 138 | 139 | if (verbose) { 140 | val newCounter: Map[Any, Int] = (finalIndex map classesToWorkWith).groupBy(identity).mapValues(_.length) 141 | println("ORIGINAL SIZE: %d".format(dataToWorkWith.length)) 142 | println("NEW DATA SIZE: %d".format(finalIndex.length)) 143 | println("REDUCTION PERCENTAGE: %s".format(100 - (finalIndex.length.toFloat / dataToWorkWith.length) * 100)) 144 | println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass))) 145 | println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass))) 146 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime))) 147 | } 148 | 149 | new Data(finalIndex map data.x, finalIndex map data.y, Some(finalIndex), data.fileInfo) 150 | } 151 | } 152 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SOUL 2 | 3 | ### Scala Oversampling and Undersampling Library 4 | 5 | Included algorithms for oversampling: 6 | 7 | * **Random Oversampling.** Original paper: "A study of the behavior of several methods for balancing machine learning training data" by Batista, Gustavo EAPA and Prati, Ronaldo C and Monard, Maria Carolina. 8 | 9 | * **SMOTE.** Original paper: "SMOTE: Synthetic Minority Over-sampling Technique" by Nitesh V. Chawla, Kevin W. Bowyer, Lawrence O. Hall and W. Philip Kegelmeyer. 10 | 11 | * **SMOTE + ENN.** Original paper: "A Study of the Behavior of Several Methods for Balancing Machine Learning Training Data" by Gustavo E. A. P. A. Batista, Ronaldo C. Prati and Maria Carolina Monard. 12 | 13 | * **SMOTE + TL.** Original paper: "A Study of the Behavior of Several Methods for Balancing Machine Learning Training Data" by Gustavo E. A. P. A. Batista, Ronaldo C. Prati and Maria Carolina Monard. 14 | 15 | * **Borderline-SMOTE.** Original paper: "Borderline-SMOTE: A New Over-Sampling Method in Imbalanced Data Sets Learning." by Hui Han, Wen-Yuan Wang, and Bing-Huan Mao. 16 | 17 | * **Adasyn.** Original paper: "ADASYN: Adaptive Synthetic Sampling Approach for Imbalanced Learning" by Haibo He, Yang Bai, Edwardo A. Garcia, and Shutao Li. 18 | 19 | * **Adoms.** Original paper: "The Generation Mechanism of Synthetic Minority Class Examples" by Sheng TANG and Si-ping CHEN. 20 | 21 | * **SafeLevel-SMOTE.** Original paper: "Safe-Level-SMOTE: Safe-Level-Synthetic Minority Over-Sampling TEchnique for Handling the Class Imbalanced Problem" by Chumphol Bunkhumpornpat, Krung Sinapiromsaran, and Chidchanok Lursinsap. 22 | 23 | * **Spider2.** Original paper: "Learning from Imbalanced Data in Presence of Noisy and Borderline Examples" by Krystyna Napiera la, Jerzy Stefanowski and Szymon Wilk. 24 | 25 | * **DBSMOTE.** Original paper: "DBSMOTE: Density-Based Synthetic Minority Over-sampling Technique" by Chumphol Bunkhumpornpat, Krung Sinapiromsaran and Chidchanok Lursinsap. 26 | 27 | * **SMOTE-RSB.** Original paper: "kNN Approach to Unbalanced Data Distribution: SMOTE-RSB: a hybrid preprocessing approach based on oversampling and undersampling for high imbalanced data-sets using SMOTE and rough sets theory" by Enislay Ramentol, Yailé Caballero, Rafael Bello and Francisco Herrera. 28 | 29 | * **MWMOTE.** Original paper: "MWMOTE—Majority Weighted Minority Oversampling Technique for Imbalanced Data Set Learning" by Sukarna Barua, Md. Monirul Islam, Xin Yao, Fellow, IEEE, and Kazuyuki Muras. 30 | 31 | * **MDO.** Original paper: "To combat multi-class imbalanced problems by means of over-sampling and boosting techniques" by Lida Adbi and Sattar Hashemi. 32 | 33 | Included algorithms for undersampling: 34 | 35 | * **Random Undersampling.** Original paper: "A study of the behavior of several methods for balancing machine learning training data" by Batista, Gustavo EAPA and Prati, Ronaldo C and Monard, Maria Carolina. 36 | 37 | * **Condensed Nearest Neighbor decision rule.** Original paper: "The Condensed Nearest Neighbor Rule" by P. Hart. 38 | 39 | * **Edited Nearest Neighbour rule.** Original paper: "Asymptotic Properties of Nearest Neighbor Rules Using Edited Data" by Dennis L. Wilson. 40 | 41 | * **Tomek Link.** Original paper: "Two Modifications of CNN" by Ivan Tomek. 42 | 43 | * **One-Side Selection.** Original paper: "Addressing the Curse of Imbalanced Training Sets: One-Side Selection" by Miroslav Kubat and Stan Matwin. 44 | 45 | * **Neighbourhood Cleaning Rule.** Original paper: "Improving Identification of Difficult Small Classes by Balancing Class Distribution" by J. Laurikkala. 46 | 47 | * **NearMiss.** Original paper: "kNN Approach to Unbalanced Data Distribution: A Case Study involving Information Extraction" by Jianping Zhang and Inderjeet Mani. 48 | 49 | * **Class Purity Maximization algorithm.** Original paper: "An Unsupervised Learning Approach to Resolving the Data Imbalanced Issue in Supervised Learning Problems in Functional Genomics" by Kihoon Yoon and Stephen Kwek. 50 | 51 | * **Undersampling Based on Clustering.** Original paper: "Under-Sampling Approaches for Improving Prediction of the Minority Class in an Imbalanced Dataset" by Show-Jane Yen and Yue-Shi Lee. 52 | 53 | * **Balance Cascade.** Original paper: "Exploratory Undersampling for Class-Imbalance Learning" by Xu-Ying Liu, Jianxin Wu and Zhi-Hua Zhou. 54 | 55 | * **Easy Ensemble.** Original paper: "Exploratory Undersampling for Class-Imbalance Learning" by Xu-Ying Liu, Jianxin Wu and Zhi-Hua Zhou. 56 | 57 | * **Evolutionary Undersampling.** Original paper: "Evolutionary Under-Sampling for Classification with Imbalanced Data Sets: Proposals and Taxonomy" by Salvador Garcia and Francisco Herrera. 58 | 59 | * **Instance Hardness Threshold.** Original paper: "An Empirical Study of Instance Hardness" by Michael R. Smith, Tony Martinez and Christophe Giraud-Carrier. 60 | 61 | * **ClusterOSS.** Original paper: "ClusterOSS: a new undersampling method for imbalanced learning." by Victor H Barella, Eduardo P Costa and André C. P. L. F. Carvalho. 62 | 63 | * **Iterative Instance Adjustment for Imbalanced Domains.** Original paper: "Addressing imbalanced classification with instance generation techniques: IPADE-ID" by Victoria López, Isaac Triguero, Cristóbal J. Carmona, Salvador García and Francisco Herrera. 64 | 65 | ### How-to use it 66 | 67 | If you are going to use this library from another `sbt` project, you just need to clone the original repository, in the root folder of the cloned repository execute `sbt publishLocal` and add the following dependendy to the `build.sbt` file of your project: 68 | 69 | ```scala 70 | libraryDependencies += "com.github.soul" %% "soul" % "1.0.0" 71 | ``` 72 | 73 | To read a data file you only need to do this: 74 | 75 | ```scala 76 | import soul.io.Reader 77 | import soul.data.Data 78 | 79 | /* Read a csv file or any delimited text file */ 80 | val csvData: Data = Reader.readDelimitedText(file = ) 81 | /* Read a WEKA arff file */ 82 | val arffData: Data = Reader.readArff(file = ) 83 | ``` 84 | 85 | Now we're going to run an undersampling algorithm: 86 | 87 | ```scala 88 | import soul.algorithm.undersampling.NCL 89 | import soul.data.Data 90 | 91 | val nclCSV = new NCL(csvData) 92 | val resultCSV: Data = nclCSV.compute() 93 | 94 | val nclARFF = new NCL(arffData) 95 | val resultARFF: Data = nclARFF.compute() 96 | ``` 97 | 98 | In this example we've used an undersampling algorithm but it's the same for an oversampling one. All the algorithm's parameters have default values so you don't need to specify any of them. 99 | 100 | Finally, we only need to save the result to a file: 101 | 102 | ```scala 103 | import soul.io.Writer 104 | 105 | Writer.writeDelimitedText(file = , data = resultCSV) 106 | Writer.writeArff(file = , data = resultARFF) 107 | ``` 108 | 109 | ### Experiments 110 | 111 | With the objective of showing the capabilities of **SOUL**, we have generated a two dimension synthetic imbalanced dataset with 1,871 instances. Among them, 1,600 instances belong to the majority class and the remaining 271 belongs to the minority class, leading to about a 17% of minority instances in the whole dataset (IR=5.9). The representation of this dataset can be found below, where we may observe a clear overlapping between the classes, as well as a cluster of minority instances in the middle of the majority instances. 112 | 113 | Next, we have used the following parameters of the algorithms to perform an experiment with some relevant oversampling and undersampling approaches: 114 | 115 | 116 | * **MWMOTE**: *seed*: 0, *N*: 1400, *k1*: 5, *k2*: 5, *k3*: 5, *dist*: euclidean, *normalize*: false, *verbose*: false. 117 | 118 | * **SMOTE**: *seed*: 0, *percent*: 500, *k*: 5, *dist*: euclidean, *normalize*: false, *verbose*: false. 119 | 120 | * **ADASYN**: *seed*: 0, *d*: 1, *B*: 1, *k*: 5, *dist*: euclidean, *normalize*: false, *verbose*: false. 121 | 122 | * **SafeLevelSMOTE**: *seed*: 0, *k*: 5, *dist*: euclidean, *normalize*: false, *verbose*: false. 123 | 124 | * **IHTS**: *seed* = 0, *nFolds* = 5, *normalize* = false, *randomData* = false, *verbose* = false 125 | 126 | * **IPADE**: *seed* = 0, *iterations* = 100, *strategy* = 1, *randomChoice* = true, *normalize* = false, *randomData* = false, *verbose* = false 127 | 128 | * **NCL**: *seed* = 0, *dist* = euclidean, *k* = 3, *threshold* = 0.5, *normalize* = false, *randomData* = false, *verbose* = false 129 | 130 | * **SBC**: *seed* = 0, *method* = "NearMiss1", *m* = 1.0, *k* = 3, *numClusters* = 50, *restarts* = 1, *minDispersion* = 0.0001, *maxIterations* = 200, val *dist* = euclidean, *normalize* = false, *randomData* = false, *verbose* = false 131 | 132 | 133 | ![Original](images/original.png) 134 | 135 | 136 | | ![ADASYN](images/ADASYN.png) | ![SafeLevelSMOTE](images/SafeLevelSMOTE.png) | 137 | | ------------- | ------------- | 138 | ![MWMOTE](images/MWMOTE.png) | ![SMOTE](images/SMOTE.png) 139 | 140 | 141 | | ![IHTS](images/IHTS.png) | ![IPADE](images/IPADE.png) | 142 | | ------------- | ------------- | 143 | ![NCL](images/NCL.png) | ![SBC](images/SBC.png) 144 | -------------------------------------------------------------------------------- /src/main/scala/soul/algorithm/oversampling/Spider2.scala: -------------------------------------------------------------------------------- 1 | /* 2 | SOUL: Scala Oversampling and Undersampling Library. 3 | Copyright (C) 2019 Néstor Rodríguez, David López 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation in version 3 of the License. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | */ 17 | package soul.algorithm.oversampling 18 | 19 | import soul.data.Data 20 | import soul.util.Utilities.Distance.Distance 21 | import soul.util.Utilities._ 22 | 23 | import scala.collection.mutable.ArrayBuffer 24 | 25 | /** Spider2 algorithm. Original paper: "Learning from Imbalanced Data in Presence of Noisy and Borderline Examples" by 26 | * Krystyna Napiera la, Jerzy Stefanowski and Szymon Wilk. 27 | * 28 | * @param data data to work with 29 | * @param seed seed to use. If it is not provided, it will use the system time 30 | * @param relabel relabeling option 31 | * @param ampl amplification option 32 | * @param k number of minority class nearest neighbors 33 | * @param dist object of Distance enumeration representing the distance to be used 34 | * @param normalize normalize the data or not 35 | * @param verbose choose to display information about the execution or not 36 | * @author David López Pretel 37 | */ 38 | class Spider2(data: Data, seed: Long = System.currentTimeMillis(), relabel: String = "yes", ampl: String = "weak", k: Int = 5, 39 | dist: Distance = Distance.EUCLIDEAN, normalize: Boolean = false, verbose: Boolean = false) { 40 | 41 | /** Compute the Spider2 algorithm 42 | * 43 | * @return synthetic samples generated 44 | */ 45 | def compute(): Data = { 46 | val initTime: Long = System.nanoTime() 47 | 48 | if (relabel != "no" && relabel != "yes") { 49 | throw new Exception("relabel must be yes or no.") 50 | } 51 | 52 | if (ampl != "weak" && ampl != "strong" && ampl != "no") { 53 | throw new Exception("amplification must be weak or strong or no.") 54 | } 55 | 56 | var minorityClassIndex: Array[Int] = minority(data.y) 57 | val minorityClass: Any = data.y(minorityClassIndex(0)) 58 | var majorityClassIndex: Array[Int] = data.processedData.indices.diff(minorityClassIndex.toList).toArray 59 | val output: ArrayBuffer[Array[Double]] = ArrayBuffer() 60 | var resultClasses: Array[Any] = new Array[Any](0) 61 | 62 | val samples: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData 63 | 64 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) { 65 | (samples.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)), 66 | samples.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)), 67 | samples.transpose.map((column: Array[Double]) => standardDeviation(column))) 68 | } else { 69 | (null, null, null) 70 | } 71 | 72 | def flagged(c: Array[Int], f: Array[Boolean]): Array[Int] = { 73 | c.map(classes => { 74 | if (!f(classes)) Some(classes) else None 75 | }).filterNot(_.forall(_ == None)).map(_.get) 76 | } 77 | 78 | def amplify(x: Int, k: Int): Unit = { 79 | // compute the neighborhood for the majority and minority class 80 | val majNeighbors: Array[Int] = if (dist == Distance.EUCLIDEAN) { 81 | kNeighbors(majorityClassIndex map output, output(x), k) 82 | } else { 83 | kNeighborsHVDM(majorityClassIndex map output, output(x), k, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter) 84 | } 85 | 86 | val minNeighbors: Array[Int] = if (dist == Distance.EUCLIDEAN) { 87 | kNeighbors(minorityClassIndex map output, output(x), k) 88 | } else { 89 | kNeighborsHVDM(minorityClassIndex map output, output(x), k, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter) 90 | } 91 | 92 | // compute the number of copies to create 93 | val S: Int = Math.abs(majNeighbors.length - minNeighbors.length) + 1 94 | // need to know the size of the output to save the randomIndex of the elements inserted 95 | val outputSize: Int = output.length 96 | (0 until S).foreach(_ => { 97 | output ++= Traversable(output(x)) 98 | }) 99 | // add n copies to the output 100 | if (resultClasses(x) == minorityClass) { 101 | minorityClassIndex = minorityClassIndex ++ (outputSize until outputSize + S) 102 | } else { 103 | majorityClassIndex = majorityClassIndex ++ (outputSize until outputSize + S) 104 | } 105 | resultClasses = resultClasses ++ Array.fill(S)(resultClasses(x)) 106 | } 107 | 108 | def correct(x: Int, k: Int, out: Boolean): Boolean = { 109 | // compute the neighbors 110 | val neighbors: Array[Int] = if (dist == Distance.EUCLIDEAN) { 111 | kNeighbors(if (out) samples else output.toArray, if (out) samples(x) else output(x), k) 112 | } else { 113 | kNeighborsHVDM(if (out) samples else output.toArray, if (out) samples(x) else output(x), k, 114 | data.fileInfo.nominal, sds, attrCounter, attrClassesCounter) 115 | } 116 | val classes: scala.collection.mutable.Map[Any, Int] = scala.collection.mutable.Map() 117 | // compute the number of samples for each class in the neighborhood 118 | neighbors.foreach(neighbor => classes += data.y(neighbor) -> 0) 119 | neighbors.foreach(neighbor => classes(data.y(neighbor)) += 1) 120 | 121 | // if the majority class in neighborhood is the minority class return true 122 | if (classes.reduceLeft((x: (Any, Int), y: (Any, Int)) => if (x._2 > y._2) x else y)._1 == data.y(x)) 123 | true 124 | else 125 | false 126 | } 127 | 128 | // array with the randomIndex of each sample 129 | var DS: Array[Int] = Array.range(0, samples.length) 130 | // at the beginning there are not safe samples 131 | var safeSamples: Array[Boolean] = Array.fill(samples.length)(false) 132 | 133 | // for each sample in majority class check if the neighbors has the same class 134 | majorityClassIndex.foreach(index => if (correct(index, k, out = true)) safeSamples(index) = true) 135 | 136 | // return a subset of samples that are not safe and belong to the majority class 137 | val RS: Array[Int] = flagged(majorityClassIndex, safeSamples) 138 | if (relabel == "yes") { 139 | //add the RS samples to the minority set 140 | minorityClassIndex = minorityClassIndex ++ RS 141 | resultClasses = data.y 142 | RS.foreach(resultClasses(_) = minorityClass) 143 | } else { 144 | // eliminate the samples from the initial set, first we recalculate the randomIndex for min and maj class 145 | var newIndex: Int = 0 146 | minorityClassIndex = minorityClassIndex.map(minor => { 147 | newIndex = minor 148 | RS.foreach(index => if (index < minor) newIndex -= 1) 149 | newIndex 150 | }) 151 | majorityClassIndex = majorityClassIndex.map(major => { 152 | newIndex = major 153 | RS.foreach(index => if (index < major) newIndex -= 1) 154 | newIndex 155 | }) 156 | DS = DS.diff(RS) 157 | safeSamples = DS map safeSamples 158 | resultClasses = DS map data.y 159 | } 160 | 161 | // the output is DS if ampl is not weak or strong 162 | output ++= (DS map samples) 163 | 164 | // if the neighbors of each sample in minority class belong to it, flag as safe 165 | minorityClassIndex.foreach(index => if (correct(index, k, out = false)) safeSamples(index) = true) 166 | if (ampl == "weak") { 167 | // for each sample returned by flagged amplify the data creating n copies (n calculated in amplify) 168 | flagged(minorityClassIndex, safeSamples).foreach(amplify(_, k)) 169 | } else if (ampl == "strong") { 170 | // if the sample is correct amplify with k, else amplify with k + 2 (k is not n) 171 | flagged(minorityClassIndex, safeSamples).foreach(x => { 172 | if (correct(x, k + 2, out = false)) amplify(x, k) else amplify(x, k + 2) 173 | }) 174 | } 175 | 176 | val finishTime: Long = System.nanoTime() 177 | 178 | if (verbose) { 179 | println("ORIGINAL SIZE: %d".format(data.x.length)) 180 | println("NEW DATA SIZE: %d".format(data.x.length + output.length)) 181 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime))) 182 | } 183 | 184 | new Data(if (data.fileInfo.nominal.length == 0) { 185 | to2Decimals(if (normalize) zeroOneDenormalization(output.toArray, data.fileInfo.maxAttribs, data.fileInfo.minAttribs) else output.toArray) 186 | } else { 187 | toNominal(if (normalize) zeroOneDenormalization(output.toArray, data.fileInfo.maxAttribs, data.fileInfo.minAttribs) else output.toArray, data.nomToNum) 188 | }, resultClasses, None, data.fileInfo) 189 | } 190 | } 191 | -------------------------------------------------------------------------------- /src/main/scala/soul/algorithm/oversampling/MWMOTE.scala: -------------------------------------------------------------------------------- 1 | /* 2 | SOUL: Scala Oversampling and Undersampling Library. 3 | Copyright (C) 2019 Néstor Rodríguez, David López 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation in version 3 of the License. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | */ 17 | package soul.algorithm.oversampling 18 | 19 | import soul.data.Data 20 | import soul.util.Utilities.Distance.Distance 21 | import soul.util.Utilities._ 22 | 23 | import scala.collection.mutable.ArrayBuffer 24 | import scala.util.Random 25 | 26 | /** MWMOTE algorithm. Original paper: "MWMOTE—Majority Weighted Minority Oversampling Technique for Imbalanced Data Set 27 | * Learning" by Sukarna Barua, Md. Monirul Islam, Xin Yao, Fellow, IEEE, and Kazuyuki Muras. 28 | * 29 | * @param data data to work with 30 | * @param seed seed to use. If it is not provided, it will use the system time 31 | * @param N number of synthetic samples to be generated 32 | * @param k1 number of neighbors used for predicting noisy minority class samples 33 | * @param k2 number of majority neighbors used for constructing informative minority set 34 | * @param k3 number of minority neighbors used for constructing informative minority set 35 | * @param dist object of Distance enumeration representing the distance to be used 36 | * @param normalize normalize the data or not 37 | * @param verbose choose to display information about the execution or not 38 | * @author David López Pretel 39 | */ 40 | class MWMOTE(data: Data, seed: Long = System.currentTimeMillis(), N: Int = 500, k1: Int = 5, k2: Int = 5, k3: Int = 5, 41 | dist: Distance = Distance.EUCLIDEAN, normalize: Boolean = false, verbose: Boolean = false) { 42 | 43 | /** Compute the MWMOTE algorithm 44 | * 45 | * @return synthetic samples generated 46 | */ 47 | def compute(): Data = { 48 | val initTime: Long = System.nanoTime() 49 | val samples: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData 50 | 51 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) { 52 | (samples.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)), 53 | samples.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)), 54 | samples.transpose.map((column: Array[Double]) => standardDeviation(column))) 55 | } else { 56 | (null, null, null) 57 | } 58 | 59 | def f(value: Double, cut: Double): Double = { 60 | if (value < cut) value else cut 61 | } 62 | 63 | def Cf(y: (Int, Int), x: Int, Nmin: Array[Array[Int]]): Double = { 64 | val cut: Double = 5 // values used in the paper 65 | val CMAX: Double = 2 66 | 67 | if (!Nmin(y._2).contains(x)) { 68 | val D: Double = if (dist == Distance.EUCLIDEAN) { 69 | euclidean(samples(y._1), samples(x)) 70 | } else { 71 | HVDM(samples(y._1), samples(x), data.fileInfo.nominal, sds, attrCounter, attrClassesCounter) 72 | } 73 | f(samples(0).length / D, cut) * CMAX 74 | } else 75 | 0.0 76 | } 77 | 78 | def Iw(y: (Int, Int), x: Int, Nmin: Array[Array[Int]], Simin: Array[Int]): Double = { 79 | val cf = Cf(y, x, Nmin) 80 | val df = cf / Simin.map(Cf(y, _, Nmin)).sum 81 | cf + df 82 | } 83 | 84 | def clusterDistance(cluster1: Array[Int], cluster2: Array[Int]): Double = { 85 | val centroid1: Array[Double] = (cluster1 map samples).transpose.map(_.sum / cluster1.length) 86 | val centroid2: Array[Double] = (cluster2 map samples).transpose.map(_.sum / cluster2.length) 87 | 88 | if (dist == Distance.EUCLIDEAN) { 89 | euclidean(centroid1, centroid2) 90 | } else { 91 | HVDM(centroid1, centroid2, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter) 92 | } 93 | } 94 | 95 | def minDistance(cluster: ArrayBuffer[ArrayBuffer[Int]]): (Int, Int, Double) = { 96 | var minDist: (Int, Int, Double) = (0, 0, 99999999) 97 | var i, j: Int = 0 98 | while (i < cluster.length) { 99 | j = 0 100 | while (j < cluster.length) { 101 | if (i != j) { 102 | val dist = clusterDistance(cluster(i).toArray, cluster(j).toArray) 103 | if (dist < minDist._3) minDist = (i, j, dist) 104 | } 105 | j += 1 106 | } 107 | i += 1 108 | } 109 | minDist 110 | } 111 | 112 | def cluster(Sminf: Array[Int]): Array[Array[Int]] = { 113 | val distances: Array[Array[Double]] = Array.fill(Sminf.length, Sminf.length)(9999999.0) 114 | var i, j: Int = 0 115 | while (i < Sminf.length) { 116 | j = 0 117 | while (j < Sminf.length) { 118 | if (i != j) { 119 | distances(i)(j) = if (dist == Distance.EUCLIDEAN) { 120 | euclidean(samples(Sminf(i)), samples(Sminf(j))) 121 | } else { 122 | HVDM(samples(Sminf(i)), samples(Sminf(j)), data.fileInfo.nominal, sds, attrCounter, attrClassesCounter) 123 | } 124 | } 125 | j += 1 126 | } 127 | i += 1 128 | } 129 | 130 | val Cp: Double = 3 // used in paper 131 | val Th: Double = distances.map(_.min).sum / Sminf.length * Cp 132 | var minDist: (Int, Int, Double) = (0, 0, 0.0) 133 | val clusters: ArrayBuffer[ArrayBuffer[Int]] = Sminf.map(ArrayBuffer(_)).to[ArrayBuffer] 134 | while (minDist._3 < Th) { 135 | //compute the min distance between each cluster 136 | minDist = minDistance(clusters) 137 | //merge the two more proximal clusters 138 | clusters(minDist._1) ++= clusters(minDist._2) 139 | clusters -= clusters(minDist._2) 140 | } 141 | 142 | clusters.map(_.toArray).toArray 143 | } 144 | 145 | // compute minority class 146 | val minorityClassIndex: Array[Int] = minority(data.y) 147 | val minorityClass: Any = data.y(minorityClassIndex(0)) 148 | // compute majority class 149 | val majorityClassIndex: Array[Int] = samples.indices.par.diff(minorityClassIndex.toList).toArray 150 | 151 | // construct the filtered minority set 152 | val Sminf: Array[Int] = minorityClassIndex.par.map(index => { 153 | val neighbors = if (dist == Distance.EUCLIDEAN) { 154 | kNeighbors(samples, index, k1) 155 | } else { 156 | kNeighborsHVDM(samples, index, k1, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter) 157 | } 158 | if (neighbors map data.y contains data.y(minorityClassIndex(0))) { 159 | Some(index) 160 | } else { 161 | None 162 | } 163 | }).filterNot(_.forall(_ == None)).map(_.get).toArray 164 | 165 | //for each sample in Sminf compute the nearest majority set 166 | val Sbmaj: Array[Int] = Sminf.par.flatMap { x => 167 | if (dist == Distance.EUCLIDEAN) { 168 | kNeighbors(majorityClassIndex map samples, samples(x), k2) 169 | } else { 170 | kNeighborsHVDM(majorityClassIndex map samples, samples(x), k2, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter) 171 | } 172 | }.distinct.par.map(majorityClassIndex(_)).toArray 173 | 174 | // for each majority example in Sbmaj , compute the nearest minority set 175 | val Nmin: Array[Array[Int]] = Sbmaj.par.map { x => 176 | (if (dist == Distance.EUCLIDEAN) { 177 | kNeighbors(minorityClassIndex map samples, samples(x), k3) 178 | } else { 179 | kNeighborsHVDM(minorityClassIndex map samples, samples(x), k3, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter) 180 | }).par.map(minorityClassIndex(_)).toArray 181 | }.toArray 182 | 183 | // find the informative minority set (union of all Nmin) 184 | val Simin: Array[Int] = Nmin.par.flatten.distinct.toArray 185 | // for each sample in Simin compute the selection weight 186 | val Sw: Array[Double] = Simin.par.map(x => Sbmaj.zipWithIndex.par.map(y => Iw(y, x, Nmin, Simin)).sum).toArray 187 | val sumSw: Double = Sw.sum 188 | // convert each Sw into probability 189 | val Sp: Array[(Double, Int)] = Sw.par.map(_ / sumSw).toArray.zip(Simin).sortWith(_._1 > _._1) 190 | 191 | // compute the clusters 192 | val clusters: Array[Array[Int]] = cluster(minorityClassIndex) // cluster => index to processedData 193 | val clustersIndex: Map[Int, Int] = clusters.zipWithIndex.flatMap(c => { 194 | clusters(c._2).map(index => (index, c._2)) 195 | }).toMap // index to processedData => cluster 196 | 197 | //output data 198 | val output: Array[Array[Double]] = Array.ofDim(N, samples(0).length) 199 | 200 | val probsSum: Double = Sp.map(_._1).sum 201 | val r: Random = new Random(seed) 202 | 203 | (0 until N).par.foreach(i => { 204 | // select a sample, then select another randomly from the cluster that have this sample 205 | val x = chooseByProb(Sp, probsSum, r) 206 | val y = clusters(clustersIndex(x))(r.nextInt(clusters(clustersIndex(x)).length)) 207 | // compute atributtes of the sample 208 | samples(0).indices.foreach(atrib => { 209 | val diff: Double = samples(y)(atrib) - samples(x)(atrib) 210 | val gap: Float = r.nextFloat 211 | output(i)(atrib) = samples(x)(atrib) + gap * diff 212 | }) 213 | }) 214 | 215 | val finishTime: Long = System.nanoTime() 216 | 217 | if (verbose) { 218 | println("ORIGINAL SIZE: %d".format(data.x.length)) 219 | println("NEW DATA SIZE: %d".format(data.x.length + output.length)) 220 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime))) 221 | } 222 | 223 | new Data(if (data.fileInfo.nominal.length == 0) { 224 | to2Decimals(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs, data.fileInfo.minAttribs) else output)) 225 | } else { 226 | toNominal(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs, data.fileInfo.minAttribs) else output), data.nomToNum) 227 | }, Array.concat(data.y, Array.fill(output.length)(minorityClass)), None, data.fileInfo) 228 | } 229 | } 230 | -------------------------------------------------------------------------------- /src/main/scala/soul/io/Reader.scala: -------------------------------------------------------------------------------- 1 | /* 2 | SOUL: Scala Oversampling and Undersampling Library. 3 | Copyright (C) 2019 Néstor Rodríguez, David López 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation in version 3 of the License. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | */ 17 | package soul.io 18 | 19 | import java.io.{BufferedReader, FileInputStream, InputStreamReader} 20 | import java.text.ParseException 21 | 22 | import soul.data.{Data, FileInfo} 23 | import soul.util.Utilities.processData 24 | 25 | import scala.collection.mutable 26 | import scala.collection.mutable.ArrayBuffer 27 | 28 | /** Class to read data files 29 | * 30 | * @author Néstor Rodríguez Vico 31 | */ 32 | object Reader { 33 | /** Parse a arff file 34 | * 35 | * @param file file containing the data 36 | * @param columnClass indicates which column represents the class in the file. It it's set to -1, it will take the las column 37 | * @return a data object containing all the relevant information 38 | */ 39 | def readArff(file: String, columnClass: Int = -1): Data = { 40 | val reader: BufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(file))) 41 | var line: String = reader.readLine() 42 | var relationName: String = "" 43 | // index -> attributeName 44 | val attributes: mutable.Map[Int, String] = collection.mutable.Map[Int, String]() 45 | // attributeName -> type (it it's nominal, possible values instead of type) 46 | val attributesValues: mutable.Map[String, String] = collection.mutable.Map[String, String]() 47 | 48 | var dataDetected: Boolean = false 49 | var counter: Int = 0 50 | 51 | while (line != null && !dataDetected) { 52 | // ignore comments/description lines 53 | if (line.isEmpty || line.startsWith("%")) { 54 | line = reader.readLine 55 | } else { 56 | // take care if the relation name has commas, tabs, multiple spaces... 57 | val parts: Array[String] = line.replaceAll("\t", " ").replaceAll("\\s{2,}", " ").split(" ", 3) 58 | if (parts(0).equalsIgnoreCase("@relation")) { 59 | // drop the identifier and group all the possible parts separated by a space 60 | relationName = parts.drop(1).mkString(" ") 61 | } else if (parts(0).equalsIgnoreCase("@attribute")) { 62 | attributes += (counter -> parts(1)) 63 | attributesValues += (parts(1) -> parts(2)) 64 | counter += 1 65 | } else if (parts(0).equalsIgnoreCase("@data")) { 66 | dataDetected = true 67 | } 68 | 69 | line = reader.readLine 70 | } 71 | } 72 | 73 | if (columnClass >= attributes.size) 74 | throw new ParseException("Invalid response variable index: " + columnClass, columnClass) 75 | 76 | val response: Int = if (columnClass == -1) attributes.size - 1 else columnClass 77 | val readData: ArrayBuffer[Array[String]] = new ArrayBuffer[Array[String]](0) 78 | 79 | // Now we have the attributes, let's save the data 80 | while (line != null) { 81 | if (line.isEmpty || line.startsWith("%")) { 82 | line = reader.readLine 83 | } else { 84 | val parts: Array[String] = line.replaceAll("\t", " ").replaceAll("\\s{2,}", " ").split(",") 85 | // there are not quotations mark 86 | if (parts.length == (attributes.size + 1)) { 87 | readData += parts.asInstanceOf 88 | } else { 89 | // if there are quotations marks, they are going to be in pairs 90 | val subParts: Array[Array[Int]] = parts.zipWithIndex.filter((x: (String, Int)) => x._1.contains("\"")).collect { case (_, a) => a }.grouped(2).toArray 91 | // separators indicates the index of the elements that need to be merged into one class 92 | val separators = new ArrayBuffer[Array[Int]](0) 93 | for (quotationMarks <- subParts) 94 | separators += (quotationMarks(0) to quotationMarks(1)).toArray 95 | 96 | val separatedValues: ArrayBuffer[String] = new ArrayBuffer[String]() 97 | // append all the parts into one value 98 | for (pair <- subParts) 99 | separatedValues += ((pair(0) to pair(1)).toArray map parts).mkString(",") 100 | 101 | val nonSeparatedValuesIndex: Array[Int] = parts.indices.diff(separators.flatten.toList).toArray 102 | val nonSeparatedValues: Array[String] = nonSeparatedValuesIndex map parts 103 | // append all the data 104 | val values: Array[String] = (separatedValues ++ nonSeparatedValues).toArray 105 | // make an index array merging all the index: take care with the separatedValuesIndex because there are more than one 106 | // index for each value, so we compute the mean for all the numbers associated to one value 107 | val index: Array[Double] = separators.map((a: Array[Int]) => a.sum.toDouble / a.length).toArray ++ nonSeparatedValuesIndex.map(_.asInstanceOf[Double]) 108 | // finally, construct an array to sort the values 109 | val indexForMap: Array[Int] = index.zipWithIndex.sortBy((pair: (Double, Int)) => pair._1).map((pair: (Double, Int)) => pair._2) 110 | // get the final values 111 | val finalValues: Array[String] = indexForMap map values 112 | if (finalValues.length != attributes.size) 113 | throw new ParseException("%d columns, expected %d".format((indexForMap map values).length, attributes.size), (indexForMap map values).length) 114 | 115 | readData += finalValues 116 | } 117 | line = reader.readLine 118 | } 119 | } 120 | 121 | val finalData: ArrayBuffer[Array[Any]] = new ArrayBuffer[Array[Any]](0) 122 | val readClasses: ArrayBuffer[Any] = new ArrayBuffer[Any](0) 123 | val readNominal: ArrayBuffer[Int] = new ArrayBuffer[Int](0) 124 | 125 | for (row <- readData) { 126 | val r = new ArrayBuffer[Any](0) 127 | for (e <- row.zipWithIndex) { 128 | if (e._2 == response) 129 | readClasses += e._1 130 | else if (e._1.matches("-?\\d+(\\.\\d+)?")) 131 | r += e._1.toDouble 132 | else { 133 | if (e._1 == "?" || e._1 == "'?'") 134 | r += "soul_NA" 135 | else { 136 | r += e._1 137 | readNominal += (if (e._2 >= response) e._2 - 1 else e._2) 138 | } 139 | } 140 | } 141 | 142 | finalData += r.toArray 143 | } 144 | 145 | val fileInfo = new FileInfo(_file = file, _comment = "%", _columnClass = response, _delimiter = null, _missing = "?", _header = null, 146 | _relationName = relationName, _attributes = attributes, _attributesValues = attributesValues, nominal = readNominal.distinct.toArray) 147 | val data: Data = new Data(x = finalData.toArray, y = readClasses.toArray, fileInfo = fileInfo) 148 | val (processedData, nomToNum) = processData(data) 149 | data.processedData = processedData 150 | data.nomToNum = nomToNum 151 | data 152 | } 153 | 154 | /** Parse a delimited text data file 155 | * 156 | * @param file file containing the data 157 | * @param comment string indicating that a line is a comment 158 | * @param delimiter string separating two elements 159 | * @param missing string indicating a element is missed 160 | * @param header indicates if the file contains a header or not 161 | * @param columnClass indicates which column represents the class in the file. It it's set to -1, it will take the las column 162 | * @return a data object containing all the relevant information 163 | */ 164 | def readDelimitedText(file: String, comment: String = "#", delimiter: String = ",", missing: String = "?", header: Boolean = true, columnClass: Int = -1): Data = { 165 | val reader: BufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(file))) 166 | reader.mark(100) 167 | val firstLine: String = reader.readLine 168 | if (columnClass >= firstLine.split(delimiter).length) throw new ParseException("Invalid response variable index: " + columnClass, columnClass) 169 | val response: Int = if (columnClass == -1) firstLine.split(delimiter).length - 1 else columnClass 170 | reader.reset() 171 | 172 | val headerArray: Array[String] = if (header) reader.readLine.split(delimiter) else null 173 | var line: String = reader.readLine 174 | val readData: ArrayBuffer[Array[Any]] = new ArrayBuffer[Array[Any]](0) 175 | val readClasses: ArrayBuffer[Any] = new ArrayBuffer[Any](0) 176 | val readNominal: ArrayBuffer[Int] = new ArrayBuffer[Int](0) 177 | 178 | while (line != null) { 179 | if (line.isEmpty || line.startsWith(comment)) { 180 | line = reader.readLine 181 | } else { 182 | val elements: Array[String] = line.split(delimiter) 183 | 184 | if (elements.length != firstLine.split(delimiter).length) 185 | throw new ParseException("%d columns, expected %d".format(elements.length, firstLine.length), elements.length) 186 | 187 | val row = new ArrayBuffer[Any](0) 188 | for (e <- elements.zipWithIndex) { 189 | if (e._2 == response) 190 | readClasses += e._1 191 | else if (e._1.replaceAll("\\s", "").matches("[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?")) 192 | row += e._1.replaceAll("\\s", "").toDouble 193 | else { 194 | if (e._1 == missing) 195 | row += "soul_NA" 196 | else { 197 | row += e._1 198 | readNominal += (if (e._2 >= response) e._2 - 1 else e._2) 199 | } 200 | } 201 | } 202 | 203 | readData += row.toArray 204 | line = reader.readLine 205 | } 206 | } 207 | 208 | val attributesValues: mutable.Map[String, String] = collection.mutable.Map[String, String]() 209 | attributesValues += ("Class" -> readClasses.distinct.mkString(",")) 210 | 211 | val fileInfo = new FileInfo(_file = file, _comment = "%", _columnClass = response, _delimiter = delimiter, _missing = missing, _header = headerArray, _relationName = null, 212 | _attributes = null, _attributesValues = attributesValues, nominal = readNominal.distinct.toArray) 213 | val data: Data = new Data(x = readData.toArray, y = readClasses.toArray, fileInfo = fileInfo) 214 | val (processedData, nomToNum) = processData(data) 215 | data.processedData = processedData 216 | data.nomToNum = nomToNum 217 | data 218 | } 219 | } 220 | -------------------------------------------------------------------------------- /src/main/scala/soul/algorithm/undersampling/EUS.scala: -------------------------------------------------------------------------------- 1 | /* 2 | SOUL: Scala Oversampling and Undersampling Library. 3 | Copyright (C) 2019 Néstor Rodríguez, David López 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation in version 3 of the License. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | */ 17 | package soul.algorithm.undersampling 18 | 19 | import soul.data.Data 20 | import soul.util.Utilities.Distance.Distance 21 | import soul.util.Utilities._ 22 | 23 | import scala.collection.mutable.ArrayBuffer 24 | import scala.math.{abs, sqrt} 25 | 26 | /** Evolutionary Under Sampling. Original paper: "Evolutionary Under-Sampling for Classification with Imbalanced Data 27 | * Sets: Proposals and Taxonomy" by Salvador Garcia and Francisco Herrera. 28 | * 29 | * @param data data to work with 30 | * @param seed seed to use. If it is not provided, it will use the system time 31 | * @param populationSize number of chromosomes to generate 32 | * @param maxEvaluations number of evaluations 33 | * @param algorithm version of core to execute. One of: EBUSGSGM, EBUSMSGM, EBUSGSAUC, EBUSMSAUC, 34 | * EUSCMGSGM, EUSCMMSGM, EUSCMGSAUC or EUSCMMSAUC 35 | * @param dist object of Distance enumeration representing the distance to be used 36 | * @param probHUX probability of changing a gen from 0 to 1 (used in crossover) 37 | * @param recombination recombination threshold (used in reinitialization) 38 | * @param prob0to1 probability of changing a gen from 0 to 1 (used in reinitialization) 39 | * @param normalize normalize the data or not 40 | * @param randomData iterate through the data randomly or not 41 | * @param verbose choose to display information about the execution or not 42 | * @author Néstor Rodríguez Vico 43 | */ 44 | class EUS(data: Data, seed: Long = System.currentTimeMillis(), populationSize: Int = 50, maxEvaluations: Int = 1000, 45 | algorithm: String = "EBUSMSGM", dist: Distance = Distance.EUCLIDEAN, probHUX: Double = 0.25, 46 | recombination: Double = 0.35, prob0to1: Double = 0.05, normalize: Boolean = false, randomData: Boolean = false, 47 | verbose: Boolean = false) { 48 | 49 | /** Compute the EUS algorithm. 50 | * 51 | * @return undersampled data structure 52 | */ 53 | def compute(): Data = { 54 | val initTime: Long = System.nanoTime() 55 | 56 | val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length) 57 | val untouchableClass: Any = counter.minBy((c: (Any, Int)) => c._2)._1 58 | val random: scala.util.Random = new scala.util.Random(seed) 59 | var dataToWorkWith: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData 60 | val classesToWorkWith: Array[Any] = if (randomData) { 61 | val randomIndex: List[Int] = random.shuffle(data.y.indices.toList) 62 | dataToWorkWith = (randomIndex map dataToWorkWith).toArray 63 | (randomIndex map data.y).toArray 64 | } else { 65 | data.y 66 | } 67 | 68 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) { 69 | (dataToWorkWith.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)), 70 | dataToWorkWith.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)), 71 | dataToWorkWith.transpose.map((column: Array[Double]) => standardDeviation(column))) 72 | } else { 73 | (null, null, null) 74 | } 75 | 76 | val majoritySelection: Boolean = algorithm.contains("MS") 77 | val targetInstances: Array[Int] = classesToWorkWith.indices.toArray 78 | val minorityElements: Array[Int] = classesToWorkWith.zipWithIndex.collect { case (c, i) if c == untouchableClass => i } 79 | 80 | def fitnessFunction(instance: Array[Int]): Double = { 81 | val index: Array[Int] = zeroOneToIndex(instance) map targetInstances 82 | val neighbours: Array[Array[Double]] = index map dataToWorkWith 83 | val classes: Array[Any] = index map classesToWorkWith 84 | val predicted: Array[Any] = dataToWorkWith.indices.map { e: Int => 85 | if (dist == Distance.EUCLIDEAN) { 86 | nnRule(neighbours, dataToWorkWith(e), index.indexOf(e), classes, 1, "nearest")._1 87 | } else { 88 | nnRuleHVDM(neighbours, dataToWorkWith(e), index.indexOf(e), classes, 1, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter, "nearest")._1 89 | } 90 | }.toArray 91 | 92 | val matrix: (Int, Int, Int, Int) = confusionMatrix(originalLabels = index map classesToWorkWith, 93 | predictedLabels = predicted, minorityClass = untouchableClass) 94 | 95 | val tp: Int = matrix._1 96 | val fp: Int = matrix._2 97 | val fn: Int = matrix._3 98 | val tn: Int = matrix._4 99 | 100 | val nPositives: Int = (index map classesToWorkWith).count(_ == untouchableClass) 101 | val nNegatives: Int = (index map classesToWorkWith).length - nPositives 102 | 103 | val tpr: Double = tp / ((tp + fn) + 0.00000001) 104 | val fpr: Double = fp / ((fp + tn) + 0.00000001) 105 | val auc: Double = (1.0 + tpr - fpr) / 2.0 106 | val tnr: Double = tn / ((tn + fp) + 0.00000001) 107 | val g: Double = sqrt(tpr * tnr) 108 | 109 | val fitness: Double = if (algorithm == "EBUSGSGM") { 110 | g - abs(1 - (nPositives.toFloat / nNegatives)) * 20 111 | } else if (algorithm == "EBUSMSGM") { 112 | g - abs(1 - (counter(untouchableClass).toFloat / nNegatives)) * 20 113 | } else if (algorithm == "EUSCMGSGM") { 114 | g 115 | } else if (algorithm == "EUSCMMSGM") { 116 | g 117 | } else if (algorithm == "EBUSGSAUC") { 118 | auc - abs(1 - (nPositives.toFloat / nNegatives)) * 0.2 119 | } else if (algorithm == "EBUSMSAUC") { 120 | auc - abs(1 - (counter(untouchableClass).toFloat / nNegatives)) * 0.2 121 | } else if (algorithm == "EUSCMGSAUC") { 122 | auc 123 | } else if (algorithm == "EUSCMMSAUC") { 124 | auc 125 | } else { 126 | Double.NaN 127 | } 128 | 129 | if (fitness.isNaN) 130 | throw new Exception("Invalid argument: core should be: EBUSGSGM, EBUSMSGM, EBUSGSAUC, EBUSMSAUC, EUSCMGSGM, " + 131 | "EUSCMMSGM, EUSCMGSAUC or EUSCMMSAUC") 132 | 133 | fitness 134 | } 135 | 136 | val population: Array[Array[Int]] = new Array[Array[Int]](populationSize) 137 | (0 until populationSize).foreach { i: Int => 138 | val individual: Array[Int] = targetInstances.indices.map(_ => random.nextInt(2)).toArray 139 | if (majoritySelection) { 140 | minorityElements.foreach((i: Int) => individual(i) = 1) 141 | } 142 | population(i) = individual 143 | } 144 | 145 | val evaluations: Array[Double] = new Array[Double](population.length) 146 | population.zipWithIndex.foreach { chromosome: (Array[Int], Int) => 147 | evaluations(chromosome._2) = fitnessFunction(chromosome._1) 148 | } 149 | 150 | var incestThreshold: Int = targetInstances.length / 4 151 | var actualEvaluations: Int = populationSize 152 | 153 | while (actualEvaluations < maxEvaluations) { 154 | val randomPopulation: Array[Array[Int]] = random.shuffle(population.indices.toList).toArray map population 155 | val newPopulation: ArrayBuffer[Array[Int]] = new ArrayBuffer[Array[Int]](0) 156 | 157 | (randomPopulation.indices by 2).foreach { i: Int => 158 | val hammingDistance: Int = (randomPopulation(i) zip randomPopulation(i + 1)).count((pair: (Int, Int)) => pair._1 != pair._2) 159 | 160 | if ((hammingDistance / 2) > incestThreshold) { 161 | val desc1: Array[Int] = randomPopulation(i).clone 162 | val desc2: Array[Int] = randomPopulation(i + 1).clone 163 | 164 | desc1.indices.foreach { i: Int => 165 | if (desc1(i) != desc2(i) && random.nextFloat < 0.5) { 166 | desc1(i) = if (desc1(i) == 1) 0 else if (random.nextFloat < probHUX) 1 else desc1(i) 167 | desc2(i) = if (desc2(i) == 1) 0 else if (random.nextFloat < probHUX) 1 else desc2(i) 168 | 169 | if (majoritySelection) { 170 | minorityElements.foreach((i: Int) => desc1(i) = 1) 171 | minorityElements.foreach((i: Int) => desc2(i) = 1) 172 | } 173 | } 174 | } 175 | 176 | newPopulation += desc1 177 | newPopulation += desc2 178 | } 179 | } 180 | 181 | val newEvaluations: Array[Double] = new Array[Double](newPopulation.length) 182 | newPopulation.zipWithIndex.foreach { chromosome: (Array[Int], Int) => 183 | newEvaluations(chromosome._2) = fitnessFunction(chromosome._1) 184 | } 185 | 186 | actualEvaluations += newPopulation.length 187 | 188 | // We order the population. The best ones (greater evaluation value) are the first 189 | val populationOrder: Array[(Double, Int, String)] = evaluations.zipWithIndex.sortBy(_._1)(Ordering[Double].reverse).map((e: (Double, Int)) => (e._1, e._2, "OLD")) 190 | val newPopulationOrder: Array[(Double, Int, String)] = newEvaluations.zipWithIndex.sortBy(_._1)(Ordering[Double].reverse).map((e: (Double, Int)) => (e._1, e._2, "NEW")) 191 | 192 | if (newPopulationOrder.length == 0 || populationOrder.last._1 > newPopulationOrder.head._1) { 193 | incestThreshold -= 1 194 | } else { 195 | val finalOrder: Array[(Double, Int, String)] = (populationOrder ++ newPopulationOrder).sortBy(_._1)(Ordering[Double].reverse).take(populationSize) 196 | 197 | finalOrder.zipWithIndex.foreach { e: ((Double, Int, String), Int) => 198 | population(e._2) = if (e._1._3 == "OLD") population(e._1._2) else newPopulation(e._1._2) 199 | evaluations(e._2) = if (e._1._3 == "OLD") evaluations(e._1._2) else newEvaluations(e._1._2) 200 | } 201 | } 202 | 203 | if (incestThreshold <= 0) { 204 | population.indices.tail.foreach { i: Int => 205 | val individual: Array[Int] = population(i).map(_ => if (random.nextFloat < recombination) 206 | if (random.nextFloat < prob0to1) 1 else 0 else population(0)(i)) 207 | 208 | if (majoritySelection) { 209 | minorityElements.foreach((i: Int) => individual(i) = 1) 210 | } 211 | 212 | population(i) = individual 213 | } 214 | 215 | population.zipWithIndex.tail.par.foreach { e: (Array[Int], Int) => 216 | evaluations(e._2) = fitnessFunction(e._1) 217 | } 218 | 219 | actualEvaluations += (population.length - 1) 220 | 221 | incestThreshold = (recombination * (1.0 - recombination) * targetInstances.length.toFloat).toInt 222 | } 223 | } 224 | 225 | val bestChromosome: Array[Int] = population(evaluations.zipWithIndex.sortBy(_._1)(Ordering[Double].reverse).head._2) 226 | val finalIndex: Array[Int] = zeroOneToIndex(bestChromosome) map targetInstances 227 | val finishTime: Long = System.nanoTime() 228 | 229 | if (verbose) { 230 | val newCounter: Map[Any, Int] = (finalIndex map classesToWorkWith).groupBy(identity).mapValues(_.length) 231 | println("ORIGINAL SIZE: %d".format(dataToWorkWith.length)) 232 | println("NEW DATA SIZE: %d".format(finalIndex.length)) 233 | println("REDUCTION PERCENTAGE: %s".format(100 - (finalIndex.length.toFloat / dataToWorkWith.length) * 100)) 234 | println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass))) 235 | println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass))) 236 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime))) 237 | } 238 | 239 | new Data(finalIndex map data.x, finalIndex map data.y, Some(finalIndex), data.fileInfo) 240 | } 241 | } 242 | -------------------------------------------------------------------------------- /src/main/scala/soul/algorithm/oversampling/DBSMOTE.scala: -------------------------------------------------------------------------------- 1 | /* 2 | SOUL: Scala Oversampling and Undersampling Library. 3 | Copyright (C) 2019 Néstor Rodríguez, David López 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation in version 3 of the License. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | */ 17 | package soul.algorithm.oversampling 18 | 19 | import soul.data.Data 20 | import soul.util.Utilities.Distance.Distance 21 | import soul.util.Utilities._ 22 | 23 | import scala.collection.mutable.ArrayBuffer 24 | import scala.util.Random 25 | 26 | /** DBSMOTE algorithm. Original paper: "DBSMOTE: Density-Based Synthetic Minority Over-sampling Technique" by 27 | * Chumphol Bunkhumpornpat, Krung Sinapiromsaran and Chidchanok Lursinsap. 28 | * 29 | * @param data data to work with 30 | * @param eps epsilon to indicate the distance that must be between two points 31 | * @param k number of neighbors 32 | * @param dist object of Distance enumeration representing the distance to be used 33 | * @param seed seed to use. If it is not provided, it will use the system time 34 | * @param normalize normalize the data or not 35 | * @param verbose choose to display information about the execution or not 36 | * @author David López Pretel 37 | */ 38 | class DBSMOTE(data: Data, eps: Double = -1, k: Int = 5, dist: Distance = Distance.EUCLIDEAN, 39 | seed: Long = 5, normalize: Boolean = false, verbose: Boolean = false) { 40 | 41 | /** Compute the DBSMOTE algorithm 42 | * 43 | * @return synthetic samples generated 44 | */ 45 | def compute(): Data = { 46 | val initTime: Long = System.nanoTime() 47 | val minorityClassIndex: Array[Int] = minority(data.y) 48 | val samples: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData 49 | 50 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) { 51 | (samples.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)), 52 | samples.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)), 53 | samples.transpose.map((column: Array[Double]) => standardDeviation(column))) 54 | } else { 55 | (null, null, null) 56 | } 57 | 58 | def regionQuery(point: Int, eps: Double): Array[Int] = { 59 | (minorityClassIndex map samples).indices.map(sample => { 60 | val D: Double = if (dist == Distance.EUCLIDEAN) { 61 | euclidean(samples(minorityClassIndex(point)), samples(minorityClassIndex(sample))) 62 | } else { 63 | HVDM(samples(minorityClassIndex(point)), samples(minorityClassIndex(sample)), data.fileInfo.nominal, sds, 64 | attrCounter, attrClassesCounter) 65 | } 66 | if (D <= eps) { 67 | Some(sample) 68 | } else { 69 | None 70 | } 71 | }).filterNot(_.forall(_ == None)).map(_.get).toArray 72 | } 73 | 74 | def expandCluster(point: Int, clusterId: Int, clusterIds: Array[Int], eps: Double, minPts: Int): Boolean = { 75 | val neighbors: ArrayBuffer[Int] = ArrayBuffer(regionQuery(point, eps): _*) 76 | if (neighbors.length < minPts) { 77 | clusterIds(point) = -2 //noise 78 | return false 79 | } else { 80 | neighbors.foreach(clusterIds(_) = clusterId) 81 | clusterIds(point) = clusterId 82 | 83 | var numNeighbors: Int = neighbors.length 84 | for (current <- 0 until numNeighbors) { 85 | val neighborsOfCurrent: Array[Int] = regionQuery(current, eps) 86 | if (neighborsOfCurrent.length >= minPts) { 87 | neighborsOfCurrent.foreach(neighbor => { 88 | if (clusterIds(neighbor) == -1 || clusterIds(neighbor) == -2) { //Noise or Unclassified 89 | if (clusterIds(neighbor) == -1) { //Unclassified 90 | neighbors += neighbor 91 | numNeighbors += 1 92 | } 93 | clusterIds(neighbor) = clusterId 94 | } 95 | }) 96 | } 97 | } 98 | } 99 | 100 | true 101 | } 102 | 103 | def dbscan(eps: Double, minPts: Int): Array[Array[Int]] = { 104 | var clusterId: Int = 0 105 | val clusterIds: Array[Int] = Array.fill(minorityClassIndex.length)(-1) 106 | minorityClassIndex.indices.foreach(point => { 107 | if (clusterIds(point) == -1) { 108 | if (expandCluster(point, clusterId, clusterIds, eps, minPts)) { 109 | clusterId += 1 110 | } 111 | } 112 | }) 113 | 114 | if (clusterId != 0) { 115 | val clusters: Array[Array[Int]] = Array.fill(clusterId)(Array()) 116 | (0 until clusterId).foreach(i => { 117 | clusters(i) = clusterIds.zipWithIndex.filter(_._1 == i).map(_._2) 118 | }) 119 | clusters 120 | } else { // the cluster is all the data 121 | Array(Array.range(0, minorityClassIndex.length)) 122 | } 123 | } 124 | 125 | def buildGraph(cluster: Array[Int], eps: Double, minPts: Int): Array[Array[Boolean]] = { 126 | val graph: Array[Array[Boolean]] = Array.fill(cluster.length, cluster.length)(false) 127 | //distance between each pair of nodes 128 | val distances: Array[Array[Double]] = cluster.map { i => 129 | cluster.map { j => 130 | if (dist == Distance.EUCLIDEAN) { 131 | euclidean(samples(minorityClassIndex(i)), samples(minorityClassIndex(j))) 132 | } else { 133 | HVDM(samples(minorityClassIndex(i)), samples(minorityClassIndex(j)), data.fileInfo.nominal, sds, attrCounter, attrClassesCounter) 134 | } 135 | 136 | } 137 | } 138 | 139 | // number of nodes connected to another which satisfied distance(a,b) <= eps 140 | val NNq: Array[Int] = distances.map(row => row.map(dist => if (dist <= eps) 1 else 0)).map(_.sum) 141 | 142 | //build the graph 143 | cluster.indices.foreach(i => { 144 | if (cluster.length >= minPts + 1) { 145 | distances(i).zipWithIndex.foreach(dist => { 146 | if (dist._1 <= eps && dist._1 > 0 && NNq(dist._2) >= minPts) { 147 | graph(i)(dist._2) = true 148 | } 149 | }) 150 | } else { 151 | distances(i).zipWithIndex.foreach(dist => { 152 | if (dist._1 <= eps && dist._1 > 0) { 153 | graph(i)(dist._2) = true 154 | } 155 | }) 156 | } 157 | }) 158 | graph 159 | } 160 | 161 | def dijsktra(graph: Array[Array[Boolean]], source: Int, target: Int, cluster: Array[Int]): Array[Int] = { 162 | // distance from source to node, prev node, node visited or not 163 | val nodeInfo: Array[(Double, Int, Boolean)] = Array.fill(graph.length)((9999999, -1, false)) 164 | nodeInfo(source) = (0.0, source, false) 165 | 166 | val findMin = (x: ((Double, Int, Boolean), Int), y: ((Double, Int, Boolean), Int)) => 167 | if ((x._1._1 < y._1._1 && !x._1._3) || (!x._1._3 && y._1._3)) x else y 168 | 169 | nodeInfo.indices.foreach(_ => { 170 | val u: Int = nodeInfo.zipWithIndex.reduceLeft(findMin)._2 //vertex with min distance 171 | nodeInfo(u) = (nodeInfo(u)._1, nodeInfo(u)._2, true) 172 | if (u == target) { // return shortest path 173 | val shortestPath: ArrayBuffer[Int] = ArrayBuffer() 174 | var current = target 175 | while (current != source) { 176 | shortestPath += current 177 | current = nodeInfo(current)._2 178 | } 179 | shortestPath += current 180 | return shortestPath.toArray 181 | } 182 | graph(u).indices.foreach(v => { 183 | if (graph(u)(v) && !nodeInfo(v)._3) { 184 | val d: Double = if (dist == Distance.EUCLIDEAN) { 185 | euclidean(samples(minorityClassIndex(cluster(u))), 186 | samples(minorityClassIndex(cluster(v)))) 187 | } else { 188 | HVDM(samples(minorityClassIndex(cluster(u))), samples(minorityClassIndex(cluster(v))), data.fileInfo.nominal, 189 | sds, attrCounter, attrClassesCounter) 190 | } 191 | val alt = nodeInfo(u)._1 + d 192 | if (alt < nodeInfo(v)._1) nodeInfo(v) = (alt, u, nodeInfo(v)._3) 193 | } 194 | }) 195 | }) 196 | 197 | throw new Exception("Path not found") 198 | } 199 | 200 | val minorityClass: Any = data.y(minorityClassIndex(0)) 201 | //check if the user pass the epsilon parameter 202 | var eps2 = eps 203 | if (eps == -1) { 204 | eps2 = samples.map { i => 205 | samples.map { j => 206 | if (dist == Distance.EUCLIDEAN) { 207 | euclidean(i, j) 208 | } else { 209 | HVDM(i, j, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter) 210 | } 211 | }.sum 212 | }.sum / (samples.length * samples.length) 213 | } 214 | 215 | //compute the clusters using dbscan 216 | val clusters: Array[Array[Int]] = dbscan(eps2, k) 217 | 218 | //the output of the algorithm 219 | val output: Array[Array[Double]] = Array.fill(clusters.map(_.length).sum, samples(0).length)(0) 220 | 221 | //for each cluster 222 | clusters.foreach(c => { 223 | //build a graph with the data of each cluster 224 | val graph: Array[Array[Boolean]] = buildGraph(c, eps2, k) 225 | val r: Random.type = scala.util.Random 226 | r.setSeed(seed) 227 | var newIndex: Int = 0 228 | //compute pseudo-centroid, centroid is the mean of the cluster 229 | val centroid = (c map samples).transpose.map(_.sum / c.length) 230 | var pseudoCentroid: (Int, Double) = (0, 99999999.0) 231 | //the pseudo-centroid is the sample that is closest to the centroid 232 | (c map samples).zipWithIndex.foreach(sample => { 233 | val d: Double = if (dist == Distance.EUCLIDEAN) { 234 | euclidean(sample._1, centroid) 235 | } else { 236 | HVDM(sample._1, centroid, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter) 237 | } 238 | if (d < pseudoCentroid._2) pseudoCentroid = (sample._2, d) 239 | }) 240 | 241 | c.indices.foreach(p => { 242 | //compute the shortest path between the pseudo centroid and the samples in each cluster 243 | val shortestPath: Array[Int] = dijsktra(graph, p, pseudoCentroid._1, c) 244 | //a random sample in the path 245 | val e = r.nextInt(shortestPath.length) 246 | //get the nodes connected by e, then only the two first will be used 247 | val v1_v2: Array[(Boolean, Int)] = graph(shortestPath(e)).zipWithIndex.filter(_._1 == true) 248 | samples(0).indices.foreach(attrib => { 249 | // v1(attrib) - v2(attrib) 250 | val dif: Double = samples(minorityClassIndex(c(v1_v2(1)._2)))(attrib) - samples(minorityClassIndex(c(v1_v2(0)._2)))(attrib) 251 | val gap: Double = r.nextFloat() 252 | // v1(attrib) + gap * dif 253 | output(newIndex)(attrib) = samples(minorityClassIndex(c(v1_v2(0)._2)))(attrib) + gap * dif 254 | }) 255 | newIndex += 1 256 | }) 257 | }) 258 | 259 | val finishTime: Long = System.nanoTime() 260 | 261 | if (verbose) { 262 | println("ORIGINAL SIZE: %d".format(data.x.length)) 263 | println("NEW DATA SIZE: %d".format(data.x.length + output.length)) 264 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime))) 265 | } 266 | 267 | new Data(if (data.fileInfo.nominal.length == 0) { 268 | to2Decimals(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs, 269 | data.fileInfo.minAttribs) else output)) 270 | } else { 271 | toNominal(Array.concat(data.processedData, if (normalize) zeroOneDenormalization(output, data.fileInfo.maxAttribs, 272 | data.fileInfo.minAttribs) else output), data.nomToNum) 273 | }, Array.concat(data.y, Array.fill(output.length)(minorityClass)), None, data.fileInfo) 274 | } 275 | } 276 | -------------------------------------------------------------------------------- /src/main/scala/soul/algorithm/undersampling/SBC.scala: -------------------------------------------------------------------------------- 1 | /* 2 | SOUL: Scala Oversampling and Undersampling Library. 3 | Copyright (C) 2019 Néstor Rodríguez, David López 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation in version 3 of the License. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | */ 17 | package soul.algorithm.undersampling 18 | 19 | import soul.data.Data 20 | import soul.util.KDTree 21 | import soul.util.Utilities.Distance.Distance 22 | import soul.util.Utilities._ 23 | 24 | import scala.math.{max, min} 25 | 26 | /** Undersampling Based on Clustering. Original paper: "Under-Sampling Approaches for Improving Prediction of the 27 | * Minority Class in an Imbalanced Dataset" by Show-Jane Yen and Yue-Shi Lee. 28 | * 29 | * @param data data to work with 30 | * @param seed seed to use. If it is not provided, it will use the system time 31 | * @param method selection method to apply. Possible options: random, NearMiss1, NearMiss2, NearMiss3, MostDistant and MostFar 32 | * @param m ratio used in the SSize calculation 33 | * @param k number of neighbours to use when computing k-NN rule (normally 3 neighbours) 34 | * @param numClusters number of clusters to be created by KMeans core 35 | * @param restarts number of times to relaunch KMeans core 36 | * @param minDispersion stop KMeans core if dispersion is lower than this value 37 | * @param maxIterations number of iterations to be done in KMeans core 38 | * @param dist object of Distance enumeration representing the distance to be used 39 | * @param normalize normalize the data or not 40 | * @param randomData iterate through the data randomly or not 41 | * @param verbose choose to display information about the execution or not 42 | * @author Néstor Rodríguez Vico 43 | */ 44 | class SBC(data: Data, seed: Long = System.currentTimeMillis(), method: String = "NearMiss1", m: Double = 1.0, k: Int = 3, numClusters: Int = 50, 45 | restarts: Int = 1, minDispersion: Double = 0.0001, maxIterations: Int = 200, val dist: Distance = Distance.EUCLIDEAN, 46 | normalize: Boolean = false, randomData: Boolean = false, verbose: Boolean = false) { 47 | 48 | /** Compute the SBC algorithm. 49 | * 50 | * @return undersampled data structure 51 | */ 52 | def compute(): Data = { 53 | val initTime: Long = System.nanoTime() 54 | 55 | val counter: Map[Any, Int] = data.y.groupBy(identity).mapValues(_.length) 56 | val untouchableClass: Any = counter.minBy((c: (Any, Int)) => c._2)._1 57 | val random: scala.util.Random = new scala.util.Random(seed) 58 | var dataToWorkWith: Array[Array[Double]] = if (normalize) zeroOneNormalization(data, data.processedData) else data.processedData 59 | val classesToWorkWith: Array[Any] = if (randomData) { 60 | val randomIndex: List[Int] = random.shuffle(data.y.indices.toList) 61 | dataToWorkWith = (randomIndex map dataToWorkWith).toArray 62 | (randomIndex map data.y).toArray 63 | } else { 64 | data.y 65 | } 66 | 67 | val (attrCounter, attrClassesCounter, sds) = if (dist == Distance.HVDM) { 68 | (dataToWorkWith.transpose.map((column: Array[Double]) => column.groupBy(identity).mapValues(_.length)), 69 | dataToWorkWith.transpose.map((attribute: Array[Double]) => occurrencesByValueAndClass(attribute, data.y)), 70 | dataToWorkWith.transpose.map((column: Array[Double]) => standardDeviation(column))) 71 | } else { 72 | (null, null, null) 73 | } 74 | 75 | val (_, centroids, assignment) = kMeans(dataToWorkWith, data.fileInfo.nominal, numClusters, restarts, minDispersion, maxIterations, seed) 76 | val minMajElements: List[(Int, Int)] = (0 until numClusters).toList.map { cluster: Int => 77 | val elements = assignment(cluster) 78 | val minElements: Int = (elements map classesToWorkWith).count((c: Any) => c == untouchableClass) 79 | (minElements, elements.length - minElements) 80 | } 81 | val nPos: Double = minMajElements.unzip._2.sum.toDouble 82 | val sizeK: Double = minMajElements.map((pair: (Int, Int)) => pair._2.toDouble / max(pair._1, 1)).sum 83 | val sSizes: Array[(Int, Int)] = assignment.map { element: (Int, Array[Int]) => 84 | val ratio: (Int, Int) = minMajElements(element._1) 85 | // The min is to prevent infinity values if no minority elements are added to the cluster 86 | (element._1, min(m * nPos * ((ratio._2.toDouble / (ratio._1 + 1)) / sizeK), ratio._2).toInt) 87 | }.toArray 88 | val minorityElements: Array[Int] = assignment.flatMap((element: (Int, Array[Int])) => element._2.filter((index: Int) => 89 | classesToWorkWith(index) == untouchableClass)).toArray 90 | 91 | val majorityElements: Array[Int] = if (method.equals("random")) { 92 | sSizes.filter(_._2 != 0).flatMap { clusterIdSize: (Int, Int) => 93 | random.shuffle(assignment(clusterIdSize._1).toList).filter((e: Int) => 94 | classesToWorkWith(e) != untouchableClass).take(clusterIdSize._2) 95 | } 96 | } else { 97 | sSizes.filter(_._2 != 0).flatMap { clusteridSize: (Int, Int) => 98 | val majorityElementsIndex: Array[(Int, Int)] = assignment(clusteridSize._1).zipWithIndex.filter((e: (Int, Int)) => 99 | classesToWorkWith(e._1) != untouchableClass) 100 | 101 | // If no minority class elements are assigned to the cluster 102 | if (majorityElementsIndex.length == assignment(clusteridSize._1).length) { 103 | // Use the centroid as "minority class" element 104 | val distances: Array[Double] = assignment(clusteridSize._1).map { instance: Int => 105 | euclidean(dataToWorkWith(instance), centroids(clusteridSize._1)) 106 | } 107 | 108 | distances.zipWithIndex.sortBy(_._2).take(clusteridSize._2).map(_._2) map assignment(clusteridSize._1) 109 | } else { 110 | val minorityElementsIndex: Array[(Int, Int)] = assignment(clusteridSize._1).zipWithIndex.filter((e: (Int, Int)) => 111 | classesToWorkWith(e._1) == untouchableClass) 112 | val majorityElementsIndex: Array[(Int, Int)] = assignment(clusteridSize._1).zipWithIndex.filter((e: (Int, Int)) => 113 | classesToWorkWith(e._1) != untouchableClass) 114 | 115 | val minNeighbours: Array[Array[Double]] = minorityElementsIndex.unzip._2 map dataToWorkWith 116 | val majNeighbours: Array[Array[Double]] = majorityElementsIndex.unzip._2 map dataToWorkWith 117 | val minClasses: Array[Any] = minorityElementsIndex.unzip._2 map classesToWorkWith 118 | val majClasses: Array[Any] = majorityElementsIndex.unzip._2 map classesToWorkWith 119 | 120 | val KDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) { 121 | Some(new KDTree(minNeighbours, minClasses, dataToWorkWith(0).length)) 122 | } else { 123 | None 124 | } 125 | 126 | val majorityKDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) { 127 | Some(new KDTree(majNeighbours, majClasses, dataToWorkWith(0).length)) 128 | } else { 129 | None 130 | } 131 | 132 | val reverseKDTree: Option[KDTree] = if (dist == Distance.EUCLIDEAN) { 133 | Some(new KDTree(minNeighbours, minClasses, dataToWorkWith(0).length, which = "farthest")) 134 | } else { 135 | None 136 | } 137 | 138 | if (method.equals("NearMiss1")) { 139 | // selects the majority class samples whose average distances to k nearest minority class samples in the ith cluster are the smallest. 140 | val meanDistances: Array[(Int, Double)] = majorityElementsIndex.map { i: (Int, Int) => 141 | if (dist == Distance.EUCLIDEAN) { 142 | val index = KDTree.get.nNeighbours(dataToWorkWith(i._1), k)._3 143 | (i._1, index.map(j => euclidean(dataToWorkWith(i._1), dataToWorkWith(j))).sum / index.length) 144 | } else { 145 | val result: (Any, Array[Int], Array[Double]) = nnRuleHVDM(minNeighbours, dataToWorkWith(i._1), -1, minClasses, k, 146 | data.fileInfo.nominal, sds, attrCounter, attrClassesCounter, "nearest") 147 | (i._1, (result._2 map result._3).sum / result._2.length) 148 | } 149 | } 150 | meanDistances.sortBy((pair: (Int, Double)) => pair._2).take(clusteridSize._2).map(_._1) 151 | } else if (method.equals("NearMiss2")) { 152 | // selects the majority class samples whose average distances to k farthest minority class samples in the ith cluster are the smallest. 153 | val meanDistances: Array[(Int, Double)] = majorityElementsIndex.map { i: (Int, Int) => 154 | if (dist == Distance.EUCLIDEAN) { 155 | val index = reverseKDTree.get.nNeighbours(dataToWorkWith(i._1), k)._3 156 | (i._1, index.map(j => euclidean(dataToWorkWith(i._1), dataToWorkWith(j))).sum / index.length) 157 | } else { 158 | val result: (Any, Array[Int], Array[Double]) = nnRuleHVDM(minNeighbours, dataToWorkWith(i._1), -1, minClasses, k, 159 | data.fileInfo.nominal, sds, attrCounter, attrClassesCounter, "farthest") 160 | (i._1, (result._2 map result._3).sum / result._2.length) 161 | } 162 | } 163 | meanDistances.sortBy((pair: (Int, Double)) => pair._2).take(clusteridSize._2).map(_._1) 164 | } else if (method.equals("NearMiss3")) { 165 | // selects the majority class samples whose average distances to the closest minority class samples in the ith cluster are the smallest. 166 | val meanDistances: Array[(Int, Double)] = majorityElementsIndex.map { i: (Int, Int) => 167 | if (dist == Distance.EUCLIDEAN) { 168 | val index = majorityKDTree.get.nNeighbours(dataToWorkWith(i._1), k)._3 169 | (i._1, index.map(j => euclidean(dataToWorkWith(i._1), dataToWorkWith(j))).sum / index.length) 170 | } else { 171 | val result: (Any, Array[Int], Array[Double]) = nnRuleHVDM(majNeighbours, dataToWorkWith(i._1), -1, majClasses, k, 172 | data.fileInfo.nominal, sds, attrCounter, attrClassesCounter, "nearest") 173 | (i._1, (result._2 map result._3).sum / result._2.length) 174 | } 175 | } 176 | meanDistances.sortBy((pair: (Int, Double)) => pair._2).take(clusteridSize._2).map(_._1) 177 | } else if (method.equals("MostDistant")) { 178 | // selects the majority class samples whose average distances to M closest minority class samples in the ith cluster are the farthest. 179 | val meanDistances: Array[(Int, Double)] = majorityElementsIndex.map { i: (Int, Int) => 180 | if (dist == Distance.EUCLIDEAN) { 181 | val index = KDTree.get.nNeighbours(dataToWorkWith(i._1), k)._3 182 | (i._1, index.map(j => euclidean(dataToWorkWith(i._1), dataToWorkWith(j))).sum / index.length) 183 | } else { 184 | val result: (Any, Array[Int], Array[Double]) = nnRuleHVDM(minNeighbours, dataToWorkWith(i._1), -1, minClasses, k, 185 | data.fileInfo.nominal, sds, attrCounter, attrClassesCounter, "nearest") 186 | (i._1, (result._2 map result._3).sum / result._2.length) 187 | } 188 | } 189 | meanDistances.sortBy((pair: (Int, Double)) => pair._2).reverse.take(clusteridSize._2).map(_._1) 190 | } else if (method.equals("MostFar")) { 191 | // selects the majority class samples whose average distances to all minority class samples in the cluster are the farthest 192 | val meanDistances: Array[(Int, Double)] = majorityElementsIndex.map { i: (Int, Int) => 193 | if (dist == Distance.EUCLIDEAN) { 194 | val index = KDTree.get.nNeighbours(dataToWorkWith(i._1), minorityElementsIndex.length)._3 195 | (i._1, index.map(j => euclidean(dataToWorkWith(i._1), dataToWorkWith(j))).sum / index.length) 196 | } else { 197 | val result: (Any, Array[Int], Array[Double]) = nnRuleHVDM(minNeighbours, dataToWorkWith(i._1), -1, minClasses, 198 | minorityElementsIndex.length, data.fileInfo.nominal, sds, attrCounter, attrClassesCounter, "nearest") 199 | (i._1, (result._2 map result._3).sum / result._2.length) 200 | } 201 | } 202 | meanDistances.sortBy((pair: (Int, Double)) => pair._2).take(clusteridSize._2).map(_._1) 203 | } else { 204 | throw new Exception("Invalid argument: method should be: random, NearMiss1, NearMiss2, NearMiss3, MostDistant or MostFar") 205 | } 206 | } 207 | } 208 | } 209 | 210 | val finalIndex: Array[Int] = minorityElements.distinct ++ majorityElements.distinct 211 | val finishTime: Long = System.nanoTime() 212 | 213 | if (verbose) { 214 | val newCounter: Map[Any, Int] = (finalIndex map classesToWorkWith).groupBy(identity).mapValues(_.length) 215 | println("ORIGINAL SIZE: %d".format(dataToWorkWith.length)) 216 | println("NEW DATA SIZE: %d".format(finalIndex.length)) 217 | println("REDUCTION PERCENTAGE: %s".format(100 - (finalIndex.length.toFloat / dataToWorkWith.length) * 100)) 218 | println("ORIGINAL IMBALANCED RATIO: %s".format(imbalancedRatio(counter, untouchableClass))) 219 | println("NEW IMBALANCED RATIO: %s".format(imbalancedRatio(newCounter, untouchableClass))) 220 | println("TOTAL ELAPSED TIME: %s".format(nanoTimeToString(finishTime - initTime))) 221 | } 222 | 223 | new Data(finalIndex map data.x, finalIndex map data.y, Some(finalIndex), data.fileInfo) 224 | } 225 | } --------------------------------------------------------------------------------