├── .gitignore ├── README.md ├── build.sbt ├── project └── build.properties └── src └── main ├── java └── de │ └── hpi │ └── WindowConverter.java ├── loading.sc ├── resources ├── application.conf ├── log4j.properties ├── logback.xml └── stopwords ├── scala └── de │ └── hpi │ └── anlp │ ├── Main.scala │ ├── POSEvaluator.scala │ ├── POSHMM.scala │ ├── POSMLP.scala │ ├── POSRBM.scala │ ├── conll │ ├── ConLLDataSet.scala │ └── ConLLFile.scala │ ├── hmm │ ├── ConstantSmoothedHMM.scala │ ├── HMM.scala │ └── TrainedHMM.scala │ ├── mlp │ ├── MLP.scala │ ├── MLPConfig.scala │ ├── MLPConnection.scala │ ├── MLPLayer.scala │ ├── MLPModel.scala │ └── MLPTasks.scala │ ├── nnpos │ └── POSMLP.scala │ └── utils │ ├── FileUtils.scala │ ├── LCCFileReader.scala │ ├── Model.scala │ ├── ScalaMLTypes.scala │ ├── SentenceUtils.scala │ ├── TagDictionary.scala │ ├── Word2VecDataSetIterator.scala │ └── WordDictionary.scala └── test-space.sc /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | 4 | # sbt specific 5 | .cache/ 6 | .history/ 7 | .lib/ 8 | dist/* 9 | target/ 10 | lib_managed/ 11 | src_managed/ 12 | project/boot/ 13 | project/plugins/project/ 14 | output/ 15 | project-ivy-repo/ 16 | word2vec-index/ 17 | .idea/ 18 | 19 | # Scala-IDE specific 20 | .scala_dependencies 21 | .worksheet 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # deep-nlp-scala 2 | Using deep learning to POS tag sentences using scala + DL4J. 3 | 4 | This is a showcase repository intended to evaluate different algorithms on the task of POS tagging german sentences. There is a Multilayerperceptron (from scratch), a Hidden-Markov-Model (from scratch) and a RBF deep net (based on DL4J) implementation. 5 | 6 | ## Installation 7 | To execute the project one needs to make sure sbt 0.13 and java >= 1.6 is installed. 8 | 9 | Information about how to install sbt on your system can be found on http://www.scala-sbt.org/release/tutorial/Setup.html 10 | 11 | ## Assets 12 | Assets need to be placed into the assets/ directory. 13 | 14 | There should be labeled training data: de-train.tt, de-test.tt and de-eval.tt 15 | 16 | And there should be unlabeled training data for the word2vec training in the folder assets/deu_news_2010_1M-text. The 17 | data set can be downloaded from http://corpora.uni-leipzig.de/download.html 18 | 19 | 20 | ## Running 21 | Running the different methods requires passing either 'mlp', 'rbm', 'hmm' or 'hmm-s' to the executable. Make sure to allow the JVM to use as much memory as possible (e.g. using "-Xmx7G"). 22 | 23 | The program can be started using "sbt run mlp" 24 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | name := "deep-nlp-scala" 2 | 3 | version := "1.0" 4 | 5 | scalaVersion := "2.10.4" 6 | 7 | libraryDependencies ++= List( 8 | "log4j" % "log4j" % "1.2.15" exclude("javax.jms", "jms"), 9 | "org.deeplearning4j" % "deeplearning4j-core" % "0.0.3.3.2.alpha1", 10 | "org.deeplearning4j" % "deeplearning4j-nlp" % "0.0.3.3.2.alpha1", 11 | "org.nd4j" % "canova-parent" % "0.0.0.1", 12 | "org.nd4j" % "nd4j-api" % "0.0.3.5.5.2", 13 | "org.nd4j" % "nd4j-netlib-blas" % "0.0.3.5.5.2", 14 | "edu.stanford.nlp" % "stanford-corenlp" % "3.4.1", // 3.4.1 last version with java 7 support 15 | "edu.stanford.nlp" % "stanford-corenlp" % "3.4.1" classifier "models" 16 | ) 17 | 18 | resolvers ++= Seq( 19 | "JBoss repository" at "https://repository.jboss.org/nexus/content/groups/public", 20 | Resolver.mavenLocal, 21 | Resolver.file("project-ivy-repo", file("project-ivy-repo")) 22 | ) 23 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=0.13.5 -------------------------------------------------------------------------------- /src/main/java/de/hpi/WindowConverter.java: -------------------------------------------------------------------------------- 1 | package de.hpi; 2 | 3 | import org.deeplearning4j.models.word2vec.Word2Vec; 4 | import org.deeplearning4j.text.movingwindow.Window; 5 | import org.nd4j.linalg.api.ndarray.INDArray; 6 | import org.nd4j.linalg.factory.Nd4j; 7 | 8 | import java.util.List; 9 | 10 | public class WindowConverter { 11 | public static double[] asExample(Window window,Word2Vec vec) { 12 | int length = vec.lookupTable().layerSize(); 13 | List words = window.getWords(); 14 | int windowSize = window.getWindowSize(); 15 | 16 | double[] example = new double[ length * windowSize]; 17 | int count = 0; 18 | for(int i = 0; i < words.size(); i++) { 19 | String word = words.get(i); 20 | INDArray n = vec.getWordVectorMatrixNormalized(word); 21 | INDArray vec2 = n == null ? vec.getWordVectorMatrix(Word2Vec.UNK) : vec.getWordVectorMatrix(word); 22 | if(vec2 == null) 23 | vec2 = vec.getWordVectorMatrix(Word2Vec.UNK); 24 | for(int j = 0; j < vec2.length(); j++) { 25 | example[count++] = vec2.getDouble(j); 26 | } 27 | 28 | 29 | } 30 | 31 | return example; 32 | } 33 | 34 | public static INDArray asExampleMatrix(Window window,Word2Vec vec) { 35 | return Nd4j.create(asExample(window, vec)); 36 | } 37 | 38 | } 39 | -------------------------------------------------------------------------------- /src/main/loading.sc: -------------------------------------------------------------------------------- 1 | import java.io.FileReader 2 | 3 | import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel} 4 | import edu.stanford.nlp.sequences.{SeqClassifierFlags, CoNLLDocumentReaderAndWriter} 5 | 6 | val conllreader = { 7 | val r = new CoNLLDocumentReaderAndWriter() 8 | r.init(new SeqClassifierFlags()) 9 | r 10 | } 11 | val it = conllreader.getIterator(new FileReader("/Users/tombocklisch/Documents/Studium/ANLP/deep-nlp-scala/assets/de-train.tt")) 12 | var numDocs = 0 13 | var numTokens = 0 14 | var lastAnsBase = "" 15 | var numEntities = 0 16 | while (it.hasNext) { 17 | val doc = it.next() 18 | numDocs += 1 19 | import scala.collection.JavaConversions._ 20 | for (fl <- doc) { 21 | if (fl.word != "XX") { 22 | val ans: String = fl.get(classOf[CoreAnnotations.AnswerAnnotation]) 23 | var ansBase: String = null 24 | var ansPrefix: String = null 25 | val bits: Array[String] = ans.split("-") 26 | if (bits.length == 1) { 27 | ansBase = bits(0) 28 | ansPrefix = "" 29 | } 30 | else { 31 | ansBase = bits(1) 32 | ansPrefix = bits(0) 33 | } 34 | numTokens += 1 35 | if (!(ansBase == "O")) { 36 | if (ansBase == lastAnsBase) { 37 | if (ansPrefix == "B") { 38 | numEntities += 1 39 | } 40 | } 41 | else { 42 | numEntities += 1 43 | } 44 | } 45 | } 46 | } 47 | } -------------------------------------------------------------------------------- /src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | akka { 2 | 3 | extensions = ["akka.contrib.pattern.ClusterReceptionistExtension","akka.contrib.pattern.DistributedPubSubExtension"] 4 | loggers = ["akka.event.slf4j.Slf4jLogger"] 5 | loglevel = "INFO" 6 | 7 | actor { 8 | provider = "akka.cluster.ClusterActorRefProvider" 9 | serialize-messages = off 10 | serialize-creators = off 11 | 12 | worker-dispatcher { 13 | type = Dispatcher 14 | mailbox-capacity = 3000 15 | mailbox-push-timeout-time = 120s 16 | } 17 | 18 | deployment { 19 | 20 | 21 | serializers { 22 | java = "akka.serialization.JavaSerializer" 23 | proto = "akka.remote.serialization.ProtobufSerializer" 24 | } 25 | 26 | 27 | 28 | } 29 | 30 | } 31 | 32 | remote { 33 | transport = "akka.remote.netty.NettyRemoteTransport" 34 | log-remote-lifecycle-events = off 35 | 36 | netty.tcp { 37 | hostname = "localhost" 38 | port = 0 39 | maximum-frame-size = 99999999999b 40 | } 41 | 42 | transport-failure-detector { 43 | heartbeat-interval = 120 s 44 | acceptable-heartbeat-pause = 60 s 45 | } 46 | } 47 | 48 | cluster { 49 | failure-detector { 50 | threshold = 12 51 | acceptable-heartbeat-pause = 120s 52 | heartbeat-interval = 5s 53 | heartbeat-request { 54 | expected-response-after = 120s 55 | } 56 | } 57 | jmx.enabled = on 58 | enabled = on 59 | allow-local-routees = off 60 | 61 | auto-down-unreachable-after = off 62 | } 63 | } -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=ERROR, Console 2 | log4j.logger.play=DEBUG 3 | log4j.appender.Console=org.apache.log4j.ConsoleAppender 4 | log4j.appender.Console.layout=org.apache.log4j.PatternLayout 5 | log4j.appender.Console.layout.ConversionPattern=%d{ABSOLUTE} %-5p ~ %m%n 6 | 7 | log4j.appender.org.springframework=DEBUG 8 | log4j.appender.org.deeplearning4j=INFO 9 | log4j.appender.opennlp.uima=OFF 10 | log4j.appender.org.apache.uima=OFF 11 | log4j.appender.org.cleartk=OFF 12 | 13 | log4j.logger.org.springframework=INFO 14 | log4j.logger.org.deeplearning4j=INFO 15 | log4j.logger.opennlp.uima.util=OFF 16 | log4j.logger.org.apache.uima=OFF 17 | log4j.logger.org.cleartk=OFF -------------------------------------------------------------------------------- /src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | logs/application.log 4 | 5 | %date - [%level] - from %logger in %thread 6 | %n%message%n%xException%n 7 | 8 | 9 | 10 | 11 | 12 | %logger{15} - %message%n%xException{5} 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /src/main/resources/stopwords: -------------------------------------------------------------------------------- 1 | a 2 | ----s 3 | act 4 | "the 5 | "The 6 | about 7 | above 8 | after 9 | again 10 | against 11 | all 12 | am 13 | an 14 | and 15 | any 16 | are 17 | aren't 18 | as 19 | at 20 | be 21 | because 22 | been 23 | before 24 | being 25 | below 26 | between 27 | both 28 | but 29 | by 30 | can't 31 | cannot 32 | could 33 | couldn't 34 | did 35 | didn't 36 | do 37 | does 38 | doesn't 39 | doing 40 | don't 41 | down 42 | during 43 | each 44 | few 45 | for 46 | from 47 | further 48 | had 49 | hadn't 50 | has 51 | hasn't 52 | have 53 | haven't 54 | having 55 | he 56 | he'd 57 | he'll 58 | he's 59 | her 60 | here 61 | here's 62 | hers 63 | herself 64 | him 65 | himself 66 | his 67 | how 68 | how's 69 | i 70 | i'd 71 | i'll 72 | i'm 73 | i've 74 | if 75 | in 76 | into 77 | is 78 | isn't 79 | it 80 | it's 81 | its 82 | itself 83 | let's 84 | me 85 | more 86 | most 87 | mustn't 88 | my 89 | myself 90 | no 91 | nor 92 | not 93 | of 94 | off 95 | on 96 | once 97 | only 98 | or 99 | other 100 | ought 101 | our 102 | ours 103 | ourselves 104 | out 105 | over 106 | own 107 | put 108 | same 109 | shan't 110 | she 111 | she'd 112 | she'll 113 | she's 114 | should 115 | somebody 116 | something 117 | shouldn't 118 | so 119 | some 120 | such 121 | take 122 | than 123 | that 124 | that's 125 | the 126 | their 127 | theirs 128 | them 129 | themselves 130 | then 131 | there 132 | there's 133 | these 134 | they 135 | they'd 136 | they'll 137 | they're 138 | they've 139 | this 140 | those 141 | through 142 | to 143 | too 144 | under 145 | until 146 | up 147 | very 148 | was 149 | wasn't 150 | we 151 | we'd 152 | we'll 153 | we're 154 | we've 155 | were 156 | weren't 157 | what 158 | what's 159 | when 160 | when's 161 | where 162 | where's 163 | which 164 | while 165 | who 166 | who's 167 | whom 168 | why 169 | why's 170 | will 171 | with 172 | without 173 | won't 174 | would 175 | wouldn't 176 | you 177 | you'd 178 | you'll 179 | you're 180 | you've 181 | your 182 | yours 183 | yourself 184 | yourselves 185 | . 186 | ? 187 | ! 188 | , 189 | + 190 | = 191 | also 192 | - 193 | 194 | -------------------------------------------------------------------------------- /src/main/scala/de/hpi/anlp/Main.scala: -------------------------------------------------------------------------------- 1 | package de.hpi.anlp 2 | 3 | import de.hpi.anlp.conll.ConLLFileReader 4 | 5 | /** 6 | * Main console application 7 | */ 8 | object Main extends App { 9 | 10 | // Tags to use for POS tagging 11 | val states = List("NOUN", "ADV", "PRT", ".", "ADP", "DET", "PRON", "VERB", "X", "NUM", "CONJ", "ADJ") 12 | 13 | // Training documents 14 | val trainDocuments = new ConLLFileReader("assets/de-train.tt") 15 | 16 | // Test documents 17 | val testDocuments = new ConLLFileReader("assets/de-eval.tt") 18 | 19 | // Lets have a look what arguments where passed in 20 | println("ARGS: " + args.mkString(" , ")) 21 | 22 | // Call the appropriate training according to the passed argument 23 | args.headOption match { 24 | case Some("mlp") => 25 | trainMLP() 26 | case Some("rbm") => 27 | trainRBM() 28 | case Some("hmm-s") => 29 | trainHMM(smoothed = true) 30 | case Some("hmm") => 31 | trainHMM(smoothed = false) 32 | case _ => 33 | throw new Exception("You need to specify the model to train. One of 'mlp', 'rbm', 'hmm-s', 'hmm'") 34 | } 35 | 36 | def trainHMM(smoothed: Boolean): Unit = { 37 | val hmm = POSHMM.train(trainDocuments, states, smoothed) 38 | 39 | println("\n--- Evaluation of: HMM.smoothed=" + smoothed) 40 | POSHMM.evaluate(hmm, testDocuments, states).printEvaluation() 41 | println("---") 42 | } 43 | 44 | def trainMLP() = { 45 | val mlp = POSMLP.train(trainDocuments, states) 46 | 47 | println("\n--- Evaluation of MLP ...") 48 | POSMLP.evaluate(mlp, testDocuments, states).printEvaluation() 49 | println("---") 50 | } 51 | 52 | def trainRBM() = { 53 | val (network, vec) = POSRBM.train(trainDocuments, states) 54 | 55 | println("\n--- Evaluation of RBM ...") 56 | POSRBM.evaluate(network, testDocuments, vec, states).printEvaluation() 57 | println("---") 58 | } 59 | } -------------------------------------------------------------------------------- /src/main/scala/de/hpi/anlp/POSEvaluator.scala: -------------------------------------------------------------------------------- 1 | package de.hpi.anlp 2 | 3 | import java.util.Locale 4 | import scala.collection.breakOut 5 | import scala.collection.mutable 6 | 7 | /** 8 | * Given the guessed tags of a trained model and a data sets gold standart this class calculates tag based accuracy 9 | * F1 score and overall accuracx 10 | */ 11 | class POSEvaluator(tags: List[String]) { 12 | /** 13 | * Underlying counter for occurrences 14 | */ 15 | val tagCounter: Map[String, mutable.Map[String, Int]] = tags.map { tag => 16 | tag -> mutable.HashMap("system" -> 0, "gold" -> 0, "both" -> 0) 17 | }(breakOut) 18 | 19 | /** 20 | * Add another guessed, gold annotation to the evaluator 21 | */ 22 | def add(tagged: String, gold: String): Unit = { 23 | tagCounter(tagged)("system") += 1 24 | tagCounter(gold)("gold") += 1 25 | if (tagged == gold) 26 | tagCounter(gold)("both") += 1 27 | } 28 | 29 | def add(tagged: Seq[String], gold: Seq[String]): Unit = { 30 | tagged.zip(gold).map { 31 | case (systemTag, goldTag) => 32 | add(systemTag, goldTag) 33 | } 34 | } 35 | 36 | /** 37 | * Prints the evalution to the standart out 38 | */ 39 | def printEvaluation(): Unit = { 40 | val overall = tagCounter.values.map(_("system")).sum 41 | val correct = tagCounter.values.map(_("both")).sum 42 | 43 | println("%5s, %6s, %6s, %6s".format("", "Prec", "Rec", "F1")) 44 | 45 | tagCounter.foreach { 46 | case (tag, counts) => 47 | val p = precision(counts) 48 | val r = recall(counts) 49 | val f1Score = f1(p, r) 50 | println("%5s, %.4f, %.4f, %.4f".formatLocal(Locale.ENGLISH, tag, p, r, f1Score)) 51 | } 52 | 53 | println("\nAccuracy: %.4f".format(correct.toDouble / overall)) 54 | } 55 | 56 | def precision(counts: mutable.Map[String, Int]) = { 57 | if (counts("system") == 0) 58 | Double.NaN 59 | else 60 | counts("both").toDouble / counts("system") 61 | } 62 | 63 | def recall(counts: mutable.Map[String, Int]) = { 64 | if (counts("gold") == 0) 65 | Double.NaN 66 | else 67 | counts("both").toDouble / counts("gold") 68 | } 69 | 70 | def f1(precision: Double, recall: Double) = { 71 | if (precision + recall == 0 || precision + recall == Double.NaN) 72 | Double.NaN 73 | else 74 | 2 * precision * recall / (precision + recall) 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/main/scala/de/hpi/anlp/POSHMM.scala: -------------------------------------------------------------------------------- 1 | package de.hpi.anlp 2 | 3 | import de.hpi.anlp.conll.AnnotatedToken 4 | import de.hpi.anlp.hmm.{TrainedHMM, ConstantSmoothedHMM, HMM} 5 | 6 | /** 7 | * Configure, train and evaluate a HMM based model* 8 | */ 9 | object POSHMM { 10 | 11 | /** 12 | * Train a new HMM model. The model can either use smoothing or not 13 | */ 14 | def train(trainDocuments: Iterable[List[AnnotatedToken]], states: List[String], smoothed: Boolean) = { 15 | val hmm = 16 | if (smoothed) 17 | new ConstantSmoothedHMM(states, smoothingConstant = 1) 18 | else 19 | new HMM(states) 20 | 21 | hmm.train(trainDocuments) 22 | } 23 | 24 | /** 25 | * Evaluate a given HMM on a test data set and its gold standart 26 | */ 27 | def evaluate(hmm: TrainedHMM, testDocuments: Iterable[List[AnnotatedToken]], states: List[String]) = { 28 | val evaluator = new POSEvaluator(states) 29 | 30 | testDocuments.foreach { sentence => 31 | val unannotated = sentence.map(_.token) 32 | val (prob, tags) = hmm.mostProbablePath(unannotated) 33 | evaluator.add(tagged = tags, gold = sentence.map(_.tag)) 34 | } 35 | 36 | evaluator 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /src/main/scala/de/hpi/anlp/POSMLP.scala: -------------------------------------------------------------------------------- 1 | package de.hpi.anlp 2 | 3 | import de.hpi.anlp.mlp.MLPConfig 4 | import de.hpi.anlp.conll.AnnotatedToken 5 | import de.hpi.anlp.nnpos.POSMLPModel 6 | 7 | /** 8 | * Configure, train and evaluate a MLP model 9 | */ 10 | object POSMLP { 11 | 12 | /** 13 | * Train a new MLP model using the given training data and states. The configuration can be adjusted in this function. 14 | */ 15 | def train(trainDocuments: Iterable[List[AnnotatedToken]], states: List[String]) = { 16 | val NUM_EPOCHS = 1000 17 | val EPS = 0.00001 18 | val learningRate = 0.01 19 | val hiddenLayers = Array[Int]() 20 | val momentum = 0.1 21 | val activationF = (x: Double) => 1.0 / (1.0 + Math.exp(-0.8 * x)) 22 | 23 | val config = MLPConfig(momentum, learningRate, hiddenLayers, NUM_EPOCHS, EPS, activationF) 24 | 25 | POSMLPModel.fit(config, states, trainDocuments, preW = 2, postW = 2) 26 | } 27 | 28 | /** 29 | * Evaluate a given MLP model on the test data set and its gold standart 30 | */ 31 | def evaluate(mlp: POSMLPModel, testDocuments: Iterable[List[AnnotatedToken]], states: List[String]) = { 32 | val evaluator = new POSEvaluator(states) 33 | 34 | testDocuments.foreach { sentence => 35 | val unannotated = sentence.map(_.token) 36 | val tags = mlp.output(unannotated) 37 | evaluator.add(tagged = tags, gold = sentence.map(_.tag)) 38 | } 39 | evaluator 40 | } 41 | 42 | } 43 | -------------------------------------------------------------------------------- /src/main/scala/de/hpi/anlp/POSRBM.scala: -------------------------------------------------------------------------------- 1 | package de.hpi.anlp 2 | 3 | import java.io.File 4 | import de.hpi.anlp.conll.AnnotatedToken 5 | import de.hpi.anlp.utils.Word2VecDataSetIterator 6 | import org.apache.commons.math3.random.MersenneTwister 7 | import org.deeplearning4j.eval.Evaluation 8 | import org.deeplearning4j.models.featuredetectors.rbm.RBM 9 | import org.deeplearning4j.models.word2vec.Word2Vec 10 | import org.deeplearning4j.nn.api.OptimizationAlgorithm 11 | import org.deeplearning4j.nn.conf.NeuralNetConfiguration 12 | import org.deeplearning4j.nn.layers.factory.LayerFactories 13 | import org.deeplearning4j.nn.multilayer.MultiLayerNetwork 14 | import org.deeplearning4j.nn.weights.WeightInit 15 | import org.deeplearning4j.text.inputsanitation.InputHomogenization 16 | import org.deeplearning4j.text.sentenceiterator.{SentencePreProcessor, FileSentenceIterator} 17 | import org.deeplearning4j.text.tokenization.tokenizerfactory.UimaTokenizerFactory 18 | import org.deeplearning4j.util.SerializationUtils 19 | import org.nd4j.linalg.api.activation.Activations 20 | import org.nd4j.linalg.api.ndarray.INDArray 21 | import org.nd4j.linalg.lossfunctions.LossFunctions 22 | import org.nd4j.linalg.netlib.SimpleNetlibBlas 23 | 24 | /** 25 | * Configure, train and evaluate RBM based POS taggers 26 | */ 27 | object POSRBM { 28 | // Size of the unsupervised news corpus to use. Should be either 1M, 10K or 300K 29 | val vecTrainSize = "1M" 30 | 31 | // Window size to train on 32 | val windowSize = 5 33 | 34 | // Word vector size used during the word2vec training 35 | val wordVecLayers = 50 36 | 37 | /** 38 | * Load a word2vec model from disc 39 | */ 40 | def loadWordVectorModel() = { 41 | SerializationUtils.readObject(new File(s"output/word2vec_$vecTrainSize.model")).asInstanceOf[Word2Vec] 42 | } 43 | 44 | /** 45 | * Load a neural network from disc 46 | */ 47 | def loadNeuralNetwork(fileName: String) = { 48 | SerializationUtils.readObject(new File(fileName)).asInstanceOf[MultiLayerNetwork] 49 | } 50 | 51 | /** 52 | * Store a word2vec instance to disc for later retrieval 53 | */ 54 | def storeWordVectorModel(model: Word2Vec) = { 55 | SerializationUtils.saveObject(model, new File(s"output/word2vec_$vecTrainSize.model")); 56 | } 57 | 58 | /** 59 | * Instanciate a default sentence preprocessor and apply standart input homogenization 60 | */ 61 | private def sentencePreprocessor = new SentencePreProcessor() { 62 | val sentenceFileRx = "(?s)^[0-9]+\\s(.*)$" r 63 | 64 | override def preProcess(sentenceLine: String): String = { 65 | sentenceLine match { 66 | case sentenceFileRx(sentence) => 67 | new InputHomogenization(sentence).transform() 68 | case _ => 69 | throw new Exception("Invalid input line.") 70 | } 71 | } 72 | } 73 | 74 | /** 75 | * Train a new word2vec model on the given news corpus 76 | */ 77 | private def trainWordVectorModel() = { 78 | val file = new File(s"assets/deu_news_2010_$vecTrainSize-text/deu_news_2010_$vecTrainSize-sentences.txt") 79 | 80 | val sentenceIterator = new FileSentenceIterator(sentencePreprocessor, file) 81 | 82 | val t = new UimaTokenizerFactory() 83 | val vec = new Word2Vec.Builder() 84 | .minWordFrequency(5) 85 | .windowSize(windowSize) 86 | .layerSize(wordVecLayers) 87 | .iterate(sentenceIterator) 88 | .tokenizerFactory(t) 89 | .build() 90 | 91 | vec.fit() 92 | vec 93 | } 94 | 95 | /** 96 | * Train a RBM on the training data set either using an existing word2vec model or creating a new one. THis is the 97 | * place to configure the RBM 98 | */ 99 | def train(trainDocuments: Iterable[List[AnnotatedToken]], states: List[String]) = { 100 | val vec = trainWordVectorModel() 101 | storeWordVectorModel(vec) 102 | // val vec = loadWordVectorModel() 103 | 104 | println("Finished Word2Vec!") 105 | 106 | printf("Sim('fernsehen', 'familie') = %f\n", vec.similarity("fernsehen", "familie")); 107 | 108 | val fetcher = new Word2VecDataSetIterator(vec, trainDocuments, states, batch = 10) 109 | val gen = new MersenneTwister(123); 110 | 111 | val layerFactory = LayerFactories.getFactory(classOf[RBM]) 112 | val conf = new NeuralNetConfiguration.Builder() 113 | .optimizationAlgo(OptimizationAlgorithm.CONJUGATE_GRADIENT) 114 | .iterations(100) 115 | .rng(gen) 116 | .weightInit(WeightInit.NORMALIZED) 117 | .learningRate(0.001f) 118 | .nIn(wordVecLayers * windowSize) 119 | .nOut(states.size) 120 | .lossFunction(LossFunctions.LossFunction.MCXENT) 121 | .visibleUnit(RBM.VisibleUnit.SOFTMAX) 122 | .hiddenUnit(RBM.HiddenUnit.RECTIFIED) 123 | .layerFactory(layerFactory) 124 | .list(2) 125 | .`override`(new NeuralNetConfiguration.ConfOverride() { 126 | override def `override`(i: Int, builder: NeuralNetConfiguration.Builder) { 127 | if (i == 1) { 128 | builder.weightInit(WeightInit.ZERO); 129 | builder.activationFunction(Activations.softMaxRows()); 130 | } 131 | } 132 | }) 133 | .hiddenLayerSizes(50) 134 | .build() 135 | 136 | val network = new MultiLayerNetwork(conf) 137 | 138 | println("Started fitting network...") 139 | 140 | network.fit(fetcher) 141 | 142 | println("Finished fitting Network!") 143 | 144 | SerializationUtils.saveObject(network, new File(s"output/network_$vecTrainSize.model6")) 145 | 146 | (network, vec) 147 | } 148 | 149 | 150 | private def labelForArray(a: INDArray, statesIndex: Map[Int, String]) = { 151 | val m = SimpleNetlibBlas.iamax(a) 152 | statesIndex(m) 153 | } 154 | 155 | /** 156 | * Evaluate a given RBM model on the test data set and its gold standard. Beside the implemented evaluation of 157 | * POSEvaluator this will also execute the model specific evaluation implemented in the dl4j library. 158 | */ 159 | def evaluate(network: MultiLayerNetwork, testDocuments: Iterable[List[AnnotatedToken]], vec: Word2Vec, states: List[String]) = { 160 | 161 | println("Started evaluating Network!") 162 | 163 | val testData = new Word2VecDataSetIterator(vec, testDocuments, states, batch = 20000).next() 164 | val predicted = network.output(testData.getFeatureMatrix) 165 | 166 | val statesIndex = states.zipWithIndex.map { 167 | case (el, i) => i -> el 168 | }.toMap 169 | 170 | val evaluator = new POSEvaluator(states) 171 | val buildInEval = new Evaluation() 172 | 173 | val predictedLabels: Seq[String] = (0 until predicted.length()).map { i => 174 | val guessRow: INDArray = predicted.getRow(i) 175 | labelForArray(guessRow, statesIndex) 176 | } 177 | 178 | val goldLabels = (0 until testData.numExamples()).map { i => 179 | val currRow: INDArray = testData.getLabels.getRow(i) 180 | labelForArray(currRow, statesIndex) 181 | } 182 | 183 | evaluator.add(predictedLabels, goldLabels) 184 | buildInEval.eval(testData.getLabels, predicted) 185 | 186 | System.out.println(buildInEval.stats()) 187 | 188 | evaluator 189 | } 190 | 191 | } 192 | -------------------------------------------------------------------------------- /src/main/scala/de/hpi/anlp/conll/ConLLDataSet.scala: -------------------------------------------------------------------------------- 1 | package de.hpi.anlp.conll 2 | 3 | import org.deeplearning4j.datasets.fetchers.BaseDataFetcher 4 | import org.deeplearning4j.models.word2vec.Word2Vec 5 | import org.deeplearning4j.text.movingwindow.{WindowConverter, Windows} 6 | import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory 7 | import org.nd4j.linalg.dataset.DataSet 8 | import org.nd4j.linalg.util.FeatureUtil 9 | 10 | import scala.collection.JavaConversions._ 11 | 12 | /** 13 | * Data fetcher to enumerate word vectors from ConLL files 14 | */ 15 | class ConLLWordVectorDataFetcher(val vec: Word2Vec, val labels: List[String], val conLLFile: ConLLFileReader) extends BaseDataFetcher { 16 | 17 | // Iterator over the files contents 18 | val iter = conLLFile.iterator 19 | 20 | // Label index 21 | val labelIdx = labels.zipWithIndex.toMap 22 | 23 | // Tokenizer to improve tokens 24 | val factory = new DefaultTokenizerFactory() 25 | 26 | // If the requested number of examples doesn't align with the tokens in a sentence we need to save left over tokens 27 | var leftOver = Vector.empty[DataSet] 28 | 29 | /** 30 | * Fetch the next numExample tokens 31 | * @param numExamples Number of tokens 32 | */ 33 | override def fetch(numExamples: Int): Unit = { 34 | 35 | if (leftOver.size >= numExamples) { 36 | curr = DataSet.merge(leftOver.take(numExamples)) 37 | leftOver = leftOver.drop(numExamples) 38 | cursor += curr.numExamples() 39 | } else if (!iter.hasNext) { 40 | if (!leftOver.isEmpty) { 41 | curr = DataSet.merge(leftOver) 42 | leftOver = Vector.empty 43 | cursor += curr.numExamples() 44 | } 45 | } else { 46 | val list = iter.take(numExamples).flatMap { example => 47 | val words = example.map(_.token) 48 | val labels = example.map(_.tag) 49 | Windows.windows(words, vec.getWindow()).zip(labels).map { 50 | case (window, label) => 51 | val wordVector = WindowConverter.asExampleArray(window, vec, false) 52 | val labelVector = FeatureUtil.toOutcomeVector(labelIdx(label), labels.size) 53 | new DataSet(wordVector, labelVector) 54 | } 55 | } 56 | 57 | val merge = (list ++ leftOver).take(numExamples).toList 58 | 59 | curr = DataSet.merge(merge) 60 | cursor += curr.numExamples() 61 | 62 | if (list.hasNext) 63 | leftOver ++= list 64 | } 65 | } 66 | 67 | override def inputColumns() = 68 | vec.lookupTable().layerSize() * vec.getWindow() 69 | 70 | override def totalOutcomes() = 71 | labels.size 72 | 73 | override def hasMore() = 74 | iter.hasNext || leftOver.size > 0 75 | } 76 | 77 | -------------------------------------------------------------------------------- /src/main/scala/de/hpi/anlp/conll/ConLLFile.scala: -------------------------------------------------------------------------------- 1 | package de.hpi.anlp.conll 2 | 3 | import java.io.{File, PrintWriter} 4 | import scala.io.Source 5 | 6 | /** 7 | * A token and its tag 8 | */ 9 | case class AnnotatedToken(token: String, tag: String) 10 | 11 | /** 12 | * Writer helper to write a sentence and its annotated tags into a ConLL file format. This allows external evaluators 13 | * based on that format to read the output 14 | * 15 | */ 16 | class ConLLFileWriter(fileName: String) { 17 | var openedWriter: Option[PrintWriter] = Some(new PrintWriter(new File(fileName))) 18 | 19 | /** 20 | * Write sentence and its tag to file. One token and tag per line 21 | */ 22 | def write(sentence: Seq[String], annotations: Seq[String]): Boolean = { 23 | openedWriter.map { writer => 24 | sentence.zip(annotations).map { 25 | case (word, annotation) => 26 | writer.println(word + "\t" + annotation) 27 | } 28 | writer.println("") // add an empty line to complete the sentence 29 | 30 | true 31 | } getOrElse false 32 | } 33 | 34 | def close() = { 35 | openedWriter.map { writer => 36 | writer.flush() 37 | writer.close() 38 | } 39 | openedWriter = None 40 | } 41 | } 42 | 43 | /** 44 | * Helper class to iterate through a ConLL data set file. 45 | */ 46 | class ConLLFileReader(fileName: String) extends Iterable[List[AnnotatedToken]] { 47 | override def iterator = new Iterator[List[AnnotatedToken]] { 48 | val lineIt = Source.fromFile(fileName).getLines 49 | var nextVal = readNextFromInput() 50 | var readLines = 0 51 | 52 | override def hasNext: Boolean = nextVal.isDefined 53 | 54 | override def next(): List[AnnotatedToken] = { 55 | nextVal match { 56 | case Some(value) => 57 | nextVal = readNextFromInput() 58 | value 59 | case _ => 60 | throw new NoSuchElementException("next on empty iterator") 61 | } 62 | } 63 | 64 | private def readNextFromInput(): Option[List[AnnotatedToken]] = { 65 | val currentTokens = lineIt.takeWhile(_.trim != "") 66 | if (currentTokens.isEmpty) 67 | None 68 | else { 69 | val cs = currentTokens.toList 70 | val annotated = cs.map { current => 71 | readLines += 1 72 | current.split('\t') match { 73 | case Array(token, label) => 74 | AnnotatedToken(token, label) 75 | case _ => 76 | throw new Exception(s"Invalid line #$readLines in ConLL file. Line content: '$current'") 77 | } 78 | } 79 | readLines += 1 80 | Some(annotated) 81 | } 82 | } 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /src/main/scala/de/hpi/anlp/hmm/ConstantSmoothedHMM.scala: -------------------------------------------------------------------------------- 1 | package de.hpi.anlp.hmm 2 | 3 | import scala.collection.mutable 4 | 5 | /** 6 | * Extension of the base HMM implementation to use constant smoothing. This is especially useful for words and tag word 7 | * combinations that were not seen during training 8 | */ 9 | class ConstantSmoothedHMM(states: List[String], n: Int = 2, smoothingConstant: Int = 1) extends HMM(states, n) { 10 | override def calculateStartProbabilities(starts: Array[Double]) = { 11 | val sum = starts.sum + states.size * smoothingConstant 12 | starts.map(startCount => (startCount + smoothingConstant)/ sum) 13 | } 14 | 15 | override def calculateTransitionProbabilities(transitions: Array[Double]) = { 16 | val sum = transitions.sum + states.size * smoothingConstant 17 | transitions.map(transitionCount => (transitionCount + smoothingConstant) / sum) 18 | } 19 | 20 | override def calculateEmissionProbabilities(emissions: Array[mutable.Map[String, Int]]) = { 21 | emissions.map { emissionsForTag => 22 | val sum = emissionsForTag.values.sum + states.size * smoothingConstant 23 | emissionsForTag.mapValues { tokenFreq => 24 | (tokenFreq + smoothingConstant).toDouble / sum 25 | }.toMap.withDefaultValue(smoothingConstant.toDouble / sum) 26 | } 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/main/scala/de/hpi/anlp/hmm/HMM.scala: -------------------------------------------------------------------------------- 1 | package de.hpi.anlp.hmm 2 | 3 | import de.hpi.anlp.conll.AnnotatedToken 4 | import scala.collection.mutable 5 | 6 | /** 7 | * HMM to be configured. After train is called a new trainedHMM instance is created 8 | */ 9 | class HMM(states: List[String], n: Int = 2) { 10 | val stateIdx = states.zipWithIndex.toMap 11 | 12 | /** 13 | * Train traverses the input data, collects statistics and calculates probabilities for hidden states and outputs. 14 | * Those can then be used in the HMM to predict tags for unseen sentences 15 | */ 16 | def train(annotatedData: Iterable[List[AnnotatedToken]]) = { 17 | val transitions = new Array[Double](math.pow(states.size, n).toInt) 18 | 19 | val emissions = Array.fill(states.size)(mutable.HashMap.empty[String, Int].withDefaultValue(0)) 20 | 21 | val starts = new Array[Double](states.size) 22 | 23 | annotatedData.foreach { annotated => 24 | annotated.headOption.map { first => 25 | val sIdx = stateIdx(first.tag) 26 | starts(sIdx) += 1 27 | } 28 | 29 | val idxs = annotated.map { 30 | case AnnotatedToken(token, tag) => 31 | val idx = stateIdx(tag) 32 | emissions(idx).update(token, emissions(idx)(token) + 1) 33 | idx 34 | } 35 | 36 | idxs.sliding(n, 1).foreach { window => 37 | if (window.size == n) { 38 | val idx = window.foldLeft(0)((p, s) => p * states.size + s) 39 | transitions(idx) = transitions(idx) + 1 40 | } 41 | } 42 | } 43 | 44 | val emissionProbs = calculateEmissionProbabilities(emissions) 45 | val transitionProbs = calculateTransitionProbabilities(transitions) 46 | val startProbs = calculateStartProbabilities(starts) 47 | new TrainedHMM(states, n, emissionProbs, transitionProbs, startProbs) 48 | } 49 | 50 | /** 51 | * Given the number of seen starts, calculate the start probabilities 52 | */ 53 | def calculateStartProbabilities(starts: Array[Double]) = { 54 | val sum = starts.sum 55 | starts.map(_ / sum) 56 | } 57 | 58 | /** 59 | * Given the transition statistics, calculate transition probabilities 60 | */ 61 | def calculateTransitionProbabilities(transitions: Array[Double]) = { 62 | val sum = transitions.sum 63 | transitions.map(_ / sum) 64 | } 65 | 66 | /** 67 | * Given emission statistics calculate emission probabilities for each hidden state 68 | */ 69 | def calculateEmissionProbabilities(emissions: Array[mutable.Map[String, Int]]) = { 70 | emissions.map { emissionsForTag => 71 | val sum = emissionsForTag.values.sum 72 | emissionsForTag.mapValues { tokenFreq => 73 | tokenFreq.toDouble / sum 74 | }.toMap.withDefaultValue(0.0) 75 | } 76 | } 77 | } 78 | 79 | 80 | 81 | 82 | -------------------------------------------------------------------------------- /src/main/scala/de/hpi/anlp/hmm/TrainedHMM.scala: -------------------------------------------------------------------------------- 1 | package de.hpi.anlp.hmm 2 | 3 | import scala.collection.breakOut 4 | 5 | /** 6 | * A trained HMM which can be used to predict tags on a given sentence. Its a result of the training of a configured HMM 7 | */ 8 | case class TrainedHMM(states: List[String], 9 | n: Int, 10 | underlyingEmissionProbs: Array[Map[String, Double]], 11 | underlyingTransitionProbs: Array[Double], 12 | underlyingStartProbs: Array[Double]) { 13 | 14 | def incommingProbabilities(trellis: Vector[Array[Double]], sidx: Int, trellisLevel: Int, emissionPs: Array[Double]): Map[Int, Double] = { 15 | (0 until states.size).map { prevStateIdx => 16 | val probability = trellis(trellisLevel - 1)(prevStateIdx) + 17 | math.log(transitionProbability(prevStateIdx, sidx)) + 18 | math.log(emissionPs(sidx)) 19 | 20 | prevStateIdx -> probability 21 | }(breakOut) 22 | } 23 | 24 | /** 25 | * Viterbi implementation for graph traversal. Used to find most probable hidden states for the observed outputs. 26 | */ 27 | def viterbi(observations: List[String]) = { 28 | observations match { 29 | case Nil => 30 | 0.0 -> Vector.empty 31 | case firstObservation :: remainingObservations => 32 | var paths = states.toArray.map(state => Vector(state)) 33 | val initialNodeLevel = states.zipWithIndex.toArray.map { 34 | case (state, idx) => 35 | math.log(startProbability(idx)) + math.log(emissionProbability(idx, firstObservation)) 36 | } 37 | 38 | var trellis = Vector(initialNodeLevel) 39 | 40 | remainingObservations.zipWithIndex.map { 41 | case (observation, trellisLevel) => 42 | val nextNodeLevel = new Array[Double](states.size) 43 | val updatedPaths = new Array[Vector[String]](states.size) 44 | var emissionPs: Array[Double] = (0 until states.size).map { idx => 45 | emissionProbability(idx, observation) 46 | }(breakOut) 47 | 48 | if (emissionPs.forall(_ == 0)) 49 | emissionPs = Array.fill(states.size)(1.0) 50 | 51 | states.zipWithIndex.map { 52 | case (state, sidx) => 53 | val (bestState, bestProb) = incommingProbabilities(trellis, sidx, trellisLevel + 1, emissionPs).maxBy(_._2) 54 | nextNodeLevel.update(sidx, bestProb) 55 | updatedPaths.update(sidx, paths(bestState) :+ state) 56 | } 57 | 58 | trellis :+= nextNodeLevel 59 | paths = updatedPaths 60 | } 61 | 62 | val (bestProb, bestState) = trellis.last.zipWithIndex.maxBy(_._1) 63 | bestProb -> paths(bestState) 64 | } 65 | } 66 | 67 | def emissionProbability(sidx: Int, observation: String): Double = 68 | underlyingEmissionProbs(sidx)(observation) 69 | 70 | def transitionProbability(from: Int, to: Int): Double = 71 | underlyingTransitionProbs(from * states.size + to) 72 | 73 | def startProbability(sidx: Int): Double = 74 | underlyingStartProbs(sidx) 75 | 76 | def mostProbablePath(observations: List[String]) = viterbi(observations) 77 | } -------------------------------------------------------------------------------- /src/main/scala/de/hpi/anlp/mlp/MLP.scala: -------------------------------------------------------------------------------- 1 | package de.hpi.anlp.mlp 2 | 3 | import scala.util.{Try, Success, Failure} 4 | import org.apache.log4j.Logger 5 | import de.hpi.anlp.utils.ScalaMLTypes._ 6 | 7 | class MLP( 8 | config: MLPConfig, 9 | xt: Array[Array[Double]], 10 | labels: DblMatrix) 11 | (implicit mlpObjective: MLPTask) { 12 | 13 | private val logger = Logger.getLogger("MLP") 14 | 15 | // Flag that indicates that the training converged toward a definite model 16 | private[this] var converged = false 17 | 18 | /** 19 | * Model for the Multi-layer Perceptron of type MLPModel 20 | */ 21 | val model: Option[MLPModel] = train match { 22 | case Success(_model) => 23 | Some(_model) 24 | case Failure(e) => 25 | logger.error("MLP.model ", e) 26 | None 27 | } 28 | 29 | /** 30 | * Test whether the model has converged 31 | */ 32 | final def hasConverged: Boolean = converged 33 | 34 | /** 35 | * Define the predictive function of the classifier or regression 36 | */ 37 | def output: PartialFunction[Array[Double], DblVector] = { 38 | case x: Array[Double] if (!x.isEmpty && model != None && x.size == xt(0).size) => { 39 | 40 | Try(model.get.getOutput(x)) match { 41 | case Success(y) => y 42 | case Failure(e) => { 43 | logger.error("MLP ", e) 44 | Array.empty[Double] 45 | } 46 | } 47 | } 48 | } 49 | 50 | 51 | /** 52 | * Computes the accuracy of the training session. The accuracy is estimated 53 | * as the percentage of the training data points for which the square root of 54 | * the sum of squares error, normalized by the size of the training set exceed a 55 | * predefined threshold 56 | */ 57 | final def accuracy(threshold: Double): Option[Double] = model.map(m => { 58 | 59 | // counts the number of data points for were correctly classified 60 | val nCorrects = xt.zip(labels) 61 | .foldLeft(0)((s, xtl) => { 62 | 63 | // Get the output layer for this input xt. 64 | val output = model.get.getOutput(xtl._1) 65 | 66 | // Compute the sum of squared error while excluding bias element 67 | val _sse = xtl._2.zip(output.drop(1)) 68 | .foldLeft(0.0)((err, tp) => { 69 | val diff = tp._1 - tp._2 70 | err + diff * diff 71 | }) * 0.5 72 | 73 | // Compute the least square error and adjusts it for the number of output variables. 74 | val error = Math.sqrt(_sse) / (output.size - 1) 75 | if (error < threshold) s + 1 else s 76 | }) 77 | 78 | // returns the percentage of observations correctly classified 79 | nCorrects.toDouble / xt.size 80 | }) 81 | 82 | /** 83 | * Training method for the Multi-layer perceptron 84 | */ 85 | private def train: Try[MLPModel] = { 86 | Try { 87 | val _model = new MLPModel(config, xt(0).size, labels(0).size)(mlpObjective) 88 | 89 | // Scaling or normalization factor for the sum of the squared error 90 | val errScale = 1.0 / (labels(0).size * xt.size) 91 | 92 | // Apply the exit condition for this online training strategy 93 | // The convergence criteria selected is the reconstruction error 94 | // generated during an epoch adjusted to the scaling factor and compare 95 | // to the predefined criteria config.eps 96 | converged = Range(0, config.numEpochs).find(epoch => { 97 | val e = xt.toArray.zip(labels).foldLeft(0.0)((s, xtlbl) => 98 | s + _model.trainEpoch(xtlbl._1, xtlbl._2) 99 | ) * errScale 100 | if (epoch % 10 == 0) 101 | println("SSE: " + e) 102 | e < config.eps 103 | }) != None 104 | _model 105 | } 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /src/main/scala/de/hpi/anlp/mlp/MLPConfig.scala: -------------------------------------------------------------------------------- 1 | package de.hpi.anlp.mlp 2 | 3 | /** 4 | * Configuration of MLP. If params are out of range an exception is thrown 5 | * @param momentum Momentum parameter used to adjust the value of the gradient of the weights 6 | * with previous value (smoothing) 7 | * @param learningRate Learning rate ]0, 1] used in the computation of the gradient of the weights 8 | * during training 9 | * @param hidLayers Sequence of number of neurons for the hidden layers 10 | * @param numEpochs Number of epochs or iterations allowed to train the weights/model 11 | * @param eps Convergence criteria used as exit condition of the convergence toward optimum 12 | * weights that minimize the sum of squared error 13 | * @param activation Activation function (sigmoid or tanh) that computes the output of hidden 14 | * layers during forward propagation 15 | * 16 | */ 17 | case class MLPConfig( 18 | momentum: Double, 19 | learningRate: Double, 20 | hidLayers: Array[Int], 21 | numEpochs: Int, 22 | eps: Double = 1e-17, 23 | activation: Double => Double) { 24 | 25 | /** 26 | * Id of output layer 27 | */ 28 | final def outLayerId: Int = 29 | if (hidLayers.isEmpty) 30 | 1 31 | else 32 | hidLayers.size + 1 33 | 34 | /** 35 | * # hidden layers in network 36 | */ 37 | def nHiddens = 38 | if (hidLayers.isEmpty) 39 | 0 40 | else 41 | hidLayers.size 42 | } 43 | -------------------------------------------------------------------------------- /src/main/scala/de/hpi/anlp/mlp/MLPConnection.scala: -------------------------------------------------------------------------------- 1 | package de.hpi.anlp.mlp 2 | 3 | import de.hpi.anlp.utils.ScalaMLTypes.MLPTask 4 | 5 | import scala.util.Random 6 | 7 | /** 8 | * Class that defines the connection between two consecutive (or sequential layers) 9 | * in a Multi-layer Perceptron. The connections is composed of all the synapses between 10 | * any neuron or variable of each layer.The Synapse is defined as a nested tuple(Double, Double) 11 | * tuple (weights, deltaWeights) 12 | */ 13 | class MLPConnection( 14 | config: MLPConfig, 15 | src: MLPLayer, 16 | dst: MLPLayer) 17 | (implicit mlpObjective: MLPTask) { 18 | 19 | private val BETA = 0.01 20 | 21 | /** 22 | * Synapse defined as a tuple of [weight, gradient(weights)] 23 | */ 24 | type MLPSynapse = (Double, Double) 25 | 26 | /* 27 | * Initialize the matrix (Array of Array) of Synapse by generating 28 | * a random value between 0 and BETA 29 | */ 30 | private[this] val synapses: Array[Array[MLPSynapse]] = Array.tabulate(dst.len)(n => 31 | if (n > 0) 32 | Array.fill(src.len)((Random.nextDouble * BETA, 0.0)) 33 | else 34 | Array.fill(src.len)((1.0, 0.0))) 35 | 36 | /** 37 | * Implement the forward propagation of input value. The output 38 | * value depends on the conversion selected for the output. If the output or destination 39 | * layer is a hidden layer, then the activation function is applied to the dot product of 40 | * weights and values. If the destination is the output layer, the output value is just 41 | * the dot product weights and values 42 | */ 43 | def connectionForwardPropagation: Unit = { 44 | // Iterates over all the synapsed except the first or bian selement 45 | val _output = synapses.drop(1).map(x => { 46 | // Compute the dot product 47 | val sum = x.zip(src.output).foldLeft(0.0)((s, xy) => s + xy._1._1 * xy._2) 48 | 49 | // Applies the activation function if this is a hidden layer (not output) 50 | if (!isOutLayer) config.activation(sum) else sum 51 | }) 52 | 53 | // Apply the objective function (SoftMax,...) to the output layer 54 | val out = if (isOutLayer) mlpObjective(_output) else _output 55 | out.copyToArray(dst.output, 1) 56 | } 57 | 58 | /** 59 | * Access the identifier for the source and destination layers 60 | */ 61 | @inline 62 | final def getLayerIds: (Int, Int) = (src.id, dst.id) 63 | 64 | @inline 65 | final def getSynapses: Array[Array[MLPSynapse]] = synapses 66 | 67 | /** 68 | * Implement the back propagation of output error (target - output). The method uses 69 | * the derivative of the logistic function to compute the delta value for the output of 70 | * the source layer 71 | */ 72 | def connectionBackpropagation: Unit = 73 | Range(1, src.len).foreach(i => { 74 | val err = Range(1, dst.len).foldLeft(0.0)((s, j) => 75 | s + synapses(j)(i)._1 * dst.delta(j)) 76 | 77 | // The delta value is computed as the derivative of the 78 | // output value adjusted for the back-propagated error, err 79 | src.delta(i) = src.output(i) * (1.0 - src.output(i)) * err 80 | }) 81 | 82 | 83 | /** 84 | * Implement the update of the synapse (weight, grad weight) following the 85 | * back propagation of output error. This method is called during training. 86 | */ 87 | def connectionUpdate: Unit = 88 | // Iterates through all element of the destination layer except the bias element 89 | Range(1, dst.len).foreach(i => { 90 | val delta = dst.delta(i) 91 | 92 | // Compute all the synapses (weight, gradient weight) between 93 | // the destination elements (index i) and the source elements (index j) 94 | Range(0, src.len).foreach(j => { 95 | val _output = src.output(j) 96 | val oldSynapse = synapses(i)(j) 97 | // Compute the gradient with the delta 98 | val grad = config.learningRate * delta * _output 99 | // Apply the gradient adjustment formula 100 | val deltaWeight = grad + config.momentum * oldSynapse._2 101 | // Update the synapse 102 | synapses(i)(j) = (oldSynapse._1 + deltaWeight, grad) 103 | }) 104 | }) 105 | 106 | /** 107 | * Convenient method to update the values of a synapse while 108 | * maintaining immutability 109 | */ 110 | private def update(i: Int, j: Int, x: Double, dx: Double): Unit = { 111 | val old = synapses(i)(j) 112 | synapses(i)(j) = (old._1 + x, dx) 113 | } 114 | 115 | private def isOutLayer: Boolean = dst.id == config.outLayerId 116 | } 117 | -------------------------------------------------------------------------------- /src/main/scala/de/hpi/anlp/mlp/MLPLayer.scala: -------------------------------------------------------------------------------- 1 | package de.hpi.anlp.mlp 2 | 3 | import de.hpi.anlp.utils.ScalaMLTypes.DblVector 4 | 5 | /** 6 | * A MLP layer is built using the input vector and add an extra element to account for the bias w0 7 | */ 8 | class MLPLayer(val id: Int, val len: Int) { 9 | 10 | /** 11 | * Values of the output vector 12 | */ 13 | val output = new DblVector(len) 14 | 15 | /** 16 | * Difference for the propagated error on the source or upstream 17 | */ 18 | val delta = new DblVector(len) 19 | output.update(0, 1.0) 20 | 21 | /** 22 | * Initialize the value of the input for this MLP layer 23 | */ 24 | def set(_x: DblVector): Unit = { 25 | _x.copyToArray(output, 1) 26 | } 27 | 28 | /** 29 | * Compute the sum of squared error of the elements of this MLP layer 30 | */ 31 | final def sse(labels: DblVector): Double = { 32 | var _sse = 0.0 33 | output.drop(1).zipWithIndex.foreach { 34 | case (on, idx) => { 35 | val err = labels(idx) - on 36 | delta.update(idx + 1, on * (1.0 - on) * err) 37 | _sse += err * err 38 | } 39 | } 40 | _sse * 0.5 // normalized C 41 | } 42 | 43 | /** 44 | * Is this layer the output layer 45 | */ 46 | final def isOutput(lastId: Int): Boolean = id == lastId 47 | } 48 | -------------------------------------------------------------------------------- /src/main/scala/de/hpi/anlp/mlp/MLPModel.scala: -------------------------------------------------------------------------------- 1 | package de.hpi.anlp.mlp 2 | 3 | import de.hpi.anlp.utils.Model 4 | import de.hpi.anlp.utils.ScalaMLTypes.{DblVector, MLPTask} 5 | 6 | /** 7 | * MLP model represents a MLP configuration and instance. A MLP model consists of MLPLayer s (layer of the MLP model), 8 | * MLPSynapse s (connection between two elements) and MLPConnections (container of synapses of a layer) 9 | */ 10 | class MLPModel( 11 | config: MLPConfig, 12 | nInputs: Int, 13 | nOutputs: Int)( 14 | implicit mlpObjective: MLPTask) extends Model { 15 | 16 | val topology = 17 | if (config.nHiddens == 0) 18 | Array[Int](nInputs, nOutputs) // if no hidden layer is set, there is only an output layer 19 | else 20 | Array[Int](nInputs) ++ config.hidLayers ++ Array[Int](nOutputs) 21 | 22 | /* 23 | * Aarrays of layers for the topology 24 | */ 25 | val layers: Array[MLPLayer] = topology.zipWithIndex 26 | .map { 27 | case (t, idx) => 28 | new MLPLayer(idx, t + 1) 29 | } 30 | 31 | /* 32 | * Create a array of connection between layer. A connection is 33 | * made of multiple synapses. 34 | */ 35 | val connections = Range(0, layers.size - 1).map(n => 36 | new MLPConnection(config, layers(n), layers(n + 1))(mlpObjective)).toArray 37 | 38 | /** 39 | * Alias for the input or first layer in the network 40 | */ 41 | @inline 42 | def inLayer: MLPLayer = layers.head 43 | 44 | /** 45 | * Alias for the last layer (output layer) in the network 46 | */ 47 | @inline 48 | def outLayer: MLPLayer = layers.last 49 | 50 | /** 51 | * Training cycle: Forward propagation of input, back propagation of error and the re-computation of the weight and 52 | * gradient of the elements. 53 | */ 54 | def trainEpoch(x: DblVector, y: DblVector): Double = { 55 | // Initialize the input layer 56 | inLayer.set(x) 57 | // Apply the forward progapation of input to all the connections 58 | // starting with the input layer 59 | connections.foreach(_.connectionForwardPropagation) 60 | 61 | // Compute the sum of squared errors 62 | val _sse = sse(y) 63 | 64 | // Create a back iterator 65 | val bckIterator = connections.reverseIterator 66 | 67 | // Apply the error back propagation to all the connections 68 | // starting with the output lauer 69 | bckIterator.foreach(_.connectionBackpropagation) 70 | 71 | // Finally update the connections (weigths and grad weights) of synapses 72 | connections.foreach(_.connectionUpdate) 73 | _sse 74 | } 75 | 76 | 77 | /** 78 | * Compute the mean squares error for the network as the sum 79 | * of the mean squares error for each output value. 80 | */ 81 | @inline 82 | final def sse(label: DblVector): Double = 83 | outLayer.sse(label) 84 | 85 | /** 86 | * Compute the output values for the network using the forward propagation 87 | */ 88 | def getOutput(x: DblVector): DblVector = { 89 | inLayer.set(x) 90 | 91 | connections.foreach(_.connectionForwardPropagation) 92 | 93 | outLayer.output 94 | } 95 | 96 | /** 97 | * Write the content of this model (weights) into a file 98 | */ 99 | override def saveToFile: Boolean = { 100 | val content = new StringBuilder(s"$nInputs,") 101 | if (config.nHiddens != 0) 102 | content.append(config.hidLayers.mkString(",")) 103 | 104 | content.append(s"$nOutputs\n") 105 | connections.foreach(c => { 106 | content.append(s"${c.getLayerIds._1},${c.getLayerIds._2}:") 107 | content.append(c.getSynapses.map(s => s"${s.mkString(",")}\n")) 108 | }) 109 | write(content.toString) 110 | } 111 | 112 | /** 113 | * Textual description of the model for Multi-layer Perceptron. The representation 114 | * include the description of the connections and layers. 115 | */ 116 | override def toString: String = { 117 | val buf = new StringBuilder 118 | connections.foreach(buf.append(_)) 119 | layers.foreach(buf.append(_)) 120 | buf.toString 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /src/main/scala/de/hpi/anlp/mlp/MLPTasks.scala: -------------------------------------------------------------------------------- 1 | package de.hpi.anlp.mlp 2 | 3 | import de.hpi.anlp.utils.ScalaMLTypes._ 4 | 5 | /** 6 | * Class for the Regression objective for the MLP. This implementation uses softmax 7 | */ 8 | object MLPTasks { 9 | def MLPMultiClassifier(y: DblVector): DblVector = { 10 | val softmaxValues = new DblVector(y.size) 11 | val expY = y.map(Math.exp(_)) 12 | val expYSum = expY.sum 13 | expY.map(_ / expYSum).copyToArray(softmaxValues, 1) 14 | softmaxValues 15 | } 16 | } -------------------------------------------------------------------------------- /src/main/scala/de/hpi/anlp/nnpos/POSMLP.scala: -------------------------------------------------------------------------------- 1 | package de.hpi.anlp.nnpos 2 | 3 | import de.hpi.anlp.mlp.{MLP, MLPConfig, MLPTasks} 4 | import de.hpi.anlp.conll.AnnotatedToken 5 | import de.hpi.anlp.utils.{SentenceUtils, WordDictionary, TagDictionary} 6 | import scala.collection.mutable.ArrayBuffer 7 | 8 | case class POSMLPModel(mlp: MLP, tags: TagDictionary, dict: WordDictionary, preW: Int, postW: Int) { 9 | def output(sentence: List[String]): Seq[String] = { 10 | SentenceUtils.slidingWindow(sentence, preW, postW).map { window => 11 | val features = dict.words2vec(window) 12 | labelForArray(mlp.output(features)) 13 | }.toSeq 14 | } 15 | 16 | private def imax(a: Array[Double]): Int = { 17 | var mv: Option[Double] = None 18 | var mi: Option[Int] = None 19 | var i = 1 20 | while (i < a.size) { 21 | if (mv.isEmpty || a(i) > mv.get) { 22 | mi = Some(i) 23 | mv = Some(a(i)) 24 | } 25 | i += 1 26 | } 27 | mi getOrElse 0 28 | } 29 | 30 | private def labelForArray(a: Array[Double]): String = { 31 | tags.revIdx(imax(a) - 1) 32 | } 33 | } 34 | 35 | object POSMLPModel { 36 | 37 | private def calculateXY(annotatedData: Iterable[List[AnnotatedToken]], preW: Int, postW: Int, dict: WordDictionary, tags: TagDictionary) = { 38 | val X = new ArrayBuffer[Array[Double]]() 39 | val y = new ArrayBuffer[Array[Double]]() 40 | 41 | annotatedData.foreach { annotated => 42 | SentenceUtils.slidingWindow(annotated.view.map(_.token), preW, postW).foreach { window => 43 | if (window.size == preW + 1 + postW) { 44 | X.append(dict.words2vec(window)) 45 | } 46 | } 47 | 48 | annotated.foreach { 49 | case AnnotatedToken(_, tag) => 50 | y.append(tags.tag2vec(tag)) 51 | } 52 | } 53 | (X.toArray, y.toArray) 54 | } 55 | 56 | def fit(mlpCfg: MLPConfig, states: List[String], annotatedData: Iterable[List[AnnotatedToken]], preW: Int = 2, postW: Int = 2) = { 57 | 58 | val tags = TagDictionary(states) 59 | val dict = WordDictionary.build(tags, annotatedData) 60 | println("Finished creation word dictionary. Size: " + dict.underlying.size) 61 | val (features, labels) = calculateXY(annotatedData, preW, postW, dict, tags) 62 | println("Finished calculating features. Size: " + features.size) 63 | 64 | val mlp = new MLP(mlpCfg, features, labels)(MLPTasks.MLPMultiClassifier _) 65 | 66 | POSMLPModel(mlp, tags, dict, preW, postW) 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/main/scala/de/hpi/anlp/utils/FileUtils.scala: -------------------------------------------------------------------------------- 1 | package de.hpi.anlp.utils 2 | 3 | import org.apache.log4j.Logger 4 | import scala.io.Source._ 5 | import scala.util.{Failure, Success, Try} 6 | 7 | /** 8 | * Read and write content from and to a file 9 | */ 10 | object FileUtils { 11 | private val logger = Logger.getLogger("FileUtils") 12 | 13 | /** 14 | * Read the content of a file as a String 15 | */ 16 | def read(toFile: String, className: String): Option[String] = 17 | Try(fromFile(toFile).mkString) match { 18 | case Success(content) => 19 | Some(content) 20 | case Failure(e) => 21 | logger.error(s"Reading $className failed. File $toFile", e) 22 | None 23 | } 24 | 25 | /** 26 | * Write the content into a file. The content is defined as a string. 27 | */ 28 | def write(content: String, pathName: String, className: String): Boolean = { 29 | import java.io.PrintWriter 30 | 31 | var printWriter: Option[PrintWriter] = None 32 | var status = false 33 | Try { 34 | printWriter = Some(new PrintWriter(pathName)) 35 | printWriter.map(_.write(content)) 36 | status = true 37 | } 38 | match { 39 | // Catch and display exception description and return false 40 | case Failure(e) => { 41 | logger.error(s"$className.write failed for $pathName", e) 42 | 43 | if (printWriter != None) { 44 | Try(printWriter.map(_.close)) match { 45 | case Success(res) => res 46 | case Failure(e) => 47 | logger.error(s"$className.write Failed for $pathName", e) 48 | } 49 | } 50 | } 51 | case Success(s) => {} 52 | } 53 | status 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/main/scala/de/hpi/anlp/utils/LCCFileReader.scala: -------------------------------------------------------------------------------- 1 | package de.hpi.anlp.utils 2 | 3 | import scala.io.Source 4 | 5 | /** 6 | * File reader to process data sets from the NLP institute of the university of Leipzig 7 | */ 8 | class LCCFileReader(fileName: String) extends Iterable[String] { 9 | /** 10 | * Every sentence is preceeded by its id. We are going to strip the id since we are only interested in the sentence 11 | */ 12 | val sentenceFileRx = "(?s)^[0-9]+\\s(.*)$" r 13 | 14 | override def iterator = new Iterator[String] { 15 | val lineIt = Source.fromFile(fileName).getLines 16 | var nextVal = readNextFromInput() 17 | var readLines = 0 18 | 19 | override def hasNext: Boolean = nextVal.isDefined 20 | 21 | override def next(): String = { 22 | nextVal match { 23 | case Some(value) => 24 | nextVal = readNextFromInput() 25 | value 26 | case _ => 27 | throw new NoSuchElementException("next on empty iterator") 28 | } 29 | } 30 | 31 | private def readNextFromInput(): Option[String] = { 32 | if (lineIt.hasNext) { 33 | // Read next sentence and strip the id from it 34 | val current = lineIt.next() 35 | current match { 36 | case sentenceFileRx(sentence) => 37 | readLines += 1 38 | Some(sentence) 39 | case _ => 40 | throw new Exception(s"Invalid line #$readLines in LCC file. Line content: '$current'") 41 | } 42 | } 43 | else 44 | None 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/main/scala/de/hpi/anlp/utils/Model.scala: -------------------------------------------------------------------------------- 1 | package de.hpi.anlp.utils 2 | 3 | trait Model { 4 | /** 5 | * Write the model parameters associated to this object into a file 6 | */ 7 | protected def write(content: String): Boolean = 8 | FileUtils.write(content, Model.RELATIVE_PATH, getClass.getSimpleName) 9 | 10 | /** 11 | * This operation or method has to be overwritten for a model to be saved into a file 12 | */ 13 | def saveToFile: Boolean = 14 | false 15 | } 16 | 17 | object Model { 18 | private val RELATIVE_PATH = "models/" 19 | 20 | /** 21 | * Read this model parameters from a file defined as in file `classname` 22 | */ 23 | def read(className: String): Option[String] = 24 | FileUtils.read(RELATIVE_PATH, className) 25 | } -------------------------------------------------------------------------------- /src/main/scala/de/hpi/anlp/utils/ScalaMLTypes.scala: -------------------------------------------------------------------------------- 1 | package de.hpi.anlp.utils 2 | 3 | /** 4 | * Types and conversion between ML types and native Scala types 5 | */ 6 | object ScalaMLTypes { 7 | type DblMatrix = Array[Array[Double]] 8 | type DblVector = Array[Double] 9 | type MLPTask = DblVector => DblVector 10 | } 11 | -------------------------------------------------------------------------------- /src/main/scala/de/hpi/anlp/utils/SentenceUtils.scala: -------------------------------------------------------------------------------- 1 | package de.hpi.anlp.utils 2 | 3 | /** 4 | * Helper class to construct windows over a given sentence. Windows at the border of the sentence are filled up with 5 | * a border tag 6 | */ 7 | object SentenceUtils { 8 | val SENTENCE_BORDER = "<=BORDER=>" 9 | 10 | 11 | def slidingWindow(sentence: Seq[String], preW: Int, postW: Int) = { 12 | val pre = List.fill(preW)(SENTENCE_BORDER) 13 | 14 | val post = List.fill(postW)(SENTENCE_BORDER) 15 | 16 | (pre ++ sentence ++ post).sliding(preW + 1 + postW) 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/main/scala/de/hpi/anlp/utils/TagDictionary.scala: -------------------------------------------------------------------------------- 1 | package de.hpi.anlp.utils 2 | 3 | /** 4 | * A wraper around helpers with tags. Creates a reverse tag index and can be used to convert a tag to a one-hot vector 5 | */ 6 | case class TagDictionary(states: List[String]) { 7 | val stateIdx = states.zipWithIndex.toMap 8 | 9 | val revIdx = states.zipWithIndex.map { 10 | case (el, i) => i -> el 11 | }.toMap 12 | 13 | val size = states.size 14 | 15 | val tag2vec: Map[String, Array[Double]] = states.zipWithIndex.map { 16 | case (state, idx) => 17 | val a = Array.fill(size)(0.0) 18 | a(idx) = 1 19 | state -> a 20 | }.toMap.withDefaultValue(Array.fill(size)(0.0)) 21 | } 22 | -------------------------------------------------------------------------------- /src/main/scala/de/hpi/anlp/utils/Word2VecDataSetIterator.scala: -------------------------------------------------------------------------------- 1 | package de.hpi.anlp.utils 2 | 3 | import de.hpi.WindowConverter 4 | import de.hpi.anlp.conll.AnnotatedToken 5 | import org.deeplearning4j.datasets.iterator.{DataSetIterator, DataSetPreProcessor} 6 | import org.deeplearning4j.models.word2vec.Word2Vec 7 | import org.deeplearning4j.text.inputsanitation.InputHomogenization 8 | import org.deeplearning4j.text.movingwindow.Windows 9 | import org.nd4j.linalg.dataset.DataSet 10 | import org.nd4j.linalg.factory.Nd4j 11 | import org.nd4j.linalg.util.FeatureUtil 12 | 13 | /** 14 | * Allows for customization of all of the params of the iterator 15 | * @param vec the word2vec model to use 16 | * @param sentenceIter the sentence iterator to use 17 | * @param labels the possible labels 18 | * @param batch the batch size 19 | */ 20 | class Word2VecDataSetIterator(vec: Word2Vec, sentenceIter: Iterable[List[AnnotatedToken]], labels: List[String], val batch: Int = 10) extends DataSetIterator { 21 | 22 | /** 23 | * Underlying active window iterator 24 | */ 25 | var iter = windowIter() 26 | 27 | /** 28 | * Index to lookup label ids 29 | */ 30 | val labelIdx = labels.zipWithIndex.toMap 31 | 32 | /** 33 | * Data set preprocessor 34 | */ 35 | var preProcessor: Option[DataSetPreProcessor] = None 36 | 37 | /** 38 | * Returns an iterate-once collection holding windows of the given size 39 | */ 40 | def windowIter() = { 41 | var counter = 0 42 | sentenceIter.flatMap { sentence => 43 | import scala.collection.JavaConversions._ 44 | val words = sentence.map(s => new InputHomogenization(s.token).transform()) 45 | val wordLabels = sentence.map(_.tag) 46 | counter += 1 47 | if (counter % 3500 == 0) 48 | println("Processing sentence " + counter) 49 | Windows.windows(words, vec.getWindow()).zip(wordLabels).map { 50 | case (window, label) => 51 | window.setLabel(label) 52 | window 53 | } 54 | }.toList 55 | } 56 | 57 | /** 58 | * Like the standard next method but allows a 59 | * customizable number of examples returned 60 | * 61 | * @param num the number of examples 62 | * @return the next data applyTransformToDestination 63 | */ 64 | override def next(num: Int): DataSet = { 65 | synchronized { 66 | try { 67 | val windows = iter.take(num).toList 68 | 69 | iter = iter.drop(num) 70 | 71 | if (windows.isEmpty) 72 | null 73 | else { 74 | val inputs = Nd4j.create(windows.size, inputColumns()) 75 | val labelOutput = Nd4j.create(windows.size, labels.size) 76 | 77 | // Iterate over all windows to convert them to matrix format 78 | windows.zipWithIndex.foreach { 79 | case (window, row) => 80 | inputs.putRow(row, WindowConverter.asExampleMatrix(window, vec)) 81 | labelOutput.putRow(row, FeatureUtil.toOutcomeVector(labelIdx(window.getLabel), labels.size)) 82 | } 83 | 84 | val ds = new DataSet(inputs, labelOutput) 85 | 86 | preProcessor.foreach { pp => 87 | pp.preProcess(ds) 88 | } 89 | 90 | ds 91 | } 92 | } catch { 93 | case e: Exception => 94 | println("Exception raised: " + e.getMessage) 95 | e.printStackTrace() 96 | throw e 97 | } 98 | } 99 | } 100 | 101 | override def totalExamples(): Int = { 102 | throw new UnsupportedOperationException() 103 | } 104 | 105 | override def inputColumns(): Int = { 106 | vec.lookupTable().layerSize() * vec.getWindow() 107 | } 108 | 109 | override def totalOutcomes(): Int = { 110 | labels.size 111 | } 112 | 113 | override def reset() = { 114 | iter = windowIter() 115 | } 116 | 117 | override def cursor(): Int = { 118 | 0 119 | } 120 | 121 | @Override 122 | override def numExamples(): Int = { 123 | 0 124 | } 125 | 126 | /** 127 | * Returns {true} if the iteration has more elements. 128 | * (In other words, returns {true} if {#next} would 129 | * return an element rather than throwing an exception.) 130 | * 131 | * @return {true} if the iteration has more elements 132 | */ 133 | override def hasNext(): Boolean = { 134 | !iter.isEmpty 135 | } 136 | 137 | /** 138 | * Returns the next element in the iteration. 139 | * 140 | * @return the next element in the iteration 141 | */ 142 | override def next(): DataSet = { 143 | next(batch) 144 | } 145 | 146 | /** 147 | * Removes from the underlying collection the last element returned 148 | * by this iterator (optional operation). This method can be called 149 | * only once per call to {@link #next}. The behavior of an iterator 150 | * is unspecified if the underlying collection is modified while the 151 | * iteration is in progress in any way other than by calling this 152 | * method. 153 | */ 154 | override def remove(): Unit = { 155 | throw new UnsupportedOperationException() 156 | } 157 | 158 | override def setPreProcessor(dataSetPreprocessor: DataSetPreProcessor): Unit = { 159 | preProcessor = Some(dataSetPreprocessor) 160 | } 161 | } 162 | -------------------------------------------------------------------------------- /src/main/scala/de/hpi/anlp/utils/WordDictionary.scala: -------------------------------------------------------------------------------- 1 | package de.hpi.anlp.utils 2 | 3 | import de.hpi.anlp.conll.AnnotatedToken 4 | import scala.collection.mutable 5 | 6 | /** 7 | * A word dictionary is a lookup table for seen words. There is a fallback for unseen words 8 | */ 9 | case class WordDictionary(underlying: scala.collection.Map[String, Array[Double]], numStates: Int) { 10 | /** 11 | * Value that gets returned if a sentence border is reached 12 | */ 13 | val nullVec = Array.fill(numStates)(0.0) 14 | 15 | /** 16 | * Value that gets returned if a word hasn't been seen during training 17 | */ 18 | val uniformVec = Array.fill(numStates)(1.0 / numStates) 19 | 20 | /** 21 | * Retrieve the vector representation of a word 22 | */ 23 | def word2vec(word: String): Array[Double] = underlying.get(word) match { 24 | case Some(vec) => vec 25 | case _ if word == SentenceUtils.SENTENCE_BORDER => nullVec 26 | case _ => uniformVec 27 | } 28 | 29 | /** 30 | * Retrieve the vector representation of a word 31 | */ 32 | def words2vec(words: List[String]) = 33 | Array.concat(words.map(word2vec): _*) 34 | } 35 | 36 | /** 37 | * Helper to construct a word dictionary given an input data set 38 | */ 39 | object WordDictionary { 40 | 41 | /** 42 | * Use the given annotated data to build up a word dictionary containing the probabilities of a word occouring with 43 | * each tag. Implements the Tag-Prob word representation 44 | */ 45 | def build(tags: TagDictionary, annotatedData: Iterable[List[AnnotatedToken]]): WordDictionary = { 46 | 47 | val emissions = mutable.HashMap.empty[String, Array[Double]] 48 | val counter = mutable.HashMap.empty[String, Int].withDefaultValue(0) 49 | 50 | // Iterate over the data set to count tag <-> token occurrences 51 | annotatedData.foreach { annotated => 52 | annotated.foreach { 53 | case AnnotatedToken(token, tag) => 54 | val idx = tags.stateIdx(tag) 55 | counter.update(token, counter(token) + 1) 56 | emissions.get(token) match { 57 | case Some(a) => 58 | a(idx) += 1 59 | case _ => 60 | val a = Array.fill(tags.size)(0.0) 61 | a(idx) = 1 62 | emissions += (token -> a) 63 | } 64 | } 65 | } 66 | 67 | // Calculate the emission probabilities for each word and tag combination 68 | emissions.foreach { 69 | case (token, freqs) => 70 | (0 until freqs.length).foreach { i => 71 | freqs.update(i, (freqs(i) + 1) / (counter(token) + freqs.length)) 72 | } 73 | } 74 | 75 | WordDictionary(emissions, tags.size) 76 | } 77 | } -------------------------------------------------------------------------------- /src/main/test-space.sc: -------------------------------------------------------------------------------- 1 | import java.io.File 2 | import java.net.{URL, URLClassLoader} 3 | 4 | import org.apache.commons.io.IOUtils 5 | import org.springframework.core.io.ClassPathResource 6 | 7 | def addPath(s: String){ 8 | val f = new File(s) 9 | println(f.exists()) 10 | val u = f.toURI() 11 | val urlClassLoader = ClassLoader.getSystemClassLoader().asInstanceOf[URLClassLoader] 12 | val urlClass = classOf[URLClassLoader] 13 | val method = urlClass.getDeclaredMethod("addURL", classOf[URL]) 14 | method.setAccessible(true) 15 | method.invoke(urlClassLoader, u.toURL()) 16 | } 17 | 18 | addPath("/Users/tombocklisch/Documents/Studium/ANLP/deep-nlp-scala/src/main/resources") 19 | 20 | import java.io.File 21 | 22 | import edu.stanford.nlp.tagger.maxent.MaxentTagger 23 | import org.apache.commons.math3.random.MersenneTwister 24 | import org.deeplearning4j.datasets.iterator.impl.IrisDataSetIterator 25 | import org.deeplearning4j.distributions.Distributions 26 | import org.deeplearning4j.eval.Evaluation 27 | import org.deeplearning4j.models.featuredetectors.rbm.RBM 28 | import org.deeplearning4j.models.word2vec.Word2Vec 29 | import org.deeplearning4j.nn.conf.{MultiLayerConfiguration, NeuralNetConfiguration} 30 | import org.deeplearning4j.nn.multilayer.MultiLayerNetwork 31 | import org.deeplearning4j.nn.weights.WeightInit 32 | import org.deeplearning4j.text.inputsanitation.InputHomogenization 33 | import org.deeplearning4j.text.sentenceiterator.{SentencePreProcessor, FileSentenceIterator} 34 | import org.deeplearning4j.text.tokenization.tokenizerfactory.UimaTokenizerFactory 35 | import org.nd4j.linalg.api.activation.Activations 36 | import org.nd4j.linalg.lossfunctions.LossFunctions 37 | 38 | println("running") 39 | val sample = "This is a sample text." 40 | val tagged = new MaxentTagger("edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger") 41 | def reuters = { 42 | val reutersFile = "assets/reuters21578/" 43 | val file = new File(reutersFile) 44 | 45 | new FileSentenceIterator(new SentencePreProcessor() { 46 | 47 | override def preProcess(sentence: String): String = 48 | new InputHomogenization(sentence).transform() 49 | 50 | },file) 51 | 52 | } 53 | 54 | val iter = reuters 55 | val t = new UimaTokenizerFactory() 56 | val vec = new Word2Vec.Builder() 57 | .windowSize(5) 58 | .layerSize(300) 59 | .iterate(iter) 60 | .tokenizerFactory(t) 61 | .build() 62 | 63 | vec.fit() 64 | 65 | val oil = "oil" 66 | 67 | printf("%f\n", vec.similarity(oil, oil)) 68 | 69 | printf("%f\n", vec.similarity(oil, "fish")); 70 | 71 | 72 | def deep() = { 73 | val gen = new MersenneTwister(123); 74 | val conf = new NeuralNetConfiguration.Builder() 75 | .hiddenUnit(RBM.HiddenUnit.RECTIFIED) 76 | .momentum(5e-1f) //this expresses decimals as floats. Remember e? 77 | .visibleUnit(RBM.VisibleUnit.GAUSSIAN) 78 | .regularization(true) 79 | .dist(Distributions.uniform(gen)) 80 | .activationFunction(Activations.tanh()) 81 | .iterations(10000) 82 | .weightInit(WeightInit.DISTRIBUTION) 83 | .lossFunction(LossFunctions.LossFunction.RECONSTRUCTION_CROSSENTROPY) 84 | .rng(gen) 85 | .learningRate(1e-3f) 86 | .nIn(4) 87 | .nOut(3) 88 | .build() 89 | val d = new MultiLayerNetwork(conf.asInstanceOf[MultiLayerConfiguration]) 90 | val iter = new IrisDataSetIterator(150, 150); 91 | val next = iter.next(); 92 | next.normalizeZeroMeanZeroUnitVariance(); 93 | next.shuffle(); 94 | val testAndTrain = next.splitTestAndTrain(110); 95 | val train = testAndTrain.getTrain(); 96 | d.fit(train); 97 | val test = testAndTrain.getTest(); 98 | 99 | val eval = new Evaluation(); 100 | val output = d.output(test.getFeatureMatrix()); 101 | eval.eval(test.getLabels(),output); 102 | println("Score " + eval.stats()); 103 | } --------------------------------------------------------------------------------