├── .gitignore
├── README.md
├── build.sbt
├── project
    └── build.properties
└── src
    └── main
        ├── java
            └── de
            │   └── hpi
            │       └── WindowConverter.java
        ├── loading.sc
        ├── resources
            ├── application.conf
            ├── log4j.properties
            ├── logback.xml
            └── stopwords
        ├── scala
            └── de
            │   └── hpi
            │       └── anlp
            │           ├── Main.scala
            │           ├── POSEvaluator.scala
            │           ├── POSHMM.scala
            │           ├── POSMLP.scala
            │           ├── POSRBM.scala
            │           ├── conll
            │               ├── ConLLDataSet.scala
            │               └── ConLLFile.scala
            │           ├── hmm
            │               ├── ConstantSmoothedHMM.scala
            │               ├── HMM.scala
            │               └── TrainedHMM.scala
            │           ├── mlp
            │               ├── MLP.scala
            │               ├── MLPConfig.scala
            │               ├── MLPConnection.scala
            │               ├── MLPLayer.scala
            │               ├── MLPModel.scala
            │               └── MLPTasks.scala
            │           ├── nnpos
            │               └── POSMLP.scala
            │           └── utils
            │               ├── FileUtils.scala
            │               ├── LCCFileReader.scala
            │               ├── Model.scala
            │               ├── ScalaMLTypes.scala
            │               ├── SentenceUtils.scala
            │               ├── TagDictionary.scala
            │               ├── Word2VecDataSetIterator.scala
            │               └── WordDictionary.scala
        └── test-space.sc


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.class
 2 | *.log
 3 | 
 4 | # sbt specific
 5 | .cache/
 6 | .history/
 7 | .lib/
 8 | dist/*
 9 | target/
10 | lib_managed/
11 | src_managed/
12 | project/boot/
13 | project/plugins/project/
14 | output/
15 | project-ivy-repo/
16 | word2vec-index/
17 | .idea/
18 | 
19 | # Scala-IDE specific
20 | .scala_dependencies
21 | .worksheet
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # deep-nlp-scala
 2 | Using deep learning to POS tag sentences using scala + DL4J.
 3 | 
 4 | This is a showcase repository intended to evaluate different algorithms on the task of POS tagging german sentences. There is a Multilayerperceptron (from scratch), a Hidden-Markov-Model (from scratch) and a RBF deep net (based on DL4J) implementation.
 5 | 
 6 | ## Installation
 7 | To execute the project one needs to make sure sbt 0.13 and java >= 1.6 is installed.
 8 | 
 9 | Information about how to install sbt on your system can be found on http://www.scala-sbt.org/release/tutorial/Setup.html
10 | 
11 | ## Assets
12 | Assets need to be placed into the assets/ directory. 
13 | 
14 | There should be labeled training data: de-train.tt, de-test.tt and de-eval.tt
15 | 
16 | And there should be unlabeled training data for the word2vec training in the folder assets/deu_news_2010_1M-text. The
17 | data set can be downloaded from http://corpora.uni-leipzig.de/download.html
18 | 
19 | 
20 | ## Running
21 | Running the different methods requires passing either 'mlp', 'rbm', 'hmm' or 'hmm-s' to the executable. Make sure to allow the JVM to use as much memory as possible (e.g. using "-Xmx7G").
22 | 
23 | The program can be started using "sbt run mlp"
24 | 


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "deep-nlp-scala"
 2 | 
 3 | version := "1.0"
 4 | 
 5 | scalaVersion := "2.10.4"
 6 | 
 7 | libraryDependencies ++= List(
 8 |   "log4j"               % "log4j"               % "1.2.15" exclude("javax.jms", "jms"),
 9 |   "org.deeplearning4j"  % "deeplearning4j-core" % "0.0.3.3.2.alpha1",
10 |   "org.deeplearning4j"  % "deeplearning4j-nlp"  % "0.0.3.3.2.alpha1",
11 |   "org.nd4j"            % "canova-parent"       % "0.0.0.1",
12 |   "org.nd4j"            % "nd4j-api"            % "0.0.3.5.5.2",
13 |   "org.nd4j"            % "nd4j-netlib-blas"    % "0.0.3.5.5.2",
14 |   "edu.stanford.nlp"    % "stanford-corenlp"    % "3.4.1", // 3.4.1 last version with java 7 support
15 |   "edu.stanford.nlp"    % "stanford-corenlp"    % "3.4.1" classifier "models"
16 | )
17 | 
18 | resolvers ++= Seq(
19 |   "JBoss repository" at "https://repository.jboss.org/nexus/content/groups/public",
20 |   Resolver.mavenLocal,
21 |   Resolver.file("project-ivy-repo", file("project-ivy-repo"))
22 | )
23 | 


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=0.13.5


--------------------------------------------------------------------------------
/src/main/java/de/hpi/WindowConverter.java:
--------------------------------------------------------------------------------
 1 | package de.hpi;
 2 |         
 3 | import org.deeplearning4j.models.word2vec.Word2Vec;
 4 | import org.deeplearning4j.text.movingwindow.Window;
 5 | import org.nd4j.linalg.api.ndarray.INDArray;
 6 | import org.nd4j.linalg.factory.Nd4j;
 7 | 
 8 | import java.util.List;
 9 | 
10 | public class WindowConverter {
11 |     public static double[] asExample(Window window,Word2Vec vec) {
12 |         int length = vec.lookupTable().layerSize();
13 |         List<String> words = window.getWords();
14 |         int windowSize = window.getWindowSize();
15 | 
16 |         double[] example = new double[ length * windowSize];
17 |         int count = 0;
18 |         for(int i = 0; i < words.size(); i++) {
19 |             String word = words.get(i);
20 |             INDArray n = vec.getWordVectorMatrixNormalized(word);
21 |             INDArray vec2 = n == null ? vec.getWordVectorMatrix(Word2Vec.UNK) : vec.getWordVectorMatrix(word);
22 |             if(vec2 == null)
23 |                 vec2 = vec.getWordVectorMatrix(Word2Vec.UNK);
24 |             for(int j = 0; j < vec2.length(); j++) {
25 |                 example[count++] = vec2.getDouble(j);
26 |             }
27 | 
28 | 
29 |         }
30 | 
31 |         return example;
32 |     }
33 | 
34 |     public static INDArray asExampleMatrix(Window window,Word2Vec vec) {
35 |         return Nd4j.create(asExample(window, vec));
36 |     }
37 | 
38 | }
39 | 


--------------------------------------------------------------------------------
/src/main/loading.sc:
--------------------------------------------------------------------------------
 1 | import java.io.FileReader
 2 | 
 3 | import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel}
 4 | import edu.stanford.nlp.sequences.{SeqClassifierFlags, CoNLLDocumentReaderAndWriter}
 5 | 
 6 | val conllreader = {
 7 |   val r = new CoNLLDocumentReaderAndWriter()
 8 |   r.init(new SeqClassifierFlags())
 9 |   r
10 | }
11 | val it = conllreader.getIterator(new FileReader("/Users/tombocklisch/Documents/Studium/ANLP/deep-nlp-scala/assets/de-train.tt"))
12 | var numDocs = 0
13 | var numTokens = 0
14 | var lastAnsBase = ""
15 | var numEntities = 0
16 | while (it.hasNext) {
17 |   val doc = it.next()
18 |   numDocs += 1
19 |   import scala.collection.JavaConversions._
20 |   for (fl <- doc) {
21 |     if (fl.word != "XX") {
22 |       val ans: String = fl.get(classOf[CoreAnnotations.AnswerAnnotation])
23 |       var ansBase: String = null
24 |       var ansPrefix: String = null
25 |       val bits: Array[String] = ans.split("-")
26 |       if (bits.length == 1) {
27 |         ansBase = bits(0)
28 |         ansPrefix = ""
29 |       }
30 |       else {
31 |         ansBase = bits(1)
32 |         ansPrefix = bits(0)
33 |       }
34 |       numTokens += 1
35 |       if (!(ansBase == "O")) {
36 |         if (ansBase == lastAnsBase) {
37 |           if (ansPrefix == "B") {
38 |             numEntities += 1
39 |           }
40 |         }
41 |         else {
42 |           numEntities += 1
43 |         }
44 |       }
45 |     }
46 |   }
47 | }


--------------------------------------------------------------------------------
/src/main/resources/application.conf:
--------------------------------------------------------------------------------
 1 | akka {
 2 | 
 3 |   extensions = ["akka.contrib.pattern.ClusterReceptionistExtension","akka.contrib.pattern.DistributedPubSubExtension"]
 4 |   loggers = ["akka.event.slf4j.Slf4jLogger"]
 5 |   loglevel = "INFO"
 6 | 
 7 |   actor {
 8 |     provider = "akka.cluster.ClusterActorRefProvider"
 9 |     serialize-messages = off
10 |     serialize-creators = off
11 | 
12 |     worker-dispatcher {
13 |       type = Dispatcher
14 |       mailbox-capacity = 3000
15 |       mailbox-push-timeout-time = 120s
16 |     }
17 | 
18 |     deployment {
19 | 
20 | 
21 |       serializers {
22 |         java = "akka.serialization.JavaSerializer"
23 |         proto = "akka.remote.serialization.ProtobufSerializer"
24 |       }
25 | 
26 | 
27 | 
28 |     }
29 | 
30 |   }
31 | 
32 |   remote {
33 |     transport = "akka.remote.netty.NettyRemoteTransport"
34 |     log-remote-lifecycle-events = off
35 | 
36 |     netty.tcp {
37 |       hostname = "localhost"
38 |       port = 0
39 |       maximum-frame-size = 99999999999b
40 |     }
41 | 
42 |     transport-failure-detector {
43 |       heartbeat-interval = 120 s
44 |       acceptable-heartbeat-pause = 60 s
45 |     }
46 |   }
47 | 
48 |   cluster {
49 |     failure-detector {
50 |       threshold = 12
51 |       acceptable-heartbeat-pause = 120s
52 |       heartbeat-interval = 5s
53 |       heartbeat-request {
54 |         expected-response-after = 120s
55 |       }
56 |     }
57 |     jmx.enabled = on
58 |     enabled = on
59 |     allow-local-routees = off
60 | 
61 |     auto-down-unreachable-after = off
62 |   }
63 | }


--------------------------------------------------------------------------------
/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | log4j.rootLogger=ERROR, Console
 2 | log4j.logger.play=DEBUG
 3 | log4j.appender.Console=org.apache.log4j.ConsoleAppender
 4 | log4j.appender.Console.layout=org.apache.log4j.PatternLayout
 5 | log4j.appender.Console.layout.ConversionPattern=%d{ABSOLUTE} %-5p ~ %m%n
 6 | 
 7 | log4j.appender.org.springframework=DEBUG
 8 | log4j.appender.org.deeplearning4j=INFO
 9 | log4j.appender.opennlp.uima=OFF
10 | log4j.appender.org.apache.uima=OFF
11 | log4j.appender.org.cleartk=OFF
12 | 
13 | log4j.logger.org.springframework=INFO
14 | log4j.logger.org.deeplearning4j=INFO
15 | log4j.logger.opennlp.uima.util=OFF
16 | log4j.logger.org.apache.uima=OFF
17 | log4j.logger.org.cleartk=OFF


--------------------------------------------------------------------------------
/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 |     <appender name="FILE" class="ch.qos.logback.core.FileAppender">
 3 |         <file>logs/application.log</file>
 4 |         <encoder>
 5 |             <pattern>%date - [%level] - from %logger in %thread
 6 |                 %n%message%n%xException%n</pattern>
 7 |         </encoder>
 8 |     </appender>
 9 | 
10 |     <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
11 |         <encoder>
12 |             <pattern> %logger{15} - %message%n%xException{5}
13 |             </pattern>
14 |         </encoder>
15 |     </appender>
16 | 
17 |     <logger name="org.apache.catalina.core" level="DEBUG" />
18 |     <logger name="org.springframework" level="DEBUG" />
19 |     <logger name="org.deeplearning4j" level="INFO" />
20 |     <logger name="opennlp.uima.util" level="OFF" />
21 |     <logger name="org.apache.uima" level="OFF" />
22 |     <logger name="org.cleartk" level="OFF" />
23 | 
24 | 
25 | 
26 |     <root level="ERROR">
27 |         <appender-ref ref="STDOUT" />
28 |         <appender-ref ref="FILE" />
29 |     </root>
30 | 
31 | </configuration>


--------------------------------------------------------------------------------
/src/main/resources/stopwords:
--------------------------------------------------------------------------------
  1 | a
  2 | ----s
  3 | act
  4 | "the
  5 | "The
  6 | about
  7 | above
  8 | after
  9 | again
 10 | against
 11 | all
 12 | am
 13 | an
 14 | and
 15 | any
 16 | are
 17 | aren't
 18 | as
 19 | at
 20 | be
 21 | because
 22 | been
 23 | before
 24 | being
 25 | below
 26 | between
 27 | both
 28 | but
 29 | by
 30 | can't
 31 | cannot
 32 | could
 33 | couldn't
 34 | did
 35 | didn't
 36 | do
 37 | does
 38 | doesn't
 39 | doing
 40 | don't
 41 | down
 42 | during
 43 | each
 44 | few
 45 | for
 46 | from
 47 | further
 48 | had
 49 | hadn't
 50 | has
 51 | hasn't
 52 | have
 53 | haven't
 54 | having
 55 | he
 56 | he'd
 57 | he'll
 58 | he's
 59 | her
 60 | here
 61 | here's
 62 | hers
 63 | herself
 64 | him
 65 | himself
 66 | his
 67 | how
 68 | how's
 69 | i
 70 | i'd
 71 | i'll
 72 | i'm
 73 | i've
 74 | if
 75 | in
 76 | into
 77 | is
 78 | isn't
 79 | it
 80 | it's
 81 | its
 82 | itself
 83 | let's
 84 | me
 85 | more
 86 | most
 87 | mustn't
 88 | my
 89 | myself
 90 | no
 91 | nor
 92 | not
 93 | of
 94 | off
 95 | on
 96 | once
 97 | only
 98 | or
 99 | other
100 | ought
101 | our
102 | ours 
103 | ourselves
104 | out
105 | over
106 | own
107 | put
108 | same
109 | shan't
110 | she
111 | she'd
112 | she'll
113 | she's
114 | should
115 | somebody
116 | something
117 | shouldn't
118 | so
119 | some
120 | such
121 | take
122 | than
123 | that
124 | that's
125 | the
126 | their
127 | theirs
128 | them
129 | themselves
130 | then
131 | there
132 | there's
133 | these
134 | they
135 | they'd
136 | they'll
137 | they're
138 | they've
139 | this
140 | those
141 | through
142 | to
143 | too
144 | under
145 | until
146 | up
147 | very
148 | was
149 | wasn't
150 | we
151 | we'd
152 | we'll
153 | we're
154 | we've
155 | were
156 | weren't
157 | what
158 | what's
159 | when
160 | when's
161 | where
162 | where's
163 | which
164 | while
165 | who
166 | who's
167 | whom
168 | why
169 | why's
170 | will
171 | with
172 | without
173 | won't
174 | would
175 | wouldn't
176 | you
177 | you'd
178 | you'll
179 | you're
180 | you've
181 | your
182 | yours
183 | yourself
184 | yourselves
185 | .
186 | ?
187 | !
188 | ,
189 | +
190 | =
191 | also
192 | -
193 | 
194 | 


--------------------------------------------------------------------------------
/src/main/scala/de/hpi/anlp/Main.scala:
--------------------------------------------------------------------------------
 1 | package de.hpi.anlp
 2 | 
 3 | import de.hpi.anlp.conll.ConLLFileReader
 4 | 
 5 | /**
 6 |  * Main console application 
 7 |  */
 8 | object Main extends App {
 9 | 
10 |   // Tags to use for POS tagging
11 |   val states = List("NOUN", "ADV", "PRT", ".", "ADP", "DET", "PRON", "VERB", "X", "NUM", "CONJ", "ADJ")
12 | 
13 |   // Training documents
14 |   val trainDocuments = new ConLLFileReader("assets/de-train.tt")
15 | 
16 |   // Test documents
17 |   val testDocuments = new ConLLFileReader("assets/de-eval.tt")
18 | 
19 |   // Lets have a look what arguments where passed in
20 |   println("ARGS: " + args.mkString(" , "))
21 | 
22 |   // Call the appropriate training according to the passed argument
23 |   args.headOption match {
24 |     case Some("mlp") =>
25 |       trainMLP()
26 |     case Some("rbm") =>
27 |       trainRBM()
28 |     case Some("hmm-s") =>
29 |       trainHMM(smoothed = true)
30 |     case Some("hmm") =>
31 |       trainHMM(smoothed = false)
32 |     case _ =>
33 |       throw new Exception("You need to specify the model to train. One of 'mlp', 'rbm', 'hmm-s', 'hmm'")
34 |   }
35 | 
36 |   def trainHMM(smoothed: Boolean): Unit = {
37 |     val hmm = POSHMM.train(trainDocuments, states, smoothed)
38 | 
39 |     println("\n--- Evaluation of: HMM.smoothed=" + smoothed)
40 |     POSHMM.evaluate(hmm, testDocuments, states).printEvaluation()
41 |     println("---")
42 |   }
43 | 
44 |   def trainMLP() = {
45 |     val mlp = POSMLP.train(trainDocuments, states)
46 | 
47 |     println("\n--- Evaluation of MLP ...")
48 |     POSMLP.evaluate(mlp, testDocuments, states).printEvaluation()
49 |     println("---")
50 |   }
51 | 
52 |   def trainRBM() = {
53 |     val (network, vec) = POSRBM.train(trainDocuments, states)
54 | 
55 |     println("\n--- Evaluation of RBM ...")
56 |     POSRBM.evaluate(network, testDocuments, vec, states).printEvaluation()
57 |     println("---")
58 |   }
59 | }


--------------------------------------------------------------------------------
/src/main/scala/de/hpi/anlp/POSEvaluator.scala:
--------------------------------------------------------------------------------
 1 | package de.hpi.anlp
 2 | 
 3 | import java.util.Locale
 4 | import scala.collection.breakOut
 5 | import scala.collection.mutable
 6 | 
 7 | /**
 8 |  * Given the guessed tags of a trained model and a data sets gold standart this class calculates tag based accuracy
 9 |  * F1 score and overall accuracx 
10 |  */
11 | class POSEvaluator(tags: List[String]) {
12 |   /**
13 |    * Underlying counter for occurrences 
14 |    */
15 |   val tagCounter: Map[String, mutable.Map[String, Int]] = tags.map { tag =>
16 |     tag -> mutable.HashMap("system" -> 0, "gold" -> 0, "both" -> 0)
17 |   }(breakOut)
18 | 
19 |   /**
20 |    * Add another guessed, gold annotation to the evaluator 
21 |    */
22 |   def add(tagged: String, gold: String): Unit = {
23 |     tagCounter(tagged)("system") += 1
24 |     tagCounter(gold)("gold") += 1
25 |     if (tagged == gold)
26 |       tagCounter(gold)("both") += 1
27 |   }
28 | 
29 |   def add(tagged: Seq[String], gold: Seq[String]): Unit = {
30 |     tagged.zip(gold).map {
31 |       case (systemTag, goldTag) =>
32 |         add(systemTag, goldTag)
33 |     }
34 |   }
35 | 
36 |   /**
37 |    * Prints the evalution to the standart out
38 |    */
39 |   def printEvaluation(): Unit = {
40 |     val overall = tagCounter.values.map(_("system")).sum
41 |     val correct = tagCounter.values.map(_("both")).sum
42 | 
43 |     println("%5s, %6s, %6s, %6s".format("", "Prec", "Rec", "F1"))
44 | 
45 |     tagCounter.foreach {
46 |       case (tag, counts) =>
47 |         val p = precision(counts)
48 |         val r = recall(counts)
49 |         val f1Score = f1(p, r)
50 |         println("%5s, %.4f, %.4f, %.4f".formatLocal(Locale.ENGLISH, tag, p, r, f1Score))
51 |     }
52 | 
53 |     println("\nAccuracy: %.4f".format(correct.toDouble / overall))
54 |   }
55 | 
56 |   def precision(counts: mutable.Map[String, Int]) = {
57 |     if (counts("system") == 0)
58 |       Double.NaN
59 |     else
60 |       counts("both").toDouble / counts("system")
61 |   }
62 | 
63 |   def recall(counts: mutable.Map[String, Int]) = {
64 |     if (counts("gold") == 0)
65 |       Double.NaN
66 |     else
67 |       counts("both").toDouble / counts("gold")
68 |   }
69 | 
70 |   def f1(precision: Double, recall: Double) = {
71 |     if (precision + recall == 0 || precision + recall == Double.NaN)
72 |       Double.NaN
73 |     else
74 |       2 * precision * recall / (precision + recall)
75 |   }
76 | }
77 | 


--------------------------------------------------------------------------------
/src/main/scala/de/hpi/anlp/POSHMM.scala:
--------------------------------------------------------------------------------
 1 | package de.hpi.anlp
 2 | 
 3 | import de.hpi.anlp.conll.AnnotatedToken
 4 | import de.hpi.anlp.hmm.{TrainedHMM, ConstantSmoothedHMM, HMM}
 5 | 
 6 | /**
 7 |  * Configure, train and evaluate a HMM based model* 
 8 |  */
 9 | object POSHMM {
10 | 
11 |   /**
12 |    * Train a new HMM model. The model can either use smoothing or not
13 |    */
14 |   def train(trainDocuments: Iterable[List[AnnotatedToken]], states: List[String], smoothed: Boolean) = {
15 |     val hmm =
16 |       if (smoothed)
17 |         new ConstantSmoothedHMM(states, smoothingConstant = 1)
18 |       else
19 |         new HMM(states)
20 | 
21 |     hmm.train(trainDocuments)
22 |   }
23 | 
24 |   /**
25 |    * Evaluate a given HMM on a test data set and its gold standart
26 |    */
27 |   def evaluate(hmm: TrainedHMM, testDocuments: Iterable[List[AnnotatedToken]], states: List[String]) = {
28 |     val evaluator = new POSEvaluator(states)
29 | 
30 |     testDocuments.foreach { sentence =>
31 |       val unannotated = sentence.map(_.token)
32 |       val (prob, tags) = hmm.mostProbablePath(unannotated)
33 |       evaluator.add(tagged = tags, gold = sentence.map(_.tag))
34 |     }
35 | 
36 |     evaluator
37 |   }
38 | 
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/scala/de/hpi/anlp/POSMLP.scala:
--------------------------------------------------------------------------------
 1 | package de.hpi.anlp
 2 | 
 3 | import de.hpi.anlp.mlp.MLPConfig
 4 | import de.hpi.anlp.conll.AnnotatedToken
 5 | import de.hpi.anlp.nnpos.POSMLPModel
 6 | 
 7 | /**
 8 |  * Configure, train and evaluate a MLP model 
 9 |  */
10 | object POSMLP {
11 | 
12 |   /**
13 |    * Train a new MLP model using the given training data and states. The configuration can be adjusted in this function.
14 |    */
15 |   def train(trainDocuments: Iterable[List[AnnotatedToken]], states: List[String]) = {
16 |     val NUM_EPOCHS = 1000
17 |     val EPS = 0.00001
18 |     val learningRate = 0.01
19 |     val hiddenLayers = Array[Int]()
20 |     val momentum = 0.1
21 |     val activationF = (x: Double) => 1.0 / (1.0 + Math.exp(-0.8 * x))
22 | 
23 |     val config = MLPConfig(momentum, learningRate, hiddenLayers, NUM_EPOCHS, EPS, activationF)
24 | 
25 |     POSMLPModel.fit(config, states, trainDocuments, preW = 2, postW = 2)
26 |   }
27 | 
28 |   /**
29 |    * Evaluate a given MLP model on the test data set and its gold standart
30 |    */
31 |   def evaluate(mlp: POSMLPModel, testDocuments: Iterable[List[AnnotatedToken]], states: List[String]) = {
32 |     val evaluator = new POSEvaluator(states)
33 | 
34 |     testDocuments.foreach { sentence =>
35 |       val unannotated = sentence.map(_.token)
36 |       val tags = mlp.output(unannotated)
37 |       evaluator.add(tagged = tags, gold = sentence.map(_.tag))
38 |     }
39 |     evaluator
40 |   }
41 | 
42 | }
43 | 


--------------------------------------------------------------------------------
/src/main/scala/de/hpi/anlp/POSRBM.scala:
--------------------------------------------------------------------------------
  1 | package de.hpi.anlp
  2 | 
  3 | import java.io.File
  4 | import de.hpi.anlp.conll.AnnotatedToken
  5 | import de.hpi.anlp.utils.Word2VecDataSetIterator
  6 | import org.apache.commons.math3.random.MersenneTwister
  7 | import org.deeplearning4j.eval.Evaluation
  8 | import org.deeplearning4j.models.featuredetectors.rbm.RBM
  9 | import org.deeplearning4j.models.word2vec.Word2Vec
 10 | import org.deeplearning4j.nn.api.OptimizationAlgorithm
 11 | import org.deeplearning4j.nn.conf.NeuralNetConfiguration
 12 | import org.deeplearning4j.nn.layers.factory.LayerFactories
 13 | import org.deeplearning4j.nn.multilayer.MultiLayerNetwork
 14 | import org.deeplearning4j.nn.weights.WeightInit
 15 | import org.deeplearning4j.text.inputsanitation.InputHomogenization
 16 | import org.deeplearning4j.text.sentenceiterator.{SentencePreProcessor, FileSentenceIterator}
 17 | import org.deeplearning4j.text.tokenization.tokenizerfactory.UimaTokenizerFactory
 18 | import org.deeplearning4j.util.SerializationUtils
 19 | import org.nd4j.linalg.api.activation.Activations
 20 | import org.nd4j.linalg.api.ndarray.INDArray
 21 | import org.nd4j.linalg.lossfunctions.LossFunctions
 22 | import org.nd4j.linalg.netlib.SimpleNetlibBlas
 23 | 
 24 | /**
 25 |  * Configure, train and evaluate RBM based POS taggers 
 26 |  */
 27 | object POSRBM {
 28 |   // Size of the unsupervised news corpus to use. Should be either 1M, 10K or 300K
 29 |   val vecTrainSize = "1M"
 30 | 
 31 |   // Window size to train on
 32 |   val windowSize = 5
 33 | 
 34 |   // Word vector size used during the word2vec training
 35 |   val wordVecLayers = 50
 36 | 
 37 |   /**
 38 |    * Load a word2vec model from disc
 39 |    */
 40 |   def loadWordVectorModel() = {
 41 |     SerializationUtils.readObject(new File(s"output/word2vec_$vecTrainSize.model")).asInstanceOf[Word2Vec]
 42 |   }
 43 | 
 44 |   /**
 45 |    * Load a neural network from disc
 46 |    */
 47 |   def loadNeuralNetwork(fileName: String) = {
 48 |     SerializationUtils.readObject(new File(fileName)).asInstanceOf[MultiLayerNetwork]
 49 |   }
 50 | 
 51 |   /**
 52 |    * Store a word2vec instance to disc for later retrieval
 53 |    */
 54 |   def storeWordVectorModel(model: Word2Vec) = {
 55 |     SerializationUtils.saveObject(model, new File(s"output/word2vec_$vecTrainSize.model"));
 56 |   }
 57 | 
 58 |   /**
 59 |    * Instanciate a default sentence preprocessor and apply standart input homogenization
 60 |    */
 61 |   private def sentencePreprocessor = new SentencePreProcessor() {
 62 |     val sentenceFileRx = "(?s)^[0-9]+\\s(.*)$" r
 63 | 
 64 |     override def preProcess(sentenceLine: String): String = {
 65 |       sentenceLine match {
 66 |         case sentenceFileRx(sentence) =>
 67 |           new InputHomogenization(sentence).transform()
 68 |         case _ =>
 69 |           throw new Exception("Invalid input line.")
 70 |       }
 71 |     }
 72 |   }
 73 | 
 74 |   /**
 75 |    * Train a new word2vec model on the given news corpus
 76 |    */
 77 |   private def trainWordVectorModel() = {
 78 |     val file = new File(s"assets/deu_news_2010_$vecTrainSize-text/deu_news_2010_$vecTrainSize-sentences.txt")
 79 | 
 80 |     val sentenceIterator = new FileSentenceIterator(sentencePreprocessor, file)
 81 | 
 82 |     val t = new UimaTokenizerFactory()
 83 |     val vec = new Word2Vec.Builder()
 84 |       .minWordFrequency(5)
 85 |       .windowSize(windowSize)
 86 |       .layerSize(wordVecLayers)
 87 |       .iterate(sentenceIterator)
 88 |       .tokenizerFactory(t)
 89 |       .build()
 90 | 
 91 |     vec.fit()
 92 |     vec
 93 |   }
 94 | 
 95 |   /**
 96 |    * Train a RBM on the training data set either using an existing word2vec model or creating a new one. THis is the 
 97 |    * place to configure the RBM 
 98 |    */
 99 |   def train(trainDocuments: Iterable[List[AnnotatedToken]], states: List[String]) = {
100 |     val vec = trainWordVectorModel()
101 |     storeWordVectorModel(vec)
102 |     //    val vec = loadWordVectorModel()
103 | 
104 |     println("Finished Word2Vec!")
105 | 
106 |     printf("Sim('fernsehen', 'familie') = %f\n", vec.similarity("fernsehen", "familie"));
107 | 
108 |     val fetcher = new Word2VecDataSetIterator(vec, trainDocuments, states, batch = 10)
109 |     val gen = new MersenneTwister(123);
110 | 
111 |     val layerFactory = LayerFactories.getFactory(classOf[RBM])
112 |     val conf = new NeuralNetConfiguration.Builder()
113 |       .optimizationAlgo(OptimizationAlgorithm.CONJUGATE_GRADIENT)
114 |       .iterations(100)
115 |       .rng(gen)
116 |       .weightInit(WeightInit.NORMALIZED)
117 |       .learningRate(0.001f)
118 |       .nIn(wordVecLayers * windowSize)
119 |       .nOut(states.size)
120 |       .lossFunction(LossFunctions.LossFunction.MCXENT)
121 |       .visibleUnit(RBM.VisibleUnit.SOFTMAX)
122 |       .hiddenUnit(RBM.HiddenUnit.RECTIFIED)
123 |       .layerFactory(layerFactory)
124 |       .list(2)
125 |       .`override`(new NeuralNetConfiguration.ConfOverride() {
126 |       override def `override`(i: Int, builder: NeuralNetConfiguration.Builder) {
127 |         if (i == 1) {
128 |           builder.weightInit(WeightInit.ZERO);
129 |           builder.activationFunction(Activations.softMaxRows());
130 |         }
131 |       }
132 |     })
133 |       .hiddenLayerSizes(50)
134 |       .build()
135 | 
136 |     val network = new MultiLayerNetwork(conf)
137 | 
138 |     println("Started fitting network...")
139 | 
140 |     network.fit(fetcher)
141 | 
142 |     println("Finished fitting Network!")
143 | 
144 |     SerializationUtils.saveObject(network, new File(s"output/network_$vecTrainSize.model6"))
145 | 
146 |     (network, vec)
147 |   }
148 | 
149 | 
150 |   private def labelForArray(a: INDArray, statesIndex: Map[Int, String]) = {
151 |     val m = SimpleNetlibBlas.iamax(a)
152 |     statesIndex(m)
153 |   }
154 | 
155 |   /**
156 |    * Evaluate a given RBM model on the test data set and its gold standard. Beside the implemented evaluation of
157 |    * POSEvaluator this will also execute the model specific evaluation implemented in the dl4j library.
158 |    */
159 |   def evaluate(network: MultiLayerNetwork, testDocuments: Iterable[List[AnnotatedToken]], vec: Word2Vec, states: List[String]) = {
160 | 
161 |     println("Started evaluating Network!")
162 | 
163 |     val testData = new Word2VecDataSetIterator(vec, testDocuments, states, batch = 20000).next()
164 |     val predicted = network.output(testData.getFeatureMatrix)
165 | 
166 |     val statesIndex = states.zipWithIndex.map {
167 |       case (el, i) => i -> el
168 |     }.toMap
169 | 
170 |     val evaluator = new POSEvaluator(states)
171 |     val buildInEval = new Evaluation()
172 | 
173 |     val predictedLabels: Seq[String] = (0 until predicted.length()).map { i =>
174 |       val guessRow: INDArray = predicted.getRow(i)
175 |       labelForArray(guessRow, statesIndex)
176 |     }
177 | 
178 |     val goldLabels = (0 until testData.numExamples()).map { i =>
179 |       val currRow: INDArray = testData.getLabels.getRow(i)
180 |       labelForArray(currRow, statesIndex)
181 |     }
182 | 
183 |     evaluator.add(predictedLabels, goldLabels)
184 |     buildInEval.eval(testData.getLabels, predicted)
185 | 
186 |     System.out.println(buildInEval.stats())
187 | 
188 |     evaluator
189 |   }
190 | 
191 | }
192 | 


--------------------------------------------------------------------------------
/src/main/scala/de/hpi/anlp/conll/ConLLDataSet.scala:
--------------------------------------------------------------------------------
 1 | package de.hpi.anlp.conll
 2 | 
 3 | import org.deeplearning4j.datasets.fetchers.BaseDataFetcher
 4 | import org.deeplearning4j.models.word2vec.Word2Vec
 5 | import org.deeplearning4j.text.movingwindow.{WindowConverter, Windows}
 6 | import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory
 7 | import org.nd4j.linalg.dataset.DataSet
 8 | import org.nd4j.linalg.util.FeatureUtil
 9 | 
10 | import scala.collection.JavaConversions._
11 | 
12 | /**
13 |  * Data fetcher to enumerate word vectors from ConLL files
14 |  */
15 | class ConLLWordVectorDataFetcher(val vec: Word2Vec, val labels: List[String], val conLLFile: ConLLFileReader) extends BaseDataFetcher {
16 | 
17 |   // Iterator over the files contents
18 |   val iter = conLLFile.iterator
19 | 
20 |   // Label index
21 |   val labelIdx = labels.zipWithIndex.toMap
22 | 
23 |   // Tokenizer to improve tokens 
24 |   val factory = new DefaultTokenizerFactory()
25 | 
26 |   // If the requested number of examples doesn't align with the tokens in a sentence we need to save left over tokens
27 |   var leftOver = Vector.empty[DataSet]
28 | 
29 |   /**
30 |    * Fetch the next numExample tokens
31 |    * @param numExamples Number of tokens
32 |    */
33 |   override def fetch(numExamples: Int): Unit = {
34 | 
35 |     if (leftOver.size >= numExamples) {
36 |       curr = DataSet.merge(leftOver.take(numExamples))
37 |       leftOver = leftOver.drop(numExamples)
38 |       cursor += curr.numExamples()
39 |     } else if (!iter.hasNext) {
40 |       if (!leftOver.isEmpty) {
41 |         curr = DataSet.merge(leftOver)
42 |         leftOver = Vector.empty
43 |         cursor += curr.numExamples()
44 |       }
45 |     } else {
46 |       val list = iter.take(numExamples).flatMap { example =>
47 |         val words = example.map(_.token)
48 |         val labels = example.map(_.tag)
49 |         Windows.windows(words, vec.getWindow()).zip(labels).map {
50 |           case (window, label) =>
51 |             val wordVector = WindowConverter.asExampleArray(window, vec, false)
52 |             val labelVector = FeatureUtil.toOutcomeVector(labelIdx(label), labels.size)
53 |             new DataSet(wordVector, labelVector)
54 |         }
55 |       }
56 | 
57 |       val merge = (list ++ leftOver).take(numExamples).toList
58 | 
59 |       curr = DataSet.merge(merge)
60 |       cursor += curr.numExamples()
61 | 
62 |       if (list.hasNext)
63 |         leftOver ++= list
64 |     }
65 |   }
66 | 
67 |   override def inputColumns() =
68 |     vec.lookupTable().layerSize() * vec.getWindow()
69 | 
70 |   override def totalOutcomes() =
71 |     labels.size
72 | 
73 |   override def hasMore() =
74 |     iter.hasNext || leftOver.size > 0
75 | }
76 | 
77 | 


--------------------------------------------------------------------------------
/src/main/scala/de/hpi/anlp/conll/ConLLFile.scala:
--------------------------------------------------------------------------------
 1 | package de.hpi.anlp.conll
 2 | 
 3 | import java.io.{File, PrintWriter}
 4 | import scala.io.Source
 5 | 
 6 | /**
 7 |  * A token and its tag
 8 |  */
 9 | case class AnnotatedToken(token: String, tag: String)
10 | 
11 | /**
12 |  * Writer helper to write a sentence and its annotated tags into a ConLL file format. This allows external evaluators
13 |  * based on that format to read the output 
14 |  *
15 |  */
16 | class ConLLFileWriter(fileName: String) {
17 |   var openedWriter: Option[PrintWriter] = Some(new PrintWriter(new File(fileName)))
18 | 
19 |   /**
20 |    * Write sentence and its tag to file. One token and tag per line
21 |    */
22 |   def write(sentence: Seq[String], annotations: Seq[String]): Boolean = {
23 |     openedWriter.map { writer =>
24 |       sentence.zip(annotations).map {
25 |         case (word, annotation) =>
26 |           writer.println(word + "\t" + annotation)
27 |       }
28 |       writer.println("") // add an empty line to complete the sentence
29 | 
30 |       true
31 |     } getOrElse false
32 |   }
33 | 
34 |   def close() = {
35 |     openedWriter.map { writer =>
36 |       writer.flush()
37 |       writer.close()
38 |     }
39 |     openedWriter = None
40 |   }
41 | }
42 | 
43 | /**
44 |  * Helper class to iterate through a ConLL data set file.
45 |  */
46 | class ConLLFileReader(fileName: String) extends Iterable[List[AnnotatedToken]] {
47 |   override def iterator = new Iterator[List[AnnotatedToken]] {
48 |     val lineIt = Source.fromFile(fileName).getLines
49 |     var nextVal = readNextFromInput()
50 |     var readLines = 0
51 | 
52 |     override def hasNext: Boolean = nextVal.isDefined
53 | 
54 |     override def next(): List[AnnotatedToken] = {
55 |       nextVal match {
56 |         case Some(value) =>
57 |           nextVal = readNextFromInput()
58 |           value
59 |         case _ =>
60 |           throw new NoSuchElementException("next on empty iterator")
61 |       }
62 |     }
63 | 
64 |     private def readNextFromInput(): Option[List[AnnotatedToken]] = {
65 |       val currentTokens = lineIt.takeWhile(_.trim != "")
66 |       if (currentTokens.isEmpty)
67 |         None
68 |       else {
69 |         val cs = currentTokens.toList
70 |         val annotated = cs.map { current =>
71 |           readLines += 1
72 |           current.split('\t') match {
73 |             case Array(token, label) =>
74 |               AnnotatedToken(token, label)
75 |             case _ =>
76 |               throw new Exception(s"Invalid line #$readLines in ConLL file. Line content: '$current'")
77 |           }
78 |         }
79 |         readLines += 1
80 |         Some(annotated)
81 |       }
82 |     }
83 |   }
84 | }
85 | 


--------------------------------------------------------------------------------
/src/main/scala/de/hpi/anlp/hmm/ConstantSmoothedHMM.scala:
--------------------------------------------------------------------------------
 1 | package de.hpi.anlp.hmm
 2 | 
 3 | import scala.collection.mutable
 4 | 
 5 | /**
 6 |  * Extension of the base HMM implementation to use constant smoothing. This is especially useful for words and tag word
 7 |  * combinations that were not seen during training
 8 |  */
 9 | class ConstantSmoothedHMM(states: List[String], n: Int = 2, smoothingConstant: Int = 1) extends HMM(states, n) {
10 |   override def calculateStartProbabilities(starts: Array[Double]) = {
11 |     val sum = starts.sum  + states.size * smoothingConstant
12 |     starts.map(startCount => (startCount + smoothingConstant)/ sum)
13 |   }
14 | 
15 |   override def calculateTransitionProbabilities(transitions: Array[Double]) = {
16 |     val sum = transitions.sum + states.size * smoothingConstant
17 |     transitions.map(transitionCount => (transitionCount + smoothingConstant) / sum)
18 |   }
19 | 
20 |   override def calculateEmissionProbabilities(emissions: Array[mutable.Map[String, Int]]) = {
21 |     emissions.map { emissionsForTag =>
22 |       val sum = emissionsForTag.values.sum + states.size * smoothingConstant
23 |       emissionsForTag.mapValues { tokenFreq =>
24 |         (tokenFreq + smoothingConstant).toDouble / sum
25 |       }.toMap.withDefaultValue(smoothingConstant.toDouble / sum)
26 |     }
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/src/main/scala/de/hpi/anlp/hmm/HMM.scala:
--------------------------------------------------------------------------------
 1 | package de.hpi.anlp.hmm
 2 | 
 3 | import de.hpi.anlp.conll.AnnotatedToken
 4 | import scala.collection.mutable
 5 | 
 6 | /**
 7 |  * HMM to be configured. After train is called a new trainedHMM instance is created
 8 |  */
 9 | class HMM(states: List[String], n: Int = 2) {
10 |   val stateIdx = states.zipWithIndex.toMap
11 | 
12 |   /**
13 |    * Train traverses the input data, collects statistics and calculates probabilities for hidden states and outputs. 
14 |    * Those can then be used in the HMM to predict tags for unseen sentences 
15 |    */
16 |   def train(annotatedData: Iterable[List[AnnotatedToken]]) = {
17 |     val transitions = new Array[Double](math.pow(states.size, n).toInt)
18 | 
19 |     val emissions = Array.fill(states.size)(mutable.HashMap.empty[String, Int].withDefaultValue(0))
20 | 
21 |     val starts = new Array[Double](states.size)
22 | 
23 |     annotatedData.foreach { annotated =>
24 |       annotated.headOption.map { first =>
25 |         val sIdx = stateIdx(first.tag)
26 |         starts(sIdx) += 1
27 |       }
28 | 
29 |       val idxs = annotated.map {
30 |         case AnnotatedToken(token, tag) =>
31 |           val idx = stateIdx(tag)
32 |           emissions(idx).update(token, emissions(idx)(token) + 1)
33 |           idx
34 |       }
35 | 
36 |       idxs.sliding(n, 1).foreach { window =>
37 |         if (window.size == n) {
38 |           val idx = window.foldLeft(0)((p, s) => p * states.size + s)
39 |           transitions(idx) = transitions(idx) + 1
40 |         }
41 |       }
42 |     }
43 | 
44 |     val emissionProbs = calculateEmissionProbabilities(emissions)
45 |     val transitionProbs = calculateTransitionProbabilities(transitions)
46 |     val startProbs = calculateStartProbabilities(starts)
47 |     new TrainedHMM(states, n, emissionProbs, transitionProbs, startProbs)
48 |   }
49 | 
50 |   /**
51 |    * Given the number of seen starts, calculate the start probabilities
52 |    */
53 |   def calculateStartProbabilities(starts: Array[Double]) = {
54 |     val sum = starts.sum
55 |     starts.map(_ / sum)
56 |   }
57 | 
58 |   /**
59 |    * Given the transition statistics, calculate transition probabilities
60 |    */
61 |   def calculateTransitionProbabilities(transitions: Array[Double]) = {
62 |     val sum = transitions.sum
63 |     transitions.map(_ / sum)
64 |   }
65 | 
66 |   /**
67 |    * Given emission statistics calculate emission probabilities for each hidden state
68 |    */
69 |   def calculateEmissionProbabilities(emissions: Array[mutable.Map[String, Int]]) = {
70 |     emissions.map { emissionsForTag =>
71 |       val sum = emissionsForTag.values.sum
72 |       emissionsForTag.mapValues { tokenFreq =>
73 |         tokenFreq.toDouble / sum
74 |       }.toMap.withDefaultValue(0.0)
75 |     }
76 |   }
77 | }
78 | 
79 | 
80 | 
81 | 
82 | 


--------------------------------------------------------------------------------
/src/main/scala/de/hpi/anlp/hmm/TrainedHMM.scala:
--------------------------------------------------------------------------------
 1 | package de.hpi.anlp.hmm
 2 | 
 3 | import scala.collection.breakOut
 4 | 
 5 | /**
 6 |  * A trained HMM which can be used to predict tags on a given sentence. Its a result of the training of a configured HMM
 7 |  */
 8 | case class TrainedHMM(states: List[String],
 9 |                       n: Int,
10 |                       underlyingEmissionProbs: Array[Map[String, Double]],
11 |                       underlyingTransitionProbs: Array[Double],
12 |                       underlyingStartProbs: Array[Double]) {
13 | 
14 |   def incommingProbabilities(trellis: Vector[Array[Double]], sidx: Int, trellisLevel: Int, emissionPs: Array[Double]): Map[Int, Double] = {
15 |     (0 until states.size).map { prevStateIdx =>
16 |       val probability = trellis(trellisLevel - 1)(prevStateIdx) +
17 |         math.log(transitionProbability(prevStateIdx, sidx)) +
18 |         math.log(emissionPs(sidx))
19 | 
20 |       prevStateIdx -> probability
21 |     }(breakOut)
22 |   }
23 | 
24 |   /**
25 |    * Viterbi implementation for graph traversal. Used to find most probable hidden states for the observed outputs.
26 |    */
27 |   def viterbi(observations: List[String]) = {
28 |     observations match {
29 |       case Nil =>
30 |         0.0 -> Vector.empty
31 |       case firstObservation :: remainingObservations =>
32 |         var paths = states.toArray.map(state => Vector(state))
33 |         val initialNodeLevel = states.zipWithIndex.toArray.map {
34 |           case (state, idx) =>
35 |             math.log(startProbability(idx)) + math.log(emissionProbability(idx, firstObservation))
36 |         }
37 | 
38 |         var trellis = Vector(initialNodeLevel)
39 | 
40 |         remainingObservations.zipWithIndex.map {
41 |           case (observation, trellisLevel) =>
42 |             val nextNodeLevel = new Array[Double](states.size)
43 |             val updatedPaths = new Array[Vector[String]](states.size)
44 |             var emissionPs: Array[Double] = (0 until states.size).map { idx =>
45 |               emissionProbability(idx, observation)
46 |             }(breakOut)
47 | 
48 |             if (emissionPs.forall(_ == 0))
49 |               emissionPs = Array.fill(states.size)(1.0)
50 | 
51 |             states.zipWithIndex.map {
52 |               case (state, sidx) =>
53 |                 val (bestState, bestProb) = incommingProbabilities(trellis, sidx, trellisLevel + 1, emissionPs).maxBy(_._2)
54 |                 nextNodeLevel.update(sidx, bestProb)
55 |                 updatedPaths.update(sidx, paths(bestState) :+ state)
56 |             }
57 | 
58 |             trellis :+= nextNodeLevel
59 |             paths = updatedPaths
60 |         }
61 | 
62 |         val (bestProb, bestState) = trellis.last.zipWithIndex.maxBy(_._1)
63 |         bestProb -> paths(bestState)
64 |     }
65 |   }
66 | 
67 |   def emissionProbability(sidx: Int, observation: String): Double =
68 |     underlyingEmissionProbs(sidx)(observation)
69 | 
70 |   def transitionProbability(from: Int, to: Int): Double =
71 |     underlyingTransitionProbs(from * states.size + to)
72 | 
73 |   def startProbability(sidx: Int): Double =
74 |     underlyingStartProbs(sidx)
75 | 
76 |   def mostProbablePath(observations: List[String]) = viterbi(observations)
77 | }


--------------------------------------------------------------------------------
/src/main/scala/de/hpi/anlp/mlp/MLP.scala:
--------------------------------------------------------------------------------
  1 | package de.hpi.anlp.mlp
  2 | 
  3 | import scala.util.{Try, Success, Failure}
  4 | import org.apache.log4j.Logger
  5 | import de.hpi.anlp.utils.ScalaMLTypes._
  6 | 
  7 | class MLP(
  8 |            config: MLPConfig,
  9 |            xt: Array[Array[Double]],
 10 |            labels: DblMatrix)
 11 |          (implicit mlpObjective: MLPTask) {
 12 | 
 13 |   private val logger = Logger.getLogger("MLP")
 14 | 
 15 |   // Flag that indicates that the training converged toward a definite model
 16 |   private[this] var converged = false
 17 | 
 18 |   /**
 19 |    * Model for the Multi-layer Perceptron of type MLPModel
 20 |    */
 21 |   val model: Option[MLPModel] = train match {
 22 |     case Success(_model) =>
 23 |       Some(_model)
 24 |     case Failure(e) =>
 25 |       logger.error("MLP.model ", e)
 26 |       None
 27 |   }
 28 | 
 29 |   /**
 30 |    * Test whether the model has converged
 31 |    */
 32 |   final def hasConverged: Boolean = converged
 33 | 
 34 |   /**
 35 |    * Define the predictive function of the classifier or regression
 36 |    */
 37 |   def output: PartialFunction[Array[Double], DblVector] = {
 38 |     case x: Array[Double] if (!x.isEmpty && model != None && x.size == xt(0).size) => {
 39 | 
 40 |       Try(model.get.getOutput(x)) match {
 41 |         case Success(y) => y
 42 |         case Failure(e) => {
 43 |           logger.error("MLP ", e)
 44 |           Array.empty[Double]
 45 |         }
 46 |       }
 47 |     }
 48 |   }
 49 | 
 50 | 
 51 |   /**
 52 |    * Computes the accuracy of the training session. The accuracy is estimated
 53 |    * as the percentage of the training data points for which the square root of
 54 |    * the sum of squares error, normalized by the size of the  training set exceed a
 55 |    * predefined threshold
 56 |    */
 57 |   final def accuracy(threshold: Double): Option[Double] = model.map(m => {
 58 | 
 59 |     // counts the number of data points for were correctly classified
 60 |     val nCorrects = xt.zip(labels)
 61 |       .foldLeft(0)((s, xtl) => {
 62 | 
 63 |       // Get the output layer for this input xt.
 64 |       val output = model.get.getOutput(xtl._1)
 65 | 
 66 |       // Compute the sum of squared error while excluding bias element
 67 |       val _sse = xtl._2.zip(output.drop(1))
 68 |         .foldLeft(0.0)((err, tp) => {
 69 |         val diff = tp._1 - tp._2
 70 |         err + diff * diff
 71 |       }) * 0.5
 72 | 
 73 |       // Compute the least square error and adjusts it for the number of output variables.
 74 |       val error = Math.sqrt(_sse) / (output.size - 1)
 75 |       if (error < threshold) s + 1 else s
 76 |     })
 77 | 
 78 |     // returns the percentage of observations correctly classified
 79 |     nCorrects.toDouble / xt.size
 80 |   })
 81 | 
 82 |   /**
 83 |    * Training method for the Multi-layer perceptron
 84 |    */
 85 |   private def train: Try[MLPModel] = {
 86 |     Try {
 87 |       val _model = new MLPModel(config, xt(0).size, labels(0).size)(mlpObjective)
 88 | 
 89 |       // Scaling or normalization factor for the sum of the squared error
 90 |       val errScale = 1.0 / (labels(0).size * xt.size)
 91 | 
 92 |       // Apply the exit condition for this online training strategy
 93 |       // The convergence criteria selected is the reconstruction error
 94 |       // generated during an epoch adjusted to the scaling factor and compare
 95 |       // to the predefined criteria config.eps
 96 |       converged = Range(0, config.numEpochs).find(epoch => {
 97 |         val e = xt.toArray.zip(labels).foldLeft(0.0)((s, xtlbl) =>
 98 |           s + _model.trainEpoch(xtlbl._1, xtlbl._2)
 99 |         ) * errScale
100 |         if (epoch % 10 == 0)
101 |           println("SSE: " + e)
102 |         e < config.eps
103 |       }) != None
104 |       _model
105 |     }
106 |   }
107 | }
108 | 


--------------------------------------------------------------------------------
/src/main/scala/de/hpi/anlp/mlp/MLPConfig.scala:
--------------------------------------------------------------------------------
 1 | package de.hpi.anlp.mlp
 2 | 
 3 | /**
 4 |  * Configuration of MLP. If params are out of range an exception is thrown
 5 |  * @param momentum  Momentum parameter used to adjust the value of the gradient of the weights
 6 |  *                  with previous value (smoothing)
 7 |  * @param learningRate   Learning rate ]0, 1] used in the computation of the gradient of the weights
 8 |  *                       during training
 9 |  * @param hidLayers  Sequence of number of neurons for the hidden layers
10 |  * @param numEpochs  Number of epochs or iterations allowed to train the weights/model
11 |  * @param eps  Convergence criteria used as exit condition of the convergence toward optimum 
12 |  *             weights that minimize the sum of squared error		 
13 |  * @param activation Activation function (sigmoid or tanh) that computes the output of hidden 
14 |  *                   layers during forward propagation
15 |  *
16 |  */
17 | case class MLPConfig(
18 |                       momentum: Double,
19 |                       learningRate: Double,
20 |                       hidLayers: Array[Int],
21 |                       numEpochs: Int,
22 |                       eps: Double = 1e-17,
23 |                       activation: Double => Double) {
24 | 
25 |   /**
26 |    * Id of output layer
27 |    */
28 |   final def outLayerId: Int =
29 |     if (hidLayers.isEmpty)
30 |       1
31 |     else
32 |       hidLayers.size + 1
33 | 
34 |   /**
35 |    * # hidden layers in network
36 |    */
37 |   def nHiddens =
38 |     if (hidLayers.isEmpty)
39 |       0
40 |     else
41 |       hidLayers.size
42 | }
43 | 


--------------------------------------------------------------------------------
/src/main/scala/de/hpi/anlp/mlp/MLPConnection.scala:
--------------------------------------------------------------------------------
  1 | package de.hpi.anlp.mlp
  2 | 
  3 | import de.hpi.anlp.utils.ScalaMLTypes.MLPTask
  4 | 
  5 | import scala.util.Random
  6 | 
  7 | /**
  8 |  * Class that defines the connection between two consecutive (or sequential layers)
  9 |  * in a Multi-layer Perceptron. The connections is composed of all the synapses between
 10 |  * any neuron or variable of each layer.The Synapse is defined as a nested tuple(Double, Double)
 11 |  * tuple (weights, deltaWeights)
 12 |  */
 13 | class MLPConnection(
 14 |                      config: MLPConfig,
 15 |                      src: MLPLayer,
 16 |                      dst: MLPLayer)
 17 |                    (implicit mlpObjective: MLPTask) {
 18 | 
 19 |   private val BETA = 0.01
 20 | 
 21 |   /**
 22 |    * Synapse defined as a tuple of [weight, gradient(weights)]
 23 |    */
 24 |   type MLPSynapse = (Double, Double)
 25 | 
 26 |   /*
 27 |    * Initialize the matrix (Array of Array) of Synapse by generating
 28 |    * a random value between 0 and BETA
 29 |    */
 30 |   private[this] val synapses: Array[Array[MLPSynapse]] = Array.tabulate(dst.len)(n =>
 31 |     if (n > 0) 
 32 |       Array.fill(src.len)((Random.nextDouble * BETA, 0.0))
 33 |     else 
 34 |       Array.fill(src.len)((1.0, 0.0)))
 35 | 
 36 |   /**
 37 |    * Implement the forward propagation of input value. The output
 38 |    * value depends on the conversion selected for the output. If the output or destination
 39 |    * layer is a hidden layer, then the activation function is applied to the dot product of
 40 |    * weights and values. If the destination is the output layer, the output value is just
 41 |    * the dot product weights and values
 42 |    */
 43 |   def connectionForwardPropagation: Unit = {
 44 |     // Iterates over all the synapsed except the first or bian selement
 45 |     val _output = synapses.drop(1).map(x => {
 46 |       // Compute the dot product
 47 |       val sum = x.zip(src.output).foldLeft(0.0)((s, xy) => s + xy._1._1 * xy._2)
 48 | 
 49 |       // Applies the activation function if this is a hidden layer (not output)
 50 |       if (!isOutLayer) config.activation(sum) else sum
 51 |     })
 52 | 
 53 |     // Apply the objective function (SoftMax,...) to the output layer
 54 |     val out = if (isOutLayer) mlpObjective(_output) else _output
 55 |     out.copyToArray(dst.output, 1)
 56 |   }
 57 | 
 58 |   /**
 59 |    * Access the identifier for the source and destination layers
 60 |    */
 61 |   @inline
 62 |   final def getLayerIds: (Int, Int) = (src.id, dst.id)
 63 | 
 64 |   @inline
 65 |   final def getSynapses: Array[Array[MLPSynapse]] = synapses
 66 | 
 67 |   /**
 68 |    * Implement the back propagation of output error (target - output). The method uses
 69 |    * the derivative of the logistic function to compute the delta value for the output of
 70 |    * the source layer
 71 |    */
 72 |   def connectionBackpropagation: Unit =
 73 |     Range(1, src.len).foreach(i => {
 74 |       val err = Range(1, dst.len).foldLeft(0.0)((s, j) =>
 75 |         s + synapses(j)(i)._1 * dst.delta(j))
 76 | 
 77 |       // The delta value is computed as the derivative of the
 78 |       // output value adjusted for the back-propagated error, err
 79 |       src.delta(i) = src.output(i) * (1.0 - src.output(i)) * err
 80 |     })
 81 | 
 82 | 
 83 |   /**
 84 |    * Implement the update of the synapse (weight, grad weight) following the
 85 |    * back propagation of output error. This method is called during training.
 86 |    */
 87 |   def connectionUpdate: Unit =
 88 |   // Iterates through all element of the destination layer except the bias element
 89 |     Range(1, dst.len).foreach(i => {
 90 |       val delta = dst.delta(i)
 91 | 
 92 |       // Compute all the synapses (weight, gradient weight) between
 93 |       // the destination elements (index i) and the source elements (index j)
 94 |       Range(0, src.len).foreach(j => {
 95 |         val _output = src.output(j)
 96 |         val oldSynapse = synapses(i)(j)
 97 |         // Compute the gradient with the delta
 98 |         val grad = config.learningRate * delta * _output
 99 |         // Apply the gradient adjustment formula
100 |         val deltaWeight = grad + config.momentum * oldSynapse._2
101 |         // Update the synapse
102 |         synapses(i)(j) = (oldSynapse._1 + deltaWeight, grad)
103 |       })
104 |     })
105 | 
106 |   /**
107 |    * Convenient method to update the values of a synapse while
108 |    * maintaining immutability
109 |    */
110 |   private def update(i: Int, j: Int, x: Double, dx: Double): Unit = {
111 |     val old = synapses(i)(j)
112 |     synapses(i)(j) = (old._1 + x, dx)
113 |   }
114 | 
115 |   private def isOutLayer: Boolean = dst.id == config.outLayerId
116 | }
117 | 


--------------------------------------------------------------------------------
/src/main/scala/de/hpi/anlp/mlp/MLPLayer.scala:
--------------------------------------------------------------------------------
 1 | package de.hpi.anlp.mlp
 2 | 
 3 | import de.hpi.anlp.utils.ScalaMLTypes.DblVector
 4 | 
 5 | /**
 6 |  * A MLP layer is built using the input vector and add an extra element to account for the bias w0
 7 |  */
 8 | class MLPLayer(val id: Int, val len: Int) {
 9 | 
10 |   /**
11 |    * Values of the output vector
12 |    */
13 |   val output = new DblVector(len)
14 | 
15 |   /**
16 |    * Difference for the propagated error on the source or upstream
17 |    */
18 |   val delta = new DblVector(len)
19 |   output.update(0, 1.0)
20 | 
21 |   /**
22 |    * Initialize the value of the input for this MLP layer
23 |    */
24 |   def set(_x: DblVector): Unit = {
25 |     _x.copyToArray(output, 1)
26 |   }
27 | 
28 |   /**
29 |    * Compute the sum of squared error of the elements of this MLP layer
30 |    */
31 |   final def sse(labels: DblVector): Double = {
32 |     var _sse = 0.0
33 |     output.drop(1).zipWithIndex.foreach {
34 |       case (on, idx) => {
35 |         val err = labels(idx) - on
36 |         delta.update(idx + 1, on * (1.0 - on) * err)
37 |         _sse += err * err
38 |       }
39 |     }
40 |     _sse * 0.5 // normalized C
41 |   }
42 | 
43 |   /**
44 |    * Is this layer the output layer
45 |    */
46 |   final def isOutput(lastId: Int): Boolean = id == lastId
47 | }
48 | 


--------------------------------------------------------------------------------
/src/main/scala/de/hpi/anlp/mlp/MLPModel.scala:
--------------------------------------------------------------------------------
  1 | package de.hpi.anlp.mlp
  2 | 
  3 | import de.hpi.anlp.utils.Model
  4 | import de.hpi.anlp.utils.ScalaMLTypes.{DblVector, MLPTask}
  5 | 
  6 | /**
  7 |  * MLP model represents a MLP configuration and instance. A MLP model consists of MLPLayer s (layer of the MLP model), 
  8 |  * MLPSynapse s (connection between two elements) and MLPConnections (container of synapses of a layer)
  9 |  */
 10 | class MLPModel(
 11 |                 config: MLPConfig,
 12 |                 nInputs: Int,
 13 |                 nOutputs: Int)(
 14 |                 implicit mlpObjective: MLPTask) extends Model {
 15 |   
 16 |   val topology = 
 17 |     if (config.nHiddens == 0) 
 18 |       Array[Int](nInputs, nOutputs)  // if no hidden layer is set, there is only an output layer
 19 |     else 
 20 |       Array[Int](nInputs) ++ config.hidLayers ++ Array[Int](nOutputs)
 21 | 
 22 |   /*
 23 |    * Aarrays of layers for the topology
 24 |    */
 25 |   val layers: Array[MLPLayer] = topology.zipWithIndex
 26 |     .map {
 27 |     case (t, idx) =>
 28 |       new MLPLayer(idx, t + 1)
 29 |   }
 30 | 
 31 |   /*
 32 |    * Create a array of connection between layer. A connection is
 33 |    * made of multiple synapses.
 34 |    */
 35 |   val connections = Range(0, layers.size - 1).map(n =>
 36 |     new MLPConnection(config, layers(n), layers(n + 1))(mlpObjective)).toArray
 37 | 
 38 |   /**
 39 |    * Alias for the input or first layer in the network
 40 |    */
 41 |   @inline
 42 |   def inLayer: MLPLayer = layers.head
 43 | 
 44 |   /**
 45 |    * Alias for the last layer (output layer) in the network
 46 |    */
 47 |   @inline
 48 |   def outLayer: MLPLayer = layers.last
 49 | 
 50 |   /**
 51 |    * Training cycle: Forward propagation of input, back propagation of error and the re-computation of the weight and 
 52 |    * gradient of the elements.
 53 |    */
 54 |   def trainEpoch(x: DblVector, y: DblVector): Double = {
 55 |     // Initialize the input layer
 56 |     inLayer.set(x)
 57 |     // Apply the forward progapation of input to all the connections
 58 |     // starting with the input layer
 59 |     connections.foreach(_.connectionForwardPropagation)
 60 | 
 61 |     // Compute the sum of squared errors
 62 |     val _sse = sse(y)
 63 | 
 64 |     // Create a back iterator
 65 |     val bckIterator = connections.reverseIterator
 66 | 
 67 |     // Apply the error back propagation to all the connections
 68 |     // starting with the output lauer
 69 |     bckIterator.foreach(_.connectionBackpropagation)
 70 | 
 71 |     // Finally update the connections (weigths and grad weights) of synapses
 72 |     connections.foreach(_.connectionUpdate)
 73 |     _sse
 74 |   }
 75 | 
 76 | 
 77 |   /**
 78 |    * Compute the mean squares error for the network as the sum
 79 |    * of the mean squares error for each output value.
 80 |    */
 81 |   @inline
 82 |   final def sse(label: DblVector): Double =
 83 |     outLayer.sse(label)
 84 | 
 85 |   /**
 86 |    * Compute the output values for the network using the forward propagation
 87 |    */
 88 |   def getOutput(x: DblVector): DblVector = {
 89 |     inLayer.set(x)
 90 | 
 91 |     connections.foreach(_.connectionForwardPropagation)
 92 | 
 93 |     outLayer.output
 94 |   }
 95 | 
 96 |   /**
 97 |    * Write the content of this model (weights) into a file
 98 |    */
 99 |   override def saveToFile: Boolean = {
100 |     val content = new StringBuilder(s"$nInputs,")
101 |     if (config.nHiddens != 0)
102 |       content.append(config.hidLayers.mkString(","))
103 | 
104 |     content.append(s"$nOutputs\n")
105 |     connections.foreach(c => {
106 |       content.append(s"${c.getLayerIds._1},${c.getLayerIds._2}:")
107 |       content.append(c.getSynapses.map(s => s"${s.mkString(",")}\n"))
108 |     })
109 |     write(content.toString)
110 |   }
111 | 
112 |   /**
113 |    * Textual description of the model for Multi-layer Perceptron. The representation
114 |    * include the description of the connections and layers.
115 |    */
116 |   override def toString: String = {
117 |     val buf = new StringBuilder
118 |     connections.foreach(buf.append(_))
119 |     layers.foreach(buf.append(_))
120 |     buf.toString
121 |   }
122 | }
123 | 


--------------------------------------------------------------------------------
/src/main/scala/de/hpi/anlp/mlp/MLPTasks.scala:
--------------------------------------------------------------------------------
 1 | package de.hpi.anlp.mlp
 2 | 
 3 | import de.hpi.anlp.utils.ScalaMLTypes._
 4 | 
 5 | /**
 6 |  * Class for the Regression objective for the MLP. This implementation uses softmax
 7 |  */
 8 | object MLPTasks {
 9 |   def MLPMultiClassifier(y: DblVector): DblVector = {
10 |     val softmaxValues = new DblVector(y.size)
11 |     val expY = y.map(Math.exp(_))
12 |     val expYSum = expY.sum
13 |     expY.map(_ / expYSum).copyToArray(softmaxValues, 1)
14 |     softmaxValues
15 |   }
16 | }


--------------------------------------------------------------------------------
/src/main/scala/de/hpi/anlp/nnpos/POSMLP.scala:
--------------------------------------------------------------------------------
 1 | package de.hpi.anlp.nnpos
 2 | 
 3 | import de.hpi.anlp.mlp.{MLP, MLPConfig, MLPTasks}
 4 | import de.hpi.anlp.conll.AnnotatedToken
 5 | import de.hpi.anlp.utils.{SentenceUtils, WordDictionary, TagDictionary}
 6 | import scala.collection.mutable.ArrayBuffer
 7 | 
 8 | case class POSMLPModel(mlp: MLP, tags: TagDictionary, dict: WordDictionary, preW: Int, postW: Int) {
 9 |   def output(sentence: List[String]): Seq[String] = {
10 |     SentenceUtils.slidingWindow(sentence, preW, postW).map { window =>
11 |       val features = dict.words2vec(window)
12 |       labelForArray(mlp.output(features))
13 |     }.toSeq
14 |   }
15 | 
16 |   private def imax(a: Array[Double]): Int = {
17 |     var mv: Option[Double] = None
18 |     var mi: Option[Int] = None
19 |     var i = 1
20 |     while (i < a.size) {
21 |       if (mv.isEmpty || a(i) > mv.get) {
22 |         mi = Some(i)
23 |         mv = Some(a(i))
24 |       }
25 |       i += 1
26 |     }
27 |     mi getOrElse 0
28 |   }
29 | 
30 |   private def labelForArray(a: Array[Double]): String = {
31 |     tags.revIdx(imax(a) - 1)
32 |   }
33 | }
34 | 
35 | object POSMLPModel {
36 | 
37 |   private def calculateXY(annotatedData: Iterable[List[AnnotatedToken]], preW: Int, postW: Int, dict: WordDictionary, tags: TagDictionary) = {
38 |     val X = new ArrayBuffer[Array[Double]]()
39 |     val y = new ArrayBuffer[Array[Double]]()
40 | 
41 |     annotatedData.foreach { annotated =>
42 |       SentenceUtils.slidingWindow(annotated.view.map(_.token), preW, postW).foreach { window =>
43 |         if (window.size == preW + 1 + postW) {
44 |           X.append(dict.words2vec(window))
45 |         }
46 |       }
47 | 
48 |       annotated.foreach {
49 |         case AnnotatedToken(_, tag) =>
50 |           y.append(tags.tag2vec(tag))
51 |       }
52 |     }
53 |     (X.toArray, y.toArray)
54 |   }
55 | 
56 |   def fit(mlpCfg: MLPConfig, states: List[String], annotatedData: Iterable[List[AnnotatedToken]], preW: Int = 2, postW: Int = 2) = {
57 | 
58 |     val tags = TagDictionary(states)
59 |     val dict = WordDictionary.build(tags, annotatedData)
60 |     println("Finished creation word dictionary. Size: " + dict.underlying.size)
61 |     val (features, labels) = calculateXY(annotatedData, preW, postW, dict, tags)
62 |     println("Finished calculating features. Size: " + features.size)
63 | 
64 |     val mlp = new MLP(mlpCfg, features, labels)(MLPTasks.MLPMultiClassifier _)
65 | 
66 |     POSMLPModel(mlp, tags, dict, preW, postW)
67 |   }
68 | }
69 | 


--------------------------------------------------------------------------------
/src/main/scala/de/hpi/anlp/utils/FileUtils.scala:
--------------------------------------------------------------------------------
 1 | package de.hpi.anlp.utils
 2 | 
 3 | import org.apache.log4j.Logger
 4 | import scala.io.Source._
 5 | import scala.util.{Failure, Success, Try}
 6 | 
 7 | /**
 8 |  * Read and write content from and to a file
 9 |  */
10 | object FileUtils {
11 |   private val logger = Logger.getLogger("FileUtils")
12 | 
13 |   /**
14 |    * Read the content of a file as a String
15 |    */
16 |   def read(toFile: String, className: String): Option[String] =
17 |     Try(fromFile(toFile).mkString) match {
18 |       case Success(content) =>
19 |         Some(content)
20 |       case Failure(e) =>
21 |         logger.error(s"Reading $className failed. File $toFile", e)
22 |         None
23 |     }
24 | 
25 |   /**
26 |    * Write the content into a file. The content is defined as a string.
27 |    */
28 |   def write(content: String, pathName: String, className: String): Boolean = {
29 |     import java.io.PrintWriter
30 | 
31 |     var printWriter: Option[PrintWriter] = None
32 |     var status = false
33 |     Try {
34 |       printWriter = Some(new PrintWriter(pathName))
35 |       printWriter.map(_.write(content))
36 |       status = true
37 |     }
38 |     match {
39 |       // Catch and display exception description and return false
40 |       case Failure(e) => {
41 |         logger.error(s"$className.write failed for $pathName", e)
42 | 
43 |         if (printWriter != None) {
44 |           Try(printWriter.map(_.close)) match {
45 |             case Success(res) => res
46 |             case Failure(e) =>
47 |               logger.error(s"$className.write Failed for $pathName", e)
48 |           }
49 |         }
50 |       }
51 |       case Success(s) => {}
52 |     }
53 |     status
54 |   }
55 | }
56 | 


--------------------------------------------------------------------------------
/src/main/scala/de/hpi/anlp/utils/LCCFileReader.scala:
--------------------------------------------------------------------------------
 1 | package de.hpi.anlp.utils
 2 | 
 3 | import scala.io.Source
 4 | 
 5 | /**
 6 |  * File reader to process data sets from the NLP institute of the university of Leipzig
 7 |  */
 8 | class LCCFileReader(fileName: String) extends Iterable[String] {
 9 |   /**
10 |    * Every sentence is preceeded by its id. We are going to strip the id since we are only interested in the sentence 
11 |    */
12 |   val sentenceFileRx = "(?s)^[0-9]+\\s(.*)$" r
13 | 
14 |   override def iterator = new Iterator[String] {
15 |     val lineIt = Source.fromFile(fileName).getLines
16 |     var nextVal = readNextFromInput()
17 |     var readLines = 0
18 | 
19 |     override def hasNext: Boolean = nextVal.isDefined
20 | 
21 |     override def next(): String = {
22 |       nextVal match {
23 |         case Some(value) =>
24 |           nextVal = readNextFromInput()
25 |           value
26 |         case _ =>
27 |           throw new NoSuchElementException("next on empty iterator")
28 |       }
29 |     }
30 | 
31 |     private def readNextFromInput(): Option[String] = {
32 |       if (lineIt.hasNext) {
33 |         // Read next sentence and strip the id from it
34 |         val current = lineIt.next()
35 |         current match {
36 |           case sentenceFileRx(sentence) =>
37 |             readLines += 1
38 |             Some(sentence)
39 |           case _ =>
40 |             throw new Exception(s"Invalid line #$readLines in LCC file. Line content: '$current'")
41 |         }
42 |       }
43 |       else
44 |         None
45 |     }
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------
/src/main/scala/de/hpi/anlp/utils/Model.scala:
--------------------------------------------------------------------------------
 1 | package de.hpi.anlp.utils
 2 | 
 3 | trait Model {
 4 |   /**
 5 |    * Write the model parameters associated to this object into a file
 6 |    */
 7 |   protected def write(content: String): Boolean =
 8 |     FileUtils.write(content, Model.RELATIVE_PATH, getClass.getSimpleName)
 9 | 
10 |   /**
11 |    * This operation or method has to be overwritten for a model to be saved into a file
12 |    */
13 |   def saveToFile: Boolean =
14 |     false
15 | }
16 | 
17 | object Model {
18 |   private val RELATIVE_PATH = "models/"
19 | 
20 |   /**
21 |    * Read this model parameters from a file defined as in file `classname`
22 |    */
23 |   def read(className: String): Option[String] =
24 |     FileUtils.read(RELATIVE_PATH, className)
25 | }


--------------------------------------------------------------------------------
/src/main/scala/de/hpi/anlp/utils/ScalaMLTypes.scala:
--------------------------------------------------------------------------------
 1 | package de.hpi.anlp.utils
 2 | 
 3 | /**
 4 |  * Types and conversion between ML types and native Scala types
 5 |  */
 6 | object ScalaMLTypes {
 7 |   type DblMatrix = Array[Array[Double]]
 8 |   type DblVector = Array[Double]
 9 |   type MLPTask = DblVector => DblVector
10 | }
11 | 


--------------------------------------------------------------------------------
/src/main/scala/de/hpi/anlp/utils/SentenceUtils.scala:
--------------------------------------------------------------------------------
 1 | package de.hpi.anlp.utils
 2 | 
 3 | /**
 4 |  * Helper class to construct windows over a given sentence. Windows at the border of the sentence are filled up with 
 5 |  * a border tag
 6 |  */
 7 | object SentenceUtils {
 8 |   val SENTENCE_BORDER = "<=BORDER=>"
 9 | 
10 | 
11 |   def slidingWindow(sentence: Seq[String], preW: Int, postW: Int) = {
12 |     val pre = List.fill(preW)(SENTENCE_BORDER)
13 | 
14 |     val post = List.fill(postW)(SENTENCE_BORDER)
15 | 
16 |     (pre ++ sentence ++ post).sliding(preW + 1 + postW)
17 |   }
18 | }
19 | 


--------------------------------------------------------------------------------
/src/main/scala/de/hpi/anlp/utils/TagDictionary.scala:
--------------------------------------------------------------------------------
 1 | package de.hpi.anlp.utils
 2 | 
 3 | /**
 4 |  * A wraper around helpers with tags. Creates a reverse tag index and can be used to convert a tag to a one-hot vector
 5 |  */
 6 | case class TagDictionary(states: List[String]) {
 7 |   val stateIdx = states.zipWithIndex.toMap
 8 | 
 9 |   val revIdx = states.zipWithIndex.map {
10 |     case (el, i) => i -> el
11 |   }.toMap
12 | 
13 |   val size = states.size
14 | 
15 |   val tag2vec: Map[String, Array[Double]] = states.zipWithIndex.map {
16 |     case (state, idx) =>
17 |       val a = Array.fill(size)(0.0)
18 |       a(idx) = 1
19 |       state -> a
20 |   }.toMap.withDefaultValue(Array.fill(size)(0.0))
21 | }
22 | 


--------------------------------------------------------------------------------
/src/main/scala/de/hpi/anlp/utils/Word2VecDataSetIterator.scala:
--------------------------------------------------------------------------------
  1 | package de.hpi.anlp.utils
  2 | 
  3 | import de.hpi.WindowConverter
  4 | import de.hpi.anlp.conll.AnnotatedToken
  5 | import org.deeplearning4j.datasets.iterator.{DataSetIterator, DataSetPreProcessor}
  6 | import org.deeplearning4j.models.word2vec.Word2Vec
  7 | import org.deeplearning4j.text.inputsanitation.InputHomogenization
  8 | import org.deeplearning4j.text.movingwindow.Windows
  9 | import org.nd4j.linalg.dataset.DataSet
 10 | import org.nd4j.linalg.factory.Nd4j
 11 | import org.nd4j.linalg.util.FeatureUtil
 12 | 
 13 | /**
 14 |  * Allows for customization of all of the params of the iterator
 15 |  * @param vec the word2vec model to use
 16 |  * @param sentenceIter the sentence iterator to use
 17 |  * @param labels the possible labels
 18 |  * @param batch the batch size
 19 |  */
 20 | class Word2VecDataSetIterator(vec: Word2Vec, sentenceIter: Iterable[List[AnnotatedToken]], labels: List[String], val batch: Int = 10) extends DataSetIterator {
 21 | 
 22 |   /**
 23 |    * Underlying active window iterator 
 24 |    */
 25 |   var iter = windowIter()
 26 | 
 27 |   /**
 28 |    * Index to lookup label ids 
 29 |    */
 30 |   val labelIdx = labels.zipWithIndex.toMap
 31 | 
 32 |   /**
 33 |    * Data set preprocessor 
 34 |    */
 35 |   var preProcessor: Option[DataSetPreProcessor] = None
 36 | 
 37 |   /**
 38 |    * Returns an iterate-once collection holding windows of the given size
 39 |    */
 40 |   def windowIter() = {
 41 |     var counter = 0
 42 |     sentenceIter.flatMap { sentence =>
 43 |       import scala.collection.JavaConversions._
 44 |       val words = sentence.map(s => new InputHomogenization(s.token).transform())
 45 |       val wordLabels = sentence.map(_.tag)
 46 |       counter += 1
 47 |       if (counter % 3500 == 0)
 48 |         println("Processing sentence " + counter)
 49 |       Windows.windows(words, vec.getWindow()).zip(wordLabels).map {
 50 |         case (window, label) =>
 51 |           window.setLabel(label)
 52 |           window
 53 |       }
 54 |     }.toList
 55 |   }
 56 | 
 57 |   /**
 58 |    * Like the standard next method but allows a
 59 |    * customizable number of examples returned
 60 |    *
 61 |    * @param num the number of examples
 62 |    * @return the next data applyTransformToDestination
 63 |    */
 64 |   override def next(num: Int): DataSet = {
 65 |     synchronized {
 66 |       try {
 67 |         val windows = iter.take(num).toList
 68 | 
 69 |         iter = iter.drop(num)
 70 | 
 71 |         if (windows.isEmpty)
 72 |           null
 73 |         else {
 74 |           val inputs = Nd4j.create(windows.size, inputColumns())
 75 |           val labelOutput = Nd4j.create(windows.size, labels.size)
 76 | 
 77 |           // Iterate over all windows to convert them to matrix format
 78 |           windows.zipWithIndex.foreach {
 79 |             case (window, row) =>
 80 |               inputs.putRow(row, WindowConverter.asExampleMatrix(window, vec))
 81 |               labelOutput.putRow(row, FeatureUtil.toOutcomeVector(labelIdx(window.getLabel), labels.size))
 82 |           }
 83 | 
 84 |           val ds = new DataSet(inputs, labelOutput)
 85 | 
 86 |           preProcessor.foreach { pp =>
 87 |             pp.preProcess(ds)
 88 |           }
 89 | 
 90 |           ds
 91 |         }
 92 |       } catch {
 93 |         case e: Exception =>
 94 |           println("Exception raised: " + e.getMessage)
 95 |           e.printStackTrace()
 96 |           throw e
 97 |       }
 98 |     }
 99 |   }
100 | 
101 |   override def totalExamples(): Int = {
102 |     throw new UnsupportedOperationException()
103 |   }
104 | 
105 |   override def inputColumns(): Int = {
106 |     vec.lookupTable().layerSize() * vec.getWindow()
107 |   }
108 | 
109 |   override def totalOutcomes(): Int = {
110 |     labels.size
111 |   }
112 | 
113 |   override def reset() = {
114 |     iter = windowIter()
115 |   }
116 | 
117 |   override def cursor(): Int = {
118 |     0
119 |   }
120 | 
121 |   @Override
122 |   override def numExamples(): Int = {
123 |     0
124 |   }
125 | 
126 |   /**
127 |    * Returns {true} if the iteration has more elements.
128 |    * (In other words, returns {true} if {#next} would
129 |    * return an element rather than throwing an exception.)
130 |    *
131 |    * @return {true} if the iteration has more elements
132 |    */
133 |   override def hasNext(): Boolean = {
134 |     !iter.isEmpty
135 |   }
136 | 
137 |   /**
138 |    * Returns the next element in the iteration.
139 |    *
140 |    * @return the next element in the iteration
141 |    */
142 |   override def next(): DataSet = {
143 |     next(batch)
144 |   }
145 | 
146 |   /**
147 |    * Removes from the underlying collection the last element returned
148 |    * by this iterator (optional operation).  This method can be called
149 |    * only once per call to {@link #next}.  The behavior of an iterator
150 |    * is unspecified if the underlying collection is modified while the
151 |    * iteration is in progress in any way other than by calling this
152 |    * method.
153 |    */
154 |   override def remove(): Unit = {
155 |     throw new UnsupportedOperationException()
156 |   }
157 | 
158 |   override def setPreProcessor(dataSetPreprocessor: DataSetPreProcessor): Unit = {
159 |     preProcessor = Some(dataSetPreprocessor)
160 |   }
161 | }
162 | 


--------------------------------------------------------------------------------
/src/main/scala/de/hpi/anlp/utils/WordDictionary.scala:
--------------------------------------------------------------------------------
 1 | package de.hpi.anlp.utils
 2 | 
 3 | import de.hpi.anlp.conll.AnnotatedToken
 4 | import scala.collection.mutable
 5 | 
 6 | /**
 7 |  * A word dictionary is a lookup table for seen words. There is a fallback for unseen words
 8 |  */
 9 | case class WordDictionary(underlying: scala.collection.Map[String, Array[Double]], numStates: Int) {
10 |   /**
11 |    * Value that gets returned if a sentence border is reached 
12 |    */
13 |   val nullVec = Array.fill(numStates)(0.0)
14 | 
15 |   /**
16 |    * Value that gets returned if a word hasn't been seen during training 
17 |    */
18 |   val uniformVec = Array.fill(numStates)(1.0 / numStates)
19 | 
20 |   /**
21 |    * Retrieve the vector representation of a word
22 |    */
23 |   def word2vec(word: String): Array[Double] = underlying.get(word) match {
24 |     case Some(vec) => vec
25 |     case _ if word == SentenceUtils.SENTENCE_BORDER => nullVec
26 |     case _ => uniformVec
27 |   }
28 | 
29 |   /**
30 |    * Retrieve the vector representation of a word
31 |    */
32 |   def words2vec(words: List[String]) =
33 |     Array.concat(words.map(word2vec): _*)
34 | }
35 | 
36 | /**
37 |  * Helper to construct a word dictionary given an input data set 
38 |  */
39 | object WordDictionary {
40 | 
41 |   /**
42 |    * Use the given annotated data to build up a word dictionary containing the probabilities of a word occouring with 
43 |    * each tag. Implements the Tag-Prob word representation 
44 |    */
45 |   def build(tags: TagDictionary, annotatedData: Iterable[List[AnnotatedToken]]): WordDictionary = {
46 | 
47 |     val emissions = mutable.HashMap.empty[String, Array[Double]]
48 |     val counter = mutable.HashMap.empty[String, Int].withDefaultValue(0)
49 | 
50 |     // Iterate over the data set to count tag <-> token occurrences
51 |     annotatedData.foreach { annotated =>
52 |       annotated.foreach {
53 |         case AnnotatedToken(token, tag) =>
54 |           val idx = tags.stateIdx(tag)
55 |           counter.update(token, counter(token) + 1)
56 |           emissions.get(token) match {
57 |             case Some(a) =>
58 |               a(idx) += 1
59 |             case _ =>
60 |               val a = Array.fill(tags.size)(0.0)
61 |               a(idx) = 1
62 |               emissions += (token -> a)
63 |           }
64 |       }
65 |     }
66 | 
67 |     // Calculate the emission probabilities for each word and tag combination
68 |     emissions.foreach {
69 |       case (token, freqs) =>
70 |         (0 until freqs.length).foreach { i =>
71 |           freqs.update(i, (freqs(i) + 1) / (counter(token) + freqs.length))
72 |         }
73 |     }
74 | 
75 |     WordDictionary(emissions, tags.size)
76 |   }
77 | }


--------------------------------------------------------------------------------
/src/main/test-space.sc:
--------------------------------------------------------------------------------
  1 | import java.io.File
  2 | import java.net.{URL, URLClassLoader}
  3 | 
  4 | import org.apache.commons.io.IOUtils
  5 | import org.springframework.core.io.ClassPathResource
  6 | 
  7 | def addPath(s: String){
  8 |   val f = new File(s)     
  9 |   println(f.exists())
 10 |   val u = f.toURI()
 11 |   val urlClassLoader = ClassLoader.getSystemClassLoader().asInstanceOf[URLClassLoader]
 12 |   val urlClass = classOf[URLClassLoader]
 13 |   val method = urlClass.getDeclaredMethod("addURL", classOf[URL])
 14 |   method.setAccessible(true)
 15 |   method.invoke(urlClassLoader, u.toURL())
 16 | }
 17 | 
 18 | addPath("/Users/tombocklisch/Documents/Studium/ANLP/deep-nlp-scala/src/main/resources")
 19 | 
 20 | import java.io.File
 21 | 
 22 | import edu.stanford.nlp.tagger.maxent.MaxentTagger
 23 | import org.apache.commons.math3.random.MersenneTwister
 24 | import org.deeplearning4j.datasets.iterator.impl.IrisDataSetIterator
 25 | import org.deeplearning4j.distributions.Distributions
 26 | import org.deeplearning4j.eval.Evaluation
 27 | import org.deeplearning4j.models.featuredetectors.rbm.RBM
 28 | import org.deeplearning4j.models.word2vec.Word2Vec
 29 | import org.deeplearning4j.nn.conf.{MultiLayerConfiguration, NeuralNetConfiguration}
 30 | import org.deeplearning4j.nn.multilayer.MultiLayerNetwork
 31 | import org.deeplearning4j.nn.weights.WeightInit
 32 | import org.deeplearning4j.text.inputsanitation.InputHomogenization
 33 | import org.deeplearning4j.text.sentenceiterator.{SentencePreProcessor, FileSentenceIterator}
 34 | import org.deeplearning4j.text.tokenization.tokenizerfactory.UimaTokenizerFactory
 35 | import org.nd4j.linalg.api.activation.Activations
 36 | import org.nd4j.linalg.lossfunctions.LossFunctions
 37 | 
 38 | println("running")
 39 | val sample = "This is a sample text."
 40 | val tagged = new MaxentTagger("edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger")
 41 | def reuters = {
 42 |   val reutersFile = "assets/reuters21578/"
 43 |   val file = new File(reutersFile)
 44 | 
 45 |   new FileSentenceIterator(new SentencePreProcessor() {
 46 | 
 47 |     override def preProcess(sentence: String): String =
 48 |       new InputHomogenization(sentence).transform()
 49 | 
 50 |   },file)
 51 | 
 52 | }
 53 | 
 54 | val iter = reuters
 55 | val t = new UimaTokenizerFactory()
 56 | val vec = new Word2Vec.Builder()
 57 |   .windowSize(5)
 58 |   .layerSize(300)
 59 |   .iterate(iter)
 60 |   .tokenizerFactory(t)
 61 |   .build()
 62 | 
 63 | vec.fit()
 64 | 
 65 | val oil = "oil"
 66 | 
 67 | printf("%f\n", vec.similarity(oil, oil))
 68 | 
 69 | printf("%f\n", vec.similarity(oil, "fish"));
 70 | 
 71 | 
 72 | def deep() = {
 73 |   val gen = new MersenneTwister(123);
 74 |   val conf = new NeuralNetConfiguration.Builder()
 75 |     .hiddenUnit(RBM.HiddenUnit.RECTIFIED)
 76 |     .momentum(5e-1f) //this expresses decimals as floats. Remember e?
 77 |     .visibleUnit(RBM.VisibleUnit.GAUSSIAN)
 78 |     .regularization(true)
 79 |     .dist(Distributions.uniform(gen))
 80 |     .activationFunction(Activations.tanh())
 81 |     .iterations(10000)
 82 |     .weightInit(WeightInit.DISTRIBUTION)
 83 |     .lossFunction(LossFunctions.LossFunction.RECONSTRUCTION_CROSSENTROPY)
 84 |     .rng(gen)
 85 |     .learningRate(1e-3f)
 86 |     .nIn(4)
 87 |     .nOut(3)
 88 |     .build()
 89 |   val d = new MultiLayerNetwork(conf.asInstanceOf[MultiLayerConfiguration])
 90 |   val iter = new IrisDataSetIterator(150, 150);
 91 |   val next = iter.next();
 92 |   next.normalizeZeroMeanZeroUnitVariance();
 93 |   next.shuffle();
 94 |   val testAndTrain = next.splitTestAndTrain(110);
 95 |   val train = testAndTrain.getTrain();
 96 |   d.fit(train);
 97 |   val test = testAndTrain.getTest();
 98 | 
 99 |   val eval = new Evaluation();
100 |   val output = d.output(test.getFeatureMatrix());
101 |   eval.eval(test.getLabels(),output);
102 |   println("Score " + eval.stats());
103 | }


--------------------------------------------------------------------------------