├── .gitignore
├── README.md
├── build.sbt
├── data
    ├── import_eventserver.py
    ├── send_query.py
    └── train.tsv
├── engine.json
├── project
    ├── assembly.sbt
    └── pio-build.sbt
├── src
    └── main
    │   └── scala
    │       ├── Algorithm.scala
    │       ├── CoreNLP-Scala
    │           ├── Makefile
    │           ├── README.md
    │           └── src
    │           │   └── edu
    │           │       └── stanford
    │           │           └── nlp
    │           │               ├── Berkeley.scala
    │           │               ├── Classify.scala
    │           │               ├── Document.scala
    │           │               ├── Magic.scala
    │           │               ├── NLP.scala
    │           │               ├── NLPConfig.scala
    │           │               ├── Optimize.scala
    │           │               ├── Sentence.scala
    │           │               └── TokensRegex.scala
    │       ├── DataSource.scala
    │       ├── Engine.scala
    │       ├── Model.scala
    │       ├── Preparator.scala
    │       └── Serving.scala
└── template.json


/.gitignore:
--------------------------------------------------------------------------------
 1 | manifest.json
 2 | pio.log
 3 | /pio.sbt
 4 | target/
 5 | data/*.csv
 6 | data/*.tsv
 7 | data/*.zip
 8 | data/gen_submission.py
 9 | *~
10 | *.swp
11 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Sentiment Analysis Template
 2 | 
 3 | Given a sentence, return a score between 0 and 4, indicating the sentence's sentiment. 0 being very negative, 4 being very positive, 2 being neutral.
 4 | 
 5 | The engine uses the stanford CoreNLP library and the Scala binding `gangeli/CoreNLP-Scala` for parsing.
 6 | 
 7 | ## Versions
 8 | 
 9 | ### v0.1.0
10 | 
11 | - initial version
12 | 
13 | ## import sample data
14 | 
15 | ```
16 | $ python data/import_eventserver.py --access_key <your_access_key> --file data/train.tsv
17 | ```
18 | 
19 | The sample training data comes from https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews. It is a tsv file. Each line contains 4 data, `PhraseId`, `SentenceId`, `Phrase` and `Sentiment`. 
20 | 
21 | For example,
22 | ```
23 | 1	1	bad	1
24 | ```
25 | 
26 | ## Step to build, train and deploy the engine
27 | 
28 | ```
29 | $ pio build && pio train && pio deploy
30 | ```
31 | 
32 | ## Query
33 | 
34 | The query takes a `String` `s`. The result contains a `Double` called `sentiment`. 
35 | 
36 | normal:
37 | 
38 | ```
39 | $ curl -H "Content-Type: application/json" \
40 | -d '{
41 |   "s" : "I am happy"
42 |   }' \
43 | http://localhost:8000/queries.json \
44 | -w %{time_connect}:%{time_starttransfer}:%{time_total}
45 | 
46 | {"sentiment":3.0714285712172384}0.005:0.027:0.027
47 | ```
48 | 
49 | ```
50 | $ curl -H "Content-Type: application/json" \
51 | -d '{
52 |   "s" : "This movie sucks!"
53 |   }' \
54 | http://localhost:8000/queries.json \
55 | -w %{time_connect}:%{time_starttransfer}:%{time_total}
56 | 
57 | {"sentiment":0.8000000001798788}0.005:0.031:0.031
58 | ```
59 | 
60 | 


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
 1 | import AssemblyKeys._
 2 | 
 3 | assemblySettings
 4 | 
 5 | name := "template-scala-sentiment-analysis"
 6 | 
 7 | organization := "com.whhone"
 8 | 
 9 | excludeFilter in unmanagedSources := "Berkeley.scala"
10 | 
11 | libraryDependencies ++= Seq(
12 |   "io.prediction"    %% "core"          % pioVersion.value % "provided",
13 |   "org.apache.spark" %% "spark-core"    % "1.2.0" % "provided",
14 |   "org.apache.spark" %% "spark-mllib"   % "1.2.0" % "provided",
15 |   "edu.stanford.nlp" % "stanford-corenlp" % "3.4",
16 |   "edu.stanford.nlp" % "stanford-corenlp" % "3.4" classifier "models",
17 |   "edu.stanford.nlp" % "stanford-parser" % "3.4"
18 | )
19 | 
20 | 


--------------------------------------------------------------------------------
/data/import_eventserver.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Import sample data for Sentiment Analysis Engine Template
 3 | """
 4 | 
 5 | import predictionio
 6 | import argparse
 7 | 
 8 | def import_events(client, file):
 9 |   f = open(file, 'r')
10 |   count = 0
11 |   print "Importing data..."
12 |   for line in f:
13 |     data = line.rstrip('\r\n').split("\t")
14 |     if True:
15 |       client.create_event(
16 |         event="train",
17 |         entity_type="user",
18 |         entity_id=data[0],
19 |         properties= {
20 |           "phrase" : str(data[2]),
21 |           "sentiment" : float(data[3])
22 |         }
23 |       )
24 |     count += 1
25 |     if count % 100 == 0:
26 |       print count
27 | 
28 |   f.close()
29 |   print "%s events are imported." % count
30 | 
31 | if __name__ == '__main__':
32 |   parser = argparse.ArgumentParser(
33 |     description="Import sample data for sentiment analysis engine")
34 |   parser.add_argument('--access_key', default='invalid-access-key')
35 |   parser.add_argument('--url', default="http://localhost:7070")
36 |   parser.add_argument('--file', default="./data/train.tsv")
37 | 
38 |   args = parser.parse_args()
39 |   print args
40 | 
41 |   client = predictionio.EventClient(
42 |     access_key=args.access_key,
43 |     url=args.url,
44 |     threads=10,
45 |     qsize=1000)
46 |   import_events(client, args.file)
47 | 


--------------------------------------------------------------------------------
/data/send_query.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Send sample query to prediction engine
 3 | """
 4 | 
 5 | import predictionio
 6 | client = predictionio.EngineClient(url="http://localhost:8000")
 7 | 
 8 | def test(s):
 9 |   print s + ' : ' + str(client.send_query({"s": s})['sentiment'])
10 | 
11 | test('sad')
12 | test('happy')
13 | test('oh')
14 | test('not')
15 | test('not sad')
16 | test('very sad')
17 | test('very happy')
18 | test('not very sad')
19 | 


--------------------------------------------------------------------------------
/engine.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "id": "default",
 3 |   "description": "Default settings",
 4 |   "engineFactory": "org.template.sentimentanalysis.SentimentAnalysisEngine",
 5 |   "datasource": {
 6 |     "params" : {
 7 |       "appId": 2
 8 |     }
 9 |   },
10 |   "algorithms": [
11 |     {
12 |       "name": "nlpparse",
13 |       "params": {
14 |         "baseWeight": 1
15 |       }
16 |     }
17 |   ]
18 | }
19 | 


--------------------------------------------------------------------------------
/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2")
2 | 


--------------------------------------------------------------------------------
/project/pio-build.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("io.prediction" % "pio-build" % "0.9.0")
2 | 


--------------------------------------------------------------------------------
/src/main/scala/Algorithm.scala:
--------------------------------------------------------------------------------
 1 | package org.template.sentimentanalysis
 2 | 
 3 | import io.prediction.controller.P2LAlgorithm
 4 | import io.prediction.controller.Params
 5 | import io.prediction.data.storage.BiMap
 6 | 
 7 | import org.apache.spark.SparkContext
 8 | import org.apache.spark.SparkContext._
 9 | import org.apache.spark.rdd.RDD
10 | 
11 | import edu.stanford.nlp.Magic._
12 | 
13 | import grizzled.slf4j.Logger
14 | 
15 | case class AlgorithmParams(
16 |   val baseWeight: Double
17 | )extends Params
18 | 
19 | class Algorithm(val ap: AlgorithmParams)
20 |   extends P2LAlgorithm[PreparedData, Model, Query, PredictedResult] {
21 | 
22 |   @transient lazy val logger = Logger[this.type]
23 | 
24 |   def train(sc: SparkContext, data: PreparedData): Model = {
25 |     require(
26 |       !data.sentiments.take(1).isEmpty,
27 |       s"RDD[sentiments] in PreparedData cannot be empty." +
28 |       " Please check if DataSource generates TrainingData" +
29 |       " and Preprator generates PreparedData correctly.")
30 | 
31 |     val itemSets: RDD[(String, Double)] = data.sentiments.map(
32 |       s => (s.phrase.toLowerCase(), s.sentiment)
33 |     ).cache()
34 |     
35 |     // assume the last training data is the most up-to-date
36 |     val rules = itemSets.groupByKey
37 |                         .mapValues(iter => iter.toVector.last)
38 |                         .collectAsMap.toMap
39 | 
40 |     new Model(rules)
41 |   }
42 | 
43 |   def predict(model: Model, query: Query): PredictedResult = {
44 |     new PredictedResult(model.getSentiment(query.s, ap))
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/src/main/scala/CoreNLP-Scala/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # To Build:
 3 | #  1. Set CORENLP_HOME to the root of CoreNLP
 4 | #  2. [optional] Set BERKELEY to the path to the Berkeley parser
 5 | #  3. Build using either 'make stanford' or 'make berkeley' (if the Berkeley parser is configured)
 6 | #
 7 | 
 8 | CORENLP=$(CORENLP_HOME)/classes:$(CORENLP_HOME)/lib/joda-time.jar:$(CORENLP_HOME)/lib/jollyday-0.4.7.jar
 9 | BERKELEY=$(CORENLP_HOME)/../more/lib/BerkeleyParser.jar
10 | 
11 | JAVAC=javac
12 | SCALAC=scalac
13 | 
14 | SRC=src
15 | SOURCES = $(wildcard src/edu/stanford/nlp/*.scala)
16 | TEST_SRC=test/src
17 | LIB=lib
18 | BUILD=classes
19 | TEST_BUILD=test/classes
20 | DIST=dist
21 | 
22 | dist: stanford
23 | 	mkdir -p ${DIST}
24 | 	jar cf ${DIST}/corenlp-scala.jar -C $(BUILD) .
25 | 	jar uf ${DIST}/corenlp-scala.jar -C $(SRC) .
26 | 
27 | berkeley: stanford
28 | 	$(SCALAC) -cp $(CORENLP):${BERKELEY} -d $(BUILD) `find $(SRC) -name "*.scala"`
29 | 
30 | stanford: ${SOURCES}
31 | 	mkdir -p $(BUILD)
32 | 	sed -e 's/BerkeleyUtil.berkeleyParser/throw new IllegalStateException("Could not find parser model (and was not compiled to run with Berkeley parser)")/g' ${SRC}/edu/stanford/nlp/NLP.scala > /tmp/NLP_stanfordonly.scala
33 | 	$(SCALAC) -cp $(CORENLP) -d $(BUILD) `find $(SRC) -name "*.scala" ! -name "*Berkeley.scala" ! -name "NLP.scala"` /tmp/NLP_stanfordonly.scala
34 | 	rm /tmp/NLP_stanfordonly.scala
35 | 
36 | default:  stanford
37 | 
38 | clean:
39 | 	rm -r $(BUILD)
40 | 	rm -r ${DIST}
41 | 
42 | 
43 | cmd:
44 | 	@echo "scala -J-Xmx4G -cp $(CORENLP):$(BUILD)":${HOME}/lib/corenlp-models.jar
45 | 


--------------------------------------------------------------------------------
/src/main/scala/CoreNLP-Scala/README.md:
--------------------------------------------------------------------------------
1 | Since gangeli/CoreNLP-Scala does not provide a way to install by the build.sbt file,
2 | copy it from https://github.com/gangeli/CoreNLP-Scala.
3 | 
4 | 


--------------------------------------------------------------------------------
/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Berkeley.scala:
--------------------------------------------------------------------------------
 1 | package edu.stanford.nlp;
 2 | 
 3 | import scala.collection.JavaConversions._
 4 | import scala.concurrent.Lock
 5 | 
 6 | import edu.stanford.nlp.trees.Tree
 7 | import edu.stanford.nlp.trees.Trees
 8 | import edu.stanford.nlp.trees.LabeledScoredTreeNode
 9 | import edu.stanford.nlp.ling.HasWord
10 | import edu.stanford.nlp.ling.Word
11 | 
12 | import edu.berkeley.nlp.PCFGLA._
13 | import edu.berkeley.nlp.util.Numberer
14 | 
15 | import NLPConfig._
16 | 
17 | object BerkeleyUtil {
18 |   type BerkeleyTree = edu.berkeley.nlp.syntax.Tree[String]
19 | 
20 |   implicit def stanfordTree2BerkeleyTree(btree:BerkeleyTree):Tree = {
21 |     val roots = TreeAnnotations.unAnnotateTree(btree).getChildren;
22 |     if (roots.isEmpty) {
23 |       new LabeledScoredTreeNode();
24 |     } else {
25 |       def convert(src:BerkeleyTree):Tree = {
26 |         val dst:Tree = new LabeledScoredTreeNode
27 |         if (src.getLabel != null) dst.setLabel(new Word(src.getLabel))
28 |         dst.setChildren(src.getChildren.map( convert(_) ).toArray)
29 |         dst
30 |       }
31 |       new LabeledScoredTreeNode(new Word("TOP"),
32 |                                 List[Tree](convert(roots.get(0))))
33 |     }
34 |   }
35 |   
36 |   lazy val berkeleyParser = {
37 |     // (function to create parser)
38 |     def mkParser = {
39 |       // (setup parser)
40 |       val pData = ParserData.Load(parse.model)
41 |       if (pData == null) throw new RuntimeException("Failed to load Berkeley parser model")
42 |       val grammar = pData.getGrammar();
43 |       val lexicon = pData.getLexicon();
44 |       Numberer.setNumberers(pData.getNumbs());
45 |       // (create parser object)
46 |       val parser = new CoarseToFineMaxRuleParser(
47 |                    grammar, lexicon, 1.0, -1, false, false, false,
48 |                    false, false, true, true)
49 |       // (set binarization)
50 |       try {
51 |         val binarizationField = classOf[ConstrainedArrayParser].getDeclaredField("binarization");
52 |         binarizationField.setAccessible(true);
53 |         binarizationField.set(parser, pData.getBinarization());
54 |         binarizationField.setAccessible(false);
55 |       } catch { case (e:Exception) => throw new RuntimeException(e) }
56 |       // (parser object)
57 |       new {
58 |         def parse(words:List[String], pos:List[String]):Tree = {
59 |           var parsedTree:BerkeleyTree 
60 |             = parser.getBestConstrainedParse(words, pos, null);
61 |           if (parsedTree.getChildren().isEmpty()) {
62 |             parsedTree = parser.getBestConstrainedParse(words, null, null);
63 |           }
64 |           parsedTree
65 |         }
66 |       }
67 |     }
68 |     // (create parsers)
69 |     val parsers = (0 until numThreads).map{ x => (mkParser, new Lock) }.toList
70 |     // (multithreaded implementation)
71 |     new {
72 |       def parse(words:List[String], pos:List[String]):Tree = {
73 |         def tryParse:Tree = {
74 |           val validParser = parsers.indexWhere{
75 |             (pair:({def parse(words:List[String],pos:List[String]):Tree},Lock)) =>
76 |               pair._2.available
77 |           }
78 |           if (validParser >= 0) { // case: [likely] found parser to run
79 |             val (parser, lock) = parsers(validParser)
80 |             lock.acquire
81 |             val rtn = parser.parse(words, pos)
82 |             lock.release
83 |             rtn
84 |           } else { Thread.sleep(1000); tryParse } // case: no parser found
85 |         }
86 |         tryParse
87 |       }
88 |     }
89 |   }
90 | }
91 | 


--------------------------------------------------------------------------------
/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Classify.scala:
--------------------------------------------------------------------------------
  1 | package edu.stanford.nlp;
  2 | 
  3 | import scala.collection.JavaConversions._
  4 | import scala.collection.MapLike
  5 | import scala.collection.Map
  6 | import scala.collection.generic.CanBuildFrom
  7 | import scala.concurrent.Lock
  8 | 
  9 | import java.io.ObjectInputStream
 10 | import java.lang.ref.SoftReference
 11 | import java.lang.ref.ReferenceQueue
 12 | import java.util.Properties
 13 | 
 14 | import edu.stanford.nlp.classify.LinearClassifierFactory
 15 | import edu.stanford.nlp.classify.LogPrior
 16 | import edu.stanford.nlp.classify.RVFDataset
 17 | import edu.stanford.nlp.ie.NERClassifierCombiner
 18 | import edu.stanford.nlp.io.IOUtils
 19 | import edu.stanford.nlp.ling.HasWord
 20 | import edu.stanford.nlp.ling.RVFDatum
 21 | import edu.stanford.nlp.ling.Word
 22 | import edu.stanford.nlp.ling.CoreLabel
 23 | import edu.stanford.nlp.optimization.DiffFunction
 24 | import edu.stanford.nlp.optimization.QNMinimizer
 25 | import edu.stanford.nlp.optimization.SGDToQNMinimizer
 26 | import edu.stanford.nlp.parser.lexparser.LexicalizedParser
 27 | import edu.stanford.nlp.process.Morphology
 28 | import edu.stanford.nlp.process.PTBTokenizer
 29 | import edu.stanford.nlp.stats.ClassicCounter
 30 | import edu.stanford.nlp.stats.Counter
 31 | import edu.stanford.nlp.tagger.maxent.MaxentTagger
 32 | import edu.stanford.nlp.trees.CollinsHeadFinder
 33 | import edu.stanford.nlp.trees.LabeledScoredTreeNode
 34 | import edu.stanford.nlp.trees.Tree
 35 | import edu.stanford.nlp.trees.Trees
 36 | import edu.stanford.nlp.trees.GrammaticalStructureFactory
 37 | import edu.stanford.nlp.trees.GrammaticalStructure
 38 | import edu.stanford.nlp.trees.PennTreebankLanguagePack
 39 | import edu.stanford.nlp.trees.TypedDependency
 40 | import edu.stanford.nlp.util.logging.Redwood.Util._
 41 | 
 42 | import NLPConfig._
 43 | import NLP._
 44 | 
 45 | // ----------
 46 | // Classifiers
 47 | // ----------
 48 | @SerialVersionUID(1l)
 49 | class Classifier[I,O](
 50 |      regression:I=>Map[O,Double],
 51 |      val data:Map[I,(O,Float)]) extends Function1[I,O] with Serializable {
 52 |   override def apply(in:I):O = {
 53 |     regression(in).maxBy(_._2)._1
 54 |   }
 55 | }
 56 | 
 57 | class Mapping[I,O](map:Map[I,(O,Float)]) {
 58 |   import Mapping.{toCounter,defaultFeatures}
 59 | 
 60 |   def scorer[F](featurizer:I=>Iterable[F]):I=>Map[O,Double] = {
 61 |     // -- Create Dataset
 62 |     val weights = new Array[Float](map.size)
 63 |     val dataset = new RVFDataset[O, F](map.size)
 64 |     map.zipWithIndex.foreach{
 65 |         case ((input:I, (output:O, weight:Float)),i:Int) =>
 66 |       weights(i) = weight
 67 |       dataset.add( new RVFDatum[O, F](toCounter(featurizer(input)), output) )
 68 |     }
 69 |     // -- Train
 70 |     val prior = new LogPrior(LogPrior.LogPriorType.QUADRATIC)
 71 |     val factory = new LinearClassifierFactory[O,F]()
 72 |     val classifier = factory.trainClassifier(dataset, weights, prior)
 73 |     // -- Return
 74 |     (input:I) => {
 75 |       val scores = classifier.scoresOf(
 76 |         new RVFDatum[O, F](toCounter(featurizer(input)), null.asInstanceOf[O]))
 77 |       scores.keySet.map{ x => (x, scores.getCount(x)) }.toMap
 78 |     }
 79 |   }
 80 |   def scorer:I=>Map[O,Double] = scorer(defaultFeatures(_, map.size))
 81 | 
 82 |   def classifier[F](featurizer:I=>Iterable[F]):Classifier[I,O]
 83 |     = new Classifier(scorer(featurizer), map)
 84 |   def classifier:Classifier[I,O]
 85 |     = classifier(defaultFeatures(_, map.size))
 86 | }
 87 | 
 88 | object Mapping {
 89 |   def toCounter[X,F](map:Iterable[X]):Counter[F] = {
 90 |     val counts = new ClassicCounter[F]
 91 |     map.foreach{ (x:X) => x match {
 92 |       case (feat:F, n:Number) => counts.incrementCount(feat, n.doubleValue)
 93 |       case (feat:F) => counts.incrementCount(feat, 1)
 94 |       case _ => throw new IllegalStateException("Type mismatch in toCounter")
 95 |     } }
 96 |     return counts
 97 |   }
 98 |   
 99 |   def apply[I,O,X](map:Map[I,X]):Mapping[I,O] = {
100 |     new Mapping(map.map{ case (i:I, x:X) => x match {
101 |       case (o:O, n:Number) => (i, (o, n.floatValue))
102 |       case (o:O) => (i, (o, 1.0.asInstanceOf[Float]))
103 |       case _ => throw new IllegalStateException("Type mismatch in toCounter")
104 |     } })
105 |   }
106 | 
107 |   def defaultFeatures[I](input:I, datasetSize:Int):Iterable[(String,Float)] = {
108 |     def ngram[A](seq:List[A], n:Int, tail:List[A] = Nil):List[String] = {
109 |       if (seq.isEmpty) Nil
110 |       else (seq.head :: tail.slice(0, n-1)).reverse.mkString("_") :: ngram(seq.tail, n, seq.head :: tail)
111 |     }
112 |     input match {
113 |       case (sent:Sentence) =>
114 |         val n:Int = (scala.math.log10(datasetSize) / 3.0).toInt + 1
115 |         // N-grams
116 |         (ngram(sent.words.toList, n) :::
117 |          ngram(sent.words.toList.map( _.toLowerCase ), n) :::
118 |          ngram(sent.lemma.toList, n) :::
119 |          ngram(sent.ner.toList, n) :::
120 |          ngram(sent.pos.toList, n) :::
121 |          // Bag-of-words
122 |          { if (n > 1)
123 |             sent.words.toList :::
124 |             sent.words.toList.map( _.toLowerCase ) :::
125 |             sent.lemma.toList :::
126 |             sent.ner.toList :::
127 |             sent.pos.toList
128 |            else Nil }
129 |         ).map{ (_, 1.0.toFloat) }
130 |       case (str:String) =>
131 |         val tokens = str.split(" ")
132 |         val n:Int = (scala.math.log10(datasetSize) / 3.0).toInt + 1
133 |         if (tokens.length <= 1) {
134 |           // Case: a single word
135 |           (tokens(0) ::  // memorize
136 |             ngram(str.toCharArray.toList, n) :::  // literal n-grams
137 |             ngram(str.toLowerCase.toCharArray.toList, n)  // case-insensitive n-grams
138 |             ).map{ (_, 1.0.toFloat) }
139 |         } else {
140 |           // Case: a phrase
141 |           (ngram(tokens.toList, n) :::  // literal n-grams
142 |            ngram(tokens.toList.map( _.toLowerCase), n)  // case-insensitive n-grams
143 |           ).map{ (_, 1.0.toFloat) }
144 |         }
145 |       case (seq:Iterable[Any]) =>
146 |       seq.map{ (x:Any) => x match {
147 |         case (feat:Any, n:Number) => (feat.toString, n.floatValue)
148 |         case (feat:Any) => (feat.toString, 1.0.toFloat)
149 |         case _ => (x.toString, 1.0.toFloat)
150 |       } }
151 |       case _ => List[(String,Float)]( (input.toString, 1.0.toFloat) )
152 |     }
153 |   }
154 | }
155 | 
156 | // ----------
157 | // Ensemble Classifiers
158 | // ----------
159 | 
160 | class Ensemble[I](members:Seq[I=>Boolean], dat:Option[Map[I,(Boolean,Float)]]) {
161 |   // -- Get Data
162 |   if (!dat.isDefined) {
163 |     members.foldLeft(Option[Map[I,(Boolean,Float)]](null)){
164 |         (dat:Option[Map[I,(Boolean,Float)]], fn:I=>Boolean) =>
165 |       fn match {
166 |         case (classifier:Classifier[I,Boolean]) =>
167 |           dat match {
168 |             case Some(existingData) =>
169 |               if (classifier.data != existingData) {
170 |                 warn("Classifiers trained on different data; taking union")
171 |                 Some(classifier.data ++ existingData)
172 |               } else {
173 |                 Some(existingData)
174 |               }
175 |             case None => Some(classifier.data)
176 |           }
177 |         case _ => dat
178 |       }
179 |     }
180 |   }
181 | 
182 |   // -- Methods
183 |   def data(d:Map[I,(Boolean,Float)]):Ensemble[I] = new Ensemble(members, Some(d))
184 |   def data(d:Seq[(I,Boolean)]):Ensemble[I]
185 |     = data( d.map( x => (x._1, (x._2, 1.0f)) ).toMap )
186 |   
187 |   /**
188 |    *  Implementation of AdaBoost.
189 |    *  Taken from http://en.wikipedia.org/wiki/AdaBoost
190 |    */
191 |   def boost(data:Map[I,(Boolean,Float)]):Classifier[I,Boolean] = {
192 |     if (data.isEmpty) throw new IllegalArgumentException("No data to train on!")
193 |     // -- Cache
194 |     startTrack("Running Weak Learners")
195 |     val dataAsArray = data.toArray
196 |     val gold = dataAsArray.map( _._2._1 )
197 |     val predictions:Array[(I=>Boolean,Array[(Boolean, Float)])]
198 |       = members.toList.par.map{ (h:I=>Boolean) =>
199 |         log("running " + h.toString)
200 |         (h, dataAsArray.map{ case (in:I, (out:Boolean, weight:Float)) =>
201 |           (h(in), weight)
202 |         })
203 |       }.toArray
204 |     endTrack("Running Weak Learners")
205 |     // -- Error Rate
206 |     def error(predictions:Array[(Boolean,Float)],
207 |               gold:Array[Boolean],
208 |               d:Array[Double] = (0 until data.size).map( x => 1.0 / data.size ).toArray
209 |               ):Double = {
210 |       predictions.zip(gold).zip(d).foldLeft(0.0){
211 |           case (sum:Double,
212 |                (( (guess:Boolean, weight:Float),
213 |                 gold:Boolean),
214 |                 di:Double)) =>
215 |         if(guess == gold) sum else sum + di * weight
216 |       }
217 |     }
218 |     def regressor(coefficients:Seq[(Double, I=>Boolean)]
219 |                   ):(I => Map[Boolean, Double]) = (in:I) => {
220 |       val sum = coefficients.foldLeft(0.0){
221 |           case (sum:Double, (alpha:Double, h:(I=>Boolean))) =>
222 |         sum + alpha * { if(h(in)) 1.0 else -1.0 }
223 |       }
224 |       Map[Boolean, Double]( true  -> {if(sum >= 0.0) 1.0 else 0.0 },
225 |                             false -> {if(sum >= 0.0) 0.0 else 1.0 } )
226 |     }
227 |     // -- Run an Iteration
228 |     def iter(t:Int,
229 |              predictions:Array[(I=>Boolean, Array[(Boolean,Float)])],
230 |              gold:Array[Boolean],
231 |              soFar:List[(Double, I=>Boolean)],
232 |              d:Array[Double] = data.map( x => 1.0 / data.size.toDouble ).toArray,
233 |              tolerance:Double = NLPConfig.classify.tolerance
234 |              ):List[(Double, I=>Boolean)] = {
235 |       startTrack("Iteration " + t)
236 |       // (get errors)
237 |       val errors = predictions.map{ case (h, pred:Array[(Boolean,Float)]) =>
238 |         ( h, pred, error(pred, gold, d) )
239 |       }
240 |       val (hOpt, predOpt, et) = errors.maxBy( x => scala.math.abs(0.5 - x._3) )
241 |       // (compute update)
242 |       log("optimal classifier: " + hOpt)
243 |       log("e_t: " + et)
244 |       val at   = 0.5 * scala.math.log( (1.0 - et) / et )
245 |       val newD = predOpt.zip(gold).zip(d).map{
246 |           case (((guess:Boolean, weight:Float), gold:Boolean), di:Double) =>
247 |         di * scala.math.exp(- {if (guess == gold) 1.0 else -1.0} * at)
248 |       }
249 |       val sumD = newD.sum
250 |       for (i <- 0 until newD.length) { newD(i) /= sumD }
251 |       // (update coefficients)
252 |       val coeffs = (at, hOpt) :: soFar
253 |       log("a_t: " + at)
254 |       endTrack("Iteration " + t)
255 |       // (recurse)
256 |       if ( scala.math.abs(0.5 - et) < tolerance ||
257 |            t >= NLPConfig.classify.iterations) {
258 |         coeffs
259 |       } else {
260 |         iter(t+1, predictions, gold, coeffs, newD, tolerance)
261 |       }
262 |     }
263 |     // -- Construct Classifier
264 |     startTrack("Boosting over " + members.length + " classifier and " + data.size + " examples")
265 |     val fn = regressor(iter(1, predictions, gold, Nil))
266 |     endTrack("Boosting over " + members.length + " classifier and " + data.size + " examples")
267 |     new Classifier(fn, data)
268 |   }
269 | 
270 |   def boost:Classifier[I,Boolean]
271 |     = boost(dat.getOrElse(Map[I,(Boolean,Float)]()))
272 | }
273 | 


--------------------------------------------------------------------------------
/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Document.scala:
--------------------------------------------------------------------------------
 1 | package edu.stanford.nlp;
 2 | 
 3 | import scala.collection.JavaConversions._
 4 | import scala.collection.MapLike
 5 | import scala.collection.Map
 6 | import scala.collection.generic.CanBuildFrom
 7 | import scala.concurrent.Lock
 8 | 
 9 | import java.io.ObjectInputStream
10 | import java.lang.ref.SoftReference
11 | import java.lang.ref.ReferenceQueue
12 | import java.util.Properties
13 | 
14 | import edu.stanford.nlp.classify.LinearClassifierFactory
15 | import edu.stanford.nlp.classify.LogPrior
16 | import edu.stanford.nlp.classify.RVFDataset
17 | import edu.stanford.nlp.ie.NERClassifierCombiner
18 | import edu.stanford.nlp.io.IOUtils
19 | import edu.stanford.nlp.ling.HasWord
20 | import edu.stanford.nlp.ling.RVFDatum
21 | import edu.stanford.nlp.ling.Word
22 | import edu.stanford.nlp.ling.CoreLabel
23 | import edu.stanford.nlp.optimization.DiffFunction
24 | import edu.stanford.nlp.optimization.QNMinimizer
25 | import edu.stanford.nlp.optimization.SGDToQNMinimizer
26 | import edu.stanford.nlp.parser.lexparser.LexicalizedParser
27 | import edu.stanford.nlp.process.Morphology
28 | import edu.stanford.nlp.process.PTBTokenizer
29 | import edu.stanford.nlp.stats.ClassicCounter
30 | import edu.stanford.nlp.stats.Counter
31 | import edu.stanford.nlp.tagger.maxent.MaxentTagger
32 | import edu.stanford.nlp.trees.CollinsHeadFinder
33 | import edu.stanford.nlp.trees.LabeledScoredTreeNode
34 | import edu.stanford.nlp.trees.Tree
35 | import edu.stanford.nlp.trees.Trees
36 | import edu.stanford.nlp.trees.GrammaticalStructureFactory
37 | import edu.stanford.nlp.trees.GrammaticalStructure
38 | import edu.stanford.nlp.trees.PennTreebankLanguagePack
39 | import edu.stanford.nlp.trees.TypedDependency
40 | import edu.stanford.nlp.util.logging.Redwood.Util._
41 | 
42 | import NLPConfig._
43 | import NLP._
44 | 
45 | 
46 | object Document {
47 | }
48 | 
49 | 
50 | @SerialVersionUID(1l)
51 | case class Document(sentences:Array[String]) {
52 |   // TODO(gabor) coreference
53 | }
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Magic.scala:
--------------------------------------------------------------------------------
 1 | package edu.stanford.nlp;
 2 | 
 3 | import scala.collection.JavaConversions._
 4 | import scala.collection.MapLike
 5 | import scala.collection.Map
 6 | import scala.collection.generic.CanBuildFrom
 7 | import scala.concurrent.Lock
 8 | 
 9 | import java.io.ObjectInputStream
10 | import java.lang.ref.SoftReference
11 | import java.lang.ref.ReferenceQueue
12 | import java.util.Properties
13 | 
14 | import edu.stanford.nlp.classify.LinearClassifierFactory
15 | import edu.stanford.nlp.classify.LogPrior
16 | import edu.stanford.nlp.classify.RVFDataset
17 | import edu.stanford.nlp.ie.NERClassifierCombiner
18 | import edu.stanford.nlp.io.IOUtils
19 | import edu.stanford.nlp.ling.HasWord
20 | import edu.stanford.nlp.ling.RVFDatum
21 | import edu.stanford.nlp.ling.Word
22 | import edu.stanford.nlp.ling.CoreLabel
23 | import edu.stanford.nlp.optimization.DiffFunction
24 | import edu.stanford.nlp.optimization.QNMinimizer
25 | import edu.stanford.nlp.optimization.SGDToQNMinimizer
26 | import edu.stanford.nlp.parser.lexparser.LexicalizedParser
27 | import edu.stanford.nlp.process.Morphology
28 | import edu.stanford.nlp.process.PTBTokenizer
29 | import edu.stanford.nlp.stats.ClassicCounter
30 | import edu.stanford.nlp.stats.Counter
31 | import edu.stanford.nlp.tagger.maxent.MaxentTagger
32 | import edu.stanford.nlp.trees.CollinsHeadFinder
33 | import edu.stanford.nlp.trees.LabeledScoredTreeNode
34 | import edu.stanford.nlp.trees.Tree
35 | import edu.stanford.nlp.trees.Trees
36 | import edu.stanford.nlp.trees.GrammaticalStructureFactory
37 | import edu.stanford.nlp.trees.GrammaticalStructure
38 | import edu.stanford.nlp.trees.PennTreebankLanguagePack
39 | import edu.stanford.nlp.trees.TypedDependency
40 | import edu.stanford.nlp.util.logging.Redwood.Util._
41 | 
42 | import NLPConfig._
43 | 
44 | 
45 | object Magic {
46 |   import NLP._
47 | 
48 |   /*
49 |    * Implicit Conversions
50 |    */
51 |   implicit def seq2nlpseq(seq:Seq[String]):Sentence = new Sentence(seq)
52 |   implicit def string2nlpseq(gloss:String):Sentence = new Sentence(gloss)
53 |   
54 |   implicit def map2mapping[I,O,X](map:Map[I,X]):Mapping[I,O] = Mapping(map)
55 |   
56 |   implicit def seq2ensemble[I](seq:Seq[I=>Boolean]):Ensemble[I] = new Ensemble(seq, None)
57 |   
58 |   implicit def fn2optimizable(
59 |         fn:Array[Double]=>Double):OptimizableFunction = {
60 |     optimize.algorithm.toLowerCase match {
61 |       case "lbfgs" => LBFGSOptimizableApproximateFunction(fn, None)
62 |       case "braindead" => BraindeadGradientDescent(fn, None)
63 |       case _ => throw new IllegalStateException("Unknown algorithm: " + optimize.algorithm)
64 |     }
65 |   }
66 |   implicit def fnPair2optimizable(
67 |         pair:(Array[Double]=>Double,Array[Double]=>Array[Double])):OptimizableFunction = {
68 |     optimize.algorithm.toLowerCase match {
69 |        case "lbfgs" => LBFGSOptimizableApproximateFunction(pair._1, Some(pair._2))
70 |        case "braindead" => BraindeadGradientDescent(pair._1, Some(pair._2))
71 |        case _ => throw new IllegalStateException("Unknown algorithm: " + optimize.algorithm)
72 |     }
73 |   }
74 | 
75 |   implicit def string2tokensregex(str:String):TokensRegex = new TokensRegex(str)
76 | }
77 | 


--------------------------------------------------------------------------------
/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/NLP.scala:
--------------------------------------------------------------------------------
  1 | package edu.stanford.nlp
  2 | 
  3 | import scala.collection.JavaConversions._
  4 | import scala.collection.MapLike
  5 | import scala.collection.Map
  6 | import scala.collection.generic.CanBuildFrom
  7 | import scala.concurrent.Lock
  8 | 
  9 | import java.io.ObjectInputStream
 10 | import java.lang.ref.SoftReference
 11 | import java.lang.ref.ReferenceQueue
 12 | import java.util.Properties
 13 | 
 14 | import edu.stanford.nlp.ling.CoreAnnotations._
 15 | import edu.stanford.nlp.classify.LinearClassifierFactory
 16 | import edu.stanford.nlp.classify.LogPrior
 17 | import edu.stanford.nlp.classify.RVFDataset
 18 | import edu.stanford.nlp.ie.NERClassifierCombiner
 19 | import edu.stanford.nlp.ie.crf.CRFBiasedClassifier
 20 | import edu.stanford.nlp.io.IOUtils
 21 | import edu.stanford.nlp.ling.HasWord
 22 | import edu.stanford.nlp.ling.RVFDatum
 23 | import edu.stanford.nlp.ling.Word
 24 | import edu.stanford.nlp.ling.CoreLabel
 25 | import edu.stanford.nlp.optimization.DiffFunction
 26 | import edu.stanford.nlp.optimization.QNMinimizer
 27 | import edu.stanford.nlp.optimization.SGDToQNMinimizer
 28 | import edu.stanford.nlp.parser.lexparser.LexicalizedParser
 29 | import edu.stanford.nlp.process.Morphology
 30 | import edu.stanford.nlp.process.PTBTokenizer
 31 | import edu.stanford.nlp.stats.ClassicCounter
 32 | import edu.stanford.nlp.stats.Counter
 33 | import edu.stanford.nlp.tagger.maxent.MaxentTagger
 34 | import edu.stanford.nlp.trees.CollinsHeadFinder
 35 | import edu.stanford.nlp.trees.LabeledScoredTreeNode
 36 | import edu.stanford.nlp.trees.Tree
 37 | import edu.stanford.nlp.trees.Trees
 38 | import edu.stanford.nlp.trees.GrammaticalStructureFactory
 39 | import edu.stanford.nlp.trees.GrammaticalStructure
 40 | import edu.stanford.nlp.trees.PennTreebankLanguagePack
 41 | import edu.stanford.nlp.trees.TypedDependency
 42 | import edu.stanford.nlp.util.logging.Redwood.Util._
 43 | 
 44 | import NLPConfig._
 45 | 
 46 | object NLP {
 47 |   implicit def list2hasWordList(lst:Seq[String]):java.util.List[_<:HasWord]
 48 |     = lst.map( new Word(_) ).toList
 49 | 
 50 |   // ----------
 51 |   // Parsers
 52 |   // ----------
 53 |   lazy val stanfordParser = {
 54 |     val parser = LexicalizedParser.loadModel(parse.model)
 55 |     new {
 56 |       def parse(words:List[String], pos:List[String]):Tree = {
 57 |         parser.parseStrings(words);
 58 |       }
 59 |     }
 60 |   }
 61 |   lazy val parser = stanfordParser
 62 |   // ----------
 63 |   // Stanford CoreNLP Components
 64 |   // ----------
 65 |   lazy val tagger = new MaxentTagger(pos.model)
 66 | 
 67 |   lazy val collinsHeadFinder = new CollinsHeadFinder()
 68 | 
 69 |   lazy val morph:((Morphology=>Any)=>Any) = {
 70 |     val morph = new Morphology()
 71 |     val morphLock = new Lock()
 72 |     val f = { (fn:Morphology=>Any) =>
 73 |       morphLock.acquire;
 74 |       val rtn = fn(morph);
 75 |       morphLock.release
 76 |       rtn
 77 |     }
 78 |     f
 79 |   }
 80 | 
 81 |   lazy val nerCRF:(Array[String], Array[String])=>Array[String] = {
 82 |     val classifier = new NERClassifierCombiner(ner.model, ner.aux);
 83 |     (words:Array[String], pos:Array[String]) => {
 84 |       val offsets:List[Int] = words.foldLeft( (List[Int](), 0) ){
 85 |           case ((offsetsSoFar:List[Int], offset:Int), word:String) =>
 86 |         (offset :: offsetsSoFar, offset + word.length + 1)
 87 |       }._1.reverse
 88 |       // (construct CoreLabel sentence)
 89 |       val coreSentence = new java.util.ArrayList[CoreLabel](words.length)
 90 |       words.zip(pos).zip(offsets)foreach{
 91 |           case ((word:String, pos:String), offset:Int) =>
 92 |         val label = new CoreLabel
 93 |         label.setWord(word)
 94 |         label.setOriginalText(word)
 95 |         label.setTag(pos)
 96 |         label.setBeginPosition(offset)
 97 |         label.setEndPosition(offset + word.length)
 98 |         coreSentence.add(label)
 99 |       }
100 |       // (classify)
101 |       classifier.classifySentence(coreSentence)
102 |       val output:java.util.List[CoreLabel] = classifier.classifySentence(coreSentence);
103 |       // (convert back)
104 |       output.map{ (label:CoreLabel) =>
105 |         label.ner()
106 |       }.toArray
107 |     }
108 |   }
109 | 
110 |   /**
111 |    * The TrueCase classifier implementation.
112 |    * Takes as input an array of tokens, POS tags, and lemmas,
113 |    * and returns as output the tokens with their true case applied.
114 |    * The length of the tokens, POS tags, and lemmas must match.
115 |    * @return An array of tokens (words as Strings) of the same length
116 |    *         as the input tokens, but with their inferred true case.
117 |    */
118 |   lazy val trueCaser:(Array[String], Array[String], Array[String])=>Array[String] = {
119 |     // Create classifier
120 |     val props:Properties = {
121 |         val p = new Properties
122 |         p.setProperty("loadClassifier", NLPConfig.truecase.model)
123 |         p.setProperty("mixedCaseMapFile", NLPConfig.truecase.disambiguation_list)
124 |         p.setProperty("classBias", NLPConfig.truecase.bias)
125 |         p
126 |       }
127 |     val classifier = new CRFBiasedClassifier[CoreLabel](props);
128 |     classifier.loadClassifierNoExceptions(NLPConfig.truecase.model, props);
129 |     // Set classifier biases
130 |     NLPConfig.truecase.bias.split(",").foreach{ (bias:String) =>
131 |       val terms = bias.split(":")
132 |       classifier.setBiasWeight(terms(0), terms(1).toDouble)
133 |     }
134 |     // Get mixed case map
135 |     val mixedCaseMap:Map[String,String]
136 |       = scala.io.Source.fromInputStream(IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(NLPConfig.truecase.disambiguation_list))
137 |                .getLines
138 |                .map( _.trim.split("""\s+""") )
139 |                .map{ case Array(a:String, b:String) => (a ,b) }
140 |                .toMap
141 |     // Return function
142 |     (words:Array[String], pos:Array[String], lemma:Array[String]) => {
143 |       // (mock offsets)
144 |       val offsets:List[Int] = words.foldLeft( (List[Int](), 0) ){
145 |           case ((offsetsSoFar:List[Int], offset:Int), word:String) =>
146 |         (offset :: offsetsSoFar, offset + word.length + 1)
147 |       }._1.reverse
148 |       // (construct CoreLabel sentence)
149 |       val coreSentence = new java.util.ArrayList[CoreLabel](words.length)
150 |       words.zip(pos).zip(offsets)foreach{
151 |           case ((word:String, pos:String), offset:Int) =>
152 |         val label = new CoreLabel
153 |         label.setWord(word.toLowerCase)
154 |         label.setOriginalText(word)
155 |         label.setTag(pos)
156 |         label.setBeginPosition(offset)
157 |         label.setEndPosition(offset + word.length)
158 |         coreSentence.add(label)
159 |       }
160 |       // (classify)
161 |       val output:java.util.List[CoreLabel] = classifier.classifySentence(coreSentence);
162 |       // (convert back)
163 |       output.map{ (label:CoreLabel) =>
164 |         val word:String = label.word
165 |         label.get(classOf[AnswerAnnotation]) match {
166 |           case "UPPER" => word.toUpperCase
167 |           case "LOWER" => word.toLowerCase
168 |           case "INIT_UPPER" => word.substring(0, 1).toUpperCase + word.substring(1).toLowerCase
169 |           case "O" => mixedCaseMap.get(word).getOrElse(word)
170 |           case _ => word
171 |         }
172 |       }.toArray
173 |     }
174 |   }
175 | 
176 |   // ----------
177 |   // Methods
178 |   // ----------
179 |   def preload(obj: => Any) { new Thread(){ override def run:Unit = obj }.start }
180 | }
181 | 
182 | trait CoreLabelSeq extends Seq[CoreLabel] {
183 |   //
184 |   // Trivial overrides (still have to define apply(Int):CoreLabel and length:Int though)
185 |   //
186 |   override def iterator:Iterator[CoreLabel] = new Iterator[CoreLabel] {
187 |       var index:Int = 0
188 |       override def hasNext:Boolean = index < CoreLabelSeq.this.length
189 |       override def next:CoreLabel = { index += 1; apply(index - 1); }
190 |     }
191 | 
192 |   //
193 |   // Common Methods
194 |   //
195 |   def matches(t:TokensRegex) = t.matches(this)
196 | }
197 | 


--------------------------------------------------------------------------------
/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/NLPConfig.scala:
--------------------------------------------------------------------------------
 1 | package edu.stanford.nlp
 2 | 
 3 | import edu.stanford.nlp.pipeline.DefaultPaths._
 4 | 
 5 | object NLPConfig {
 6 |   object parse {
 7 |     var model:String = DEFAULT_PARSER_MODEL
 8 |   }
 9 | 
10 |   object pos {
11 |     var model:String = DEFAULT_POS_MODEL
12 |   }
13 |   
14 |   object ner {
15 |     var model:String = DEFAULT_NER_CONLL_MODEL
16 |     var aux:String   = DEFAULT_NER_MUC_MODEL
17 |   }
18 | 
19 |   object classify {
20 |     var tolerance:Double = 1e-5
21 |     var iterations:Double = 40
22 |   }
23 | 
24 |   object optimize {
25 |     var tolerance:Double = 1e-5
26 |     var wiggle:Double = 1e-5
27 |     var algorithm = "LBFGS" // | braindead | ...
28 |   }
29 | 
30 |   object truecase {
31 |     var model:String = "edu/stanford/nlp/models/truecase/truecasing.fast.caseless.qn.ser.gz"
32 |     var disambiguation_list:String = "edu/stanford/nlp/models/truecase/MixDisambiguation.list"
33 |     var bias:String = "INIT_UPPER:-0.7,UPPER:-0.7,O:0"
34 |   }
35 | 
36 |   def caseless:Unit = {
37 |     parse.model = "edu/stanford/nlp/models/lexparser/englishPCFG.caseless.ser.gz"
38 |     pos.model = "edu/stanford/nlp/models/pos-tagger/english-caseless-left3words-distsim.tagger"
39 |     ner.model = "edu/stanford/nlp/models/ner/english.conll.4class.caseless.distsim.crf.ser.gz"
40 |     ner.aux = "edu/stanford/nlp/models/ner/english.muc.7class.caseless.distsim.crf.ser.gz"
41 |   }
42 | 
43 |   var numThreads = Runtime.getRuntime().availableProcessors();
44 | }
45 | 


--------------------------------------------------------------------------------
/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Optimize.scala:
--------------------------------------------------------------------------------
  1 | package edu.stanford.nlp;
  2 | 
  3 | import scala.collection.JavaConversions._
  4 | import scala.collection.MapLike
  5 | import scala.collection.Map
  6 | import scala.collection.generic.CanBuildFrom
  7 | import scala.concurrent.Lock
  8 | 
  9 | import java.io.ObjectInputStream
 10 | import java.lang.ref.SoftReference
 11 | import java.lang.ref.ReferenceQueue
 12 | import java.util.Properties
 13 | 
 14 | import edu.stanford.nlp.classify.LinearClassifierFactory
 15 | import edu.stanford.nlp.classify.LogPrior
 16 | import edu.stanford.nlp.classify.RVFDataset
 17 | import edu.stanford.nlp.ie.NERClassifierCombiner
 18 | import edu.stanford.nlp.io.IOUtils
 19 | import edu.stanford.nlp.ling.HasWord
 20 | import edu.stanford.nlp.ling.RVFDatum
 21 | import edu.stanford.nlp.ling.Word
 22 | import edu.stanford.nlp.ling.CoreLabel
 23 | import edu.stanford.nlp.optimization.DiffFunction
 24 | import edu.stanford.nlp.optimization.QNMinimizer
 25 | import edu.stanford.nlp.optimization.SGDToQNMinimizer
 26 | import edu.stanford.nlp.parser.lexparser.LexicalizedParser
 27 | import edu.stanford.nlp.process.Morphology
 28 | import edu.stanford.nlp.process.PTBTokenizer
 29 | import edu.stanford.nlp.stats.ClassicCounter
 30 | import edu.stanford.nlp.stats.Counter
 31 | import edu.stanford.nlp.tagger.maxent.MaxentTagger
 32 | import edu.stanford.nlp.trees.CollinsHeadFinder
 33 | import edu.stanford.nlp.trees.LabeledScoredTreeNode
 34 | import edu.stanford.nlp.trees.Tree
 35 | import edu.stanford.nlp.trees.Trees
 36 | import edu.stanford.nlp.trees.GrammaticalStructureFactory
 37 | import edu.stanford.nlp.trees.GrammaticalStructure
 38 | import edu.stanford.nlp.trees.PennTreebankLanguagePack
 39 | import edu.stanford.nlp.trees.TypedDependency
 40 | import edu.stanford.nlp.util.logging.Redwood.Util._
 41 | 
 42 | import NLPConfig._
 43 | import NLP._
 44 | import Optimize._
 45 | 
 46 | // ----------
 47 | // Optimizers
 48 | // ----------
 49 | object Optimize {
 50 |   def empiricalDerivative(fn:Array[Double]=>Double,
 51 |                           x:Array[Double]):Array[Double] = {
 52 |     val y0 = fn(x)
 53 |     def tweak(i:Int, delta:Double):(Double, Double) = {
 54 |       x(i) += delta
 55 |       val y1 = fn(x)
 56 |       x(i) -= delta
 57 |       if (delta < 1e-5 * optimize.wiggle || delta > 1e5 * optimize.wiggle) {
 58 |         (y1, delta)
 59 |       } else {
 60 |         if (scala.math.abs(y1 - y0) / delta > 1e5) tweak(i, delta / 2.0)
 61 |         else if (scala.math.abs(y1 - y0) / delta < 1e-5) tweak(i, delta * 2.0)
 62 |         else (y1, delta)
 63 |       }
 64 |     }
 65 |     {for (i <- 0 until x.length) yield {
 66 |       val (y1, step) = tweak(i, optimize.wiggle)
 67 |       (y1 - y0) / step
 68 |     }}.toArray
 69 |   }
 70 | }
 71 | 
 72 | trait OptimizableFunction {
 73 |   def minimize(initial:Array[Double]):Array[Double]
 74 |   def derivative(ddx:Array[Double]=>Array[Double]):OptimizableFunction
 75 | }
 76 | 
 77 | /**
 78 |  * A wrapper for QNMinimizer (L-BFGS)
 79 | */
 80 | case class LBFGSOptimizableApproximateFunction(
 81 |     fn:Array[Double]=>Double, derivative:Option[Array[Double]=>Array[Double]])
 82 |     extends OptimizableFunction{
 83 | 
 84 |   override def minimize(initial:Array[Double]):Array[Double] = {
 85 |     // (define a differentiable function)
 86 |     val javaFn:DiffFunction = new DiffFunction {
 87 |       override def domainDimension:Int = initial.length
 88 |       override def valueAt(x:Array[Double]):Double = fn(x)
 89 |       override def derivativeAt(x:Array[Double]):Array[Double] = {
 90 |         derivative match {
 91 |           case Some(ddx) => ddx(x)
 92 |           case None => empiricalDerivative(fn, x)
 93 |         }
 94 |       }
 95 |     }
 96 |     // (optimize using QNMinimizer)
 97 |     val javaInit = initial.map{ (n:Double) => n }
 98 |     val optimizer = new QNMinimizer()
 99 |     optimizer.setRobustOptions()
100 |     optimizer.minimize(javaFn, optimize.tolerance, javaInit)
101 |   }
102 | 
103 |   override def derivative(ddx:Array[Double]=>Array[Double]):LBFGSOptimizableApproximateFunction
104 |     = new LBFGSOptimizableApproximateFunction(fn, Some(ddx))
105 | }
106 | 
107 | /**
108 |  * An optimization algorithm I made up (thus, "braindead"), that tries its
109 |  * best to move against the gradient (thus, "gradient descent").
110 |  * The only motivation to use this over L-BFGS is that it's more robust to
111 |  * non-convex problems (i.e., won't crash and burn).
112 | */
113 | case class BraindeadGradientDescent(
114 |     fn:Array[Double]=>Double, derivative:Option[Array[Double]=>Array[Double]])
115 |     extends OptimizableFunction{
116 | 
117 |   override def minimize(initial:Array[Double]):Array[Double] = {
118 |     // (helpers)
119 |     def dx(x:Array[Double], y0:Double):Array[Double] = derivative match {
120 |           case Some(ddx) => ddx(x)
121 |           case None => empiricalDerivative(fn, x)
122 |         }
123 |     def move(init:Array[Double], direction:Array[Double], scaling:Double):Array[Double] = {
124 |       init.zip(direction).map{ case (a:Double, d:Double) => a + scaling * d}
125 |     }
126 |     def isImprovementOver(newY:Double, y:Double):Boolean
127 |       = newY + optimize.tolerance < y
128 |     // (state)
129 |     val initialX:Array[Double] = initial
130 |     val initialY:Double        = fn(initialX)
131 |     var x:Array[Double]        = initialX
132 |     var y:Double               = initialY
133 |     var numIters = 0
134 |     // (optimization)
135 |     while (numIters < 100) {
136 |       var step:Double        = 1.0
137 |       val dir:Array[Double]  = dx(x, y).map( - _ )
138 |       var newX:Array[Double] = move(x, dir, step)
139 |       var newY:Double        = fn(newX)
140 |       while (!isImprovementOver(newY, y) && step > 1e-5) {
141 |         step /= 2.0
142 |         newX = move(x, dir, step)
143 |         newY = fn(newX)
144 |       }
145 |       if (step <= 1e-5) return x // convergence
146 |       assert(newY < y, "Function value did not decrease!")
147 |       x = newX
148 |       y = newY
149 |       numIters += 1
150 |     }
151 |     // (timeout -- no convergence)
152 |     return x
153 |   }
154 | 
155 |   override def derivative(ddx:Array[Double]=>Array[Double]):BraindeadGradientDescent
156 |     = new BraindeadGradientDescent(fn, Some(ddx))
157 | }
158 | 


--------------------------------------------------------------------------------
/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Sentence.scala:
--------------------------------------------------------------------------------
  1 | package edu.stanford.nlp;
  2 | 
  3 | import scala.collection.JavaConversions._
  4 | import scala.collection.MapLike
  5 | import scala.collection.Map
  6 | import scala.collection.generic.CanBuildFrom
  7 | import scala.concurrent.Lock
  8 | 
  9 | import java.io.ObjectInputStream
 10 | import java.lang.ref.SoftReference
 11 | import java.lang.ref.ReferenceQueue
 12 | import java.util.Properties
 13 | 
 14 | import edu.stanford.nlp.classify.LinearClassifierFactory
 15 | import edu.stanford.nlp.classify.LogPrior
 16 | import edu.stanford.nlp.classify.RVFDataset
 17 | import edu.stanford.nlp.ie.NERClassifierCombiner
 18 | import edu.stanford.nlp.io.IOUtils
 19 | import edu.stanford.nlp.ling.HasWord
 20 | import edu.stanford.nlp.ling.RVFDatum
 21 | import edu.stanford.nlp.ling.Word
 22 | import edu.stanford.nlp.ling.CoreLabel
 23 | import edu.stanford.nlp.optimization.DiffFunction
 24 | import edu.stanford.nlp.optimization.QNMinimizer
 25 | import edu.stanford.nlp.optimization.SGDToQNMinimizer
 26 | import edu.stanford.nlp.parser.lexparser.LexicalizedParser
 27 | import edu.stanford.nlp.process.Morphology
 28 | import edu.stanford.nlp.process.PTBTokenizer
 29 | import edu.stanford.nlp.stats.ClassicCounter
 30 | import edu.stanford.nlp.stats.Counter
 31 | import edu.stanford.nlp.tagger.maxent.MaxentTagger
 32 | import edu.stanford.nlp.trees.CollinsHeadFinder
 33 | import edu.stanford.nlp.trees.LabeledScoredTreeNode
 34 | import edu.stanford.nlp.trees.Tree
 35 | import edu.stanford.nlp.trees.Trees
 36 | import edu.stanford.nlp.trees.GrammaticalStructureFactory
 37 | import edu.stanford.nlp.trees.GrammaticalStructure
 38 | import edu.stanford.nlp.trees.PennTreebankLanguagePack
 39 | import edu.stanford.nlp.trees.TypedDependency
 40 | import edu.stanford.nlp.util.logging.Redwood.Util._
 41 | 
 42 | import NLPConfig._
 43 | import NLP._
 44 | 
 45 | object Sentence {
 46 |   val tokenizerFactory = PTBTokenizer.factory
 47 |   val grammaticalStructureFactory
 48 |     = new PennTreebankLanguagePack().grammaticalStructureFactory
 49 |   
 50 |   def apply(word:Seq[String]):Sentence = new Sentence(word.toArray)
 51 |   def apply(gloss:String):Sentence = new Sentence(gloss)
 52 | }
 53 | 
 54 | 
 55 | @SerialVersionUID(2l)
 56 | case class Sentence(word:Array[String]) extends CoreLabelSeq {
 57 | 
 58 |   def this(word:Seq[String]) = this(word.toArray)
 59 | 
 60 |   def this(sentence:String) = this(
 61 |     Sentence.tokenizerFactory.getTokenizer(new java.io.StringReader(sentence))
 62 |       .tokenize
 63 |       .map( _.word )
 64 |       .toArray
 65 |   )
 66 | 
 67 |   //
 68 |   // Necessary Overrides for Seq[CoreLabel]
 69 |   //
 70 |   override def length:Int = word.length
 71 |   override def apply(index:Int):CoreLabel = {
 72 |     val label = new CoreLabel(8)
 73 |     label.setWord(word(index))
 74 |     label.setTag(pos(index))
 75 |     if (index > 0) { label.setAfter(word(index - 1)) }
 76 |     if (index < word.length - 1) { label.setBefore(word(index + 1)) }
 77 |     label.setNER(ner(index))
 78 |     label.setLemma(lemma(index))
 79 |     label.setIndex(index)
 80 |     // TODO(gabor) things like character offsets, original text, etc.
 81 |     label
 82 |   }
 83 | 
 84 | 
 85 | 
 86 |   var id:Option[Int] = None
 87 |   // values
 88 |   lazy val parse:Tree = {
 89 |     NLP.parser.parse(word.toList, pos.toList)
 90 |   }
 91 | 
 92 |   lazy val stanfordDependencies:Array[(Int, String)] = {
 93 |     if (length == 0) {
 94 |       new Array[(Int, String)](0)
 95 |     } else {
 96 |       val depArray = new Array[(Int, String)](length)
 97 |       // (get dependencies)
 98 |       val structure:GrammaticalStructure
 99 |         = Sentence.grammaticalStructureFactory.newGrammaticalStructure(parse)
100 |       val deps:java.util.Collection[TypedDependency]
101 |         = structure.typedDependencies()
102 |       // (fill dependencies)
103 |       deps.foreach{ (arc:TypedDependency) =>
104 |         depArray(arc.dep.index - 1) = 
105 |           ( arc.gov.index - 1,
106 |             arc.reln.getShortName + {if (arc.reln.getSpecific == null) "" else "_" + arc.reln.getSpecific} )
107 |       }
108 |       // (pad empty dependencies)
109 |       for (i <- 0 until depArray.length) {
110 |         if (depArray(i) == null) depArray(i) = (i, "noop")
111 |       }
112 |       depArray
113 |     }
114 |   }
115 | 
116 |   def dependencyRoot:Int
117 |     = stanfordDependencies.zipWithIndex.filter( _._1._1 < 0 ).headOption match {
118 |       case Some( (dep, index) ) => index
119 |       case None => throw new IllegalStateException("Could not find head: '" +
120 |                     this + "' --- dependencies: " + stanfordDependencies.mkString(" "))
121 |     }
122 | 
123 |   def dependencyChild(root:Int, depType:String):Option[Int]
124 |     = stanfordDependencies.zipWithIndex.filter( x => x._1._1 == root && x._1._2 == depType )
125 |                                        .map( _._2 ).headOption
126 | 
127 |   def dependencyChildren(root:Int):Seq[(Int, String)]
128 |     = stanfordDependencies.zipWithIndex.filter( _._1._1 == root ).map( x => (x._2, x._1._2) )
129 |   
130 |   def dependencyYield(root:Int):Set[Int] = {
131 |     def recursiveSearch(root:Int, seen:Set[Int]):Set[Int] = {
132 |       val directChildren = dependencyChildren(root).map( _._1 )
133 |       directChildren.foldLeft(seen) {
134 |           case (soFar:Set[Int], index:Int) =>
135 |         if (!soFar(index)) recursiveSearch(index, seen + index)
136 |         else soFar
137 |       }
138 |     }
139 |     recursiveSearch(root, Set[Int](root))
140 |   }
141 | 
142 |   def dependencyPathMonotonic(ancestor:Int, descendent:Int):Option[Seq[Int]] = {
143 |     def recurse(ancestor:Int, descendent:Int, lst:List[Int]):Option[List[Int]] = {
144 |       if (descendent == ancestor) Some(ancestor :: lst)
145 |       else if (descendent < 0) None
146 |       else recurse(ancestor, stanfordDependencies(descendent)._1, descendent :: lst)
147 |     }
148 |     recurse(ancestor, stanfordDependencies(descendent)._1, Nil)
149 |   }
150 | 
151 |   lazy val headIndex:Int = {
152 |     if (word.length == 1) { 0 }
153 |     else {
154 |       val headLeaf = parse.headTerminal(collinsHeadFinder)
155 |       val index = parse.getLeaves().indexWhere{ (x:Tree) => x eq headLeaf }
156 |       if (index < 0) word.length - 1 else index
157 |     }
158 |   }
159 | 
160 |   def headIndex(spanBegin:Int, spanEnd:Int):Int = {
161 |     parse.setSpans
162 |     val (score, tree) = parse.foldLeft( spanBegin + (length - spanEnd), parse ){
163 |         case ( (smallestDiffSoFar:Int, bestTreeSoFar:Tree), tree:Tree ) =>
164 |       if (tree != null && tree.getSpan != null) {
165 |         val (treeBegin, treeEnd) = (tree.getSpan.getSource, tree.getSpan.getTarget)
166 |         val diff = scala.math.abs(spanBegin - treeBegin)
167 |                      + scala.math.abs(spanEnd - treeEnd)
168 |         if (treeBegin >= spanBegin && treeEnd <= spanEnd &&
169 |             diff < smallestDiffSoFar) { (diff, tree) }
170 |         else { (smallestDiffSoFar, bestTreeSoFar) }
171 |       } else { (smallestDiffSoFar, bestTreeSoFar) }
172 |     }
173 |     val headLeaf = tree.headTerminal(collinsHeadFinder)
174 |     val index = parse.getLeaves().indexWhere{ (x:Tree) => x eq headLeaf }
175 |     if (index < spanBegin || index >= spanEnd) spanEnd - 1 else index
176 |   }
177 |   
178 |   def headWord(spanBegin:Int, spanEnd:Int):String = word(headIndex(spanBegin, spanEnd))
179 | 
180 |   lazy val pos:Array[String]
181 |     = if (length == 0) new Array[String](0)
182 |       else NLP.tagger.apply(word.toList).map( _.tag ).toArray
183 | 
184 |   lazy val lemma:Array[String] = word.zip(pos).map{ case (w:String,p:String) => 
185 |         morph( m => m.lemma(w,p) ).toString
186 |       }.toArray
187 | 
188 |   lazy val ner:Array[String] = nerCRF(word, pos)
189 | 
190 |   lazy val truecase:Array[String] = trueCaser(word, pos, lemma)
191 | 
192 |   // helper functions
193 |   def words:Array[String] = word
194 |   def tags:Array[String] = pos
195 | 
196 |   def headWord:String = word(headIndex)
197 |   def headLemma:String = lemma(headIndex)
198 |   def headPOS:String = pos(headIndex)
199 |   def namedEntities:Array[(Array[String],String)] = {
200 |     // (collect tags)
201 |     val nerTags = word.zip(ner).foldLeft(List[(List[String],String)]()){
202 |         case (soFar:List[(List[String],String)], (word:String, tag:String)) =>
203 |       val (chunk, lastTag) = if (soFar.isEmpty) (List[String](), "O")
204 |                              else soFar.head
205 |       val tailList:List[(List[String],String)]
206 |         = if (soFar.isEmpty) Nil else soFar.tail
207 |       if (lastTag != tag) {
208 |         (List[String](word), tag) :: {
209 |           if (lastTag != "O") (chunk.reverse, lastTag) :: tailList
210 |           else tailList
211 |         }
212 |       } else {
213 |         (word :: chunk, tag) :: tailList
214 |       }
215 |     }
216 |     // (some cleanup)
217 |     val headPair = nerTags.head
218 |     (if (headPair._2 == "O") nerTags.tail
219 |      else (headPair._1.reverse, headPair._2) :: nerTags.tail)
220 |       .reverse
221 |       .map{ case (c,t) => (c.toArray,t) }
222 |       .toArray
223 |   }
224 | 
225 |   def toSentence:Sentence = this
226 | 
227 |   override def equals(a:Any):Boolean = {
228 |     def seqMatch(s:Seq[String]):Boolean = {
229 |       s.length == word.length && s.zip(word).forall{ case (a,b) => a == b }
230 |     }
231 |     a match {
232 |       case (s:Sentence) =>
233 |         for (id1 <- this.id;
234 |              id2 <- s.id) return id1 == id2
235 |         return seqMatch(s.word)
236 |       case (s:Seq[String]) => seqMatch(s)
237 |       case _ => false
238 |     }
239 |   }
240 |   private var code:Int = 0
241 |   override def hashCode:Int = {
242 |     if (code == 0) { word.foreach( w => code = 37 * code + w.hashCode ) }
243 |     code
244 |   }
245 |   override def toString:String = word.mkString(" ")
246 | }
247 | 


--------------------------------------------------------------------------------
/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/TokensRegex.scala:
--------------------------------------------------------------------------------
 1 | package edu.stanford.nlp;
 2 | 
 3 | import scala.collection.JavaConversions._
 4 | 
 5 | import edu.stanford.nlp.ling.CoreLabel
 6 | import edu.stanford.nlp.util.CoreMap
 7 | import edu.stanford.nlp.ling.tokensregex._
 8 | 
 9 | import NLPConfig._
10 | 
11 | 
12 | case class TokensRegex(override val toString:String) {
13 |   val pattern:TokenSequencePattern = TokenSequencePattern.compile(toString)
14 | 
15 | 
16 |   def matches(input:Seq[CoreLabel]):Boolean = pattern.getMatcher(input.toList).matches
17 | 
18 |   def allMatches(input:Seq[CoreLabel]):Iterator[Seq[CoreLabel]] = {
19 |     val matcher = pattern.getMatcher(input.toList)
20 |     new Iterator[Seq[CoreLabel]] {
21 |       var theNext:Option[Boolean] = None
22 |       override def hasNext:Boolean = theNext match {
23 |         case Some(x) => x
24 |         case None => theNext = Some(matcher.find); theNext.get
25 |       }
26 |       override def next:Seq[CoreLabel] = {
27 |         if (!hasNext) throw new NoSuchElementException
28 |         theNext = None
29 |         val m:java.util.List[_ <: CoreMap] = matcher.groupNodes
30 |         m.map( _ match {
31 |           case (x:CoreLabel) => x
32 |           case (x:CoreMap) => new CoreLabel(x)
33 |         })
34 |       }
35 |     }
36 |   }
37 | 
38 |   def unapplySeq(target:Any):Option[Seq[Seq[CoreLabel]]] = target match {
39 |     case (input:Seq[CoreLabel]) =>
40 |       val matcher = pattern getMatcher(input toList)
41 |       if (matcher matches) {
42 |         Some(for (i <- 1 to matcher.groupCount) yield 
43 |           matcher groupNodes(i) map( _ match {
44 |           case (x:CoreLabel) => x
45 |           case (x:CoreMap) => new CoreLabel(x)
46 |         }))
47 |       } else { None }
48 |     case _ => None
49 |   }
50 | }
51 | 
52 | 
53 | object TokensRegex {
54 |   // Built-in predicates
55 |   def word(pattern:String):MarkedString = MarkedString(s"""{word : /$pattern/}""")
56 |   def tag(pattern:String):MarkedString = MarkedString(s"""{tag : /$pattern/}""")
57 |   def lemma(pattern:String):MarkedString = MarkedString(s"""{lemma : /$pattern/}""")
58 |   def ner(pattern:String):MarkedString = MarkedString(s"""{ner : /$pattern/}""")
59 |   def normalized(pattern:String):MarkedString = MarkedString(s"""{normalized : /$pattern/}""")
60 | 
61 |   // Decorate predicates
62 |   case class MarkedString(str:String) extends AnyVal { override def toString:String = str }
63 |   implicit def stringDecorator(str:MarkedString) = new {
64 |     def unary_!():String = s"""!$str"""
65 |   }
66 |   implicit def string2string(str:MarkedString):String = str.str
67 | 
68 |   // Create token sequence
69 |   implicit def product2tokens(p:Product):Tokens = new Tokens(List[String](p.productIterator.map( _.toString ).mkString(" & ")))
70 |   implicit def string2tokens(str:MarkedString):Tokens = new Tokens(List[String](str.str))
71 |   class Tokens(val regexps:List[String]) {
72 |     def apply(terms:String*):Tokens = {
73 |       new Tokens(terms.mkString(" & ") :: regexps)
74 |     }
75 |   }
76 |   
77 |   // Dump to TokensRegex object
78 |   implicit def string2tokensregex(str:MarkedString):TokensRegex
79 |     = new TokensRegex(s"""[${str.str}]""")
80 |   implicit def tokens2tokensregex(tokens:Tokens):TokensRegex
81 |     = new TokensRegex(s"""[${tokens.regexps.reverse.mkString("] [")}]""")
82 | }
83 | 


--------------------------------------------------------------------------------
/src/main/scala/DataSource.scala:
--------------------------------------------------------------------------------
 1 | package org.template.sentimentanalysis
 2 | 
 3 | import io.prediction.controller.PDataSource
 4 | import io.prediction.controller.EmptyEvaluationInfo
 5 | import io.prediction.controller.EmptyActualResult
 6 | import io.prediction.controller.Params
 7 | import io.prediction.data.storage.Event
 8 | import io.prediction.data.storage.Storage
 9 | 
10 | import org.apache.spark.SparkContext
11 | import org.apache.spark.SparkContext._
12 | import org.apache.spark.rdd.RDD
13 | 
14 | import grizzled.slf4j.Logger
15 | 
16 | case class DataSourceParams(appId: Int) extends Params
17 | 
18 | class DataSource(val dsp: DataSourceParams)
19 |   extends PDataSource[TrainingData,
20 |       EmptyEvaluationInfo, Query, EmptyActualResult] {
21 | 
22 |   @transient lazy val logger = Logger[this.type]
23 | 
24 |   override
25 |   def readTraining(sc: SparkContext): TrainingData = {
26 |     val eventsDB = Storage.getPEvents()
27 |     val eventsRDD: RDD[Event] = eventsDB.find(
28 |       appId = dsp.appId,
29 |       entityType = Some("user"),
30 |       eventNames = Some(List("train"))
31 |     ) (sc)
32 |     
33 |     val sentimentsRDD: RDD[Sentiment] = eventsRDD.map { event =>
34 |       val sentiment = try {
35 |         val sentimentValue: Double = event.event match {
36 |           case "train" => event.properties.get[Double]("sentiment")
37 |           case _ => throw new Exception(s"Unexpected event ${event} is read.")
38 |         }
39 |       
40 |         Sentiment(
41 |           event.properties.get[String]("phrase"),
42 |           sentimentValue)
43 |       } catch {
44 |         case e: Exception => {
45 |           logger.error(
46 |             s"Cannot convert ${event} to Sentiment. Exception: ${e}.")
47 |           throw e
48 |         }
49 |       }
50 |       sentiment
51 |     }.cache()
52 |     
53 |     new TrainingData(sentimentsRDD)
54 |   }
55 | }
56 | 
57 | case class Sentiment(
58 |   phrase: String,
59 |   sentiment: Double
60 | )
61 | 
62 | class TrainingData(
63 |   val sentiments: RDD[Sentiment]
64 | ) extends Serializable { }
65 | 
66 | 


--------------------------------------------------------------------------------
/src/main/scala/Engine.scala:
--------------------------------------------------------------------------------
 1 | package org.template.sentimentanalysis
 2 | 
 3 | import io.prediction.controller.IEngineFactory
 4 | import io.prediction.controller.Engine
 5 | 
 6 | case class Query(
 7 |   s: String
 8 | ) extends Serializable
 9 | 
10 | case class PredictedResult(
11 |   sentiment: Double
12 | ) extends Serializable
13 | 
14 | object SentimentAnalysisEngine extends IEngineFactory {
15 |   def apply() = {
16 |     new Engine(
17 |       classOf[DataSource],
18 |       classOf[Preparator],
19 |       Map("nlpparse" -> classOf[Algorithm]),
20 |       classOf[Serving])
21 |   }
22 | }
23 | 
24 | 


--------------------------------------------------------------------------------
/src/main/scala/Model.scala:
--------------------------------------------------------------------------------
 1 | package org.template.sentimentanalysis
 2 | 
 3 | import edu.stanford.nlp.Magic._
 4 | import edu.stanford.nlp.trees.Tree
 5 | 
 6 | class Model (
 7 |   var rules: Map[String, Double]
 8 | ) extends Serializable {
 9 | 
10 |   /**
11 |    * Return the sentiment in [-2 , 2] scale
12 |    */
13 |   def getWordSentiment(word: String): Double = {
14 |     var score = rules.get(word.toLowerCase())
15 |     if (score.isEmpty) {
16 |       return 0.0
17 |     } else {
18 |       return score.get - 2.0
19 |     }
20 |   }
21 | 
22 |   /**
23 |    * Parse the input to a tree structure. Calculate the sentiment from bottom
24 |    * to the top.
25 |    *
26 |    * For a leaf node, it is always a word token. Use the sentiment
27 |    * from the training data in this case. If the word did not appear in the
28 |    * training data. Assume it is neutral.
29 |    *
30 |    * For a non-leaf node, calculate the sentiments of each of its children.
31 |    * Determine whether the sentence is positive or negative by the number of
32 |    * negative children. If it is odd, then assume the sentence is negative.
33 |    */
34 |   def getSentiment(s: String, ap: AlgorithmParams): Double = {
35 |     var m = scala.collection.mutable.Map[Tree, Double]()
36 |     var tree = s.parse
37 |     var root = tree.preOrderNodeList().get(0)
38 |     var post_order = tree.postOrderNodeList()
39 |     var i = 0
40 |     while (i < post_order.size()) {
41 |       var cur = post_order.get(i)
42 |       i = i + 1
43 | 
44 |       if (cur.isLeaf()) {
45 |         m(cur) = getWordSentiment(cur.value)
46 |       } else {
47 |         var children = cur.children()
48 |         var weight = 0.0000000001
49 |         var positive = 1
50 |         var sentiment = 0.0
51 |         m(cur) = 0
52 |         for (child <- children) {
53 |           var child_sentiment = m(child)
54 | 
55 |           // The weight of a the child is proportional to the absolute value
56 |           // of its sentiment. It avoid the sentiment to be neutralized by
57 |           // other neutral childs
58 |           var child_weight = Math.abs(child_sentiment) + ap.baseWeight
59 | 
60 |           weight = weight + child_weight
61 |           sentiment = sentiment + child_weight * Math.abs(child_sentiment)
62 |           if (child_sentiment < -0.0000000001) {
63 |             positive = positive * -1
64 |           }
65 |         }
66 |         m(cur) = ( sentiment / weight ) * positive
67 |       }
68 |     }
69 | 
70 |     return m(root) + 2.0
71 |   }
72 | }
73 | 


--------------------------------------------------------------------------------
/src/main/scala/Preparator.scala:
--------------------------------------------------------------------------------
 1 | package org.template.sentimentanalysis
 2 | 
 3 | import io.prediction.controller.PPreparator
 4 | 
 5 | import org.apache.spark.SparkContext
 6 | import org.apache.spark.SparkContext._
 7 | import org.apache.spark.rdd.RDD
 8 | 
 9 | class Preparator
10 |   extends PPreparator[TrainingData, PreparedData] {
11 | 
12 |   def prepare(sc: SparkContext, trainingData: TrainingData): PreparedData = {
13 |     new PreparedData(sentiments = trainingData.sentiments)
14 |   }
15 | }
16 | 
17 | class PreparedData(
18 |   val sentiments: RDD[Sentiment]
19 | ) extends Serializable
20 | 
21 | 


--------------------------------------------------------------------------------
/src/main/scala/Serving.scala:
--------------------------------------------------------------------------------
 1 | package org.template.sentimentanalysis
 2 | 
 3 | import io.prediction.controller.LServing
 4 | 
 5 | class Serving extends LServing[Query, PredictedResult] {
 6 | 
 7 |   override
 8 |   def serve(
 9 |     query: Query,
10 |     predictedResults: Seq[PredictedResult]
11 |   ): PredictedResult = {
12 |     predictedResults.head
13 |   }
14 | }
15 | 


--------------------------------------------------------------------------------
/template.json:
--------------------------------------------------------------------------------
1 | {"pio": {"version": { "min": "0.9.0" }}}
2 | 


--------------------------------------------------------------------------------