├── .gitignore ├── README.md ├── build.sbt ├── data ├── import_eventserver.py ├── send_query.py └── train.tsv ├── engine.json ├── project ├── assembly.sbt └── pio-build.sbt ├── src └── main │ └── scala │ ├── Algorithm.scala │ ├── CoreNLP-Scala │ ├── Makefile │ ├── README.md │ └── src │ │ └── edu │ │ └── stanford │ │ └── nlp │ │ ├── Berkeley.scala │ │ ├── Classify.scala │ │ ├── Document.scala │ │ ├── Magic.scala │ │ ├── NLP.scala │ │ ├── NLPConfig.scala │ │ ├── Optimize.scala │ │ ├── Sentence.scala │ │ └── TokensRegex.scala │ ├── DataSource.scala │ ├── Engine.scala │ ├── Model.scala │ ├── Preparator.scala │ └── Serving.scala └── template.json /.gitignore: -------------------------------------------------------------------------------- 1 | manifest.json 2 | pio.log 3 | /pio.sbt 4 | target/ 5 | data/*.csv 6 | data/*.tsv 7 | data/*.zip 8 | data/gen_submission.py 9 | *~ 10 | *.swp 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Sentiment Analysis Template 2 | 3 | Given a sentence, return a score between 0 and 4, indicating the sentence's sentiment. 0 being very negative, 4 being very positive, 2 being neutral. 4 | 5 | The engine uses the stanford CoreNLP library and the Scala binding `gangeli/CoreNLP-Scala` for parsing. 6 | 7 | ## Versions 8 | 9 | ### v0.1.0 10 | 11 | - initial version 12 | 13 | ## import sample data 14 | 15 | ``` 16 | $ python data/import_eventserver.py --access_key --file data/train.tsv 17 | ``` 18 | 19 | The sample training data comes from https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews. It is a tsv file. Each line contains 4 data, `PhraseId`, `SentenceId`, `Phrase` and `Sentiment`. 20 | 21 | For example, 22 | ``` 23 | 1 1 bad 1 24 | ``` 25 | 26 | ## Step to build, train and deploy the engine 27 | 28 | ``` 29 | $ pio build && pio train && pio deploy 30 | ``` 31 | 32 | ## Query 33 | 34 | The query takes a `String` `s`. The result contains a `Double` called `sentiment`. 35 | 36 | normal: 37 | 38 | ``` 39 | $ curl -H "Content-Type: application/json" \ 40 | -d '{ 41 | "s" : "I am happy" 42 | }' \ 43 | http://localhost:8000/queries.json \ 44 | -w %{time_connect}:%{time_starttransfer}:%{time_total} 45 | 46 | {"sentiment":3.0714285712172384}0.005:0.027:0.027 47 | ``` 48 | 49 | ``` 50 | $ curl -H "Content-Type: application/json" \ 51 | -d '{ 52 | "s" : "This movie sucks!" 53 | }' \ 54 | http://localhost:8000/queries.json \ 55 | -w %{time_connect}:%{time_starttransfer}:%{time_total} 56 | 57 | {"sentiment":0.8000000001798788}0.005:0.031:0.031 58 | ``` 59 | 60 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | import AssemblyKeys._ 2 | 3 | assemblySettings 4 | 5 | name := "template-scala-sentiment-analysis" 6 | 7 | organization := "com.whhone" 8 | 9 | excludeFilter in unmanagedSources := "Berkeley.scala" 10 | 11 | libraryDependencies ++= Seq( 12 | "io.prediction" %% "core" % pioVersion.value % "provided", 13 | "org.apache.spark" %% "spark-core" % "1.2.0" % "provided", 14 | "org.apache.spark" %% "spark-mllib" % "1.2.0" % "provided", 15 | "edu.stanford.nlp" % "stanford-corenlp" % "3.4", 16 | "edu.stanford.nlp" % "stanford-corenlp" % "3.4" classifier "models", 17 | "edu.stanford.nlp" % "stanford-parser" % "3.4" 18 | ) 19 | 20 | -------------------------------------------------------------------------------- /data/import_eventserver.py: -------------------------------------------------------------------------------- 1 | """ 2 | Import sample data for Sentiment Analysis Engine Template 3 | """ 4 | 5 | import predictionio 6 | import argparse 7 | 8 | def import_events(client, file): 9 | f = open(file, 'r') 10 | count = 0 11 | print "Importing data..." 12 | for line in f: 13 | data = line.rstrip('\r\n').split("\t") 14 | if True: 15 | client.create_event( 16 | event="train", 17 | entity_type="user", 18 | entity_id=data[0], 19 | properties= { 20 | "phrase" : str(data[2]), 21 | "sentiment" : float(data[3]) 22 | } 23 | ) 24 | count += 1 25 | if count % 100 == 0: 26 | print count 27 | 28 | f.close() 29 | print "%s events are imported." % count 30 | 31 | if __name__ == '__main__': 32 | parser = argparse.ArgumentParser( 33 | description="Import sample data for sentiment analysis engine") 34 | parser.add_argument('--access_key', default='invalid-access-key') 35 | parser.add_argument('--url', default="http://localhost:7070") 36 | parser.add_argument('--file', default="./data/train.tsv") 37 | 38 | args = parser.parse_args() 39 | print args 40 | 41 | client = predictionio.EventClient( 42 | access_key=args.access_key, 43 | url=args.url, 44 | threads=10, 45 | qsize=1000) 46 | import_events(client, args.file) 47 | -------------------------------------------------------------------------------- /data/send_query.py: -------------------------------------------------------------------------------- 1 | """ 2 | Send sample query to prediction engine 3 | """ 4 | 5 | import predictionio 6 | client = predictionio.EngineClient(url="http://localhost:8000") 7 | 8 | def test(s): 9 | print s + ' : ' + str(client.send_query({"s": s})['sentiment']) 10 | 11 | test('sad') 12 | test('happy') 13 | test('oh') 14 | test('not') 15 | test('not sad') 16 | test('very sad') 17 | test('very happy') 18 | test('not very sad') 19 | -------------------------------------------------------------------------------- /engine.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "default", 3 | "description": "Default settings", 4 | "engineFactory": "org.template.sentimentanalysis.SentimentAnalysisEngine", 5 | "datasource": { 6 | "params" : { 7 | "appId": 2 8 | } 9 | }, 10 | "algorithms": [ 11 | { 12 | "name": "nlpparse", 13 | "params": { 14 | "baseWeight": 1 15 | } 16 | } 17 | ] 18 | } 19 | -------------------------------------------------------------------------------- /project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2") 2 | -------------------------------------------------------------------------------- /project/pio-build.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("io.prediction" % "pio-build" % "0.9.0") 2 | -------------------------------------------------------------------------------- /src/main/scala/Algorithm.scala: -------------------------------------------------------------------------------- 1 | package org.template.sentimentanalysis 2 | 3 | import io.prediction.controller.P2LAlgorithm 4 | import io.prediction.controller.Params 5 | import io.prediction.data.storage.BiMap 6 | 7 | import org.apache.spark.SparkContext 8 | import org.apache.spark.SparkContext._ 9 | import org.apache.spark.rdd.RDD 10 | 11 | import edu.stanford.nlp.Magic._ 12 | 13 | import grizzled.slf4j.Logger 14 | 15 | case class AlgorithmParams( 16 | val baseWeight: Double 17 | )extends Params 18 | 19 | class Algorithm(val ap: AlgorithmParams) 20 | extends P2LAlgorithm[PreparedData, Model, Query, PredictedResult] { 21 | 22 | @transient lazy val logger = Logger[this.type] 23 | 24 | def train(sc: SparkContext, data: PreparedData): Model = { 25 | require( 26 | !data.sentiments.take(1).isEmpty, 27 | s"RDD[sentiments] in PreparedData cannot be empty." + 28 | " Please check if DataSource generates TrainingData" + 29 | " and Preprator generates PreparedData correctly.") 30 | 31 | val itemSets: RDD[(String, Double)] = data.sentiments.map( 32 | s => (s.phrase.toLowerCase(), s.sentiment) 33 | ).cache() 34 | 35 | // assume the last training data is the most up-to-date 36 | val rules = itemSets.groupByKey 37 | .mapValues(iter => iter.toVector.last) 38 | .collectAsMap.toMap 39 | 40 | new Model(rules) 41 | } 42 | 43 | def predict(model: Model, query: Query): PredictedResult = { 44 | new PredictedResult(model.getSentiment(query.s, ap)) 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/main/scala/CoreNLP-Scala/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # To Build: 3 | # 1. Set CORENLP_HOME to the root of CoreNLP 4 | # 2. [optional] Set BERKELEY to the path to the Berkeley parser 5 | # 3. Build using either 'make stanford' or 'make berkeley' (if the Berkeley parser is configured) 6 | # 7 | 8 | CORENLP=$(CORENLP_HOME)/classes:$(CORENLP_HOME)/lib/joda-time.jar:$(CORENLP_HOME)/lib/jollyday-0.4.7.jar 9 | BERKELEY=$(CORENLP_HOME)/../more/lib/BerkeleyParser.jar 10 | 11 | JAVAC=javac 12 | SCALAC=scalac 13 | 14 | SRC=src 15 | SOURCES = $(wildcard src/edu/stanford/nlp/*.scala) 16 | TEST_SRC=test/src 17 | LIB=lib 18 | BUILD=classes 19 | TEST_BUILD=test/classes 20 | DIST=dist 21 | 22 | dist: stanford 23 | mkdir -p ${DIST} 24 | jar cf ${DIST}/corenlp-scala.jar -C $(BUILD) . 25 | jar uf ${DIST}/corenlp-scala.jar -C $(SRC) . 26 | 27 | berkeley: stanford 28 | $(SCALAC) -cp $(CORENLP):${BERKELEY} -d $(BUILD) `find $(SRC) -name "*.scala"` 29 | 30 | stanford: ${SOURCES} 31 | mkdir -p $(BUILD) 32 | sed -e 's/BerkeleyUtil.berkeleyParser/throw new IllegalStateException("Could not find parser model (and was not compiled to run with Berkeley parser)")/g' ${SRC}/edu/stanford/nlp/NLP.scala > /tmp/NLP_stanfordonly.scala 33 | $(SCALAC) -cp $(CORENLP) -d $(BUILD) `find $(SRC) -name "*.scala" ! -name "*Berkeley.scala" ! -name "NLP.scala"` /tmp/NLP_stanfordonly.scala 34 | rm /tmp/NLP_stanfordonly.scala 35 | 36 | default: stanford 37 | 38 | clean: 39 | rm -r $(BUILD) 40 | rm -r ${DIST} 41 | 42 | 43 | cmd: 44 | @echo "scala -J-Xmx4G -cp $(CORENLP):$(BUILD)":${HOME}/lib/corenlp-models.jar 45 | -------------------------------------------------------------------------------- /src/main/scala/CoreNLP-Scala/README.md: -------------------------------------------------------------------------------- 1 | Since gangeli/CoreNLP-Scala does not provide a way to install by the build.sbt file, 2 | copy it from https://github.com/gangeli/CoreNLP-Scala. 3 | 4 | -------------------------------------------------------------------------------- /src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Berkeley.scala: -------------------------------------------------------------------------------- 1 | package edu.stanford.nlp; 2 | 3 | import scala.collection.JavaConversions._ 4 | import scala.concurrent.Lock 5 | 6 | import edu.stanford.nlp.trees.Tree 7 | import edu.stanford.nlp.trees.Trees 8 | import edu.stanford.nlp.trees.LabeledScoredTreeNode 9 | import edu.stanford.nlp.ling.HasWord 10 | import edu.stanford.nlp.ling.Word 11 | 12 | import edu.berkeley.nlp.PCFGLA._ 13 | import edu.berkeley.nlp.util.Numberer 14 | 15 | import NLPConfig._ 16 | 17 | object BerkeleyUtil { 18 | type BerkeleyTree = edu.berkeley.nlp.syntax.Tree[String] 19 | 20 | implicit def stanfordTree2BerkeleyTree(btree:BerkeleyTree):Tree = { 21 | val roots = TreeAnnotations.unAnnotateTree(btree).getChildren; 22 | if (roots.isEmpty) { 23 | new LabeledScoredTreeNode(); 24 | } else { 25 | def convert(src:BerkeleyTree):Tree = { 26 | val dst:Tree = new LabeledScoredTreeNode 27 | if (src.getLabel != null) dst.setLabel(new Word(src.getLabel)) 28 | dst.setChildren(src.getChildren.map( convert(_) ).toArray) 29 | dst 30 | } 31 | new LabeledScoredTreeNode(new Word("TOP"), 32 | List[Tree](convert(roots.get(0)))) 33 | } 34 | } 35 | 36 | lazy val berkeleyParser = { 37 | // (function to create parser) 38 | def mkParser = { 39 | // (setup parser) 40 | val pData = ParserData.Load(parse.model) 41 | if (pData == null) throw new RuntimeException("Failed to load Berkeley parser model") 42 | val grammar = pData.getGrammar(); 43 | val lexicon = pData.getLexicon(); 44 | Numberer.setNumberers(pData.getNumbs()); 45 | // (create parser object) 46 | val parser = new CoarseToFineMaxRuleParser( 47 | grammar, lexicon, 1.0, -1, false, false, false, 48 | false, false, true, true) 49 | // (set binarization) 50 | try { 51 | val binarizationField = classOf[ConstrainedArrayParser].getDeclaredField("binarization"); 52 | binarizationField.setAccessible(true); 53 | binarizationField.set(parser, pData.getBinarization()); 54 | binarizationField.setAccessible(false); 55 | } catch { case (e:Exception) => throw new RuntimeException(e) } 56 | // (parser object) 57 | new { 58 | def parse(words:List[String], pos:List[String]):Tree = { 59 | var parsedTree:BerkeleyTree 60 | = parser.getBestConstrainedParse(words, pos, null); 61 | if (parsedTree.getChildren().isEmpty()) { 62 | parsedTree = parser.getBestConstrainedParse(words, null, null); 63 | } 64 | parsedTree 65 | } 66 | } 67 | } 68 | // (create parsers) 69 | val parsers = (0 until numThreads).map{ x => (mkParser, new Lock) }.toList 70 | // (multithreaded implementation) 71 | new { 72 | def parse(words:List[String], pos:List[String]):Tree = { 73 | def tryParse:Tree = { 74 | val validParser = parsers.indexWhere{ 75 | (pair:({def parse(words:List[String],pos:List[String]):Tree},Lock)) => 76 | pair._2.available 77 | } 78 | if (validParser >= 0) { // case: [likely] found parser to run 79 | val (parser, lock) = parsers(validParser) 80 | lock.acquire 81 | val rtn = parser.parse(words, pos) 82 | lock.release 83 | rtn 84 | } else { Thread.sleep(1000); tryParse } // case: no parser found 85 | } 86 | tryParse 87 | } 88 | } 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Classify.scala: -------------------------------------------------------------------------------- 1 | package edu.stanford.nlp; 2 | 3 | import scala.collection.JavaConversions._ 4 | import scala.collection.MapLike 5 | import scala.collection.Map 6 | import scala.collection.generic.CanBuildFrom 7 | import scala.concurrent.Lock 8 | 9 | import java.io.ObjectInputStream 10 | import java.lang.ref.SoftReference 11 | import java.lang.ref.ReferenceQueue 12 | import java.util.Properties 13 | 14 | import edu.stanford.nlp.classify.LinearClassifierFactory 15 | import edu.stanford.nlp.classify.LogPrior 16 | import edu.stanford.nlp.classify.RVFDataset 17 | import edu.stanford.nlp.ie.NERClassifierCombiner 18 | import edu.stanford.nlp.io.IOUtils 19 | import edu.stanford.nlp.ling.HasWord 20 | import edu.stanford.nlp.ling.RVFDatum 21 | import edu.stanford.nlp.ling.Word 22 | import edu.stanford.nlp.ling.CoreLabel 23 | import edu.stanford.nlp.optimization.DiffFunction 24 | import edu.stanford.nlp.optimization.QNMinimizer 25 | import edu.stanford.nlp.optimization.SGDToQNMinimizer 26 | import edu.stanford.nlp.parser.lexparser.LexicalizedParser 27 | import edu.stanford.nlp.process.Morphology 28 | import edu.stanford.nlp.process.PTBTokenizer 29 | import edu.stanford.nlp.stats.ClassicCounter 30 | import edu.stanford.nlp.stats.Counter 31 | import edu.stanford.nlp.tagger.maxent.MaxentTagger 32 | import edu.stanford.nlp.trees.CollinsHeadFinder 33 | import edu.stanford.nlp.trees.LabeledScoredTreeNode 34 | import edu.stanford.nlp.trees.Tree 35 | import edu.stanford.nlp.trees.Trees 36 | import edu.stanford.nlp.trees.GrammaticalStructureFactory 37 | import edu.stanford.nlp.trees.GrammaticalStructure 38 | import edu.stanford.nlp.trees.PennTreebankLanguagePack 39 | import edu.stanford.nlp.trees.TypedDependency 40 | import edu.stanford.nlp.util.logging.Redwood.Util._ 41 | 42 | import NLPConfig._ 43 | import NLP._ 44 | 45 | // ---------- 46 | // Classifiers 47 | // ---------- 48 | @SerialVersionUID(1l) 49 | class Classifier[I,O]( 50 | regression:I=>Map[O,Double], 51 | val data:Map[I,(O,Float)]) extends Function1[I,O] with Serializable { 52 | override def apply(in:I):O = { 53 | regression(in).maxBy(_._2)._1 54 | } 55 | } 56 | 57 | class Mapping[I,O](map:Map[I,(O,Float)]) { 58 | import Mapping.{toCounter,defaultFeatures} 59 | 60 | def scorer[F](featurizer:I=>Iterable[F]):I=>Map[O,Double] = { 61 | // -- Create Dataset 62 | val weights = new Array[Float](map.size) 63 | val dataset = new RVFDataset[O, F](map.size) 64 | map.zipWithIndex.foreach{ 65 | case ((input:I, (output:O, weight:Float)),i:Int) => 66 | weights(i) = weight 67 | dataset.add( new RVFDatum[O, F](toCounter(featurizer(input)), output) ) 68 | } 69 | // -- Train 70 | val prior = new LogPrior(LogPrior.LogPriorType.QUADRATIC) 71 | val factory = new LinearClassifierFactory[O,F]() 72 | val classifier = factory.trainClassifier(dataset, weights, prior) 73 | // -- Return 74 | (input:I) => { 75 | val scores = classifier.scoresOf( 76 | new RVFDatum[O, F](toCounter(featurizer(input)), null.asInstanceOf[O])) 77 | scores.keySet.map{ x => (x, scores.getCount(x)) }.toMap 78 | } 79 | } 80 | def scorer:I=>Map[O,Double] = scorer(defaultFeatures(_, map.size)) 81 | 82 | def classifier[F](featurizer:I=>Iterable[F]):Classifier[I,O] 83 | = new Classifier(scorer(featurizer), map) 84 | def classifier:Classifier[I,O] 85 | = classifier(defaultFeatures(_, map.size)) 86 | } 87 | 88 | object Mapping { 89 | def toCounter[X,F](map:Iterable[X]):Counter[F] = { 90 | val counts = new ClassicCounter[F] 91 | map.foreach{ (x:X) => x match { 92 | case (feat:F, n:Number) => counts.incrementCount(feat, n.doubleValue) 93 | case (feat:F) => counts.incrementCount(feat, 1) 94 | case _ => throw new IllegalStateException("Type mismatch in toCounter") 95 | } } 96 | return counts 97 | } 98 | 99 | def apply[I,O,X](map:Map[I,X]):Mapping[I,O] = { 100 | new Mapping(map.map{ case (i:I, x:X) => x match { 101 | case (o:O, n:Number) => (i, (o, n.floatValue)) 102 | case (o:O) => (i, (o, 1.0.asInstanceOf[Float])) 103 | case _ => throw new IllegalStateException("Type mismatch in toCounter") 104 | } }) 105 | } 106 | 107 | def defaultFeatures[I](input:I, datasetSize:Int):Iterable[(String,Float)] = { 108 | def ngram[A](seq:List[A], n:Int, tail:List[A] = Nil):List[String] = { 109 | if (seq.isEmpty) Nil 110 | else (seq.head :: tail.slice(0, n-1)).reverse.mkString("_") :: ngram(seq.tail, n, seq.head :: tail) 111 | } 112 | input match { 113 | case (sent:Sentence) => 114 | val n:Int = (scala.math.log10(datasetSize) / 3.0).toInt + 1 115 | // N-grams 116 | (ngram(sent.words.toList, n) ::: 117 | ngram(sent.words.toList.map( _.toLowerCase ), n) ::: 118 | ngram(sent.lemma.toList, n) ::: 119 | ngram(sent.ner.toList, n) ::: 120 | ngram(sent.pos.toList, n) ::: 121 | // Bag-of-words 122 | { if (n > 1) 123 | sent.words.toList ::: 124 | sent.words.toList.map( _.toLowerCase ) ::: 125 | sent.lemma.toList ::: 126 | sent.ner.toList ::: 127 | sent.pos.toList 128 | else Nil } 129 | ).map{ (_, 1.0.toFloat) } 130 | case (str:String) => 131 | val tokens = str.split(" ") 132 | val n:Int = (scala.math.log10(datasetSize) / 3.0).toInt + 1 133 | if (tokens.length <= 1) { 134 | // Case: a single word 135 | (tokens(0) :: // memorize 136 | ngram(str.toCharArray.toList, n) ::: // literal n-grams 137 | ngram(str.toLowerCase.toCharArray.toList, n) // case-insensitive n-grams 138 | ).map{ (_, 1.0.toFloat) } 139 | } else { 140 | // Case: a phrase 141 | (ngram(tokens.toList, n) ::: // literal n-grams 142 | ngram(tokens.toList.map( _.toLowerCase), n) // case-insensitive n-grams 143 | ).map{ (_, 1.0.toFloat) } 144 | } 145 | case (seq:Iterable[Any]) => 146 | seq.map{ (x:Any) => x match { 147 | case (feat:Any, n:Number) => (feat.toString, n.floatValue) 148 | case (feat:Any) => (feat.toString, 1.0.toFloat) 149 | case _ => (x.toString, 1.0.toFloat) 150 | } } 151 | case _ => List[(String,Float)]( (input.toString, 1.0.toFloat) ) 152 | } 153 | } 154 | } 155 | 156 | // ---------- 157 | // Ensemble Classifiers 158 | // ---------- 159 | 160 | class Ensemble[I](members:Seq[I=>Boolean], dat:Option[Map[I,(Boolean,Float)]]) { 161 | // -- Get Data 162 | if (!dat.isDefined) { 163 | members.foldLeft(Option[Map[I,(Boolean,Float)]](null)){ 164 | (dat:Option[Map[I,(Boolean,Float)]], fn:I=>Boolean) => 165 | fn match { 166 | case (classifier:Classifier[I,Boolean]) => 167 | dat match { 168 | case Some(existingData) => 169 | if (classifier.data != existingData) { 170 | warn("Classifiers trained on different data; taking union") 171 | Some(classifier.data ++ existingData) 172 | } else { 173 | Some(existingData) 174 | } 175 | case None => Some(classifier.data) 176 | } 177 | case _ => dat 178 | } 179 | } 180 | } 181 | 182 | // -- Methods 183 | def data(d:Map[I,(Boolean,Float)]):Ensemble[I] = new Ensemble(members, Some(d)) 184 | def data(d:Seq[(I,Boolean)]):Ensemble[I] 185 | = data( d.map( x => (x._1, (x._2, 1.0f)) ).toMap ) 186 | 187 | /** 188 | * Implementation of AdaBoost. 189 | * Taken from http://en.wikipedia.org/wiki/AdaBoost 190 | */ 191 | def boost(data:Map[I,(Boolean,Float)]):Classifier[I,Boolean] = { 192 | if (data.isEmpty) throw new IllegalArgumentException("No data to train on!") 193 | // -- Cache 194 | startTrack("Running Weak Learners") 195 | val dataAsArray = data.toArray 196 | val gold = dataAsArray.map( _._2._1 ) 197 | val predictions:Array[(I=>Boolean,Array[(Boolean, Float)])] 198 | = members.toList.par.map{ (h:I=>Boolean) => 199 | log("running " + h.toString) 200 | (h, dataAsArray.map{ case (in:I, (out:Boolean, weight:Float)) => 201 | (h(in), weight) 202 | }) 203 | }.toArray 204 | endTrack("Running Weak Learners") 205 | // -- Error Rate 206 | def error(predictions:Array[(Boolean,Float)], 207 | gold:Array[Boolean], 208 | d:Array[Double] = (0 until data.size).map( x => 1.0 / data.size ).toArray 209 | ):Double = { 210 | predictions.zip(gold).zip(d).foldLeft(0.0){ 211 | case (sum:Double, 212 | (( (guess:Boolean, weight:Float), 213 | gold:Boolean), 214 | di:Double)) => 215 | if(guess == gold) sum else sum + di * weight 216 | } 217 | } 218 | def regressor(coefficients:Seq[(Double, I=>Boolean)] 219 | ):(I => Map[Boolean, Double]) = (in:I) => { 220 | val sum = coefficients.foldLeft(0.0){ 221 | case (sum:Double, (alpha:Double, h:(I=>Boolean))) => 222 | sum + alpha * { if(h(in)) 1.0 else -1.0 } 223 | } 224 | Map[Boolean, Double]( true -> {if(sum >= 0.0) 1.0 else 0.0 }, 225 | false -> {if(sum >= 0.0) 0.0 else 1.0 } ) 226 | } 227 | // -- Run an Iteration 228 | def iter(t:Int, 229 | predictions:Array[(I=>Boolean, Array[(Boolean,Float)])], 230 | gold:Array[Boolean], 231 | soFar:List[(Double, I=>Boolean)], 232 | d:Array[Double] = data.map( x => 1.0 / data.size.toDouble ).toArray, 233 | tolerance:Double = NLPConfig.classify.tolerance 234 | ):List[(Double, I=>Boolean)] = { 235 | startTrack("Iteration " + t) 236 | // (get errors) 237 | val errors = predictions.map{ case (h, pred:Array[(Boolean,Float)]) => 238 | ( h, pred, error(pred, gold, d) ) 239 | } 240 | val (hOpt, predOpt, et) = errors.maxBy( x => scala.math.abs(0.5 - x._3) ) 241 | // (compute update) 242 | log("optimal classifier: " + hOpt) 243 | log("e_t: " + et) 244 | val at = 0.5 * scala.math.log( (1.0 - et) / et ) 245 | val newD = predOpt.zip(gold).zip(d).map{ 246 | case (((guess:Boolean, weight:Float), gold:Boolean), di:Double) => 247 | di * scala.math.exp(- {if (guess == gold) 1.0 else -1.0} * at) 248 | } 249 | val sumD = newD.sum 250 | for (i <- 0 until newD.length) { newD(i) /= sumD } 251 | // (update coefficients) 252 | val coeffs = (at, hOpt) :: soFar 253 | log("a_t: " + at) 254 | endTrack("Iteration " + t) 255 | // (recurse) 256 | if ( scala.math.abs(0.5 - et) < tolerance || 257 | t >= NLPConfig.classify.iterations) { 258 | coeffs 259 | } else { 260 | iter(t+1, predictions, gold, coeffs, newD, tolerance) 261 | } 262 | } 263 | // -- Construct Classifier 264 | startTrack("Boosting over " + members.length + " classifier and " + data.size + " examples") 265 | val fn = regressor(iter(1, predictions, gold, Nil)) 266 | endTrack("Boosting over " + members.length + " classifier and " + data.size + " examples") 267 | new Classifier(fn, data) 268 | } 269 | 270 | def boost:Classifier[I,Boolean] 271 | = boost(dat.getOrElse(Map[I,(Boolean,Float)]())) 272 | } 273 | -------------------------------------------------------------------------------- /src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Document.scala: -------------------------------------------------------------------------------- 1 | package edu.stanford.nlp; 2 | 3 | import scala.collection.JavaConversions._ 4 | import scala.collection.MapLike 5 | import scala.collection.Map 6 | import scala.collection.generic.CanBuildFrom 7 | import scala.concurrent.Lock 8 | 9 | import java.io.ObjectInputStream 10 | import java.lang.ref.SoftReference 11 | import java.lang.ref.ReferenceQueue 12 | import java.util.Properties 13 | 14 | import edu.stanford.nlp.classify.LinearClassifierFactory 15 | import edu.stanford.nlp.classify.LogPrior 16 | import edu.stanford.nlp.classify.RVFDataset 17 | import edu.stanford.nlp.ie.NERClassifierCombiner 18 | import edu.stanford.nlp.io.IOUtils 19 | import edu.stanford.nlp.ling.HasWord 20 | import edu.stanford.nlp.ling.RVFDatum 21 | import edu.stanford.nlp.ling.Word 22 | import edu.stanford.nlp.ling.CoreLabel 23 | import edu.stanford.nlp.optimization.DiffFunction 24 | import edu.stanford.nlp.optimization.QNMinimizer 25 | import edu.stanford.nlp.optimization.SGDToQNMinimizer 26 | import edu.stanford.nlp.parser.lexparser.LexicalizedParser 27 | import edu.stanford.nlp.process.Morphology 28 | import edu.stanford.nlp.process.PTBTokenizer 29 | import edu.stanford.nlp.stats.ClassicCounter 30 | import edu.stanford.nlp.stats.Counter 31 | import edu.stanford.nlp.tagger.maxent.MaxentTagger 32 | import edu.stanford.nlp.trees.CollinsHeadFinder 33 | import edu.stanford.nlp.trees.LabeledScoredTreeNode 34 | import edu.stanford.nlp.trees.Tree 35 | import edu.stanford.nlp.trees.Trees 36 | import edu.stanford.nlp.trees.GrammaticalStructureFactory 37 | import edu.stanford.nlp.trees.GrammaticalStructure 38 | import edu.stanford.nlp.trees.PennTreebankLanguagePack 39 | import edu.stanford.nlp.trees.TypedDependency 40 | import edu.stanford.nlp.util.logging.Redwood.Util._ 41 | 42 | import NLPConfig._ 43 | import NLP._ 44 | 45 | 46 | object Document { 47 | } 48 | 49 | 50 | @SerialVersionUID(1l) 51 | case class Document(sentences:Array[String]) { 52 | // TODO(gabor) coreference 53 | } 54 | 55 | 56 | -------------------------------------------------------------------------------- /src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Magic.scala: -------------------------------------------------------------------------------- 1 | package edu.stanford.nlp; 2 | 3 | import scala.collection.JavaConversions._ 4 | import scala.collection.MapLike 5 | import scala.collection.Map 6 | import scala.collection.generic.CanBuildFrom 7 | import scala.concurrent.Lock 8 | 9 | import java.io.ObjectInputStream 10 | import java.lang.ref.SoftReference 11 | import java.lang.ref.ReferenceQueue 12 | import java.util.Properties 13 | 14 | import edu.stanford.nlp.classify.LinearClassifierFactory 15 | import edu.stanford.nlp.classify.LogPrior 16 | import edu.stanford.nlp.classify.RVFDataset 17 | import edu.stanford.nlp.ie.NERClassifierCombiner 18 | import edu.stanford.nlp.io.IOUtils 19 | import edu.stanford.nlp.ling.HasWord 20 | import edu.stanford.nlp.ling.RVFDatum 21 | import edu.stanford.nlp.ling.Word 22 | import edu.stanford.nlp.ling.CoreLabel 23 | import edu.stanford.nlp.optimization.DiffFunction 24 | import edu.stanford.nlp.optimization.QNMinimizer 25 | import edu.stanford.nlp.optimization.SGDToQNMinimizer 26 | import edu.stanford.nlp.parser.lexparser.LexicalizedParser 27 | import edu.stanford.nlp.process.Morphology 28 | import edu.stanford.nlp.process.PTBTokenizer 29 | import edu.stanford.nlp.stats.ClassicCounter 30 | import edu.stanford.nlp.stats.Counter 31 | import edu.stanford.nlp.tagger.maxent.MaxentTagger 32 | import edu.stanford.nlp.trees.CollinsHeadFinder 33 | import edu.stanford.nlp.trees.LabeledScoredTreeNode 34 | import edu.stanford.nlp.trees.Tree 35 | import edu.stanford.nlp.trees.Trees 36 | import edu.stanford.nlp.trees.GrammaticalStructureFactory 37 | import edu.stanford.nlp.trees.GrammaticalStructure 38 | import edu.stanford.nlp.trees.PennTreebankLanguagePack 39 | import edu.stanford.nlp.trees.TypedDependency 40 | import edu.stanford.nlp.util.logging.Redwood.Util._ 41 | 42 | import NLPConfig._ 43 | 44 | 45 | object Magic { 46 | import NLP._ 47 | 48 | /* 49 | * Implicit Conversions 50 | */ 51 | implicit def seq2nlpseq(seq:Seq[String]):Sentence = new Sentence(seq) 52 | implicit def string2nlpseq(gloss:String):Sentence = new Sentence(gloss) 53 | 54 | implicit def map2mapping[I,O,X](map:Map[I,X]):Mapping[I,O] = Mapping(map) 55 | 56 | implicit def seq2ensemble[I](seq:Seq[I=>Boolean]):Ensemble[I] = new Ensemble(seq, None) 57 | 58 | implicit def fn2optimizable( 59 | fn:Array[Double]=>Double):OptimizableFunction = { 60 | optimize.algorithm.toLowerCase match { 61 | case "lbfgs" => LBFGSOptimizableApproximateFunction(fn, None) 62 | case "braindead" => BraindeadGradientDescent(fn, None) 63 | case _ => throw new IllegalStateException("Unknown algorithm: " + optimize.algorithm) 64 | } 65 | } 66 | implicit def fnPair2optimizable( 67 | pair:(Array[Double]=>Double,Array[Double]=>Array[Double])):OptimizableFunction = { 68 | optimize.algorithm.toLowerCase match { 69 | case "lbfgs" => LBFGSOptimizableApproximateFunction(pair._1, Some(pair._2)) 70 | case "braindead" => BraindeadGradientDescent(pair._1, Some(pair._2)) 71 | case _ => throw new IllegalStateException("Unknown algorithm: " + optimize.algorithm) 72 | } 73 | } 74 | 75 | implicit def string2tokensregex(str:String):TokensRegex = new TokensRegex(str) 76 | } 77 | -------------------------------------------------------------------------------- /src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/NLP.scala: -------------------------------------------------------------------------------- 1 | package edu.stanford.nlp 2 | 3 | import scala.collection.JavaConversions._ 4 | import scala.collection.MapLike 5 | import scala.collection.Map 6 | import scala.collection.generic.CanBuildFrom 7 | import scala.concurrent.Lock 8 | 9 | import java.io.ObjectInputStream 10 | import java.lang.ref.SoftReference 11 | import java.lang.ref.ReferenceQueue 12 | import java.util.Properties 13 | 14 | import edu.stanford.nlp.ling.CoreAnnotations._ 15 | import edu.stanford.nlp.classify.LinearClassifierFactory 16 | import edu.stanford.nlp.classify.LogPrior 17 | import edu.stanford.nlp.classify.RVFDataset 18 | import edu.stanford.nlp.ie.NERClassifierCombiner 19 | import edu.stanford.nlp.ie.crf.CRFBiasedClassifier 20 | import edu.stanford.nlp.io.IOUtils 21 | import edu.stanford.nlp.ling.HasWord 22 | import edu.stanford.nlp.ling.RVFDatum 23 | import edu.stanford.nlp.ling.Word 24 | import edu.stanford.nlp.ling.CoreLabel 25 | import edu.stanford.nlp.optimization.DiffFunction 26 | import edu.stanford.nlp.optimization.QNMinimizer 27 | import edu.stanford.nlp.optimization.SGDToQNMinimizer 28 | import edu.stanford.nlp.parser.lexparser.LexicalizedParser 29 | import edu.stanford.nlp.process.Morphology 30 | import edu.stanford.nlp.process.PTBTokenizer 31 | import edu.stanford.nlp.stats.ClassicCounter 32 | import edu.stanford.nlp.stats.Counter 33 | import edu.stanford.nlp.tagger.maxent.MaxentTagger 34 | import edu.stanford.nlp.trees.CollinsHeadFinder 35 | import edu.stanford.nlp.trees.LabeledScoredTreeNode 36 | import edu.stanford.nlp.trees.Tree 37 | import edu.stanford.nlp.trees.Trees 38 | import edu.stanford.nlp.trees.GrammaticalStructureFactory 39 | import edu.stanford.nlp.trees.GrammaticalStructure 40 | import edu.stanford.nlp.trees.PennTreebankLanguagePack 41 | import edu.stanford.nlp.trees.TypedDependency 42 | import edu.stanford.nlp.util.logging.Redwood.Util._ 43 | 44 | import NLPConfig._ 45 | 46 | object NLP { 47 | implicit def list2hasWordList(lst:Seq[String]):java.util.List[_<:HasWord] 48 | = lst.map( new Word(_) ).toList 49 | 50 | // ---------- 51 | // Parsers 52 | // ---------- 53 | lazy val stanfordParser = { 54 | val parser = LexicalizedParser.loadModel(parse.model) 55 | new { 56 | def parse(words:List[String], pos:List[String]):Tree = { 57 | parser.parseStrings(words); 58 | } 59 | } 60 | } 61 | lazy val parser = stanfordParser 62 | // ---------- 63 | // Stanford CoreNLP Components 64 | // ---------- 65 | lazy val tagger = new MaxentTagger(pos.model) 66 | 67 | lazy val collinsHeadFinder = new CollinsHeadFinder() 68 | 69 | lazy val morph:((Morphology=>Any)=>Any) = { 70 | val morph = new Morphology() 71 | val morphLock = new Lock() 72 | val f = { (fn:Morphology=>Any) => 73 | morphLock.acquire; 74 | val rtn = fn(morph); 75 | morphLock.release 76 | rtn 77 | } 78 | f 79 | } 80 | 81 | lazy val nerCRF:(Array[String], Array[String])=>Array[String] = { 82 | val classifier = new NERClassifierCombiner(ner.model, ner.aux); 83 | (words:Array[String], pos:Array[String]) => { 84 | val offsets:List[Int] = words.foldLeft( (List[Int](), 0) ){ 85 | case ((offsetsSoFar:List[Int], offset:Int), word:String) => 86 | (offset :: offsetsSoFar, offset + word.length + 1) 87 | }._1.reverse 88 | // (construct CoreLabel sentence) 89 | val coreSentence = new java.util.ArrayList[CoreLabel](words.length) 90 | words.zip(pos).zip(offsets)foreach{ 91 | case ((word:String, pos:String), offset:Int) => 92 | val label = new CoreLabel 93 | label.setWord(word) 94 | label.setOriginalText(word) 95 | label.setTag(pos) 96 | label.setBeginPosition(offset) 97 | label.setEndPosition(offset + word.length) 98 | coreSentence.add(label) 99 | } 100 | // (classify) 101 | classifier.classifySentence(coreSentence) 102 | val output:java.util.List[CoreLabel] = classifier.classifySentence(coreSentence); 103 | // (convert back) 104 | output.map{ (label:CoreLabel) => 105 | label.ner() 106 | }.toArray 107 | } 108 | } 109 | 110 | /** 111 | * The TrueCase classifier implementation. 112 | * Takes as input an array of tokens, POS tags, and lemmas, 113 | * and returns as output the tokens with their true case applied. 114 | * The length of the tokens, POS tags, and lemmas must match. 115 | * @return An array of tokens (words as Strings) of the same length 116 | * as the input tokens, but with their inferred true case. 117 | */ 118 | lazy val trueCaser:(Array[String], Array[String], Array[String])=>Array[String] = { 119 | // Create classifier 120 | val props:Properties = { 121 | val p = new Properties 122 | p.setProperty("loadClassifier", NLPConfig.truecase.model) 123 | p.setProperty("mixedCaseMapFile", NLPConfig.truecase.disambiguation_list) 124 | p.setProperty("classBias", NLPConfig.truecase.bias) 125 | p 126 | } 127 | val classifier = new CRFBiasedClassifier[CoreLabel](props); 128 | classifier.loadClassifierNoExceptions(NLPConfig.truecase.model, props); 129 | // Set classifier biases 130 | NLPConfig.truecase.bias.split(",").foreach{ (bias:String) => 131 | val terms = bias.split(":") 132 | classifier.setBiasWeight(terms(0), terms(1).toDouble) 133 | } 134 | // Get mixed case map 135 | val mixedCaseMap:Map[String,String] 136 | = scala.io.Source.fromInputStream(IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(NLPConfig.truecase.disambiguation_list)) 137 | .getLines 138 | .map( _.trim.split("""\s+""") ) 139 | .map{ case Array(a:String, b:String) => (a ,b) } 140 | .toMap 141 | // Return function 142 | (words:Array[String], pos:Array[String], lemma:Array[String]) => { 143 | // (mock offsets) 144 | val offsets:List[Int] = words.foldLeft( (List[Int](), 0) ){ 145 | case ((offsetsSoFar:List[Int], offset:Int), word:String) => 146 | (offset :: offsetsSoFar, offset + word.length + 1) 147 | }._1.reverse 148 | // (construct CoreLabel sentence) 149 | val coreSentence = new java.util.ArrayList[CoreLabel](words.length) 150 | words.zip(pos).zip(offsets)foreach{ 151 | case ((word:String, pos:String), offset:Int) => 152 | val label = new CoreLabel 153 | label.setWord(word.toLowerCase) 154 | label.setOriginalText(word) 155 | label.setTag(pos) 156 | label.setBeginPosition(offset) 157 | label.setEndPosition(offset + word.length) 158 | coreSentence.add(label) 159 | } 160 | // (classify) 161 | val output:java.util.List[CoreLabel] = classifier.classifySentence(coreSentence); 162 | // (convert back) 163 | output.map{ (label:CoreLabel) => 164 | val word:String = label.word 165 | label.get(classOf[AnswerAnnotation]) match { 166 | case "UPPER" => word.toUpperCase 167 | case "LOWER" => word.toLowerCase 168 | case "INIT_UPPER" => word.substring(0, 1).toUpperCase + word.substring(1).toLowerCase 169 | case "O" => mixedCaseMap.get(word).getOrElse(word) 170 | case _ => word 171 | } 172 | }.toArray 173 | } 174 | } 175 | 176 | // ---------- 177 | // Methods 178 | // ---------- 179 | def preload(obj: => Any) { new Thread(){ override def run:Unit = obj }.start } 180 | } 181 | 182 | trait CoreLabelSeq extends Seq[CoreLabel] { 183 | // 184 | // Trivial overrides (still have to define apply(Int):CoreLabel and length:Int though) 185 | // 186 | override def iterator:Iterator[CoreLabel] = new Iterator[CoreLabel] { 187 | var index:Int = 0 188 | override def hasNext:Boolean = index < CoreLabelSeq.this.length 189 | override def next:CoreLabel = { index += 1; apply(index - 1); } 190 | } 191 | 192 | // 193 | // Common Methods 194 | // 195 | def matches(t:TokensRegex) = t.matches(this) 196 | } 197 | -------------------------------------------------------------------------------- /src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/NLPConfig.scala: -------------------------------------------------------------------------------- 1 | package edu.stanford.nlp 2 | 3 | import edu.stanford.nlp.pipeline.DefaultPaths._ 4 | 5 | object NLPConfig { 6 | object parse { 7 | var model:String = DEFAULT_PARSER_MODEL 8 | } 9 | 10 | object pos { 11 | var model:String = DEFAULT_POS_MODEL 12 | } 13 | 14 | object ner { 15 | var model:String = DEFAULT_NER_CONLL_MODEL 16 | var aux:String = DEFAULT_NER_MUC_MODEL 17 | } 18 | 19 | object classify { 20 | var tolerance:Double = 1e-5 21 | var iterations:Double = 40 22 | } 23 | 24 | object optimize { 25 | var tolerance:Double = 1e-5 26 | var wiggle:Double = 1e-5 27 | var algorithm = "LBFGS" // | braindead | ... 28 | } 29 | 30 | object truecase { 31 | var model:String = "edu/stanford/nlp/models/truecase/truecasing.fast.caseless.qn.ser.gz" 32 | var disambiguation_list:String = "edu/stanford/nlp/models/truecase/MixDisambiguation.list" 33 | var bias:String = "INIT_UPPER:-0.7,UPPER:-0.7,O:0" 34 | } 35 | 36 | def caseless:Unit = { 37 | parse.model = "edu/stanford/nlp/models/lexparser/englishPCFG.caseless.ser.gz" 38 | pos.model = "edu/stanford/nlp/models/pos-tagger/english-caseless-left3words-distsim.tagger" 39 | ner.model = "edu/stanford/nlp/models/ner/english.conll.4class.caseless.distsim.crf.ser.gz" 40 | ner.aux = "edu/stanford/nlp/models/ner/english.muc.7class.caseless.distsim.crf.ser.gz" 41 | } 42 | 43 | var numThreads = Runtime.getRuntime().availableProcessors(); 44 | } 45 | -------------------------------------------------------------------------------- /src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Optimize.scala: -------------------------------------------------------------------------------- 1 | package edu.stanford.nlp; 2 | 3 | import scala.collection.JavaConversions._ 4 | import scala.collection.MapLike 5 | import scala.collection.Map 6 | import scala.collection.generic.CanBuildFrom 7 | import scala.concurrent.Lock 8 | 9 | import java.io.ObjectInputStream 10 | import java.lang.ref.SoftReference 11 | import java.lang.ref.ReferenceQueue 12 | import java.util.Properties 13 | 14 | import edu.stanford.nlp.classify.LinearClassifierFactory 15 | import edu.stanford.nlp.classify.LogPrior 16 | import edu.stanford.nlp.classify.RVFDataset 17 | import edu.stanford.nlp.ie.NERClassifierCombiner 18 | import edu.stanford.nlp.io.IOUtils 19 | import edu.stanford.nlp.ling.HasWord 20 | import edu.stanford.nlp.ling.RVFDatum 21 | import edu.stanford.nlp.ling.Word 22 | import edu.stanford.nlp.ling.CoreLabel 23 | import edu.stanford.nlp.optimization.DiffFunction 24 | import edu.stanford.nlp.optimization.QNMinimizer 25 | import edu.stanford.nlp.optimization.SGDToQNMinimizer 26 | import edu.stanford.nlp.parser.lexparser.LexicalizedParser 27 | import edu.stanford.nlp.process.Morphology 28 | import edu.stanford.nlp.process.PTBTokenizer 29 | import edu.stanford.nlp.stats.ClassicCounter 30 | import edu.stanford.nlp.stats.Counter 31 | import edu.stanford.nlp.tagger.maxent.MaxentTagger 32 | import edu.stanford.nlp.trees.CollinsHeadFinder 33 | import edu.stanford.nlp.trees.LabeledScoredTreeNode 34 | import edu.stanford.nlp.trees.Tree 35 | import edu.stanford.nlp.trees.Trees 36 | import edu.stanford.nlp.trees.GrammaticalStructureFactory 37 | import edu.stanford.nlp.trees.GrammaticalStructure 38 | import edu.stanford.nlp.trees.PennTreebankLanguagePack 39 | import edu.stanford.nlp.trees.TypedDependency 40 | import edu.stanford.nlp.util.logging.Redwood.Util._ 41 | 42 | import NLPConfig._ 43 | import NLP._ 44 | import Optimize._ 45 | 46 | // ---------- 47 | // Optimizers 48 | // ---------- 49 | object Optimize { 50 | def empiricalDerivative(fn:Array[Double]=>Double, 51 | x:Array[Double]):Array[Double] = { 52 | val y0 = fn(x) 53 | def tweak(i:Int, delta:Double):(Double, Double) = { 54 | x(i) += delta 55 | val y1 = fn(x) 56 | x(i) -= delta 57 | if (delta < 1e-5 * optimize.wiggle || delta > 1e5 * optimize.wiggle) { 58 | (y1, delta) 59 | } else { 60 | if (scala.math.abs(y1 - y0) / delta > 1e5) tweak(i, delta / 2.0) 61 | else if (scala.math.abs(y1 - y0) / delta < 1e-5) tweak(i, delta * 2.0) 62 | else (y1, delta) 63 | } 64 | } 65 | {for (i <- 0 until x.length) yield { 66 | val (y1, step) = tweak(i, optimize.wiggle) 67 | (y1 - y0) / step 68 | }}.toArray 69 | } 70 | } 71 | 72 | trait OptimizableFunction { 73 | def minimize(initial:Array[Double]):Array[Double] 74 | def derivative(ddx:Array[Double]=>Array[Double]):OptimizableFunction 75 | } 76 | 77 | /** 78 | * A wrapper for QNMinimizer (L-BFGS) 79 | */ 80 | case class LBFGSOptimizableApproximateFunction( 81 | fn:Array[Double]=>Double, derivative:Option[Array[Double]=>Array[Double]]) 82 | extends OptimizableFunction{ 83 | 84 | override def minimize(initial:Array[Double]):Array[Double] = { 85 | // (define a differentiable function) 86 | val javaFn:DiffFunction = new DiffFunction { 87 | override def domainDimension:Int = initial.length 88 | override def valueAt(x:Array[Double]):Double = fn(x) 89 | override def derivativeAt(x:Array[Double]):Array[Double] = { 90 | derivative match { 91 | case Some(ddx) => ddx(x) 92 | case None => empiricalDerivative(fn, x) 93 | } 94 | } 95 | } 96 | // (optimize using QNMinimizer) 97 | val javaInit = initial.map{ (n:Double) => n } 98 | val optimizer = new QNMinimizer() 99 | optimizer.setRobustOptions() 100 | optimizer.minimize(javaFn, optimize.tolerance, javaInit) 101 | } 102 | 103 | override def derivative(ddx:Array[Double]=>Array[Double]):LBFGSOptimizableApproximateFunction 104 | = new LBFGSOptimizableApproximateFunction(fn, Some(ddx)) 105 | } 106 | 107 | /** 108 | * An optimization algorithm I made up (thus, "braindead"), that tries its 109 | * best to move against the gradient (thus, "gradient descent"). 110 | * The only motivation to use this over L-BFGS is that it's more robust to 111 | * non-convex problems (i.e., won't crash and burn). 112 | */ 113 | case class BraindeadGradientDescent( 114 | fn:Array[Double]=>Double, derivative:Option[Array[Double]=>Array[Double]]) 115 | extends OptimizableFunction{ 116 | 117 | override def minimize(initial:Array[Double]):Array[Double] = { 118 | // (helpers) 119 | def dx(x:Array[Double], y0:Double):Array[Double] = derivative match { 120 | case Some(ddx) => ddx(x) 121 | case None => empiricalDerivative(fn, x) 122 | } 123 | def move(init:Array[Double], direction:Array[Double], scaling:Double):Array[Double] = { 124 | init.zip(direction).map{ case (a:Double, d:Double) => a + scaling * d} 125 | } 126 | def isImprovementOver(newY:Double, y:Double):Boolean 127 | = newY + optimize.tolerance < y 128 | // (state) 129 | val initialX:Array[Double] = initial 130 | val initialY:Double = fn(initialX) 131 | var x:Array[Double] = initialX 132 | var y:Double = initialY 133 | var numIters = 0 134 | // (optimization) 135 | while (numIters < 100) { 136 | var step:Double = 1.0 137 | val dir:Array[Double] = dx(x, y).map( - _ ) 138 | var newX:Array[Double] = move(x, dir, step) 139 | var newY:Double = fn(newX) 140 | while (!isImprovementOver(newY, y) && step > 1e-5) { 141 | step /= 2.0 142 | newX = move(x, dir, step) 143 | newY = fn(newX) 144 | } 145 | if (step <= 1e-5) return x // convergence 146 | assert(newY < y, "Function value did not decrease!") 147 | x = newX 148 | y = newY 149 | numIters += 1 150 | } 151 | // (timeout -- no convergence) 152 | return x 153 | } 154 | 155 | override def derivative(ddx:Array[Double]=>Array[Double]):BraindeadGradientDescent 156 | = new BraindeadGradientDescent(fn, Some(ddx)) 157 | } 158 | -------------------------------------------------------------------------------- /src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Sentence.scala: -------------------------------------------------------------------------------- 1 | package edu.stanford.nlp; 2 | 3 | import scala.collection.JavaConversions._ 4 | import scala.collection.MapLike 5 | import scala.collection.Map 6 | import scala.collection.generic.CanBuildFrom 7 | import scala.concurrent.Lock 8 | 9 | import java.io.ObjectInputStream 10 | import java.lang.ref.SoftReference 11 | import java.lang.ref.ReferenceQueue 12 | import java.util.Properties 13 | 14 | import edu.stanford.nlp.classify.LinearClassifierFactory 15 | import edu.stanford.nlp.classify.LogPrior 16 | import edu.stanford.nlp.classify.RVFDataset 17 | import edu.stanford.nlp.ie.NERClassifierCombiner 18 | import edu.stanford.nlp.io.IOUtils 19 | import edu.stanford.nlp.ling.HasWord 20 | import edu.stanford.nlp.ling.RVFDatum 21 | import edu.stanford.nlp.ling.Word 22 | import edu.stanford.nlp.ling.CoreLabel 23 | import edu.stanford.nlp.optimization.DiffFunction 24 | import edu.stanford.nlp.optimization.QNMinimizer 25 | import edu.stanford.nlp.optimization.SGDToQNMinimizer 26 | import edu.stanford.nlp.parser.lexparser.LexicalizedParser 27 | import edu.stanford.nlp.process.Morphology 28 | import edu.stanford.nlp.process.PTBTokenizer 29 | import edu.stanford.nlp.stats.ClassicCounter 30 | import edu.stanford.nlp.stats.Counter 31 | import edu.stanford.nlp.tagger.maxent.MaxentTagger 32 | import edu.stanford.nlp.trees.CollinsHeadFinder 33 | import edu.stanford.nlp.trees.LabeledScoredTreeNode 34 | import edu.stanford.nlp.trees.Tree 35 | import edu.stanford.nlp.trees.Trees 36 | import edu.stanford.nlp.trees.GrammaticalStructureFactory 37 | import edu.stanford.nlp.trees.GrammaticalStructure 38 | import edu.stanford.nlp.trees.PennTreebankLanguagePack 39 | import edu.stanford.nlp.trees.TypedDependency 40 | import edu.stanford.nlp.util.logging.Redwood.Util._ 41 | 42 | import NLPConfig._ 43 | import NLP._ 44 | 45 | object Sentence { 46 | val tokenizerFactory = PTBTokenizer.factory 47 | val grammaticalStructureFactory 48 | = new PennTreebankLanguagePack().grammaticalStructureFactory 49 | 50 | def apply(word:Seq[String]):Sentence = new Sentence(word.toArray) 51 | def apply(gloss:String):Sentence = new Sentence(gloss) 52 | } 53 | 54 | 55 | @SerialVersionUID(2l) 56 | case class Sentence(word:Array[String]) extends CoreLabelSeq { 57 | 58 | def this(word:Seq[String]) = this(word.toArray) 59 | 60 | def this(sentence:String) = this( 61 | Sentence.tokenizerFactory.getTokenizer(new java.io.StringReader(sentence)) 62 | .tokenize 63 | .map( _.word ) 64 | .toArray 65 | ) 66 | 67 | // 68 | // Necessary Overrides for Seq[CoreLabel] 69 | // 70 | override def length:Int = word.length 71 | override def apply(index:Int):CoreLabel = { 72 | val label = new CoreLabel(8) 73 | label.setWord(word(index)) 74 | label.setTag(pos(index)) 75 | if (index > 0) { label.setAfter(word(index - 1)) } 76 | if (index < word.length - 1) { label.setBefore(word(index + 1)) } 77 | label.setNER(ner(index)) 78 | label.setLemma(lemma(index)) 79 | label.setIndex(index) 80 | // TODO(gabor) things like character offsets, original text, etc. 81 | label 82 | } 83 | 84 | 85 | 86 | var id:Option[Int] = None 87 | // values 88 | lazy val parse:Tree = { 89 | NLP.parser.parse(word.toList, pos.toList) 90 | } 91 | 92 | lazy val stanfordDependencies:Array[(Int, String)] = { 93 | if (length == 0) { 94 | new Array[(Int, String)](0) 95 | } else { 96 | val depArray = new Array[(Int, String)](length) 97 | // (get dependencies) 98 | val structure:GrammaticalStructure 99 | = Sentence.grammaticalStructureFactory.newGrammaticalStructure(parse) 100 | val deps:java.util.Collection[TypedDependency] 101 | = structure.typedDependencies() 102 | // (fill dependencies) 103 | deps.foreach{ (arc:TypedDependency) => 104 | depArray(arc.dep.index - 1) = 105 | ( arc.gov.index - 1, 106 | arc.reln.getShortName + {if (arc.reln.getSpecific == null) "" else "_" + arc.reln.getSpecific} ) 107 | } 108 | // (pad empty dependencies) 109 | for (i <- 0 until depArray.length) { 110 | if (depArray(i) == null) depArray(i) = (i, "noop") 111 | } 112 | depArray 113 | } 114 | } 115 | 116 | def dependencyRoot:Int 117 | = stanfordDependencies.zipWithIndex.filter( _._1._1 < 0 ).headOption match { 118 | case Some( (dep, index) ) => index 119 | case None => throw new IllegalStateException("Could not find head: '" + 120 | this + "' --- dependencies: " + stanfordDependencies.mkString(" ")) 121 | } 122 | 123 | def dependencyChild(root:Int, depType:String):Option[Int] 124 | = stanfordDependencies.zipWithIndex.filter( x => x._1._1 == root && x._1._2 == depType ) 125 | .map( _._2 ).headOption 126 | 127 | def dependencyChildren(root:Int):Seq[(Int, String)] 128 | = stanfordDependencies.zipWithIndex.filter( _._1._1 == root ).map( x => (x._2, x._1._2) ) 129 | 130 | def dependencyYield(root:Int):Set[Int] = { 131 | def recursiveSearch(root:Int, seen:Set[Int]):Set[Int] = { 132 | val directChildren = dependencyChildren(root).map( _._1 ) 133 | directChildren.foldLeft(seen) { 134 | case (soFar:Set[Int], index:Int) => 135 | if (!soFar(index)) recursiveSearch(index, seen + index) 136 | else soFar 137 | } 138 | } 139 | recursiveSearch(root, Set[Int](root)) 140 | } 141 | 142 | def dependencyPathMonotonic(ancestor:Int, descendent:Int):Option[Seq[Int]] = { 143 | def recurse(ancestor:Int, descendent:Int, lst:List[Int]):Option[List[Int]] = { 144 | if (descendent == ancestor) Some(ancestor :: lst) 145 | else if (descendent < 0) None 146 | else recurse(ancestor, stanfordDependencies(descendent)._1, descendent :: lst) 147 | } 148 | recurse(ancestor, stanfordDependencies(descendent)._1, Nil) 149 | } 150 | 151 | lazy val headIndex:Int = { 152 | if (word.length == 1) { 0 } 153 | else { 154 | val headLeaf = parse.headTerminal(collinsHeadFinder) 155 | val index = parse.getLeaves().indexWhere{ (x:Tree) => x eq headLeaf } 156 | if (index < 0) word.length - 1 else index 157 | } 158 | } 159 | 160 | def headIndex(spanBegin:Int, spanEnd:Int):Int = { 161 | parse.setSpans 162 | val (score, tree) = parse.foldLeft( spanBegin + (length - spanEnd), parse ){ 163 | case ( (smallestDiffSoFar:Int, bestTreeSoFar:Tree), tree:Tree ) => 164 | if (tree != null && tree.getSpan != null) { 165 | val (treeBegin, treeEnd) = (tree.getSpan.getSource, tree.getSpan.getTarget) 166 | val diff = scala.math.abs(spanBegin - treeBegin) 167 | + scala.math.abs(spanEnd - treeEnd) 168 | if (treeBegin >= spanBegin && treeEnd <= spanEnd && 169 | diff < smallestDiffSoFar) { (diff, tree) } 170 | else { (smallestDiffSoFar, bestTreeSoFar) } 171 | } else { (smallestDiffSoFar, bestTreeSoFar) } 172 | } 173 | val headLeaf = tree.headTerminal(collinsHeadFinder) 174 | val index = parse.getLeaves().indexWhere{ (x:Tree) => x eq headLeaf } 175 | if (index < spanBegin || index >= spanEnd) spanEnd - 1 else index 176 | } 177 | 178 | def headWord(spanBegin:Int, spanEnd:Int):String = word(headIndex(spanBegin, spanEnd)) 179 | 180 | lazy val pos:Array[String] 181 | = if (length == 0) new Array[String](0) 182 | else NLP.tagger.apply(word.toList).map( _.tag ).toArray 183 | 184 | lazy val lemma:Array[String] = word.zip(pos).map{ case (w:String,p:String) => 185 | morph( m => m.lemma(w,p) ).toString 186 | }.toArray 187 | 188 | lazy val ner:Array[String] = nerCRF(word, pos) 189 | 190 | lazy val truecase:Array[String] = trueCaser(word, pos, lemma) 191 | 192 | // helper functions 193 | def words:Array[String] = word 194 | def tags:Array[String] = pos 195 | 196 | def headWord:String = word(headIndex) 197 | def headLemma:String = lemma(headIndex) 198 | def headPOS:String = pos(headIndex) 199 | def namedEntities:Array[(Array[String],String)] = { 200 | // (collect tags) 201 | val nerTags = word.zip(ner).foldLeft(List[(List[String],String)]()){ 202 | case (soFar:List[(List[String],String)], (word:String, tag:String)) => 203 | val (chunk, lastTag) = if (soFar.isEmpty) (List[String](), "O") 204 | else soFar.head 205 | val tailList:List[(List[String],String)] 206 | = if (soFar.isEmpty) Nil else soFar.tail 207 | if (lastTag != tag) { 208 | (List[String](word), tag) :: { 209 | if (lastTag != "O") (chunk.reverse, lastTag) :: tailList 210 | else tailList 211 | } 212 | } else { 213 | (word :: chunk, tag) :: tailList 214 | } 215 | } 216 | // (some cleanup) 217 | val headPair = nerTags.head 218 | (if (headPair._2 == "O") nerTags.tail 219 | else (headPair._1.reverse, headPair._2) :: nerTags.tail) 220 | .reverse 221 | .map{ case (c,t) => (c.toArray,t) } 222 | .toArray 223 | } 224 | 225 | def toSentence:Sentence = this 226 | 227 | override def equals(a:Any):Boolean = { 228 | def seqMatch(s:Seq[String]):Boolean = { 229 | s.length == word.length && s.zip(word).forall{ case (a,b) => a == b } 230 | } 231 | a match { 232 | case (s:Sentence) => 233 | for (id1 <- this.id; 234 | id2 <- s.id) return id1 == id2 235 | return seqMatch(s.word) 236 | case (s:Seq[String]) => seqMatch(s) 237 | case _ => false 238 | } 239 | } 240 | private var code:Int = 0 241 | override def hashCode:Int = { 242 | if (code == 0) { word.foreach( w => code = 37 * code + w.hashCode ) } 243 | code 244 | } 245 | override def toString:String = word.mkString(" ") 246 | } 247 | -------------------------------------------------------------------------------- /src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/TokensRegex.scala: -------------------------------------------------------------------------------- 1 | package edu.stanford.nlp; 2 | 3 | import scala.collection.JavaConversions._ 4 | 5 | import edu.stanford.nlp.ling.CoreLabel 6 | import edu.stanford.nlp.util.CoreMap 7 | import edu.stanford.nlp.ling.tokensregex._ 8 | 9 | import NLPConfig._ 10 | 11 | 12 | case class TokensRegex(override val toString:String) { 13 | val pattern:TokenSequencePattern = TokenSequencePattern.compile(toString) 14 | 15 | 16 | def matches(input:Seq[CoreLabel]):Boolean = pattern.getMatcher(input.toList).matches 17 | 18 | def allMatches(input:Seq[CoreLabel]):Iterator[Seq[CoreLabel]] = { 19 | val matcher = pattern.getMatcher(input.toList) 20 | new Iterator[Seq[CoreLabel]] { 21 | var theNext:Option[Boolean] = None 22 | override def hasNext:Boolean = theNext match { 23 | case Some(x) => x 24 | case None => theNext = Some(matcher.find); theNext.get 25 | } 26 | override def next:Seq[CoreLabel] = { 27 | if (!hasNext) throw new NoSuchElementException 28 | theNext = None 29 | val m:java.util.List[_ <: CoreMap] = matcher.groupNodes 30 | m.map( _ match { 31 | case (x:CoreLabel) => x 32 | case (x:CoreMap) => new CoreLabel(x) 33 | }) 34 | } 35 | } 36 | } 37 | 38 | def unapplySeq(target:Any):Option[Seq[Seq[CoreLabel]]] = target match { 39 | case (input:Seq[CoreLabel]) => 40 | val matcher = pattern getMatcher(input toList) 41 | if (matcher matches) { 42 | Some(for (i <- 1 to matcher.groupCount) yield 43 | matcher groupNodes(i) map( _ match { 44 | case (x:CoreLabel) => x 45 | case (x:CoreMap) => new CoreLabel(x) 46 | })) 47 | } else { None } 48 | case _ => None 49 | } 50 | } 51 | 52 | 53 | object TokensRegex { 54 | // Built-in predicates 55 | def word(pattern:String):MarkedString = MarkedString(s"""{word : /$pattern/}""") 56 | def tag(pattern:String):MarkedString = MarkedString(s"""{tag : /$pattern/}""") 57 | def lemma(pattern:String):MarkedString = MarkedString(s"""{lemma : /$pattern/}""") 58 | def ner(pattern:String):MarkedString = MarkedString(s"""{ner : /$pattern/}""") 59 | def normalized(pattern:String):MarkedString = MarkedString(s"""{normalized : /$pattern/}""") 60 | 61 | // Decorate predicates 62 | case class MarkedString(str:String) extends AnyVal { override def toString:String = str } 63 | implicit def stringDecorator(str:MarkedString) = new { 64 | def unary_!():String = s"""!$str""" 65 | } 66 | implicit def string2string(str:MarkedString):String = str.str 67 | 68 | // Create token sequence 69 | implicit def product2tokens(p:Product):Tokens = new Tokens(List[String](p.productIterator.map( _.toString ).mkString(" & "))) 70 | implicit def string2tokens(str:MarkedString):Tokens = new Tokens(List[String](str.str)) 71 | class Tokens(val regexps:List[String]) { 72 | def apply(terms:String*):Tokens = { 73 | new Tokens(terms.mkString(" & ") :: regexps) 74 | } 75 | } 76 | 77 | // Dump to TokensRegex object 78 | implicit def string2tokensregex(str:MarkedString):TokensRegex 79 | = new TokensRegex(s"""[${str.str}]""") 80 | implicit def tokens2tokensregex(tokens:Tokens):TokensRegex 81 | = new TokensRegex(s"""[${tokens.regexps.reverse.mkString("] [")}]""") 82 | } 83 | -------------------------------------------------------------------------------- /src/main/scala/DataSource.scala: -------------------------------------------------------------------------------- 1 | package org.template.sentimentanalysis 2 | 3 | import io.prediction.controller.PDataSource 4 | import io.prediction.controller.EmptyEvaluationInfo 5 | import io.prediction.controller.EmptyActualResult 6 | import io.prediction.controller.Params 7 | import io.prediction.data.storage.Event 8 | import io.prediction.data.storage.Storage 9 | 10 | import org.apache.spark.SparkContext 11 | import org.apache.spark.SparkContext._ 12 | import org.apache.spark.rdd.RDD 13 | 14 | import grizzled.slf4j.Logger 15 | 16 | case class DataSourceParams(appId: Int) extends Params 17 | 18 | class DataSource(val dsp: DataSourceParams) 19 | extends PDataSource[TrainingData, 20 | EmptyEvaluationInfo, Query, EmptyActualResult] { 21 | 22 | @transient lazy val logger = Logger[this.type] 23 | 24 | override 25 | def readTraining(sc: SparkContext): TrainingData = { 26 | val eventsDB = Storage.getPEvents() 27 | val eventsRDD: RDD[Event] = eventsDB.find( 28 | appId = dsp.appId, 29 | entityType = Some("user"), 30 | eventNames = Some(List("train")) 31 | ) (sc) 32 | 33 | val sentimentsRDD: RDD[Sentiment] = eventsRDD.map { event => 34 | val sentiment = try { 35 | val sentimentValue: Double = event.event match { 36 | case "train" => event.properties.get[Double]("sentiment") 37 | case _ => throw new Exception(s"Unexpected event ${event} is read.") 38 | } 39 | 40 | Sentiment( 41 | event.properties.get[String]("phrase"), 42 | sentimentValue) 43 | } catch { 44 | case e: Exception => { 45 | logger.error( 46 | s"Cannot convert ${event} to Sentiment. Exception: ${e}.") 47 | throw e 48 | } 49 | } 50 | sentiment 51 | }.cache() 52 | 53 | new TrainingData(sentimentsRDD) 54 | } 55 | } 56 | 57 | case class Sentiment( 58 | phrase: String, 59 | sentiment: Double 60 | ) 61 | 62 | class TrainingData( 63 | val sentiments: RDD[Sentiment] 64 | ) extends Serializable { } 65 | 66 | -------------------------------------------------------------------------------- /src/main/scala/Engine.scala: -------------------------------------------------------------------------------- 1 | package org.template.sentimentanalysis 2 | 3 | import io.prediction.controller.IEngineFactory 4 | import io.prediction.controller.Engine 5 | 6 | case class Query( 7 | s: String 8 | ) extends Serializable 9 | 10 | case class PredictedResult( 11 | sentiment: Double 12 | ) extends Serializable 13 | 14 | object SentimentAnalysisEngine extends IEngineFactory { 15 | def apply() = { 16 | new Engine( 17 | classOf[DataSource], 18 | classOf[Preparator], 19 | Map("nlpparse" -> classOf[Algorithm]), 20 | classOf[Serving]) 21 | } 22 | } 23 | 24 | -------------------------------------------------------------------------------- /src/main/scala/Model.scala: -------------------------------------------------------------------------------- 1 | package org.template.sentimentanalysis 2 | 3 | import edu.stanford.nlp.Magic._ 4 | import edu.stanford.nlp.trees.Tree 5 | 6 | class Model ( 7 | var rules: Map[String, Double] 8 | ) extends Serializable { 9 | 10 | /** 11 | * Return the sentiment in [-2 , 2] scale 12 | */ 13 | def getWordSentiment(word: String): Double = { 14 | var score = rules.get(word.toLowerCase()) 15 | if (score.isEmpty) { 16 | return 0.0 17 | } else { 18 | return score.get - 2.0 19 | } 20 | } 21 | 22 | /** 23 | * Parse the input to a tree structure. Calculate the sentiment from bottom 24 | * to the top. 25 | * 26 | * For a leaf node, it is always a word token. Use the sentiment 27 | * from the training data in this case. If the word did not appear in the 28 | * training data. Assume it is neutral. 29 | * 30 | * For a non-leaf node, calculate the sentiments of each of its children. 31 | * Determine whether the sentence is positive or negative by the number of 32 | * negative children. If it is odd, then assume the sentence is negative. 33 | */ 34 | def getSentiment(s: String, ap: AlgorithmParams): Double = { 35 | var m = scala.collection.mutable.Map[Tree, Double]() 36 | var tree = s.parse 37 | var root = tree.preOrderNodeList().get(0) 38 | var post_order = tree.postOrderNodeList() 39 | var i = 0 40 | while (i < post_order.size()) { 41 | var cur = post_order.get(i) 42 | i = i + 1 43 | 44 | if (cur.isLeaf()) { 45 | m(cur) = getWordSentiment(cur.value) 46 | } else { 47 | var children = cur.children() 48 | var weight = 0.0000000001 49 | var positive = 1 50 | var sentiment = 0.0 51 | m(cur) = 0 52 | for (child <- children) { 53 | var child_sentiment = m(child) 54 | 55 | // The weight of a the child is proportional to the absolute value 56 | // of its sentiment. It avoid the sentiment to be neutralized by 57 | // other neutral childs 58 | var child_weight = Math.abs(child_sentiment) + ap.baseWeight 59 | 60 | weight = weight + child_weight 61 | sentiment = sentiment + child_weight * Math.abs(child_sentiment) 62 | if (child_sentiment < -0.0000000001) { 63 | positive = positive * -1 64 | } 65 | } 66 | m(cur) = ( sentiment / weight ) * positive 67 | } 68 | } 69 | 70 | return m(root) + 2.0 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/main/scala/Preparator.scala: -------------------------------------------------------------------------------- 1 | package org.template.sentimentanalysis 2 | 3 | import io.prediction.controller.PPreparator 4 | 5 | import org.apache.spark.SparkContext 6 | import org.apache.spark.SparkContext._ 7 | import org.apache.spark.rdd.RDD 8 | 9 | class Preparator 10 | extends PPreparator[TrainingData, PreparedData] { 11 | 12 | def prepare(sc: SparkContext, trainingData: TrainingData): PreparedData = { 13 | new PreparedData(sentiments = trainingData.sentiments) 14 | } 15 | } 16 | 17 | class PreparedData( 18 | val sentiments: RDD[Sentiment] 19 | ) extends Serializable 20 | 21 | -------------------------------------------------------------------------------- /src/main/scala/Serving.scala: -------------------------------------------------------------------------------- 1 | package org.template.sentimentanalysis 2 | 3 | import io.prediction.controller.LServing 4 | 5 | class Serving extends LServing[Query, PredictedResult] { 6 | 7 | override 8 | def serve( 9 | query: Query, 10 | predictedResults: Seq[PredictedResult] 11 | ): PredictedResult = { 12 | predictedResults.head 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /template.json: -------------------------------------------------------------------------------- 1 | {"pio": {"version": { "min": "0.9.0" }}} 2 | --------------------------------------------------------------------------------