├── .gitignore
├── README.md
├── build.sbt
├── cli
    ├── README.md
    ├── build.sbt
    └── src
    │   └── main
    │       ├── resources
    │           ├── application.conf
    │           └── logback.xml
    │       └── scala
    │           └── org
    │               └── allenai
    │                   └── nlpstack
    │                       └── cli
    │                           ├── ArgumentHeadExtractorMain.scala
    │                           ├── ChunkerMain.scala
    │                           ├── ConstituencyParserMain.scala
    │                           ├── DependencyParserMain.scala
    │                           ├── LineProcessor.scala
    │                           ├── PostaggerMain.scala
    │                           ├── RelationHeadExtractorMain.scala
    │                           ├── SegmenterMain.scala
    │                           ├── SrlMain.scala
    │                           ├── StemmerMain.scala
    │                           └── TokenizerMain.scala
├── conf
    └── logback.xml
├── project
    ├── Dependencies.scala
    ├── build.properties
    └── plugins.sbt
├── tools
    ├── chunk
    │   ├── LICENSE
    │   ├── build.sbt
    │   ├── conf
    │   │   └── deploy.conf
    │   └── src
    │   │   ├── main
    │   │       ├── scala
    │   │       │   └── org
    │   │       │   │   └── allenai
    │   │       │   │       └── nlpstack
    │   │       │   │           └── chunk
    │   │       │   │               ├── OpenNlpChunker.scala
    │   │       │   │               └── package.scala
    │   │       └── universal
    │   │       │   └── chunk-server.sh
    │   │   └── test
    │   │       ├── resources
    │   │           └── logback.xml
    │   │       └── scala
    │   │           └── org
    │   │               └── allenai
    │   │                   └── nlpstack
    │   │                       └── chunk
    │   │                           └── OpenNlpChunkerSpec.scala
    ├── core
    │   ├── LICENSE
    │   ├── build.sbt
    │   └── src
    │   │   ├── main
    │   │       └── scala
    │   │       │   └── org
    │   │       │       └── allenai
    │   │       │           └── nlpstack
    │   │       │               └── core
    │   │       │                   ├── ChunkedToken.scala
    │   │       │                   ├── Chunker.scala
    │   │       │                   ├── ConstituencyParser.scala
    │   │       │                   ├── DependencyParser.scala
    │   │       │                   ├── FactorieUtilities.scala
    │   │       │                   ├── Format.scala
    │   │       │                   ├── HashCodeHelper.scala
    │   │       │                   ├── IdentityStemmer.scala
    │   │       │                   ├── Lemmatized.scala
    │   │       │                   ├── PostaggedToken.scala
    │   │       │                   ├── Postagger.scala
    │   │       │                   ├── Segmenter.scala
    │   │       │                   ├── Stemmer.scala
    │   │       │                   ├── Token.scala
    │   │       │                   ├── Tokenizer.scala
    │   │       │                   ├── conf
    │   │       │                       ├── ConfidenceFunction.scala
    │   │       │                       ├── ConfidenceTrainer.scala
    │   │       │                       ├── Feature.scala
    │   │       │                       ├── FeatureSet.scala
    │   │       │                       ├── Labelled.scala
    │   │       │                       ├── Trainer.scala
    │   │       │                       └── impl
    │   │       │                       │   └── LogisticRegression.scala
    │   │       │                   ├── coref
    │   │       │                       └── CorefResolver.scala
    │   │       │                   ├── graph
    │   │       │                       ├── Bipath.scala
    │   │       │                       ├── DirectedEdge.scala
    │   │       │                       ├── Graph.scala
    │   │       │                       └── pattern
    │   │       │                       │   ├── Match.scala
    │   │       │                       │   ├── Matcher.scala
    │   │       │                       │   └── Pattern.scala
    │   │       │                   ├── headword
    │   │       │                       └── HeadExtractor.scala
    │   │       │                   ├── parse
    │   │       │                       └── graph
    │   │       │                       │   ├── Dependency.scala
    │   │       │                       │   ├── DependencyGraph.scala
    │   │       │                       │   ├── DependencyNode.scala
    │   │       │                       │   ├── DependencyPattern.scala
    │   │       │                       │   ├── JoinedDependencyGraph.scala
    │   │       │                       │   ├── JoinedDependencyNode.scala
    │   │       │                       │   ├── TokenDependencyNode.scala
    │   │       │                       │   └── package.scala
    │   │       │                   ├── remote
    │   │       │                       ├── Remote.scala
    │   │       │                       ├── RemoteDependencyParser.scala
    │   │       │                       ├── RemoteSegmenter.scala
    │   │       │                       └── RemoteStemmer.scala
    │   │       │                   ├── repr
    │   │       │                       ├── Chunks.scala
    │   │       │                       ├── Dependencies.scala
    │   │       │                       ├── Document.scala
    │   │       │                       ├── Lemmas.scala
    │   │       │                       ├── Postags.scala
    │   │       │                       ├── Sentence.scala
    │   │       │                       ├── Sentenced.scala
    │   │       │                       └── Tokens.scala
    │   │       │                   ├── srl
    │   │       │                       ├── Frame.scala
    │   │       │                       ├── FrameHierarchy.scala
    │   │       │                       ├── RemoteSrl.scala
    │   │       │                       └── Srl.scala
    │   │       │                   └── typer
    │   │       │                       └── Typer.scala
    │   │   └── test
    │   │       ├── resources
    │   │           └── logback.xml
    │   │       └── scala
    │   │           └── org
    │   │               └── allenai
    │   │                   └── nlpstack
    │   │                       └── core
    │   │                           ├── ChunkerSpec.scala
    │   │                           ├── CorefResolverSpec.scala
    │   │                           ├── DependencyGraphSpec.scala
    │   │                           ├── DependencyNodeSpec.scala
    │   │                           ├── DependencySpec.scala
    │   │                           ├── FormatSpec.scala
    │   │                           ├── TokenSpec.scala
    │   │                           └── TokenizerSpec.scala
    ├── headword
    │   ├── build.sbt
    │   └── src
    │   │   ├── main
    │   │       └── scala
    │   │       │   ├── JwiTools.scala
    │   │       │   └── KnowitallHeadExtractor.scala
    │   │   └── test
    │   │       └── scala
    │   │           ├── JwiToolsSpec.scala
    │   │           └── KnowitallHeadExtractorSpec.scala
    ├── lemmatize
    │   ├── LICENSE
    │   ├── build.sbt
    │   └── src
    │   │   ├── main
    │   │       ├── scala
    │   │       │   └── org
    │   │       │   │   └── allenai
    │   │       │   │       └── nlpstack
    │   │       │   │           └── lemmatize
    │   │       │   │               └── MorphaStemmer.scala
    │   │       └── universal
    │   │       │   └── lemmatize-server.sh
    │   │   └── test
    │   │       ├── resources
    │   │           └── logback.xml
    │   │       └── scala
    │   │           └── org
    │   │               └── allenai
    │   │                   └── nlpstack
    │   │                       └── lemmatize
    │   │                           └── MorphaLemmatizerSpec.scala
    ├── parse
    │   ├── LICENSE
    │   ├── build.sbt
    │   ├── conf
    │   │   └── deploy.conf
    │   ├── jvm.sbt
    │   └── src
    │   │   ├── main
    │   │       ├── resources
    │   │       │   └── featuretaggers.config
    │   │       ├── scala
    │   │       │   └── org
    │   │       │   │   └── allenai
    │   │       │   │       └── nlpstack
    │   │       │   │           └── parse
    │   │       │   │               ├── BankerProtocol.scala
    │   │       │   │               ├── FactorieParser.scala
    │   │       │   │               ├── PolytreeParser.scala
    │   │       │   │               ├── package.scala
    │   │       │   │               └── poly
    │   │       │   │                   ├── core
    │   │       │   │                       ├── AnnotatedSentence.scala
    │   │       │   │                       ├── DirectedGraph.scala
    │   │       │   │                       ├── PositionTree.scala
    │   │       │   │                       ├── Sentence.scala
    │   │       │   │                       ├── SentenceTagger.scala
    │   │       │   │                       ├── TaggedSentence.scala
    │   │       │   │                       ├── Token.scala
    │   │       │   │                       ├── TokenTagger.scala
    │   │       │   │                       ├── Util.scala
    │   │       │   │                       └── WordClusters.scala
    │   │       │   │                   ├── decisiontree
    │   │       │   │                       ├── DecisionTree.scala
    │   │       │   │                       ├── DecisionTreeTrainer.scala
    │   │       │   │                       ├── FeatureVector.scala
    │   │       │   │                       ├── FeatureVectorSource.scala
    │   │       │   │                       ├── OmnibusTrainer.scala
    │   │       │   │                       ├── OneVersusAll.scala
    │   │       │   │                       ├── ProbabilisticClassifier.scala
    │   │       │   │                       ├── RandomForest.scala
    │   │       │   │                       └── package.scala
    │   │       │   │                   ├── eval
    │   │       │   │                       ├── Evaluate.scala
    │   │       │   │                       ├── ParseAnalyzer.scala
    │   │       │   │                       ├── ParseBank.scala
    │   │       │   │                       ├── ParseEvaluation.scala
    │   │       │   │                       ├── ParseScore.scala
    │   │       │   │                       └── TaggingEvaluation.scala
    │   │       │   │                   ├── fsm
    │   │       │   │                       ├── ClassificationTask.scala
    │   │       │   │                       ├── EmbeddedClassifier.scala
    │   │       │   │                       ├── FSMTrainingVectorSource.scala
    │   │       │   │                       ├── MarbleBlock.scala
    │   │       │   │                       ├── NbestCorpus.scala
    │   │       │   │                       ├── NbestSearch.scala
    │   │       │   │                       ├── Reranker.scala
    │   │       │   │                       ├── Sculpture.scala
    │   │       │   │                       ├── SculptureCost.scala
    │   │       │   │                       ├── SculptureFeature.scala
    │   │       │   │                       ├── SculptureTrainingVectorSource.scala
    │   │       │   │                       ├── Search.scala
    │   │       │   │                       ├── State.scala
    │   │       │   │                       ├── StateCostFunction.scala
    │   │       │   │                       ├── StateCostFunctionTrainer.scala
    │   │       │   │                       ├── StateFeature.scala
    │   │       │   │                       ├── StateTransition.scala
    │   │       │   │                       ├── TransitionClassifier.scala
    │   │       │   │                       ├── TransitionConstraint.scala
    │   │       │   │                       ├── TransitionSystem.scala
    │   │       │   │                       └── Walk.scala
    │   │       │   │                   ├── ml
    │   │       │   │                       ├── BrownClusters.scala
    │   │       │   │                       ├── FeatureVector.scala
    │   │       │   │                       ├── GoogleNGram.scala
    │   │       │   │                       ├── LinearModel.scala
    │   │       │   │                       ├── TrainingData.scala
    │   │       │   │                       ├── Verbnet.scala
    │   │       │   │                       └── WrapperClassifier.scala
    │   │       │   │                   ├── polyparser
    │   │       │   │                       ├── AdaptiveTraining.scala
    │   │       │   │                       ├── ArcEagerTransitionSystem.scala
    │   │       │   │                       ├── ArcHybridTransitionSystem.scala
    │   │       │   │                       ├── ArcInverter.scala
    │   │       │   │                       ├── DependencyParsingTransitionSystem.scala
    │   │       │   │                       ├── MultiWordTagger.scala
    │   │       │   │                       ├── NbestParser.scala
    │   │       │   │                       ├── Neighborhood.scala
    │   │       │   │                       ├── ParseCache.scala
    │   │       │   │                       ├── ParseFile.scala
    │   │       │   │                       ├── ParsePool.scala
    │   │       │   │                       ├── Parser.scala
    │   │       │   │                       ├── ParserClassificationTask.scala
    │   │       │   │                       ├── ParserConfiguration.scala
    │   │       │   │                       ├── ParserConstraint.scala
    │   │       │   │                       ├── PolytreeParse.scala
    │   │       │   │                       ├── PolytreeParseSource.scala
    │   │       │   │                       ├── RerankingTransitionParser.scala
    │   │       │   │                       ├── StateRef.scala
    │   │       │   │                       ├── TokenFeature.scala
    │   │       │   │                       ├── TokenTransform.scala
    │   │       │   │                       ├── Training.scala
    │   │       │   │                       ├── TransitionParser.scala
    │   │       │   │                       ├── TransitionParserFeature.scala
    │   │       │   │                       └── TransitionParserState.scala
    │   │       │   │                   └── reranking
    │   │       │   │                       ├── NeighborhoodEventStatistic.scala
    │   │       │   │                       ├── NeighborhoodExtractor.scala
    │   │       │   │                       ├── NeighborhoodTransform.scala
    │   │       │   │                       ├── OracleReranker.scala
    │   │       │   │                       ├── ParseNodeFeature.scala
    │   │       │   │                       ├── ParseReranker.scala
    │   │       │   │                       ├── ParseRerankerTraining.scala
    │   │       │   │                       ├── ParseRerankingFunction.scala
    │   │       │   │                       ├── PolytreeParseFeature.scala
    │   │       │   │                       └── QualityEstimation.scala
    │   │       └── universal
    │   │       │   └── parse-server.sh
    │   │   └── test
    │   │       ├── resources
    │   │           └── logback.xml
    │   │       └── scala
    │   │           └── org
    │   │               └── allenai
    │   │                   └── nlpstack
    │   │                       └── parse
    │   │                           ├── FactorieParserSpec.scala
    │   │                           ├── PolytreeParserSpec.scala
    │   │                           └── poly
    │   │                               ├── core
    │   │                                   ├── PositionTreeSpec.scala
    │   │                                   ├── SentenceSpec.scala
    │   │                                   ├── SentenceTaggerSpec.scala
    │   │                                   ├── TokenSpec.scala
    │   │                                   └── TokenTaggerSpec.scala
    │   │                               ├── decisiontree
    │   │                                   ├── DecisionTreeSpec.scala
    │   │                                   └── OneVersusAllSpec.scala
    │   │                               ├── eval
    │   │                                   ├── ParseBankSpec.scala
    │   │                                   └── ParseScoreSpec.scala
    │   │                               ├── fsm
    │   │                                   ├── ClassificationTaskSpec.scala
    │   │                                   ├── SearchSpec.scala
    │   │                                   └── TrainingVectorSourceSpec.scala
    │   │                               ├── ml
    │   │                                   ├── BrownClustersSpec.scala
    │   │                                   ├── FeatureVectorSpec.scala
    │   │                                   ├── LinearModelSpec.scala
    │   │                                   ├── NgramSetSpec.scala
    │   │                                   └── VerbnetSpec.scala
    │   │                               ├── polyparser
    │   │                                   ├── ArcEagerConstraintsSpec.scala
    │   │                                   ├── ArcEagerTransitionSystemSpec.scala
    │   │                                   ├── ArcHybridTransitionSystemSpec.scala
    │   │                                   ├── ArcInverterSpec.scala
    │   │                                   ├── GoldParseTrainingVectorSourceSpec.scala
    │   │                                   ├── GreedySearchSpec.scala
    │   │                                   ├── MultiWordTaggerSpec.scala
    │   │                                   ├── ParserClassificationTaskSpec.scala
    │   │                                   ├── PolytreeParseFeatureSpec.scala
    │   │                                   ├── PolytreeParseSourceSpec.scala
    │   │                                   ├── PolytreeParseSpec.scala
    │   │                                   ├── StateRefSpec.scala
    │   │                                   ├── TokenTransformSpec.scala
    │   │                                   └── TransitionSpec.scala
    │   │                               └── reranking
    │   │                                   ├── NeighborhoodExtractorSpec.scala
    │   │                                   └── NeighborhoodTransformSpec.scala
    ├── postag
    │   ├── LICENSE
    │   ├── build.sbt
    │   └── src
    │   │   ├── main
    │   │       ├── scala
    │   │       │   └── org
    │   │       │   │   └── allenai
    │   │       │   │       └── nlpstack
    │   │       │   │           └── postag
    │   │       │   │               ├── FactoriePostagger.scala
    │   │       │   │               ├── OpenNlpPostagger.scala
    │   │       │   │               ├── StanfordPostagger.scala
    │   │       │   │               └── package.scala
    │   │       └── universal
    │   │       │   └── postag-server.sh
    │   │   └── test
    │   │       ├── resources
    │   │           └── logback.xml
    │   │       └── scala
    │   │           └── org
    │   │               └── allenai
    │   │                   └── nlpstack
    │   │                       └── postag
    │   │                           ├── FactoriePostaggerSpec.scala
    │   │                           ├── OpenNlpPostaggerSpec.scala
    │   │                           └── PostaggerSpec.scala
    ├── segment
    │   ├── LICENSE
    │   ├── build.sbt
    │   └── src
    │   │   ├── main
    │   │       ├── scala
    │   │       │   └── org
    │   │       │   │   └── allenai
    │   │       │   │       └── nlpstack
    │   │       │   │           └── segment
    │   │       │   │               ├── ChalkSentenceSegmenter.scala
    │   │       │   │               ├── FactorieSegmenter.scala
    │   │       │   │               ├── StanfordSegmenter.scala
    │   │       │   │               └── package.scala
    │   │       └── universal
    │   │       │   └── segment-server.scala
    │   │   └── test
    │   │       ├── resources
    │   │           ├── logback.xml
    │   │           └── org
    │   │           │   └── allenai
    │   │           │       └── nlpstack
    │   │           │           └── segment
    │   │           │               └── unclosed_tag_test.txt
    │   │       └── scala
    │   │           └── org
    │   │               └── allenai
    │   │                   └── nlpstack
    │   │                       └── segment
    │   │                           ├── ChalkSentenceSegmenter.scala
    │   │                           └── FactorieSegmenterSpec.scala
    └── tokenize
    │   ├── LICENSE
    │   ├── build.sbt
    │   └── src
    │       ├── main
    │           ├── scala
    │           │   └── org
    │           │   │   └── allenai
    │           │   │       └── nlpstack
    │           │   │           └── tokenize
    │           │   │               ├── FactorieTokenizer.scala
    │           │   │               ├── PennTokenizer.scala
    │           │   │               ├── RemoteTokenizer.scala
    │           │   │               ├── SimpleEnglishTokenizer.scala
    │           │   │               ├── StanfordTokenizer.scala
    │           │   │               ├── WhitespaceTokenizer.scala
    │           │   │               └── package.scala
    │           └── universal
    │           │   └── tokenize-server.sh
    │       └── test
    │           ├── resources
    │               ├── logback.xml
    │               └── org
    │               │   └── allenai
    │               │       └── nlpstack
    │               │           └── tokenize
    │               │               └── unclosed_tag_test.txt
    │           └── scala
    │               └── org
    │                   └── allenai
    │                       └── nlpstack
    │                           └── tokenize
    │                               ├── FactorieTokenizerSpec.scala
    │                               └── TokenizerSpec.scala
├── version.sbt
└── webapp
    ├── README.md
    ├── build.sbt
    ├── conf
        ├── deploy.conf
        └── global_deploy.conf
    ├── public
        ├── css
        │   ├── bootstrap-3.1.1.min.css
        │   └── main.css
        ├── img
        │   └── spinner.gif
        ├── index.html
        ├── js
        │   ├── angular-1.2.13.min.js
        │   ├── bootstrap-3.1.1.min.js
        │   ├── jquery-2.0.1.min.js
        │   ├── tools.js
        │   ├── ui-bootstrap-tpls-0.10.0.min.js
        │   └── visualize.js
        ├── tools.html
        └── visualize.html
    ├── src
        └── main
        │   ├── bin
        │       └── webapp.sh
        │   ├── resources
        │       └── application.conf
        │   └── scala
        │       └── org
        │           └── allenai
        │               └── nlpstack
        │                   └── webapp
        │                       ├── BasicService.scala
        │                       ├── Nlpweb.scala
        │                       ├── NlpwebActor.scala
        │                       ├── ToolService.scala
        │                       ├── VisualizationService.scala
        │                       ├── Whatswrong.scala
        │                       └── tools
        │                           ├── ChunkerTool.scala
        │                           ├── DependencyParserTool.scala
        │                           ├── Impl.scala
        │                           ├── LemmatizerTool.scala
        │                           ├── PostaggerTool.scala
        │                           ├── SentenceSegmenterTool.scala
        │                           ├── TokenizerTool.scala
        │                           └── Tool.scala
    └── webapp
        └── package.json


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | target/
3 | .idea/
4 | 


--------------------------------------------------------------------------------
/cli/build.sbt:
--------------------------------------------------------------------------------
 1 | import Dependencies._
 2 | 
 3 | name := "nlpstack-cli"
 4 | 
 5 | libraryDependencies ++= Seq(
 6 |   scopt,
 7 |   "com.typesafe.akka" %% "akka-actor" % defaultAkkaVersion,
 8 |   sprayCan,
 9 |   sprayRouting,
10 |   typesafeConfig)
11 | 
12 | fork in run := true
13 | 
14 | javaOptions += "-Xmx8G"
15 | 


--------------------------------------------------------------------------------
/cli/src/main/resources/application.conf:
--------------------------------------------------------------------------------
 1 | nlpstack {
 2 |   tools {
 3 |     tokenizer {
 4 |       defaultPort = "14000"
 5 |     }
 6 |     segmenter {
 7 |       defaultPort = "14001"
 8 |     }
 9 |     postagger {
10 |       defaultPort = "14002"
11 |     }
12 |     chunker {
13 |       defaultPort = "14003"
14 |     }
15 |     dep-parser {
16 |       defaultPort = "14004"
17 |     }
18 |     argumentheadextractor {
19 |       defaultPort = "14005"
20 |     }
21 |     relationheadextractor {
22 |       defaultPort = "14006"
23 |     }
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------
/cli/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 |   <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
 3 |     <!-- encoders are assigned the type
 4 |     ch.qos.logback.classic.encoder.PatternLayoutEncoder by default -->
 5 |     <encoder>
 6 |       <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 7 |     </encoder>
 8 |   </appender>
 9 | 
10 |   <root level="info">
11 |     <appender-ref ref="STDOUT" />
12 |   </root>
13 |   <logger name="org.apache.wire" level="WARN" additivity="false" />
14 |   <logger name="edu.stanford.nlp.process.PTBLexer" level="ERROR" additivity="false" />
15 | </configuration>
16 | 


--------------------------------------------------------------------------------
/cli/src/main/scala/org/allenai/nlpstack/cli/ArgumentHeadExtractorMain.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.cli
 2 | 
 3 | import org.allenai.nlpstack.core._
 4 | import org.allenai.nlpstack.core.headword.HeadExtractor
 5 | import org.allenai.nlpstack.headword.KnowitallHeadExtractor
 6 | import org.allenai.nlpstack.postag.defaultPostagger
 7 | import org.allenai.nlpstack.tokenize.defaultTokenizer
 8 | 
 9 | abstract class ArgumentHeadExtractorMain extends LineProcessor("argumentheadextractor") {
10 |   def tokenizer: Tokenizer
11 |   def postagger: Postagger
12 |   def headExtractor: HeadExtractor
13 | 
14 |   override def process(line: String) = {
15 |     val headTokens = headExtractor.argumentHead(tokenizer, postagger)(line)
16 |     Postagger.multilineStringFormat.write(headTokens)
17 |   }
18 | }
19 | 
20 | object KnowitallArgumentHeadExtractorMain extends ArgumentHeadExtractorMain {
21 |   override val tokenizer = defaultTokenizer
22 |   override val postagger = defaultPostagger
23 |   override val headExtractor = new KnowitallHeadExtractor()
24 | }


--------------------------------------------------------------------------------
/cli/src/main/scala/org/allenai/nlpstack/cli/ChunkerMain.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.cli
 2 | 
 3 | import org.allenai.nlpstack.core._
 4 | import org.allenai.nlpstack.chunk._
 5 | import org.allenai.nlpstack.postag._
 6 | import org.allenai.nlpstack.tokenize._
 7 | 
 8 | abstract class ChunkerMain
 9 |     extends LineProcessor("chunker") {
10 |   def tokenizer: Tokenizer
11 |   def postagger: Postagger
12 |   def chunker: Chunker
13 | 
14 |   override def process(line: String) = {
15 |     val chunkedTokens = chunker.chunk(tokenizer, postagger)(line)
16 |     Chunker.multilineStringFormat.write(chunkedTokens)
17 |   }
18 | 
19 |   override def init(config: Config) {
20 |     // for timing purposes
21 |     chunker.chunk(tokenizer, postagger)("I want to initialize the chunker.")
22 |   }
23 | }
24 | 
25 | object OpenNlpChunkerMain extends ChunkerMain {
26 |   override lazy val tokenizer = defaultTokenizer
27 |   override lazy val postagger = defaultPostagger
28 |   override lazy val chunker = new OpenNlpChunker()
29 | }
30 | 


--------------------------------------------------------------------------------
/cli/src/main/scala/org/allenai/nlpstack/cli/ConstituencyParserMain.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.cli
 2 | 
 3 | import org.allenai.nlpstack.core.ConstituencyParser
 4 | import org.allenai.nlpstack.parse.FactorieParser
 5 | import org.allenai.nlpstack.parse.PolytreeParser
 6 | import org.allenai.nlpstack.postag.defaultPostagger
 7 | import org.allenai.nlpstack.tokenize.defaultTokenizer
 8 | 
 9 | abstract class ConstituencyParserMain extends LineProcessor("constit-parser") {
10 |   def constituencyParser: ConstituencyParser
11 |   override def process(line: String) = {
12 |     constituencyParser.parse(line).toString
13 |   }
14 | }
15 | 


--------------------------------------------------------------------------------
/cli/src/main/scala/org/allenai/nlpstack/cli/DependencyParserMain.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.cli
 2 | 
 3 | import org.allenai.nlpstack.core._
 4 | import org.allenai.nlpstack.parse._
 5 | import org.allenai.nlpstack.postag._
 6 | import org.allenai.nlpstack.tokenize._
 7 | 
 8 | abstract class DependencyParserMain extends LineProcessor("dep-parser") {
 9 |   def tokenizer: Tokenizer
10 |   def postagger: Postagger
11 |   def dependencyParser: DependencyParser
12 | 
13 |   override def init(config: Config) {
14 |     // for timing purposes
15 |     val tokens = tokenizer("I want to initialize the parser.")
16 |     val postagged = postagger.postagTokenized(tokens)
17 |     dependencyParser.dependencyGraphPostagged(postagged)
18 |   }
19 | 
20 |   override def process(line: String) = {
21 |     val tokens = tokenizer(line)
22 |     val postagged = postagger.postagTokenized(tokens)
23 |     val dgraph = dependencyParser.dependencyGraphPostagged(postagged)
24 |     DependencyParser.multilineStringFormat.write((postagged, dgraph))
25 |   }
26 | }
27 | 
28 | object FactorieParserMain extends DependencyParserMain {
29 |   override lazy val tokenizer = defaultTokenizer
30 |   override lazy val postagger = defaultPostagger
31 |   override lazy val dependencyParser = new FactorieParser
32 | }
33 | 
34 | object PolytreeParserMain extends DependencyParserMain {
35 |   override lazy val tokenizer = defaultTokenizer
36 |   override lazy val postagger = defaultPostagger
37 |   override lazy val dependencyParser = new PolytreeParser
38 | }
39 | 


--------------------------------------------------------------------------------
/cli/src/main/scala/org/allenai/nlpstack/cli/PostaggerMain.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.cli
 2 | 
 3 | import org.allenai.nlpstack.core._
 4 | import org.allenai.nlpstack.tokenize._
 5 | import org.allenai.nlpstack.postag.FactoriePostagger
 6 | import org.allenai.nlpstack.postag.StanfordPostagger
 7 | 
 8 | abstract class PostaggerMain extends LineProcessor("postagger") {
 9 |   def tokenizer: Tokenizer
10 |   def postagger: Postagger
11 |   override def process(line: String) = {
12 |     val postaggedTokens = postagger.postag(tokenizer)(line)
13 |     Postagger.multilineStringFormat.write(postaggedTokens)
14 |   }
15 | }
16 | 
17 | object FactoriePostaggerMain extends PostaggerMain {
18 |   override val tokenizer = defaultTokenizer
19 |   override val postagger = new FactoriePostagger()
20 | }
21 | 
22 | object StanfordPostaggerMain extends PostaggerMain {
23 |   override val tokenizer = defaultTokenizer
24 |   override val postagger = new StanfordPostagger()
25 | }


--------------------------------------------------------------------------------
/cli/src/main/scala/org/allenai/nlpstack/cli/RelationHeadExtractorMain.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.cli
 2 | 
 3 | import org.allenai.nlpstack.core._
 4 | import org.allenai.nlpstack.core.headword.HeadExtractor
 5 | import org.allenai.nlpstack.headword.KnowitallHeadExtractor
 6 | import org.allenai.nlpstack.postag.defaultPostagger
 7 | import org.allenai.nlpstack.tokenize.defaultTokenizer
 8 | 
 9 | abstract class RelationHeadExtractorMain extends LineProcessor("relationheadextractor") {
10 |   def tokenizer: Tokenizer
11 |   def postagger: Postagger
12 |   def headExtractor: HeadExtractor
13 | 
14 |   override def process(line: String) = {
15 |     val headTokens = headExtractor.relationHead(tokenizer, postagger)(line)
16 |     Postagger.multilineStringFormat.write(headTokens)
17 |   }
18 | }
19 | 
20 | object KnowitallRelationHeadExtractorMain extends RelationHeadExtractorMain {
21 |   override val tokenizer = defaultTokenizer
22 |   override val postagger = defaultPostagger
23 |   override val headExtractor = new KnowitallHeadExtractor()
24 | }


--------------------------------------------------------------------------------
/cli/src/main/scala/org/allenai/nlpstack/cli/SegmenterMain.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.cli
 2 | 
 3 | import org.allenai.nlpstack.core._
 4 | import org.allenai.nlpstack.segment.{ StanfordSegmenter, ChalkSentenceSegmenter, FactorieSegmenter }
 5 | 
 6 | abstract class SegmenterMain
 7 |     extends LineProcessor("segmenter") {
 8 |   def sentencer: Segmenter
 9 |   override def process(line: String) = sentencer(line).map(_.text).mkString("\n")
10 | }
11 | 
12 | object FactorieSegmenterMain extends SegmenterMain {
13 |   override val sentencer = new FactorieSegmenter()
14 | }
15 | 
16 | object StanfordSegmenterMain extends SegmenterMain {
17 |   override val sentencer = StanfordSegmenter
18 | }
19 | 


--------------------------------------------------------------------------------
/cli/src/main/scala/org/allenai/nlpstack/cli/SrlMain.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.cli
 2 | 
 3 | import org.allenai.nlpstack.core.srl._
 4 | import org.allenai.nlpstack.core.DependencyParser
 5 | 
 6 | abstract class SrlMain extends LineProcessor("srl") {
 7 |   def srl: Srl
 8 | 
 9 |   override def process(line: String) = {
10 |     val (tokens, dgraph) = DependencyParser.multilineStringFormat.read(line)
11 |     (srl(tokens, dgraph) map (_.serialize)).mkString("\n")
12 |   }
13 | }
14 | 


--------------------------------------------------------------------------------
/cli/src/main/scala/org/allenai/nlpstack/cli/StemmerMain.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.cli
 2 | 
 3 | import org.allenai.nlpstack.core._
 4 | import org.allenai.nlpstack.lemmatize.MorphaStemmer
 5 | 
 6 | abstract class StemmerMain
 7 |     extends LineProcessor("stemmer") {
 8 |   def lemmatizer: Stemmer
 9 |   override def process(line: String) = line.split("\\s+").map(lemmatizer.stem(_)).mkString(" ")
10 | }
11 | 
12 | object MorphaStemmerMain extends StemmerMain {
13 |   lazy val lemmatizer = new MorphaStemmer
14 | }
15 | 


--------------------------------------------------------------------------------
/cli/src/main/scala/org/allenai/nlpstack/cli/TokenizerMain.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.cli
 2 | 
 3 | import org.allenai.nlpstack.core.Tokenizer
 4 | import org.allenai.nlpstack.tokenize._
 5 | 
 6 | abstract class TokenizerMain extends LineProcessor("tokenizer") {
 7 |   def tokenizer: Tokenizer
 8 |   override def process(sentence: String) =
 9 |     Tokenizer.multilineStringFormat.write(tokenizer.tokenize(sentence))
10 | }
11 | 
12 | object FactorieTokenizerMain extends TokenizerMain {
13 |   val tokenizer = new FactorieTokenizer()
14 | }
15 | 
16 | object PennTokenizerMain extends TokenizerMain {
17 |   val tokenizer = PennTokenizer
18 | }
19 | 
20 | object WhitespaceTokenizerMain extends TokenizerMain {
21 |   val tokenizer = WhitespaceTokenizer
22 | }
23 | 
24 | object StanfordTokenizerMain extends TokenizerMain {
25 |   val tokenizer = StanfordTokenizer
26 | }
27 | 


--------------------------------------------------------------------------------
/conf/logback.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 |   <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
 3 |     <!-- encoders are assigned the type
 4 |     ch.qos.logback.classic.encoder.PatternLayoutEncoder by default -->
 5 |     <encoder>
 6 |       <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 7 |     </encoder>
 8 |   </appender>
 9 | 
10 |   <root level="info">
11 |     <appender-ref ref="STDOUT" />
12 |   </root>
13 |   <logger name="org.apache.wire" level="WARN" additivity="false" />
14 |   <logger name="edu.stanford.nlp.process.PTBLexer" level="error" additivity="false" />
15 | </configuration>
16 | 


--------------------------------------------------------------------------------
/project/Dependencies.scala:
--------------------------------------------------------------------------------
 1 | import sbt._
 2 | import org.allenai.plugins.CoreDependencies
 3 | 
 4 | object Dependencies extends CoreDependencies {
 5 |   val datastore = "org.allenai" %% "datastore" % "1.0.2"
 6 | 
 7 |   val commonsIo = "commons-io" % "commons-io" % "2.4"
 8 | 
 9 |   val parserCombinators = "org.scala-lang.modules" %% "scala-parser-combinators" % "1.0.3"
10 | 
11 |   val clearGroup = "com.clearnlp"
12 |   val clearVersion = "2.0.2"
13 |   val clear = clearGroup % "clearnlp" % clearVersion
14 |   val opennlp = ("org.apache.opennlp" % "opennlp-tools" % "1.5.3"
15 |     exclude ("net.sf.jwordnet", "jwnl"))
16 | 
17 |   val stanfordCoreNlp = "edu.stanford.nlp" % "stanford-corenlp" % "3.4.1"
18 | 
19 |   val factorie = ("cc.factorie" %% "factorie" % "1.1.1"
20 |     exclude ("junit", "junit")
21 |     exclude ("commons-logging", "commons-logging"))
22 |   val factorieWordnet = "cc.factorie.app.nlp" % "wordnet" % "1.0"
23 | 
24 |   val testingLibraries = Seq(allenAiTestkit % "test")
25 | 
26 |   val apache2 = "Apache 2.0 " -> url("http://www.opensource.org/licenses/bsd-3-clause")
27 | 
28 |   val loggingDependencies = Seq(
29 |     Logging.slf4jApi,
30 |     Logging.logbackCore,
31 |     Logging.logbackClassic,
32 |     "org.slf4j" % "jcl-over-slf4j" % Logging.slf4jVersion,
33 |     "org.slf4j" % "log4j-over-slf4j" % Logging.slf4jVersion,
34 |     "org.slf4j" % "jul-to-slf4j" % Logging.slf4jVersion
35 |   )
36 | 
37 |   val jVerbnet = "edu.mit" % "jverbnet" % "1.2.0.1"
38 | 
39 |   val jwiWordnet = "edu.mit" % "jwi" % "2.2.3"
40 | 
41 |   val reming = "com.github.jkinkead" %% "reming-json" % "0.0.9"
42 | 
43 |   val Overrides = loggingDependencies.toSet
44 | }
45 | 


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=0.13.8
2 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("org.allenai.plugins" % "allenai-sbt-plugins" % "1.4.8")
2 | 
3 | addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.0.0")
4 | 


--------------------------------------------------------------------------------
/tools/chunk/build.sbt:
--------------------------------------------------------------------------------
1 | import Dependencies._
2 | 
3 | libraryDependencies ++= loggingDependencies
4 | 


--------------------------------------------------------------------------------
/tools/chunk/conf/deploy.conf:
--------------------------------------------------------------------------------
 1 | chunker = {
 2 |   project = {
 3 |     name = "tools-chunk"
 4 |   }
 5 | 
 6 |   deploy = {
 7 |     startup_script = "bin/chunker-server.sh"
 8 |     directory = "/local/deploy/tools/chunker"
 9 |     user.ssh_username = "ec2-user"
10 |   }
11 | 
12 |   deploy.host = "nlp.allenai.org"
13 | }
14 | 


--------------------------------------------------------------------------------
/tools/chunk/src/main/scala/org/allenai/nlpstack/chunk/OpenNlpChunker.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.chunk
 2 | 
 3 | import org.allenai.common.Resource
 4 | import org.allenai.nlpstack.core.{ ChunkedToken, Chunker, PostaggedToken }
 5 | 
 6 | import opennlp.tools.chunker.{ ChunkerME, ChunkerModel }
 7 | 
 8 | class OpenNlpChunker extends Chunker {
 9 |   //Added ThreadLocal to prevent concurrency issues
10 |   private final val chunker: ThreadLocal[ChunkerME] = new ThreadLocal[ChunkerME]() {
11 |     override protected def initialValue(): ChunkerME = new ChunkerME(OpenNlpChunker.model)
12 |   }
13 | 
14 |   def chunkPostagged(tokens: Seq[PostaggedToken]): Seq[ChunkedToken] = {
15 |     // OpenNLP uses : as the postag for hyphens, but we use HYPH, so we change it back before
16 |     // sending it to the chunker.
17 |     val fixedTokens = tokens.map { t =>
18 |       if (t.string == "-") PostaggedToken(t, ":") else t
19 |     }
20 | 
21 |     val chunks = chunker.get().chunk(tokens.map(_.string).toArray, fixedTokens.map(_.postag)
22 |       .toArray)
23 |     (tokens zip chunks) map { case (token, chunk) => ChunkedToken(token, chunk) }
24 |   }
25 | }
26 | 
27 | object OpenNlpChunker {
28 |   private val defaultModelName = "en-chunker.bin"
29 |   private val model = Resource.using(this.getClass.getClassLoader.getResourceAsStream(defaultModelName)) { is =>
30 |     new ChunkerModel(is)
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/tools/chunk/src/main/scala/org/allenai/nlpstack/chunk/package.scala:
--------------------------------------------------------------------------------
1 | package org.allenai.nlpstack
2 | 
3 | import org.allenai.nlpstack.core.Chunker
4 | 
5 | package object chunk {
6 |   val defaultChunker: Chunker = new OpenNlpChunker
7 | }


--------------------------------------------------------------------------------
/tools/chunk/src/main/universal/chunk-server.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CLASS_NAME="edu.knowitall.tool.chunk.OpenNlpChunker"
 4 | 
 5 | SCRIPT_DIR=`dirname $0`
 6 | SHORT_NAME=`basename $0 .sh`
 7 | APP_ROOT="$SCRIPT_DIR/.."
 8 | JVM_ARGS="-Xmx128M"
 9 | 
10 | . "${SCRIPT_DIR}/run-class.sh" "$CLASS_NAME" "$SHORT_NAME" "$@"
11 | 


--------------------------------------------------------------------------------
/tools/chunk/src/test/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 |   <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
 3 |     <!-- encoders are assigned the type
 4 |     ch.qos.logback.classic.encoder.PatternLayoutEncoder by default -->
 5 |     <encoder>
 6 |       <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 7 |     </encoder>
 8 |   </appender>
 9 | 
10 |   <root level="info">
11 |     <appender-ref ref="STDOUT" />
12 |   </root>
13 |   <logger name="org.apache.wire" level="WARN" additivity="false" />
14 | </configuration>
15 | 


--------------------------------------------------------------------------------
/tools/chunk/src/test/scala/org/allenai/nlpstack/chunk/OpenNlpChunkerSpec.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack
 2 | package chunk
 3 | 
 4 | import org.allenai.common.testkit.UnitSpec
 5 | import org.allenai.nlpstack.postag._
 6 | import org.allenai.nlpstack.tokenize._
 7 | 
 8 | class OpenNlpChunkerSpec extends UnitSpec {
 9 |   "chunker" should "correctly chunk an example sentence" in {
10 |     val text = "This is a test of the OpenNlp chunker."
11 |     val tokenizer = defaultTokenizer
12 |     val postagger = new OpenNlpPostagger
13 |     val chunker = new OpenNlpChunker
14 | 
15 |     val chunked = chunker.chunk(tokenizer, postagger)(text)
16 |     chunked.mkString("; ") === "This 0 DT B-NP; is 5 VBZ B-VP; a 8 DT B-NP; test 10 NN I-NP; of 15 IN B-PP; the 18 DT B-NP; OpenNlp 22 NNP I-NP; chunker 30 NN I-NP; . 37 . O"
17 |   }
18 | }
19 | 
20 | 


--------------------------------------------------------------------------------
/tools/core/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2012, University of Washington
 2 | BSD 3-clause License / BSD Modified License / New BSD License
 3 | All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are met:
 7 |     * Redistributions of source code must retain the above copyright
 8 |       notice, this list of conditions and the following disclaimer.
 9 |     * Redistributions in binary form must reproduce the above copyright
10 |       notice, this list of conditions and the following disclaimer in the
11 |       documentation and/or other materials provided with the distribution.
12 |     * Neither the name of the <organization> nor the
13 |       names of its contributors may be used to endorse or promote products
14 |       derived from this software without specific prior written permission.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
20 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/tools/core/build.sbt:
--------------------------------------------------------------------------------
 1 | import Dependencies._
 2 | 
 3 | name := "nlpstack-core"
 4 | 
 5 | licenses := Seq(apache2)
 6 | 
 7 | libraryDependencies ++= Seq(
 8 |     parserCombinators,
 9 |     // for remotes
10 |     "net.databinder.dispatch" %% "dispatch-core" % "0.11.2")
11 | 
12 | dependencyOverrides ++= Set(
13 |   "org.scala-lang.modules" %% "scala-xml" % "1.0.2")
14 | 
15 | libraryDependencies ++= loggingDependencies
16 | 


--------------------------------------------------------------------------------
/tools/core/src/main/scala/org/allenai/nlpstack/core/ConstituencyParser.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.core
 2 | 
 3 | /** A constituency parser turns a sentence into a constituency
 4 |   * tree, a structure that is somewhat like chunking but
 5 |   * hierarchical.
 6 |   */
 7 | trait ConstituencyParser {
 8 |   def parse(string: String): ParseTree
 9 | }
10 | 
11 | /** A representation of the constituency parse. */
12 | abstract class ParseTree(
13 |     val token: String, var index: Int, val children: Array[ParseTree]
14 | ) extends Iterable[ParseTree] {
15 | 
16 |   /** Prints the tree in Penn treebank format. */
17 |   override def toString() =
18 |     if (children.size == 0) {
19 |       token
20 |     } else {
21 |       "(" + token + " " + children.map(child => child.toString).mkString(" ") + ")"
22 |     }
23 | 
24 |   def value = token
25 | 
26 |   def iterator = {
27 |     def preorder(node: ParseTree): List[ParseTree] = {
28 |       node :: node.children.toList.flatMap(preorder(_))
29 |     }
30 |     preorder(this).iterator
31 |   }
32 | 
33 |   def print(): Unit = {
34 |     def print(tree: ParseTree, indent: Int) {
35 |       if (tree.children.isEmpty) {
36 |         println(" " * indent + "(" + tree.token + ")")
37 |       } else {
38 |         println(" " * indent + "(" + tree.token)
39 |         tree.children.foreach { tree => print(tree, indent + 2) }
40 |         println(" " * indent + ")")
41 |       }
42 |     }
43 | 
44 |     print(this, 0)
45 |   }
46 | 
47 |   def printDOT(writer: java.lang.Appendable) {
48 |     def quote(string: String) = "\"" + string + "\""
49 |     def nodeString(node: ParseTree) = node.token
50 |     val indent = " " * 2
51 | 
52 |     writer.append("digraph g {\n")
53 | 
54 |     for (node <- this) {
55 |       val shape = node match {
56 |         case node: ParseTreePhrase => "box"
57 |         case node: ParseTreePostag => "invtriangle"
58 |         case node: ParseTreeToken => "circle"
59 |       }
60 |       writer.append(indent + node.index + " [label=" + quote(nodeString(node)) +
61 |         ", shape=" + quote(shape) + "]\n")
62 |     }
63 | 
64 |     for (node <- this) {
65 |       for (child <- node.children) {
66 |         writer.append(indent + node.index.toString + " -> " + child.index.toString + "\n")
67 |       }
68 |     }
69 |     writer.append("}")
70 |   }
71 | }
72 | 
73 | class ParseTreePhrase(
74 |   token: String, index: Int,
75 |   children: Array[ParseTree]
76 | ) extends ParseTree(token, index, children)
77 | 
78 | class ParseTreePostag(
79 |   token: String,
80 |   index: Int,
81 |   children: Array[ParseTree]
82 | ) extends ParseTree(token, index, children)
83 | 
84 | class ParseTreeToken(
85 |   token: String,
86 |   index: Int,
87 |   children: Array[ParseTree]
88 | ) extends ParseTree(token, index, children)
89 | 


--------------------------------------------------------------------------------
/tools/core/src/main/scala/org/allenai/nlpstack/core/FactorieUtilities.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.core
 2 | 
 3 | import java.util.regex.Matcher
 4 | 
 5 | /** Shared utilities for making Factorie work. These are probably not generally
 6 |   * useful.
 7 |   */
 8 | object FactorieUtilities {
 9 |   // Factorie's tokenizer crashes on unclosed XML tags. To work around this, we
10 |   // detect unclosed tags, and replace the opening < with a space.
11 |   private val unclosedTagRegex = "<([^>]{100})".r
12 |   def replaceUnclosedTag(s: String): String = {
13 |     val replaced = unclosedTagRegex.replaceAllIn(s, m => Matcher.quoteReplacement(" " + m.group(1)))
14 |     // Have to do this repeatedly for the case of "foo << barbarbarbar..."
15 |     if (replaced == s) s else replaceUnclosedTag(replaced)
16 |   }
17 | }
18 | 


--------------------------------------------------------------------------------
/tools/core/src/main/scala/org/allenai/nlpstack/core/Format.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.core
 2 | 
 3 | import scala.util.Try
 4 | 
 5 | trait Writer[F, T] {
 6 |   def write(from: F): T
 7 | }
 8 | 
 9 | trait Reader[F, T] {
10 |   def read(from: F): T
11 |   def readTry(from: F): Try[T] = Try(this.read(from))
12 | }
13 | 
14 | trait Format[F, T] extends Writer[F, T] with Reader[T, F] {
15 |   def roundtrip(f: F) = read(write(f))
16 |   def reverseRoundtrip(t: T) = write(read(t))
17 | }
18 | 
19 | object Format {
20 |   object stringQuoter extends Quoter(Set('"'))
21 | 
22 |   class Quoter(val chars: Set[Char]) {
23 |     def this(charString: String) = this(charString.toSet)
24 | 
25 |     def quote(s: String): String = {
26 |       val escapedBackslashes = s.replace("\\", "\\\\")
27 |       chars.foldLeft(escapedBackslashes)((unreplaced: String, char: Char) =>
28 |         unreplaced.replace(char.toString, "\\" + char))
29 |     }
30 | 
31 |     def unquote(s: String): String = {
32 |       val escapedBackslashes = chars.foldLeft(s)((quoted: String, char: Char) =>
33 |         quoted.replace("\\" + char, char.toString))
34 |       escapedBackslashes.replace("\\\\", "\\")
35 |     }
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/tools/core/src/main/scala/org/allenai/nlpstack/core/HashCodeHelper.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.core
 2 | 
 3 | /** This object provides a function to generate a hash code
 4 |   * out of multiple hashable parts.
 5 |   *
 6 |   * @author  Michael Schmitz
 7 |   */
 8 | object HashCodeHelper {
 9 |   def apply(parts: Any*): Int = this.apply(41)(parts: _*)
10 |   def apply(prime: Int)(parts: Any*): Int = {
11 |     var code = 0;
12 |     for (part <- parts) {
13 |       code = prime * code + part.hashCode
14 |     }
15 | 
16 |     code
17 |   }
18 | }
19 | 


--------------------------------------------------------------------------------
/tools/core/src/main/scala/org/allenai/nlpstack/core/IdentityStemmer.scala:
--------------------------------------------------------------------------------
1 | package org.allenai.nlpstack.core
2 | 
3 | /** A trivial stemmer that doesn't apply a stemming algorithm. */
4 | object IdentityStemmer extends Stemmer {
5 |   override def stem(word: String) = word
6 | 
7 |   implicit def instance: Stemmer = IdentityStemmer
8 | }
9 | 


--------------------------------------------------------------------------------
/tools/core/src/main/scala/org/allenai/nlpstack/core/Lemmatized.scala:
--------------------------------------------------------------------------------
1 | package org.allenai.nlpstack.core
2 | 
3 | case class Lemmatized[+T <: Token](token: T, lemma: String)
4 | 


--------------------------------------------------------------------------------
/tools/core/src/main/scala/org/allenai/nlpstack/core/Segmenter.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.core
 2 | 
 3 | import org.allenai.common.immutable.Interval
 4 | 
 5 | import spray.json.DefaultJsonProtocol._
 6 | 
 7 | /** A sentencer breaks text into sentences.
 8 |   */
 9 | abstract class Segmenter {
10 |   def apply(document: String) = segment(document)
11 | 
12 |   def segmentTexts(document: String) = {
13 |     this.segment(document).map(_.text)
14 |   }
15 | 
16 |   def segment(document: String): Iterable[Segment]
17 | }
18 | 
19 | case class Segment(text: String, offset: Int) {
20 |   override def toString = serialize
21 | 
22 |   def interval = Interval.open(offset, offset + text.length)
23 |   def length = text.length
24 | 
25 |   def serialize = text + "@" + offset
26 | }
27 | 
28 | object Segment {
29 |   private[this] val segmentRegex = """(.+)@(\d+)""".r
30 |   def deserialize(pickled: String): Segment = {
31 |     pickled match {
32 |       case segmentRegex(string, offset) => new Segment(string, offset.toInt)
33 |       case s => throw new MatchError("Could not deserialize: " + s)
34 |     }
35 |   }
36 | 
37 |   implicit val segmentJsonFormat = jsonFormat2(Segment.apply)
38 | }
39 | 


--------------------------------------------------------------------------------
/tools/core/src/main/scala/org/allenai/nlpstack/core/Stemmer.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.core
 2 | 
 3 | /** A stemmer takes a string token and produces a normalized form. */
 4 | abstract class Stemmer {
 5 |   def apply(word: String) = lemmatize(word)
 6 | 
 7 |   /** Apply the stemming algorithm. */
 8 |   def stem(word: String): String
 9 | 
10 |   /** Stem a token without a postag. */
11 |   def stemToken[T <: Token](token: T) = Lemmatized(token, this.stem(token.string))
12 | 
13 |   /** Apply the normalizing algorithm and then the stemming algorithm. */
14 |   def lemmatize(word: String) = this.stem(Stemmer.normalize(word))
15 | 
16 |   /** Lemmatize a token without a postag. */
17 |   def lemmatizeToken[T <: Token](token: T) = Lemmatized(token, this.lemmatize(token.string))
18 | }
19 | 
20 | trait PostaggedStemmer {
21 |   /** Some stemmers can take advantage of postags. */
22 |   def stem(word: String, postag: String): String
23 | 
24 |   /** Apply the normalizing algorithm and then the stemming algorithm with postag. */
25 |   def lemmatize(word: String, postag: String) = this.stem(Stemmer.normalize(word), postag)
26 | 
27 |   /** Stem a token with a postag. */
28 |   def stemPostaggedToken[T <: PostaggedToken](token: T): Lemmatized[T] =
29 |     Lemmatized(token, this.stem(token.string, token.postag))
30 | 
31 |   /** Lemmatize a token with a postag. */
32 |   def lemmatizePostaggedToken[T <: PostaggedToken](token: T): Lemmatized[T] =
33 |     Lemmatized(token, this.lemmatize(token.string, token.postag))
34 | }
35 | 
36 | object Stemmer {
37 |   /** Special characters to remove. */
38 |   val remove = """[()\[\].,;:"']""".r;
39 | 
40 |   /** Remove special characters and lowercase the string. */
41 |   def normalize(word: String) = Stemmer.remove.replaceAllIn(
42 |     word.trim.toLowerCase, ""
43 |   )
44 | }
45 | 


--------------------------------------------------------------------------------
/tools/core/src/main/scala/org/allenai/nlpstack/core/Token.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.core
 2 | 
 3 | import org.allenai.common.immutable.Interval
 4 | 
 5 | import spray.json._
 6 | 
 7 | /** The most simple representation of a token.  A token has a string
 8 |   * and a character offset in the original text.
 9 |   *
10 |   * @param  string  the string of the token
11 |   * @param  offset  the character offset of the token in the source sentence
12 |   */
13 | class Token(val string: String, val offset: Int) {
14 |   override def toString = Token.stringFormat.write(this)
15 | 
16 |   override def hashCode = HashCodeHelper(this.string, this.offset)
17 |   def canEqual(that: Token) = that.isInstanceOf[Token]
18 |   override def equals(that: Any) = that match {
19 |     case that: Token => (that canEqual this) &&
20 |       this.string == that.string &&
21 |       this.offset == that.offset
22 |     case _ => false
23 |   }
24 | 
25 |   @deprecated("Use offsets instead.", "2.4.0")
26 |   def interval = offsets
27 | 
28 |   def offsets = Interval.open(offset, offset + string.length)
29 | }
30 | 
31 | object Token {
32 |   def apply(string: String, offset: Int) = new Token(string, offset)
33 |   def unapply(token: Token): Option[(String, Int)] = Some((token.string, token.offset))
34 | 
35 |   object stringFormat extends Format[Token, String] {
36 |     val tokenRegex = "(.*?) +([^ ]*)".r
37 |     def write(token: Token): String = token.string + " " + token.offset
38 |     def read(pickled: String): Token = {
39 |       val (string, offset) = pickled match {
40 |         case tokenRegex(string, offset) => (string, offset.toInt)
41 |         case _ => throw new MatchError("Error parsing token: " + pickled)
42 |       }
43 |       Token(string, offset)
44 |     }
45 |   }
46 | 
47 |   implicit object tokenJsonFormat extends RootJsonFormat[Token] {
48 |     def write(t: Token) = JsObject(
49 |       "string" -> JsString(t.string),
50 |       "offset" -> JsNumber(t.offset)
51 |     )
52 | 
53 |     def read(value: JsValue) = value.asJsObject.getFields("string", "offset") match {
54 |       case Seq(JsString(string), JsNumber(offset)) =>
55 |         Token.apply(string, offset.toInt)
56 |       case _ => throw new DeserializationException("Token expected.")
57 |     }
58 |   }
59 | 
60 |   def rebuildString(tokens: Iterable[Token]): String = {
61 |     val str = new StringBuilder
62 |     for (token <- tokens) {
63 |       if (str.length < token.offset) {
64 |         str.append(" " * (token.offset - str.length))
65 |       }
66 |       str.replace(token.offset, token.offset + token.string.length, token.string)
67 |     }
68 |     str.mkString
69 |   }
70 | }
71 | 


--------------------------------------------------------------------------------
/tools/core/src/main/scala/org/allenai/nlpstack/core/conf/ConfidenceFunction.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.core.conf
 2 | 
 3 | import org.allenai.common.Resource.using
 4 | 
 5 | import java.io.{ BufferedOutputStream, File, FileOutputStream, OutputStream }
 6 | 
 7 | /** A confidence function for ranking how likely an extraction is correct.
 8 |   *
 9 |   * @tparam  E  the extraction to rank
10 |   * @param  featureSet  the features to use
11 |   */
12 | abstract class ConfidenceFunction[E](val featureSet: FeatureSet[E, Double])
13 |     extends Function[E, Double] {
14 |   def apply(that: E): Double
15 | 
16 |   def save(output: OutputStream): Unit
17 |   def saveFile(file: File) {
18 |     using(new BufferedOutputStream(new FileOutputStream(file))) { stream =>
19 |       this.save(stream)
20 |     }
21 |   }
22 | }
23 | 


--------------------------------------------------------------------------------
/tools/core/src/main/scala/org/allenai/nlpstack/core/conf/ConfidenceTrainer.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.core.conf
 2 | 
 3 | /** A trainer for a confidence function.
 4 |   *
 5 |   * @tparam  E  the extraction the confidence function will rank
 6 |   * @param  featureSet  the features to use
 7 |   */
 8 | abstract class ConfidenceTrainer[E](features: FeatureSet[E, Double])
 9 |     extends Trainer[E, Double](features) {
10 |   override val apply = train _
11 |   override def train(examples: Iterable[Labelled[E]]): ConfidenceFunction[E]
12 | }
13 | 


--------------------------------------------------------------------------------
/tools/core/src/main/scala/org/allenai/nlpstack/core/conf/Feature.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.core.conf
 2 | 
 3 | import scala.language.implicitConversions
 4 | 
 5 | /** An abstract representation for a feature used by the
 6 |   * confidence function.
 7 |   *
 8 |   * @param  name  a human-readable name for this feature
 9 |   */
10 | abstract class Feature[E, V](val name: String) extends Function[E, V] {
11 |   def apply(that: E): V
12 | }
13 | 
14 | object Feature {
15 |   /** A convenience factory method for creating a Feature from
16 |     * an anonymous function.
17 |     */
18 |   def from[E, V](name: String, f: E => V) = new Feature[E, V](name) {
19 |     override def apply(that: E): V = f(that)
20 |   }
21 | 
22 |   implicit def booleanToDouble[E](feature: Feature[E, Boolean]) =
23 |     new Feature[E, Double](feature.name) {
24 |       override def apply(item: E) = {
25 |         if (feature(item)) {
26 |           1.0
27 |         } else {
28 |           0.0
29 |         }
30 |       }
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------
/tools/core/src/main/scala/org/allenai/nlpstack/core/conf/FeatureSet.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.core.conf
 2 | 
 3 | import scala.collection.immutable.SortedMap
 4 | 
 5 | /** FeatureSet represents a set of features on T that can be
 6 |   * represented as a double.
 7 |   *
 8 |   * @param  featureMap  a lookup for the features
 9 |   */
10 | class FeatureSet[T, V](val featureMap: SortedMap[String, Feature[T, V]]) {
11 |   def this() = this(SortedMap.empty[String, Feature[T, V]])
12 | 
13 |   def apply(name: String) = featureMap(name)
14 | 
15 |   def featureNames(): Seq[String] =
16 |     featureMap.keys.toSeq
17 | 
18 |   def numFeatures(): Int =
19 |     featureNames.size
20 | 
21 |   def vectorize(example: T): Seq[V] =
22 |     featureNames.map({ name =>
23 |       val featureFunction = featureMap(name)
24 |       featureFunction(example)
25 |     })(scala.collection.breakOut)
26 | }
27 | 
28 | object FeatureSet {
29 |   val binaryClass = true
30 | 
31 |   def apply[T, V](features: Iterable[Feature[T, V]]): FeatureSet[T, V] = {
32 |     new FeatureSet[T, V](SortedMap.empty[String, Feature[T, V]] ++
33 |       features.map(feature => (feature.name, feature)))
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/tools/core/src/main/scala/org/allenai/nlpstack/core/conf/Labelled.scala:
--------------------------------------------------------------------------------
1 | package org.allenai.nlpstack.core.conf
2 | 
3 | /** A representation of a labelled extraction.
4 |   *
5 |   * @param  label  whether this extraction is true or false
6 |   * @param  item  the item labelled
7 |   */
8 | case class Labelled[E](label: Boolean, item: E)
9 | 


--------------------------------------------------------------------------------
/tools/core/src/main/scala/org/allenai/nlpstack/core/conf/Trainer.scala:
--------------------------------------------------------------------------------
1 | package org.allenai.nlpstack.core.conf
2 | 
3 | abstract class Trainer[E, V](val features: FeatureSet[E, V]) {
4 |   val apply = train _
5 |   def train(examples: Iterable[Labelled[E]]): Function[E, V]
6 | }
7 | 


--------------------------------------------------------------------------------
/tools/core/src/main/scala/org/allenai/nlpstack/core/coref/CorefResolver.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.core.coref
 2 | 
 3 | import org.allenai.nlpstack.core.Format.Quoter
 4 | import org.allenai.nlpstack.core.{ PostaggedToken, Format, Token }
 5 | import org.allenai.nlpstack.core.parse.graph.{ DependencyNode, DependencyGraph }
 6 | 
 7 | import scala.util.matching.Regex
 8 | import java.util.regex.Pattern
 9 | 
10 | case class Referent(
11 |   val references: Seq[DependencyNode],
12 |   val mainReference: Option[DependencyNode]
13 | )
14 | 
15 | abstract class CorefResolver[T <: Token] {
16 |   def resolveCoreferences(postaggedParse: (Seq[T], DependencyGraph)): Seq[Referent]
17 | }
18 | 
19 | object CorefResolver {
20 |   object multilineStringFormat extends StringFormat("\n")
21 |   object singlelineStringFormat extends StringFormat(";")
22 | 
23 |   class StringFormat(val separator: String)
24 |       extends Format[(DependencyGraph, Seq[Referent]), String] {
25 |     private val dgraphStringFormat = new DependencyGraph.StringFormat(separator)
26 | 
27 |     private val regex =
28 |       new Regex("""^\((.*[^)])\)( refers to (.*))?$""", "list", "_", "mainRef")
29 | 
30 |     override def read(from: String): (DependencyGraph, Seq[Referent]) = {
31 |       val parts = from.split(Pattern.quote(separator * 2), 2)
32 |       require(parts.length == 2)
33 |       val (dgraphString, corefString) = (parts(0), parts(1))
34 | 
35 |       val dgraph = dgraphStringFormat.read(dgraphString)
36 | 
37 |       val coref = corefString.split(Pattern.quote(separator)).map(s => {
38 |         val m = regex.findFirstMatchIn(s)
39 |         require(m.isDefined)
40 |         val stringReferences = m.get.group("list").split(Pattern.quote(", "))
41 |         val references = stringReferences map DependencyNode.stringFormat.read
42 | 
43 |         val mainReference = m.get.group("mainRef") match {
44 |           case null => None
45 |           case mainRefString => Some(DependencyNode.stringFormat.read(mainRefString))
46 |         }
47 | 
48 |         Referent(references, mainReference)
49 |       })
50 | 
51 |       (dgraph, coref)
52 |     }
53 | 
54 |     override def write(from: (DependencyGraph, Seq[Referent])): String = {
55 |       val (dgraph, coref) = from
56 |       dgraphStringFormat.write(dgraph) +
57 |         separator +
58 |         separator +
59 |         coref.map(r =>
60 |           "(%s)".format(r.references.mkString(", ")) + (r.mainReference match {
61 |             case None => ""
62 |             case Some(node) => " refers to %s".format(node)
63 |           })).mkString(separator)
64 |     }
65 |   }
66 | }
67 | 


--------------------------------------------------------------------------------
/tools/core/src/main/scala/org/allenai/nlpstack/core/graph/Bipath.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.core.graph
 2 | 
 3 | import org.allenai.nlpstack.core.graph.Graph._
 4 | 
 5 | /** A representation of a path through a graph.  The path is represented
 6 |   * by a list of directed edges.
 7 |   *
 8 |   * @author  Michael Schmitz
 9 |   */
10 | class Bipath[T](val path: List[DirectedEdge[T]]) {
11 |   require(path != null)
12 | 
13 |   // extend Object
14 |   override def toString = "[" + path.mkString(", ") + "]";
15 |   def canEqual(that: Any) = that.isInstanceOf[Bipath[_]]
16 |   override def equals(that: Any) = that match {
17 |     case that: Bipath[_] => (that canEqual this) && that.path == this.path
18 |     case _ => false
19 |   }
20 |   override def hashCode = 37 * (path.hashCode + 1)
21 | 
22 |   /** the undirected edges of the path */
23 |   def edges: Set[Edge[T]] = path.foldRight[Set[Edge[T]]](Set()) {
24 |     case (item, set) => set + item.edge
25 |   }
26 | 
27 |   /** the unique vertices along the path */
28 |   def nodes: List[T] = path.head.start :: path.map(_.end)
29 | 
30 |   /** the first vertex in the path */
31 |   def start: T = path.head.start
32 | 
33 |   /** collapse edges in the path that match `pred` */
34 |   def collapse(pred: Edge[T] => Boolean, merge: (T, T) => T) = {
35 |     if (path.forall(dep => pred(dep.edge))) {
36 |       this
37 |     } else {
38 |       val array = path.toArray
39 |       for (i <- array.indices) {
40 |         val current = array(i)
41 |         if (pred(current.edge)) {
42 |           // TODO: sorted
43 |           val merged = merge(current.start, current.end)
44 |           if (current.isInstanceOf[UpEdge[_]]) {
45 |             if (array.indices contains (i + 1)) {
46 |               array(i + 1) = array(i + 1).switchStart(merged)
47 |             }
48 | 
49 |             if (array.indices contains (i - 1)) {
50 |               array(i - 1) = array(i - 1).switchEnd(merged)
51 |             }
52 |           } else if (current.isInstanceOf[DownEdge[_]]) {
53 |             if (array.indices contains (i + 1)) {
54 |               array(i + 1).switchStart(merged)
55 |             }
56 | 
57 |             if (array.indices contains (i - 1)) {
58 |               array(i - 1) = array(i - 1).switchEnd(merged)
59 |             }
60 |           }
61 |         }
62 |       }
63 | 
64 |       new Bipath(array.filter(dep => !pred(dep.edge)).toList)
65 |     }
66 |   }
67 | }
68 | 


--------------------------------------------------------------------------------
/tools/core/src/main/scala/org/allenai/nlpstack/core/graph/pattern/Match.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.core.graph.pattern
 2 | 
 3 | import org.allenai.nlpstack.core.graph.{ Bipath, DirectedEdge, Graph }
 4 | 
 5 | /** A representation of a match of a pattern in a graph.
 6 |   *
 7 |   * @author  Michael Schmitz
 8 |   */
 9 | class Match[T](
10 |     /** the pattern that was applied */
11 |     val pattern: Pattern[T],
12 |     /** the matched path through the graph */
13 |     val bipath: Bipath[T],
14 |     /** the pattern groups in the match */
15 |     val nodeGroups: Map[String, Match.NodeGroup[T]],
16 |     val edgeGroups: Map[String, Match.EdgeGroup[T]]
17 | ) {
18 |   // extend Object
19 |   override def toString = bipath.toString + ": " + nodeGroups.toString + " and " +
20 |     edgeGroups.toString
21 | 
22 |   def groups: Map[String, Match.Group] = nodeGroups ++ edgeGroups
23 | 
24 |   def nodes: Iterable[T] = bipath.nodes
25 |   def edges: Iterable[Graph.Edge[T]] = bipath.edges
26 | }
27 | 
28 | object Match {
29 |   sealed abstract class Group(val text: String)
30 |   case class NodeGroup[T](node: T, matchText: String) extends Group(matchText)
31 |   case class EdgeGroup[T](dedge: DirectedEdge[T], matchText: String) extends Group(matchText)
32 | }
33 | 


--------------------------------------------------------------------------------
/tools/core/src/main/scala/org/allenai/nlpstack/core/headword/HeadExtractor.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.core.headword
 2 | 
 3 | import org.allenai.nlpstack.core.{ PostaggedToken, Postagger, Tokenizer }
 4 | 
 5 | trait HeadExtractor {
 6 | 
 7 |   /** Given a string representing a relation, will return those
 8 |     * tokens comprising the headword(s) of the relation, possibly empty if
 9 |     * headword(s) couldn't be determined.
10 |     */
11 |   def relationHead(
12 |     tokenizer: Tokenizer, postagger: Postagger
13 |   )(relation: String): Seq[PostaggedToken]
14 | 
15 |   /** Given a string representing an argument, will return those
16 |     * tokens comprising the headword(s) of the relation, possibly empty if
17 |     * headword(s) couldn't be determined.
18 |     */
19 |   def argumentHead(
20 |     tokenizer: Tokenizer, postagger: Postagger
21 |   )(argument: String): Seq[PostaggedToken]
22 | 
23 |   /** Given a Seq[PostaggedToken] representing a relation, will return those
24 |     * tokens comprising the headword(s) of the relation, possibly empty if
25 |     * headword(s) couldn't be determined.
26 |     */
27 |   def relationHead(tokens: Seq[PostaggedToken]): Seq[PostaggedToken]
28 | 
29 |   /** Given a Seq[PostaggedToken] representing an argument, will return those
30 |     * tokens comprising the headword(s) of the argument, possibly empty if
31 |     * headword(s) couldn't be determined.
32 |     */
33 |   def argumentHead(tokens: Seq[PostaggedToken]): Seq[PostaggedToken]
34 | }
35 | 


--------------------------------------------------------------------------------
/tools/core/src/main/scala/org/allenai/nlpstack/core/parse/graph/Dependency.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.core.parse.graph
 2 | 
 3 | import org.allenai.nlpstack.core.Format
 4 | 
 5 | import scala.util.matching.Regex
 6 | 
 7 | object Dependency {
 8 |   val Serialized =
 9 |     new Regex("""(\p{Graph}+)\(\s*(\p{Graph}*?-\d\d*?),\s*(\p{Graph}*?-\d\d*)\s*\)""")
10 | 
11 |   implicit object DependencyOrdering extends Ordering[Dependency] {
12 |     def compare(a: Dependency, b: Dependency) = {
13 |       def tupled(x: Dependency) = (x.dest.id, x.source.id)
14 | 
15 |       implicitly[Ordering[(Int, Int)]].compare(tupled(a), tupled(b))
16 |     }
17 |   }
18 | 
19 |   object stringFormat extends Format[Dependency, String] {
20 |     def write(dep: Dependency): String = {
21 |       dep.label + "(" + DependencyNode.stringFormat.write(dep.source) + ", " +
22 |         DependencyNode.stringFormat.write(dep.dest) + ")"
23 |     }
24 | 
25 |     def read(pickled: String): Dependency = try {
26 |       val Serialized(label, source, dest) = pickled
27 |       new Dependency(
28 |         DependencyNode.stringFormat.read(source),
29 |         DependencyNode.stringFormat.read(dest),
30 |         label
31 |       )
32 |     } catch {
33 |       case e: Throwable =>
34 |         throw new Dependency.SerializationException(
35 |           "could not deserialize dependency: " + pickled,
36 |           e
37 |         )
38 |     }
39 |   }
40 | 
41 |   @deprecated("Use stringFormat instead.", "2.4.5")
42 |   def deserialize(string: String) = stringFormat.read(string)
43 | 
44 |   class SerializationException(message: String, cause: Throwable)
45 |     extends RuntimeException(message, cause)
46 | }
47 | 
48 | object Dependencies {
49 |   def serialize(deps: Iterable[Dependency]) = {
50 |     deps.iterator.map {
51 |       Dependency.stringFormat.write(_)
52 |     }.mkString("; ")
53 |   }
54 |   def deserialize(string: String): Seq[Dependency] = string.split("""\s*(?:;|\n)\s*""").
55 |     map(Dependency.stringFormat.read(_))
56 | 
57 |   object DependencyOrdering extends Ordering[Dependency] {
58 |     def compare(a: Dependency, b: Dependency) = {
59 |       def tuplize(dep: Dependency) =
60 |         (dep.source.id, dep.dest.id, dep.label)
61 |       implicitly[Ordering[(Int, Int, String)]].compare(tuplize(a), tuplize(b))
62 |     }
63 |   }
64 | }
65 | 


--------------------------------------------------------------------------------
/tools/core/src/main/scala/org/allenai/nlpstack/core/parse/graph/DependencyNode.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.core.parse.graph
 2 | 
 3 | import org.allenai.nlpstack.core.Format
 4 | 
 5 | import spray.json.DefaultJsonProtocol._
 6 | 
 7 | import scala.util.matching.Regex
 8 | 
 9 | /** A representation for a node in the graph of dependencies.  A node
10 |   * represents one or more adjacent tokens in the source sentence.
11 |   */
12 | case class DependencyNode(val id: Int, val string: String) {
13 |   require(string != null)
14 | 
15 |   // extend Object
16 |   override def toString() = s"$string-$id"
17 | }
18 | 
19 | object DependencyNode {
20 |   implicit object DependencyNodeOrdering extends Ordering[DependencyNode] {
21 |     def compare(a: DependencyNode, b: DependencyNode) = a.id compare b.id
22 |   }
23 | 
24 |   object stringFormat extends Format[DependencyNode, String] {
25 |     val Serialized = new Regex("""(\p{Graph}*?)-(\d\d*)""")
26 |     def write(node: DependencyNode): String = {
27 |       val cleanText = node.string.replaceAll("[[_()][^\\p{Graph}]]", "")
28 |       Iterator(cleanText, node.id).mkString("-")
29 |     }
30 | 
31 |     def read(pickled: String): DependencyNode = {
32 |       val (text, id) = pickled match {
33 |         case Serialized(text, id) => (text, id)
34 |         case _ => throw new MatchError("Could not split pickled node into parts: " + pickled)
35 |       }
36 | 
37 |       new DependencyNode(id.toInt, text)
38 |     }
39 |   }
40 | 
41 |   implicit val dependencyNodeJsonFormat = jsonFormat2(DependencyNode.apply)
42 | 
43 |   @deprecated("Use StringFormat instead.", "2.4.5")
44 |   def deserialize(string: String) = {
45 |     stringFormat.read(string)
46 |   }
47 | 
48 |   class SerializationException(message: String, cause: Throwable)
49 |     extends RuntimeException(message, cause)
50 | }
51 | 


--------------------------------------------------------------------------------
/tools/core/src/main/scala/org/allenai/nlpstack/core/parse/graph/JoinedDependencyNode.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.core.parse.graph
 2 | 
 3 | import org.allenai.common.immutable.Interval
 4 | import org.allenai.nlpstack.core.graph.Graph
 5 | 
 6 | /** A representation for a node in the graph of dependencies.  A node
 7 |   * represents one or more adjacent tokens in the source sentence.
 8 |   */
 9 | case class JoinedDependencyNode(val ids: Seq[Int], val strings: Seq[String]) {
10 |   require(!ids.isEmpty)
11 |   require(!strings.isEmpty)
12 | 
13 |   def string = strings.mkString(" ")
14 | 
15 |   def span = Interval.closed(ids.min, ids.max)
16 | 
17 |   // extend Object
18 |   override def toString() = s"${strings.mkString(" ")}-${ids.mkString(",")}"
19 | }
20 | 
21 | object JoinedDependencyNode {
22 |   def from(node: DependencyNode) = JoinedDependencyNode(Seq(node.id), Seq(node.string))
23 | 
24 |   /** Merge nodes that correspond to adjacent tokens.
25 |     *
26 |     * @throws  IllegalArgumentException  there is no superior of the set
27 |     * @return  the superior node of the set
28 |     */
29 |   implicit def directedMerge(
30 |     graph: Graph[JoinedDependencyNode]
31 |   )(nodes: Traversable[JoinedDependencyNode]) = {
32 |     if (nodes.isEmpty) throw new IllegalArgumentException("argument nodes empty")
33 |     val sorted = nodes.toList.sortBy(_.span)
34 |     val strings = sorted.map(_.string)
35 | 
36 |     // ensure the nodes are adjacent in the source sentence
37 |     // or at least that the spans are
38 |     val spans = sorted.map(_.span)
39 |     if (!(Interval.span(spans) forall (point => spans.exists(span => span contains point)))) {
40 |       throw new IllegalArgumentException("A set of non-adjacent intervals cannot be merged: " +
41 |         nodes.mkString(", "))
42 |     }
43 | 
44 |     new JoinedDependencyNode(sorted.flatMap(_.ids).sorted, strings)
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/tools/core/src/main/scala/org/allenai/nlpstack/core/parse/graph/TokenDependencyNode.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.core.parse.graph
 2 | 
 3 | import org.allenai.nlpstack.core._
 4 | 
 5 | /** A representation for a node in the graph of dependencies.  A node
 6 |   * represents one or more adjacent tokens in the source sentence.
 7 |   */
 8 | case class TokenDependencyNode(val id: Int, val lemmatizedToken: Lemmatized[PostaggedToken]) {
 9 |   def string = token.string
10 |   def postag = token.postag
11 |   def lemma = lemmatizedToken.lemma
12 | 
13 |   def token: PostaggedToken = lemmatizedToken.token
14 | 
15 |   // extend Object
16 |   override def toString() = s"$string-$id"
17 | }
18 | 
19 | object TokenDependencyNode {
20 |   def from(tokens: Seq[Lemmatized[PostaggedToken]])(node: DependencyNode) =
21 |     TokenDependencyNode(node.id, tokens(node.id))
22 | }
23 | 


--------------------------------------------------------------------------------
/tools/core/src/main/scala/org/allenai/nlpstack/core/parse/graph/package.scala:
--------------------------------------------------------------------------------
1 | package org.allenai.nlpstack.core.parse
2 | 
3 | import org.allenai.nlpstack.core.graph.Graph.Edge
4 | 
5 | package object graph {
6 |   type Dependency = Edge[DependencyNode]
7 | }
8 | 


--------------------------------------------------------------------------------
/tools/core/src/main/scala/org/allenai/nlpstack/core/remote/Remote.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.core.remote
 2 | 
 3 | import dispatch.{ Http, as, url }
 4 | 
 5 | import scala.concurrent.duration.DurationInt
 6 | import scala.concurrent.{ Await, ExecutionContext }
 7 | 
 8 | trait Remote {
 9 |   def urlString: String
10 |   def timeout = 5.minutes
11 | 
12 |   val svc = url(urlString)
13 | 
14 |   def post(string: String)(implicit executor: ExecutionContext) =
15 |     Await.result(Http(svc << string OK as.String), timeout)
16 | }
17 | 


--------------------------------------------------------------------------------
/tools/core/src/main/scala/org/allenai/nlpstack/core/remote/RemoteDependencyParser.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.core.remote
 2 | 
 3 | import org.allenai.nlpstack.core.DependencyParser
 4 | 
 5 | import scala.concurrent.ExecutionContext
 6 | 
 7 | class RemoteDependencyParser(
 8 |     val urlString: String
 9 | )(implicit executionContext: ExecutionContext) extends Remote {
10 |   def dependencyGraph(sentence: String) = {
11 |     val response = post(sentence)
12 | 
13 |     DependencyParser.multilineStringFormat.read(response)
14 |   }
15 | }
16 | 


--------------------------------------------------------------------------------
/tools/core/src/main/scala/org/allenai/nlpstack/core/remote/RemoteSegmenter.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.core.remote
 2 | 
 3 | import org.allenai.nlpstack.core.{ Segment, Segmenter }
 4 | 
 5 | import scala.concurrent.ExecutionContext
 6 | 
 7 | class RemoteSegmenter(
 8 |     val urlString: String
 9 | )(implicit executionContext: ExecutionContext) extends Segmenter with Remote {
10 |   def segment(sentence: String) = {
11 |     val response = this.post(sentence)
12 |     response.split("\\n").map(Segment.deserialize)(scala.collection.breakOut)
13 |   }
14 | }
15 | 


--------------------------------------------------------------------------------
/tools/core/src/main/scala/org/allenai/nlpstack/core/remote/RemoteStemmer.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.core.remote
 2 | 
 3 | import org.allenai.nlpstack.core.Stemmer
 4 | 
 5 | import scala.concurrent.ExecutionContext
 6 | 
 7 | class RemoteStemmer(
 8 |     val urlString: String
 9 | )(implicit executionContext: ExecutionContext) extends Stemmer with Remote {
10 |   override def stem(word: String) = {
11 |     post(word)
12 |   }
13 | }
14 | 


--------------------------------------------------------------------------------
/tools/core/src/main/scala/org/allenai/nlpstack/core/repr/Chunks.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.core.repr
 2 | 
 3 | import org.allenai.common.immutable.Interval
 4 | import org.allenai.nlpstack.core.ChunkedToken
 5 | 
 6 | trait ChunksSupertrait extends PostagsSupertrait {
 7 |   this: Sentence =>
 8 | 
 9 |   type token <: ChunkedToken
10 | 
11 |   def chunks: Seq[String] = tokens.map(_.chunk)
12 |   def chunkIntervals: Seq[(String, Interval)] =
13 |     org.allenai.nlpstack.core.Chunker.intervals(tokens)
14 | }
15 | 
16 | trait Chunks extends ChunksSupertrait {
17 |   this: Sentence =>
18 | 
19 |   type token = ChunkedToken
20 | }
21 | 
22 | trait Chunker extends Chunks {
23 |   this: Sentence =>
24 | 
25 |   def tokenizer: org.allenai.nlpstack.core.Tokenizer
26 |   def postagger: org.allenai.nlpstack.core.Postagger
27 |   def chunker: org.allenai.nlpstack.core.Chunker
28 | 
29 |   override lazy val tokens: Seq[ChunkedToken] =
30 |     chunker.chunk(tokenizer, postagger)(this.text)
31 | }
32 | 


--------------------------------------------------------------------------------
/tools/core/src/main/scala/org/allenai/nlpstack/core/repr/Document.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.core.repr
 2 | 
 3 | class Document(val text: String) {
 4 |   override def toString = {
 5 |     if (text.length > 80) {
 6 |       s"Document(${text.take(80) + "..."})"
 7 |     } else {
 8 |       s"Document($text)"
 9 |     }
10 |   }
11 | 
12 |   def canEqual(that: Document) = that.isInstanceOf[Document]
13 |   override def equals(that: Any) = that match {
14 |     case that: Document => (that canEqual this) && this.text == that.text
15 |   }
16 |   override def hashCode = text.hashCode
17 | }
18 | 


--------------------------------------------------------------------------------
/tools/core/src/main/scala/org/allenai/nlpstack/core/repr/Lemmas.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.core.repr
 2 | 
 3 | import org.allenai.nlpstack.core.Stemmer
 4 | 
 5 | trait Lemmas {
 6 |   tokenized: TokensSupertrait =>
 7 | 
 8 |   def lemmatizedTokens: Seq[org.allenai.nlpstack.core.Lemmatized[token]]
 9 | }
10 | 
11 | trait Lemmatizer extends Lemmas {
12 |   tokenized: TokensSupertrait =>
13 | 
14 |   def lemmatizer: Stemmer
15 | 
16 |   override lazy val lemmatizedTokens: Seq[org.allenai.nlpstack.core.Lemmatized[token]] =
17 |     tokenized.tokens map lemmatizer.lemmatizeToken
18 | }
19 | 


--------------------------------------------------------------------------------
/tools/core/src/main/scala/org/allenai/nlpstack/core/repr/Postags.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.core.repr
 2 | 
 3 | import org.allenai.nlpstack.core.PostaggedToken
 4 | 
 5 | trait PostagsSupertrait extends TokensSupertrait {
 6 |   this: Sentence =>
 7 | 
 8 |   type token <: PostaggedToken
 9 | 
10 |   def postags: Seq[String] = tokens.map(_.postag)
11 | }
12 | 
13 | trait Postags extends PostagsSupertrait {
14 |   this: Sentence =>
15 | 
16 |   type token = PostaggedToken
17 | }
18 | 
19 | trait Postagger extends Postags {
20 |   this: Sentence =>
21 |   def tokenizer: org.allenai.nlpstack.core.Tokenizer
22 |   def postagger: org.allenai.nlpstack.core.Postagger
23 | 
24 |   override lazy val tokens: Seq[PostaggedToken] =
25 |     postagger.postag(tokenizer)(this.text)
26 | }
27 | 
28 | 


--------------------------------------------------------------------------------
/tools/core/src/main/scala/org/allenai/nlpstack/core/repr/Sentence.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.core.repr
 2 | 
 3 | class Sentence(val text: String) {
 4 |   override def toString = s"Sentence($text)"
 5 | 
 6 |   def canEqual(that: Sentence) = that.isInstanceOf[Sentence]
 7 |   override def equals(that: Any) = that match {
 8 |     case that: Sentence => (that canEqual this) && this.text == that.text
 9 |   }
10 |   override def hashCode = text.hashCode
11 | }
12 | 


--------------------------------------------------------------------------------
/tools/core/src/main/scala/org/allenai/nlpstack/core/repr/Sentenced.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.core.repr
 2 | 
 3 | import org.allenai.nlpstack.core.Segmenter
 4 | 
 5 | case class DocumentSentence[S <: Sentence](sentence: S, offset: Int)
 6 | 
 7 | trait Sentenced[S <: Sentence] {
 8 |   this: Document =>
 9 | 
10 |   def sentences: Stream[DocumentSentence[S]]
11 | }
12 | 
13 | trait Sentencer[S <: Sentence] extends Sentenced[S] {
14 |   this: Document =>
15 | 
16 |   def constructor(text: String): S
17 |   def sentencer: Segmenter
18 | 
19 |   override lazy val sentences: Stream[DocumentSentence[S]] =
20 |     sentencer(text).toStream.map { segment =>
21 |       DocumentSentence(constructor(segment.text), segment.offset)
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/tools/core/src/main/scala/org/allenai/nlpstack/core/repr/Tokens.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.core.repr
 2 | 
 3 | import org.allenai.nlpstack.core.Token
 4 | 
 5 | trait TokensSupertrait {
 6 |   this: Sentence =>
 7 |   type token <: Token
 8 | 
 9 |   def tokens: Seq[token]
10 | 
11 |   def strings: Seq[String] = tokens.map(_.string)
12 | }
13 | 
14 | trait Tokens extends TokensSupertrait {
15 |   this: Sentence =>
16 |   type token = Token
17 | }
18 | 
19 | trait Tokenizer extends Tokens {
20 |   this: Sentence =>
21 | 
22 |   def tokenizer: org.allenai.nlpstack.core.Tokenizer
23 | 
24 |   override lazy val tokens: Seq[Token] =
25 |     tokenizer.tokenize(text)
26 | }
27 | 


--------------------------------------------------------------------------------
/tools/core/src/main/scala/org/allenai/nlpstack/core/srl/RemoteSrl.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.core.srl
 2 | 
 3 | import org.allenai.nlpstack.core.DependencyParser
 4 | import org.allenai.nlpstack.core.parse.graph._
 5 | import org.allenai.nlpstack.core.PostaggedToken
 6 | import org.allenai.nlpstack.core.remote.Remote
 7 | 
 8 | import scala.concurrent.ExecutionContext
 9 | 
10 | class RemoteSrl(val urlString: String)(implicit executionContext: ExecutionContext)
11 |     extends Srl with Remote {
12 |   def apply(tokens: Seq[PostaggedToken], dgraph: DependencyGraph) = {
13 |     val response = this.post(DependencyParser.multilineStringFormat.write(tokens -> dgraph))
14 |     if (response.isEmpty) {
15 |       Seq.empty
16 |     } else {
17 |       response.split("\\n").map(Frame.deserialize(dgraph))(scala.collection.breakOut)
18 |     }
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/tools/core/src/main/scala/org/allenai/nlpstack/core/srl/Srl.scala:
--------------------------------------------------------------------------------
1 | package org.allenai.nlpstack.core.srl
2 | 
3 | import org.allenai.nlpstack.core.parse.graph.DependencyGraph
4 | import org.allenai.nlpstack.core.PostaggedToken
5 | 
6 | abstract class Srl {
7 |   def apply(tokens: Seq[PostaggedToken], graph: DependencyGraph): Seq[Frame]
8 | }
9 | 


--------------------------------------------------------------------------------
/tools/core/src/main/scala/org/allenai/nlpstack/core/typer/Typer.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.core.typer
 2 | 
 3 | import org.allenai.common.immutable.Interval
 4 | import org.allenai.nlpstack.core.Token
 5 | 
 6 | abstract class Typer[E <: Token] {
 7 |   def name: String
 8 |   def source: String
 9 | 
10 |   def apply(seq: Seq[E]): Seq[Type]
11 | }
12 | 
13 | abstract class Type {
14 |   def name: String
15 |   def source: String
16 |   def tokenInterval: Interval
17 |   def text: String
18 | 
19 |   def matchText[E <: Token](seq: Seq[E]): String =
20 |     seq.iterator.slice(tokenInterval.start, tokenInterval.end).map(_.string).mkString(" ")
21 |   def tokens[E <: Token](seq: Seq[E]): Seq[E] = seq.slice(tokenInterval.start, tokenInterval.end)
22 | }
23 | 
24 | object Type {
25 |   def apply(name: String, source: String, tokenInterval: Interval, text: String): Type = {
26 |     this.create(name, source, tokenInterval, text)
27 |   }
28 | 
29 |   def create(name: String, source: String, tokenInterval: Interval, text: String): Type = {
30 |     TypeImpl(name, source, tokenInterval, text)
31 |   }
32 | 
33 |   private case class TypeImpl(
34 |     val name: String,
35 |     val source: String,
36 |     val tokenInterval: Interval,
37 |     val text: String
38 |   ) extends Type
39 | }
40 | 


--------------------------------------------------------------------------------
/tools/core/src/test/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 |   <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
 3 |     <!-- encoders are assigned the type
 4 |     ch.qos.logback.classic.encoder.PatternLayoutEncoder by default -->
 5 |     <encoder>
 6 |       <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 7 |     </encoder>
 8 |   </appender>
 9 | 
10 |   <root level="info">
11 |     <appender-ref ref="STDOUT" />
12 |   </root>
13 |   <logger name="org.apache.wire" level="WARN" additivity="false" />
14 | </configuration>
15 | 


--------------------------------------------------------------------------------
/tools/core/src/test/scala/org/allenai/nlpstack/core/ChunkerSpec.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.core
 2 | 
 3 | import org.allenai.common.testkit.UnitSpec
 4 | 
 5 | class ChunkerSpec extends UnitSpec {
 6 |   "chunker" should "determine intervals correctly." in {
 7 |     val strings = "John very quickly ran away from the deep blue reflection in the mirror .".split(" ")
 8 |     val postags = "NNP RB RB VBD RB IN DT JJ JJ NN IN DT NN .".split(" ")
 9 |     val chunks = "B-NP B-ADVP B-ADVP B-VP B-ADVP B-PP B-NP I-NP I-NP I-NP B-PP B-NP I-NP O".split(" ")
10 | 
11 |     val text = "John very quickly ran away from the deep blue reflection in the mirror."
12 |     val tokens = Tokenizer.computeOffsets(strings, text)
13 |     val chunkedTokens = Chunker.tokensFrom(chunks, postags, tokens)
14 | 
15 |     Chunker.intervals(chunkedTokens).map(_.toString) should contain theSameElementsAs (
16 |       List(
17 |         "(NP,{0})",
18 |         "(ADVP,{1})",
19 |         "(ADVP,{2})",
20 |         "(VP,{3})",
21 |         "(ADVP,{4})",
22 |         "(PP,{5})",
23 |         "(NP,[6, 10))",
24 |         "(PP,{10})",
25 |         "(NP,[11, 13))",
26 |         "(O,{13})"
27 |       )
28 |     )
29 |   }
30 | 
31 |   it should "join of" in {
32 |     val strings = "John 's dog ate at the University of Washington".split(" ")
33 |     val postags = "NNP POS NN VBD IN DT NNP IN NNP".split(" ")
34 |     val chunks = "B-NP B-NP I-NP V-BP B-PP B-NP I-NP B-PP B-NP".split(" ")
35 | 
36 |     val text = "John's dog ate at the University of Washington."
37 |     val tokens = Tokenizer.computeOffsets(strings, text)
38 |     val chunkedTokens = Chunker.tokensFrom(chunks, postags, tokens)
39 | 
40 |     Chunker.joinOf(chunkedTokens).map(_.chunk).mkString(" ") ===
41 |       "B-NP B-NP I-NP V-BP B-PP B-NP I-NP I-NP I-NP"
42 |   }
43 | 
44 |   it should "join possessives" in {
45 |     val strings = "John 's dog ate at the University of Washington".split(" ")
46 |     val postags = "NNP POS NN VBD IN DT NNP IN NNP".split(" ")
47 |     val chunks = "B-NP B-NP I-NP V-BP B-PP B-NP I-NP B-PP B-NP".split(" ")
48 | 
49 |     val text = "John's dog ate at the University of Washington."
50 |     val tokens = Tokenizer.computeOffsets(strings, text)
51 |     val chunkedTokens = Chunker.tokensFrom(chunks, postags, tokens)
52 | 
53 |     Chunker.joinPos(chunkedTokens).map(_.chunk).mkString(" ") ===
54 |       "B-NP I-NP I-NP V-BP B-PP B-NP I-NP B-PP B-NP"
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------
/tools/core/src/test/scala/org/allenai/nlpstack/core/CorefResolverSpec.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.core
 2 | 
 3 | import org.allenai.common.testkit.UnitSpec
 4 | import org.allenai.nlpstack.core.coref.{ CorefResolver, Referent }
 5 | import org.allenai.nlpstack.core.parse.graph.DependencyGraph
 6 | 
 7 | class CorefResolverSpec extends UnitSpec {
 8 |   "CorefResolverSerialization" should "round trip through serialization" in {
 9 |     val dgraphString =
10 |       """|det(amphibians-3, The-1)
11 |          |amod(amphibians-3, first-2)
12 |          |nn(.-25, amphibians-3)
13 |          |vmod(amphibians-3, evolved-4)
14 |          |aux(move-6, to-5)
15 |          |xcomp(evolved-4, move-6)
16 |          |prep(move-6, out-7)
17 |          |pcomp(out-7, of-8)
18 |          |nsubj(.-25, the-9)
19 |          |nn(.-25, water-10)
20 |          |cc(water-10, and-11)
21 |          |nn(land-13, colonize-12)
22 |          |conj(water-10, land-13)
23 |          |punct(water-10, ,-14)
24 |          |cc(water-10, but-15)
25 |          |nsubj(had-17, they-16)
26 |          |rcmod(water-10, had-17)
27 |          |aux(return-19, to-18)
28 |          |xcomp(had-17, return-19)
29 |          |prep(return-19, to-20)
30 |          |det(water-22, the-21)
31 |          |pobj(to-20, water-22)
32 |          |aux(reproduce-24, to-23)
33 |          |vmod(water-22, reproduce-24)
34 |          |root(ROOT-0, .-25)""".stripMargin
35 |     val dgraph = DependencyGraph.multilineStringFormat.read(dgraphString)
36 | 
37 |     // minus 1 because the dgraph's serialization format increases the numbers
38 |     // by one
39 |     val amphibians = dgraph.nodeById(3 - 1).get
40 |     val they = dgraph.nodeById(16 - 1).get
41 | 
42 |     for (
43 |       format <- Seq(CorefResolver.multilineStringFormat, CorefResolver.singlelineStringFormat);
44 |       mainReference <- Seq(Some(amphibians), None)
45 |     ) {
46 |       val coref = Seq(Referent(Seq(amphibians, they), mainReference))
47 |       val corefString = format.write((dgraph, coref))
48 |       val newCoref = format.read(corefString)
49 |       assert((dgraph, coref) === newCoref)
50 |     }
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/tools/core/src/test/scala/org/allenai/nlpstack/core/DependencyNodeSpec.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.core
 2 | 
 3 | import org.allenai.common.testkit.UnitSpec
 4 | import org.allenai.nlpstack.core.parse.graph.DependencyNode
 5 | 
 6 | class DependencyNodeSpec extends UnitSpec {
 7 |   "DependencyNode" should "round trip through string serialization when it contains a hyphen" in {
 8 |     val pickledDepNode = "Co-Redemptrix-13"
 9 |     val depNode = DependencyNode.stringFormat.read(pickledDepNode)
10 |     val repickled = DependencyNode.stringFormat.write(depNode)
11 | 
12 |     assert(pickledDepNode === repickled)
13 |   }
14 | 
15 |   "DependencyNode" should "round trip through json serialization" in {
16 |     val node = new DependencyNode(4, "Michael")
17 |     val pickled = DependencyNode.dependencyNodeJsonFormat.write(node)
18 |     val unpickled = DependencyNode.dependencyNodeJsonFormat.read(pickled)
19 | 
20 |     assert(node === unpickled)
21 |   }
22 | }
23 | 
24 | 


--------------------------------------------------------------------------------
/tools/core/src/test/scala/org/allenai/nlpstack/core/DependencySpec.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.core
 2 | 
 3 | import org.allenai.common.testkit.UnitSpec
 4 | import org.allenai.nlpstack.core.parse.graph.Dependency
 5 | 
 6 | class DependencySpec extends UnitSpec {
 7 |   "Dependency" should "round trip through serialization" in {
 8 |     val pickledDep = "det(reflection-9, the-6)"
 9 |     val dep = Dependency.stringFormat.read(pickledDep)
10 |     val repickled = Dependency.stringFormat.write(dep)
11 | 
12 |     assert(pickledDep === repickled)
13 |   }
14 | }
15 | 


--------------------------------------------------------------------------------
/tools/core/src/test/scala/org/allenai/nlpstack/core/FormatSpec.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.core
 2 | 
 3 | import org.allenai.common.testkit.UnitSpec
 4 | import org.allenai.nlpstack.core.Format.Quoter
 5 | 
 6 | class FormatSpec extends UnitSpec {
 7 |   "stringQuoter" should "quote strings" in {
 8 |     assert(Format.stringQuoter.quote("A 3\" diameter") === "A 3\\\" diameter")
 9 |     assert(Format.stringQuoter.quote("C:\\Windows\\System32") === "C:\\\\Windows\\\\System32")
10 |   }
11 | 
12 |   "custom Quoter" should "quote strings" in {
13 |     val q = new Quoter(";\"")
14 |     val unquoted = "To be; Or \\not\\ \"to be\""
15 |     val quoted = "To be\\; Or \\\\not\\\\ \\\"to be\\\""
16 |     assert(q.quote(unquoted) == quoted)
17 |     assert(q.unquote(quoted) == unquoted)
18 |   }
19 | }
20 | 


--------------------------------------------------------------------------------
/tools/core/src/test/scala/org/allenai/nlpstack/core/TokenizerSpec.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.core
 2 | 
 3 | import org.allenai.common.testkit.UnitSpec
 4 | 
 5 | class TokenizerSpecTest extends UnitSpec {
 6 |   "tokenizer" should "compute offsets correctly and infer the original text" in {
 7 |     val sentence = "John walks down the hall."
 8 |     val tokens = Tokenizer.computeOffsets(Seq("John", "walks", "down", "the", "hall", "."), sentence)
 9 | 
10 |     // make sure offsets were computed correctly
11 |     assert(tokens.map(_.offsets.start) === Seq(0, 5, 11, 16, 20, 24))
12 | 
13 |     // make sure we can go back to the original sentence
14 |     assert(Tokenizer.originalText(tokens) === sentence)
15 |   }
16 | 
17 |   it should "compute offsets correctly and infer the original text when there is a leading space" in {
18 |     val sentence = "  John walks down the hall."
19 |     val tokens = Tokenizer.computeOffsets(Seq("John", "walks", "down", "the", "hall", "."), sentence)
20 | 
21 |     // make sure offsets were computed correctly
22 |     assert(tokens.map(_.offsets.start) === Seq(2, 7, 13, 18, 22, 26))
23 | 
24 |     // make sure we can go back to the original sentence
25 |     assert(Tokenizer.originalText(tokens) === sentence)
26 |   }
27 | 
28 |   it should "trim original text correctly when a start offset is specified" in {
29 |     val sentence = "  John walks down the hall."
30 |     val trimmedSentence = "John walks down the hall."
31 |     val tokens = Tokenizer.computeOffsets(Seq("John", "walks", "down", "the", "hall", "."), sentence)
32 | 
33 |     // make sure offsets were computed correctly
34 |     assert(tokens.map(_.offsets.start) === Seq(2, 7, 13, 18, 22, 26))
35 | 
36 |     // make sure we can go back to the original sentence
37 |     assert(Tokenizer.originalText(tokens, tokens.head.offset) === trimmedSentence)
38 |   }
39 | 
40 |   it should "throw an exception if tokens are out of order" in {
41 |     val tokens = Seq(
42 |       new Token("large-scale", 0),
43 |       new Token("large", 0),
44 |       new Token("scale", 6)
45 |     )
46 | 
47 |     a[IllegalArgumentException] should be thrownBy {
48 |       Tokenizer.originalText(tokens, 10)
49 |     }
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/tools/headword/build.sbt:
--------------------------------------------------------------------------------
1 | import Dependencies._
2 | 
3 | libraryDependencies ++= loggingDependencies
4 | 


--------------------------------------------------------------------------------
/tools/headword/src/test/scala/JwiToolsSpec.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.headword
 2 | 
 3 | import org.allenai.common.testkit.UnitSpec
 4 | 
 5 | class JwiToolsSpec extends UnitSpec {
 6 | 
 7 |   val jwiTools = new JwiTools()
 8 | 
 9 |   "JwiTools" should "correctly stem a word" in {
10 |     val word = "elephants"
11 |     val stem = jwiTools.stem(word)
12 |     assert(stem === "elephant")
13 |   }
14 | 
15 |   it should "throw an exception if wordnet path is invalid" in {
16 |     a[IllegalArgumentException] should be thrownBy {
17 |       new JwiTools("foo/bar")
18 |     }
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/tools/lemmatize/LICENSE:
--------------------------------------------------------------------------------
 1 | This licence accompanied the original LEX files from which Morpha.java was
 2 | generated.
 3 | 
 4 | All supplemental code is copyright under Apache 2.0.
 5 | 
 6 | Copyright (c) 1995-2001 University of Sheffield, University of Sussex
 7 | All rights reserved.
 8 | 
 9 | Redistribution and use of source and derived binary forms are permitted
10 | provided that:
11 | - they are not used in commercial products
12 | - the above copyright notice and this paragraph are duplicated in
13 | all such forms
14 | - any documentation, advertising materials, and other materials
15 | related to such distribution and use acknowledge that the software
16 | was developed by Kevin Humphreys <kwh@dcs.shef.ac.uk> and John
17 | Carroll <john.carroll@cogs.susx.ac.uk> and Guido Minnen
18 | <Guido.Minnen@cogs.susx.ac.uk> and refer to the following related
19 | publication:
20 | 
21 |  Guido Minnen, John Carroll and Darren Pearce. 2000. Robust, Applied
22 |  Morphological Generation. In Proceedings of the First International
23 |  Natural Language Generation Conference (INLG), Mitzpe Ramon, Israel.
24 |  201-208.
25 | 
26 | The name of University of Sheffield may not be used to endorse or
27 | promote products derived from this software without specific prior
28 | written permission.
29 | 
30 | This software is provided "as is" and without any express or
31 | implied warranties, including, without limitation, the implied
32 | warranties of merchantibility and fitness for a particular purpose.
33 | 
34 | If you make any changes, the authors would appreciate it
35 | if you sent them details of what you have done.
36 | 


--------------------------------------------------------------------------------
/tools/lemmatize/build.sbt:
--------------------------------------------------------------------------------
1 | import Dependencies._
2 | 
3 | libraryDependencies ++= loggingDependencies
4 | 


--------------------------------------------------------------------------------
/tools/lemmatize/src/main/scala/org/allenai/nlpstack/lemmatize/MorphaStemmer.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.lemmatize
 2 | 
 3 | import org.allenai.nlpstack.core.{ PostaggedStemmer, Stemmer }
 4 | 
 5 | import edu.washington.cs.knowitall.morpha.{ MorphaStemmer => MorphaStem }
 6 | 
 7 | /** This stemmer handles many cases, but the JFlex is 5 MB. */
 8 | class MorphaStemmer extends Stemmer with PostaggedStemmer {
 9 |   private val whitespace = "\\s".r
10 | 
11 |   private def stem(word: String, stemmer: (String => String)) =
12 |     if (whitespace.findFirstMatchIn(word).isDefined) {
13 |       word
14 |     } else {
15 |       stemmer(word)
16 |     }
17 | 
18 |   def stem(word: String) = stem(word, MorphaStem.stemToken(_))
19 |   override def stem(word: String, postag: String) =
20 |     stem(word, MorphaStem.stemToken(_, postag))
21 | }
22 | 
23 | /** MorphaStemmer is threadsafe.  Clients can use this global instance. */
24 | object MorphaStemmer extends MorphaStemmer
25 | 


--------------------------------------------------------------------------------
/tools/lemmatize/src/main/universal/lemmatize-server.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CLASS_NAME="edu.knowitall.tool.lemmatize.MorphaStemmer"
 4 | 
 5 | SCRIPT_DIR=`dirname $0`
 6 | SHORT_NAME=`basename $0 .sh`
 7 | APP_ROOT="$SCRIPT_DIR/.."
 8 | JVM_ARGS="-Xmx128M"
 9 | 
10 | . "${SCRIPT_DIR}/run-class.sh" "$CLASS_NAME" "$SHORT_NAME" "$@"
11 | 


--------------------------------------------------------------------------------
/tools/lemmatize/src/test/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 |   <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
 3 |     <!-- encoders are assigned the type
 4 |     ch.qos.logback.classic.encoder.PatternLayoutEncoder by default -->
 5 |     <encoder>
 6 |       <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 7 |     </encoder>
 8 |   </appender>
 9 | 
10 |   <root level="info">
11 |     <appender-ref ref="STDOUT" />
12 |   </root>
13 |   <logger name="org.apache.wire" level="WARN" additivity="false" />
14 | </configuration>
15 | 


--------------------------------------------------------------------------------
/tools/lemmatize/src/test/scala/org/allenai/nlpstack/lemmatize/MorphaLemmatizerSpec.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.lemmatize
 2 | 
 3 | import org.allenai.common.testkit.UnitSpec
 4 | 
 5 | class MorphaLemmatizerSpec extends UnitSpec {
 6 |   "lemmatizer" should "correctly lemmatize a word" in {
 7 |     val word = "ate"
 8 |     val lemma = MorphaStemmer.lemmatize(word)
 9 |     assert(lemma === "eat")
10 |   }
11 | 
12 |   it should "not lemmatize a word with spaces" in {
13 |     val wordWithSpace = "29 1/2"
14 |     assert(MorphaStemmer.lemmatize(wordWithSpace) === wordWithSpace)
15 |   }
16 | }
17 | 
18 | 


--------------------------------------------------------------------------------
/tools/parse/build.sbt:
--------------------------------------------------------------------------------
 1 | parallelExecution in ThisBuild := false
 2 | 
 3 | javaOptions += "-XX:ReservedCodeCacheSize=512M"
 4 | 
 5 | javaOptions += "-Xmx16G"
 6 | 
 7 | // uncomment if you want to train the parser
 8 | javaOptions += "-Xss800m"
 9 | 
10 | fork in test := true
11 | 


--------------------------------------------------------------------------------
/tools/parse/conf/deploy.conf:
--------------------------------------------------------------------------------
 1 | clearParser = {
 2 | 
 3 |   project = {
 4 |     subdirectory = "parse/clear"
 5 |     name = "parse-clear"
 6 |   }
 7 | 
 8 |   deploy = {
 9 |     startup_script = "bin/clear-parse-server.sh"
10 |     directory = "/local/deploy/nlptools-clear-parser"
11 |     user.ssh_username = "ec2-user"
12 |   }
13 | 
14 | 
15 |   // For now set this on the command line via -Ddeploy.host=ec2-54-200-156-107.us-west-2.compute.amazonaws.com
16 |   // TODO is to get a real aname for nlptools and configure it here.
17 |   // deploy.host = 
18 | }
19 | 


--------------------------------------------------------------------------------
/tools/parse/jvm.sbt:
--------------------------------------------------------------------------------
1 | fork := true
2 | 


--------------------------------------------------------------------------------
/tools/parse/src/main/resources/featuretaggers.config:
--------------------------------------------------------------------------------
 1 | verbnet {
 2 |   group: "org.allenai.nlp.resources"
 3 |   name: "verbnet-3.2"
 4 |   version: 1
 5 | }
 6 | googleUnigram {
 7 |   group: "org.allenai.nlp.resources"
 8 |   name: "googleNgramsNodes-20130501-freq1000Filtered"
 9 |   version: 1
10 |   features: [ "depLabel", "posTag" ]
11 | }
12 | 


--------------------------------------------------------------------------------
/tools/parse/src/main/scala/org/allenai/nlpstack/parse/package.scala:
--------------------------------------------------------------------------------
1 | package org.allenai.nlpstack
2 | 
3 | import org.allenai.nlpstack.core.DependencyParser
4 | 
5 | package object parse {
6 |   val defaultDependencyParser: DependencyParser = new FactorieParser
7 | }
8 | 


--------------------------------------------------------------------------------
/tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/core/AnnotatedSentence.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.core
 2 | 
 3 | import org.allenai.nlpstack.parse.poly.ml.{ FeatureName, FeatureVector }
 4 | import reming.DefaultJsonProtocol._
 5 | 
 6 | /** An AnnotatedSentence is a sentence whose tokens are each annotated with a feature
 7 |   * vector.
 8 |   *
 9 |   * @param sentence the unannotated sentence
10 |   * @param annotation an indexed sequence, of which the nth element is the feature vector for
11 |   * the nth token of the sentence
12 |   */
13 | case class AnnotatedSentence(sentence: Sentence, annotation: IndexedSeq[FeatureVector])
14 | 
15 | object AnnotatedSentence {
16 |   implicit val annotatedSentenceJsonFormat = jsonFormat2(AnnotatedSentence.apply)
17 | 
18 |   /** Converts a TaggedSentence into an AnnotatedSentence by making simple features from
19 |     * the tags.
20 |     *
21 |     * @param tagged the original tagged sentence
22 |     * @return an annotated sentence (with feature vectors derived from the tags)
23 |     */
24 |   def annotate(tagged: TaggedSentence): AnnotatedSentence = {
25 |     AnnotatedSentence(
26 |       tagged.sentence,
27 |       Range(0, tagged.sentence.size) map { tokenIndex =>
28 |         FeatureVector(
29 |           tagged.tags.getOrElse(tokenIndex, Set[TokenTag]()).toSeq
30 |             map { tag =>
31 |               FeatureName(Seq(tag.name, tag.value)) -> 1.0
32 |             }
33 |         )
34 |       }
35 |     )
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/core/TaggedSentence.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.core
 2 | 
 3 | import org.allenai.nlpstack.parse.poly.fsm.{ Sculpture, SculptureSource }
 4 | 
 5 | /** A TaggedSentence is a sentence accompanied by a map that assigns tags to its tokens.
 6 |   *
 7 |   * Specifically, the `tags` field maps each token index to a set of TokenTag objects corresponding
 8 |   * to that token.
 9 |   *
10 |   * @param sentence the untagged sentence
11 |   * @param tags maps each token index to a set of TokenTag objects
12 |   */
13 | case class TaggedSentence(sentence: Sentence, tags: Map[Int, Set[TokenTag]]) extends Sculpture {
14 |   override val marbleBlock = sentence
15 | }
16 | 
17 | /** A data source for TaggedSentence objects. */
18 | trait TaggedSentenceSource extends SculptureSource with SentenceSource {
19 |   def taggedSentenceIterator: Iterator[TaggedSentence]
20 | 
21 |   override def sculptureIterator: Iterator[Sculpture] = taggedSentenceIterator
22 | 
23 |   override def sentenceIterator: Iterator[Sentence] = taggedSentenceIterator map { taggedSentence =>
24 |     taggedSentence.sentence
25 |   }
26 | }
27 | 
28 | /** A TaggedSentenceSource derived from a SentenceSource.
29 |   *
30 |   * Tokens are tagged with a specified property from their `properties` field.
31 |   *
32 |   * @param sentenceSource the sentence source to derive the tagged sentences from
33 |   * @param propertyName the token property to use as the "tag"
34 |   */
35 | case class DerivedTaggedSentenceSource(
36 |     sentenceSource: SentenceSource,
37 |     propertyName: Symbol
38 | ) extends TaggedSentenceSource {
39 | 
40 |   override def taggedSentenceIterator: Iterator[TaggedSentence] = {
41 |     for {
42 |       sentence <- sentenceSource.sentenceIterator
43 |     } yield {
44 |       TaggedSentence(
45 |         sentence,
46 |         (sentence.tokens.zipWithIndex map {
47 |         case (tok, index) =>
48 |           (index, tok.getProperty(propertyName) map { prop => TokenTag(propertyName, prop) })
49 |       }).toMap
50 |       )
51 |     }
52 |   }
53 | }
54 | 


--------------------------------------------------------------------------------
/tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/core/Util.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.core
 2 | 
 3 | import org.allenai.common.Resource
 4 | import org.allenai.nlpstack.core.{
 5 |   Token => NlpStackToken,
 6 |   PostaggedToken,
 7 |   Postagger,
 8 |   Tokenizer
 9 | }
10 | import reming.{ JsonFormat, JsonParser }
11 | 
12 | import java.io.{ File, InputStream, PushbackInputStream }
13 | import java.net.URL
14 | import java.util.zip.GZIPInputStream
15 | 
16 | import scala.io.BufferedSource
17 | 
18 | object Util {
19 |   def readFromFile[T: JsonFormat](filename: String): T = {
20 |     readFromUrl(new File(filename).toURI.toURL)
21 |   }
22 | 
23 |   def readFromUrl[T: JsonFormat](url: URL): T = {
24 |     Resource.using(url.openStream()) { readFromStream[T] }
25 |   }
26 | 
27 |   def readFromStream[T: JsonFormat](stream: InputStream): T = {
28 |     val headerLength = 2
29 |     val pbStream = new PushbackInputStream(stream, headerLength)
30 |     val header = new Array[Byte](headerLength)
31 |     val readBytes = pbStream.read(header, 0, headerLength)
32 |     pbStream.unread(header, 0, readBytes)
33 | 
34 |     val isZipped =
35 |       (readBytes == headerLength) &&
36 |         (header(0) == GZIPInputStream.GZIP_MAGIC.toByte) &&
37 |         (header(1) == (GZIPInputStream.GZIP_MAGIC >> 8).toByte)
38 | 
39 |     val uncompressedStream =
40 |       if (isZipped) {
41 |         new GZIPInputStream(pbStream)
42 |       } else {
43 |         pbStream
44 |       }
45 | 
46 |     JsonParser.read[T](new BufferedSource(uncompressedStream))
47 |   }
48 | 
49 |   /** Uses an NlpStack postagger to tag a Sentence object.
50 |     *
51 |     * @param sentence the Sentence to tag
52 |     * @param posTagger the nlpstack postagger to use
53 |     * @return a map from Sentence token indices to their POS tags
54 |     */
55 |   def getPostaggedTokens(sentence: Sentence, posTagger: Postagger): Map[Int, PostaggedToken] = {
56 |     val words: IndexedSeq[String] = sentence.tokens.tail map { tok => tok.word.name }
57 |     val nlpStackTokens: IndexedSeq[NlpStackToken] =
58 |       Tokenizer.computeOffsets(words, words.mkString).toIndexedSeq
59 |     (posTagger.postagTokenized(nlpStackTokens).zipWithIndex map {
60 |       case (taggedTok, index) =>
61 |         (index + 1, taggedTok)
62 |     }).toMap
63 |   }
64 | }
65 | 


--------------------------------------------------------------------------------
/tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/decisiontree/OmnibusTrainer.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.decisiontree
 2 | 
 3 | class OmnibusTrainer()
 4 |     extends ProbabilisticClassifierTrainer {
 5 | 
 6 |   val dtTrainer = new RandomForestTrainer(0, 12, 0.1f, MultinomialGainMetric(0.5f), numThreads = 6)
 7 |   val rfTrainer = new RandomForestTrainer(0, 12, 0.1f, MultinomialGainMetric(0.5f), numThreads = 6)
 8 | 
 9 |   override def apply(data: FeatureVectorSource): ProbabilisticClassifier = {
10 |     val trainer = data.classificationTask.filenameFriendlyName match {
11 |       case name if name.startsWith("dt-") =>
12 |         dtTrainer
13 |       case _ =>
14 |         rfTrainer
15 |     }
16 |     trainer(data)
17 |   }
18 | }
19 | 
20 | 


--------------------------------------------------------------------------------
/tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/decisiontree/package.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly
 2 | 
 3 | /** Implements C4.5 decision trees for integral labels and attributes.
 4 |   *
 5 |   * Main class to use is [[org.allenai.nlpstack.parse.poly.decisiontree.DecisionTree]].
 6 |   * Use the companion object to build the tree.
 7 |   * Then use [[org.allenai.nlpstack.parse.poly.decisiontree.DecisionTree.classify( )]]
 8 |   * or
 9 |   * [[org.allenai.nlpstack.parse.poly.decisiontree.DecisionTree.outcomeDistribution( )]]
10 |   * to do prediction.
11 |   *
12 |   * The tree takes data in the form of
13 |   * [[org.allenai.nlpstack.parse.poly.decisiontree.FeatureVectors]].
14 |   * This is a container for a collection of
15 |   * [[org.allenai.nlpstack.parse.poly.decisiontree.FeatureVector]] objects.
16 |   *
17 |   * Implementations of these are
18 |   * [[org.allenai.nlpstack.parse.poly.decisiontree.SparseVector]]
19 |   * or
20 |   * [[org.allenai.nlpstack.parse.poly.decisiontree.DenseVector]].
21 |   */
22 | package object decisiontree {
23 | 
24 | }
25 | 


--------------------------------------------------------------------------------
/tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/eval/Evaluate.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.eval
 2 | 
 3 | import org.allenai.nlpstack.parse.poly.polyparser._
 4 | import scopt.OptionParser
 5 | 
 6 | private case class EvaluateConfig(candidateFilename: String = "", goldFilename: String = "")
 7 | 
 8 | object Evaluate {
 9 | 
10 |   /** Command-line for evaluating a set of parses against a gold set.
11 |     *
12 |     * Usage: Evaluate [options]
13 |     *
14 |     * -c <file> | --candidate <file>
15 |     * the file containing the candidate parses (CoNLL-X format)
16 |     * -g <file> | --gold <file>
17 |     * the file containing the gold parses (CoNLL-X format)
18 |     *
19 |     * @param args see above
20 |     */
21 |   def main(args: Array[String]) {
22 |     val optionParser = new OptionParser[EvaluateConfig]("Evaluate") {
23 |       opt[String]('c', "candidate") required () valueName ("<file>") action
24 |         { (x, c) => c.copy(candidateFilename = x) } text ("the file containing the candidate " +
25 |           "parses (CoNLL-X format)")
26 |       opt[String]('g', "gold") required () valueName ("<file>") action
27 |         { (x, c) => c.copy(goldFilename = x) } text ("the file containing the gold " +
28 |           "parses (CoNLL-X format)")
29 |     }
30 |     val config: EvaluateConfig = optionParser.parse(args, EvaluateConfig()).get
31 |     val fileFormat: PolytreeParseFileFormat = ConllX(true)
32 |     val candidateParses =
33 |       InMemoryPolytreeParseSource(
34 |         (PolytreeParse.fromFile(config.candidateFilename, fileFormat) map { Some(_) }).flatten.toSeq
35 |       )
36 | 
37 |     val goldParseBank =
38 |       ParseBank.createParseBankFromSource(
39 |         InMemoryPolytreeParseSource(PolytreeParse.fromFile(config.goldFilename, fileFormat).toSeq)
40 |       )
41 |     ParseEvaluation.performStandardEvaluation(candidateParses, goldParseBank)
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/fsm/MarbleBlock.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.fsm
 2 | 
 3 | /** A MarbleBlock is an unstructured input corresponding to a start state of a finite-state
 4 |   * machine. The goal of the finite-state machine is to find a final state (which correponds
 5 |   * to a Sculpture, i.e. a structured output).
 6 |   *
 7 |   * As an example, consider a transition-based parser. A MarbleBlock would be a sentence to be
 8 |   * parsed, whereas a Sculpture would be a parse tree for that sentence.
 9 |   */
10 | trait MarbleBlock
11 | 


--------------------------------------------------------------------------------
/tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/fsm/NbestCorpus.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.fsm
 2 | 
 3 | import org.allenai.nlpstack.parse.poly.core.Util
 4 | 
 5 | import reming.DefaultJsonProtocol._
 6 | 
 7 | /** A sequence of (scored) sculptures. */
 8 | case class NbestList(scoredSculptures: Iterable[(Sculpture, Double)])
 9 | 
10 | object NbestList {
11 |   implicit val jsFormat = jsonFormat1(NbestList.apply)
12 | }
13 | 
14 | /** A sequence of NbestLists. */
15 | 
16 | case class NbestCorpus(nbestLists: Iterable[NbestList])
17 | 
18 | object NbestCorpus {
19 |   implicit val jsFormat = jsonFormat1(NbestCorpus.apply)
20 | 
21 |   def loadNbestCorpus(filename: String): NbestCorpus = Util.readFromFile(filename)
22 | }
23 | 


--------------------------------------------------------------------------------
/tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/fsm/NbestSearch.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.fsm
 2 | 
 3 | import scala.collection.mutable
 4 | 
 5 | /** Finds the best n greedy paths through a finite-state machine.
 6 |   *
 7 |   * @param costFunction the cost function to use to evaluate transitions from a given state
 8 |   */
 9 | class NbestSearch(
10 |     costFunction: StateCostFunction,
11 |     timeout: Int = NbestSearch.defaultTimeout
12 | ) {
13 | 
14 |   // Right now, we use a rather generous "qualifying cost delta" of 10000.0, to make sure that
15 |   // most reasonable alternatives are remembered by the nostalgic parser.
16 |   val baseParser: NostalgicSearch = new NostalgicSearch(costFunction, 10000.0)
17 | 
18 |   /** Finds the best n greedy paths through a finite-state machine.
19 |     *
20 |     * @param initialState the initial state in the finite-state machine
21 |     * @param maxDesiredWalks the number of walks desired (i.e. n)
22 |     * @param constraints a set of constraints that must be satisfied by returned paths
23 |     * @return an n-best list containing n greedy paths through the FSM
24 |     */
25 |   def find(initialState: State, maxDesiredWalks: Int,
26 |     constraints: Set[TransitionConstraint] = Set()): NbestList = {
27 | 
28 |     val queue = mutable.PriorityQueue[ScoredWalk]()(
29 |       Ordering.by({ walk: ScoredWalk => -walk.score })
30 |     )
31 |     var results: Seq[ScoredWalk] = Seq()
32 |     var iterNumber: Int = 0
33 |     queue.enqueue(ScoredWalk(Walk(initialState, Seq()), 0.0))
34 |     while (queue.nonEmpty && results.size < maxDesiredWalks && iterNumber < timeout) {
35 |       iterNumber += 1
36 |       val scoredWalk: ScoredWalk = queue.dequeue()
37 |       if (scoredWalk.walk.isGoal) {
38 |         results = scoredWalk +: results
39 |       } else {
40 |         val (mementos, _) =
41 |           baseParser.getPromisingWalks(scoredWalk.walk, scoredWalk.score, constraints)
42 |         mementos.headOption match {
43 |           case Some(memento) =>
44 |             if (memento.walk.isGoal) {
45 |               results = memento +: results
46 |               queue ++= mementos.tail
47 |             } else {
48 |               queue ++= mementos
49 |             }
50 |           case _ =>
51 |         }
52 |       }
53 |     }
54 |     val allWalks: Seq[ScoredWalk] = results
55 |     NbestList(
56 |       (allWalks map { scoredWalk =>
57 |       scoredWalk.walk.finalState flatMap { state =>
58 |         state.asSculpture
59 |       } map { sculpture =>
60 |         (sculpture, scoredWalk.score)
61 |       }
62 |     }).flatten
63 |     )
64 |   }
65 | }
66 | 
67 | object NbestSearch {
68 |   val defaultTimeout = 1000
69 | }
70 | 


--------------------------------------------------------------------------------
/tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/fsm/Sculpture.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.fsm
 2 | 
 3 | import org.allenai.nlpstack.parse.poly.polyparser.{ PolytreeParse }
 4 | 
 5 | import reming.DefaultJsonProtocol._
 6 | 
 7 | /** A Sculpture is a structured output corresponding to a final state of a finite-state
 8 |   * machine, whose goal is to transform an unstructured input (a MarbleBlock) into a
 9 |   * structured output.
10 |   *
11 |   * As an example, consider a transition-based parser. A MarbleBlock would be a sentence to be
12 |   * parsed, whereas a Sculpture would be a parse tree for that sentence.
13 |   */
14 | trait Sculpture {
15 |   def marbleBlock: MarbleBlock
16 | }
17 | 
18 | object Sculpture {
19 |   private implicit val polytreeParseFormat = jsonFormat4(PolytreeParse.apply)
20 |   implicit val sculptureJsonFormat = parentFormat[Sculpture](childFormat[PolytreeParse, Sculpture])
21 | }
22 | 
23 | /** An interface for a Sculpture data source. */
24 | trait SculptureSource {
25 | 
26 |   /** Returns a use-once iterator over all sculptures in the data source. */
27 |   def sculptureIterator: Iterator[Sculpture]
28 | }
29 | 


--------------------------------------------------------------------------------
/tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/fsm/SculptureCost.scala:
--------------------------------------------------------------------------------
1 | package org.allenai.nlpstack.parse.poly.fsm
2 | 
3 | abstract class SculptureCost extends (Sculpture => Double)
4 | 


--------------------------------------------------------------------------------
/tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/fsm/SculptureFeature.scala:
--------------------------------------------------------------------------------
1 | package org.allenai.nlpstack.parse.poly.fsm
2 | 
3 | import org.allenai.nlpstack.parse.poly.ml.FeatureVector
4 | 
5 | /** A SculptureFeature computes a feature vector corresponding to a given sculpture. */
6 | abstract class SculptureFeature extends (Sculpture => FeatureVector)
7 | 


--------------------------------------------------------------------------------
/tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/fsm/SculptureTrainingVectorSource.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.fsm
 2 | 
 3 | /** A SculptureTrainingVectorSource reduces a sculpture to a set of feature vectors for
 4 |   * classifier training.
 5 |   *
 6 |   * Essentially, we derive the transition states that lead to the gold sculpture.
 7 |   * Each of these states becomes a feature vector,
 8 |   * labeled with the transition executed from that state in the gold sculpture.
 9 |   *
10 |   * One of the constructor arguments is a TaskIdentifer. This will dispatch the feature vectors
11 |   * to train different classifiers. For instance, if taskIdentifier(state) !=
12 |   * taskIdentifier(state2), then their respective feature vectors (i.e. feature(state) and
13 |   * feature(state2)) will be used to train different classifiers.
14 |   *
15 |   * @param trainingSculptures the data source for the training sculptures
16 |   * @param transitionSystemFactory the transition system factory to use (for generating states)
17 |   * @param baseCostFunctionFactory a trained cost function factory to adapt (optional)
18 |   */
19 | case class SculptureTrainingVectorSource(
20 |   trainingSculptures: SculptureSource,
21 |   transitionSystemFactory: TransitionSystemFactory,
22 |   baseCostFunctionFactory: Option[StateCostFunctionFactory] = None
23 | )
24 |     extends FSMTrainingVectorSource(transitionSystemFactory, baseCostFunctionFactory) {
25 | 
26 |   def getVectorIterator: Iterator[FSMTrainingVector] = {
27 |     for {
28 |       taggedSentence <- trainingSculptures.sculptureIterator
29 |       vector <- generateVectors(taggedSentence)
30 |     } yield vector
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/fsm/State.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.fsm
 2 | 
 3 | import org.allenai.nlpstack.parse.poly.polyparser.TransitionParserState
 4 | 
 5 | import reming.DefaultJsonProtocol._
 6 | 
 7 | /** A state of a finite-state machine. */
 8 | trait State {
 9 |   val isFinal: Boolean
10 |   def asSculpture: Option[Sculpture]
11 | }
12 | 
13 | object State {
14 |   private implicit val transitionParserStateFormat = jsonFormat8(TransitionParserState.apply)
15 |   implicit val stateJsonFormat = parentFormat[State](childFormat[TransitionParserState, State])
16 | }
17 | 
18 | /** A StateCost maps a state to a cost. */
19 | trait StateCost extends (Option[State] => Double)
20 | 
21 | trait StateSource {
22 |   /** Generates an iterator over State objects.
23 |     *
24 |     * @return a use-once iterator over State objects
25 |     */
26 |   def getStateIterator: Iterator[State]
27 | }
28 | 
29 | /** A StateSource that keeps all its states in memory. */
30 | case class InMemoryStateSource(states: Iterable[State]) extends StateSource {
31 |   override def getStateIterator: Iterator[State] = states.iterator
32 | }
33 | 


--------------------------------------------------------------------------------
/tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/fsm/StateCostFunctionTrainer.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.fsm
 2 | 
 3 | import org.allenai.nlpstack.parse.poly.ml.FeatureName
 4 | 
 5 | /** A StateCostFunctionTrainer trains a StateCostFunction from data. Training is
 6 |   * triggered during construction, after which the .costFunction field contains the trained
 7 |   * TransitionCostFunctionAndClassifier.
 8 |   *
 9 |   * @param trainingVectorSource a source of training vectors
10 |   */
11 | abstract class StateCostFunctionTrainer(
12 |     transitionSystemFactory: TransitionSystemFactory, trainingVectorSource: FSMTrainingVectorSource
13 | ) {
14 | 
15 |   /** The trained cost function factory. */
16 |   def costFunctionFactory: StateCostFunctionFactory
17 | 
18 |   protected val featureNames: List[FeatureName] =
19 |     FSMTrainingVectorSource.collectFeatureNames(trainingVectorSource).toList
20 | 
21 |   protected val featureNameToIndex: Map[FeatureName, Int] = featureNames.zipWithIndex.toMap
22 | 
23 |   protected val transitions: IndexedSeq[StateTransition] =
24 |     FSMTrainingVectorSource.collectTransitions(trainingVectorSource).toIndexedSeq
25 | }
26 | 
27 | 


--------------------------------------------------------------------------------
/tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/fsm/StateFeature.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.fsm
 2 | 
 3 | import org.allenai.nlpstack.parse.poly.ml.FeatureVector
 4 | import org.allenai.nlpstack.parse.poly.polyparser._
 5 | 
 6 | import reming.LazyFormat
 7 | import reming.DefaultJsonProtocol._
 8 | 
 9 | /** A StateFeature computes a feature vector corresponding to a given parser state. */
10 | abstract class StateFeature extends (State => FeatureVector)
11 | 
12 | object StateFeature {
13 |   private implicit val tokenTransformFeatureFormat = jsonFormat2(TokenTransformFeature.apply)
14 |   private implicit val offlineTokenFeatureFormat = jsonFormat2(OfflineTokenFeature.apply)
15 |   private implicit val tokenCardinalityFeatureFormat = jsonFormat1(TokenCardinalityFeature.apply)
16 | 
17 |   implicit object StateFeatureJsonFormat extends LazyFormat[StateFeature] {
18 |     private implicit val featureUnionFormat = jsonFormat1(FeatureUnion.apply)
19 | 
20 |     override val delegate = parentFormat[StateFeature](
21 |       childFormat[TokenTransformFeature, StateFeature],
22 |       childFormat[OfflineTokenFeature, StateFeature],
23 |       childFormat[TokenCardinalityFeature, StateFeature],
24 |       childFormat[FeatureUnion, StateFeature]
25 |     )
26 |   }
27 | }
28 | 
29 | /** A FeatureUnion simply merges the output of a list of features.
30 |   *
31 |   * @param features a list of the features we want to merge into a single feature
32 |   */
33 | case class FeatureUnion(val features: Iterable[StateFeature])
34 |     extends StateFeature {
35 | 
36 |   override def apply(state: State): FeatureVector = {
37 |     features map { f =>
38 |       f(state)
39 |     } reduce { (m1, m2) =>
40 |       FeatureVector(m1.values ++ m2.values)
41 |     }
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/fsm/StateTransition.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.fsm
 2 | 
 3 | import org.allenai.nlpstack.parse.poly.polyparser._
 4 | import reming.DefaultJsonProtocol._
 5 | 
 6 | abstract class StateTransition extends (Option[State] => Option[State]) {
 7 |   val name: String
 8 |   override def toString: String = name
 9 | }
10 | 
11 | object StateTransition {
12 |   def applyTransitionSequence(
13 |     initialState: State,
14 |     transitions: Seq[StateTransition]
15 |   ): Option[State] = {
16 | 
17 |     transitions.foldLeft(Option(initialState)) { (state, transition) => transition(state) }
18 |   }
19 | 
20 |   def applicable(transition: StateTransition, state: Option[State]): Boolean = {
21 |     transition(state) != None
22 |   }
23 | 
24 |   private implicit val arcEagerShiftFormat = jsonFormat0(() => ArcEagerShift)
25 |   private implicit val arcEagerReduceFormat = jsonFormat0(() => ArcEagerReduce)
26 |   private implicit val arcHybridShiftFormat = jsonFormat0(() => ArcHybridShift)
27 |   private implicit val fallbackFormat = jsonFormat0(() => Fallback)
28 |   private implicit val leftArcFormat = jsonFormat1(ArcEagerLeftArc.apply)
29 |   private implicit val rightArcFormat = jsonFormat1(ArcEagerRightArc.apply)
30 |   private implicit val hybridLeftArcFormat = jsonFormat1(ArcHybridLeftArc.apply)
31 |   private implicit val hybridRightArcFormat = jsonFormat1(ArcHybridRightArc.apply)
32 |   private implicit val leftLabelArcFormat = jsonFormat1(LabelLeftArc.apply)
33 |   private implicit val rightLabelArcFormat = jsonFormat1(LabelRightArc.apply)
34 |   //private implicit val tagTokenFormat = jsonFormat1(AssignTag.apply)
35 | 
36 |   implicit val stateTransitionJsonFormat = parentFormat[StateTransition](
37 |     childFormat[ArcEagerShift.type, StateTransition]("Sh"),
38 |     childFormat[ArcEagerReduce.type, StateTransition]("Re"),
39 |     childFormat[ArcHybridShift.type, StateTransition]("HySh"),
40 |     childFormat[Fallback.type, StateTransition]("Fb"),
41 |     childFormat[ArcEagerLeftArc, StateTransition]("Lt"),
42 |     childFormat[ArcEagerRightArc, StateTransition]("Rt"),
43 |     childFormat[ArcHybridLeftArc, StateTransition]("HyLt"),
44 |     childFormat[ArcHybridRightArc, StateTransition]("HyRt"),
45 |     //childFormat[AssignTag, StateTransition]("Tag"),
46 |     childFormat[LabelLeftArc, StateTransition]("LtLbl"),
47 |     childFormat[LabelRightArc, StateTransition]("RtLbl")
48 |   )
49 | }
50 | 
51 | case object Fallback extends StateTransition {
52 | 
53 |   override def apply(state: Option[State]): Option[State] = None
54 | 
55 |   override val name: String = "Fb"
56 | }
57 | 


--------------------------------------------------------------------------------
/tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/fsm/TransitionClassifier.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.fsm
 2 | 
 3 | import org.allenai.nlpstack.parse.poly.ml.FeatureVector
 4 | 
 5 | import reming.DefaultJsonProtocol._
 6 | 
 7 | /** A TransitionClassifier maps Transitions to probabilities. */
 8 | abstract class TransitionClassifier {
 9 | 
10 |   /** Returns the most probable Transition according to .getDistribution(featureVector).
11 |     *
12 |     * @param featureVector the feature vector to use to compute the distribution
13 |     * @return the most probable Transition, given the argument feature vector
14 |     */
15 |   def classify(featureVector: FeatureVector): StateTransition
16 | 
17 |   /** Given the argument feature vector, this assigns a probability to a set of Transitions.
18 |     *
19 |     * @param featureVector the feature vector to use to compute the distribution
20 |     * @return a probability distribution over Transitions
21 |     */
22 |   def getDistribution(featureVector: FeatureVector): Map[StateTransition, Float]
23 | 
24 | }
25 | 
26 | /** Companion class for serializing TransitionClassifier instances. */
27 | object TransitionClassifier {
28 |   private implicit val embeddedClassifierFormat = jsonFormat4(EmbeddedClassifier.apply)
29 | 
30 |   implicit val transitionClassifierJsonFormat = parentFormat[TransitionClassifier](
31 |     childFormat[EmbeddedClassifier, TransitionClassifier]
32 |   )
33 | }
34 | 
35 | 


--------------------------------------------------------------------------------
/tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/fsm/TransitionConstraint.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.fsm
 2 | 
 3 | import org.allenai.nlpstack.parse.poly.polyparser.{
 4 |   RequestedCpos,
 5 |   RequestedArc,
 6 |   ForbiddenArcLabel,
 7 |   ForbiddenEdge
 8 | }
 9 | import reming.DefaultJsonProtocol._
10 | 
11 | /** A TransitionConstraint returns true if a given transition is illegal to
12 |   * apply in a given state.
13 |   */
14 | trait TransitionConstraint
15 | 
16 | object TransitionConstraint {
17 |   private implicit val forbiddenEdgeFormat = jsonFormat2(ForbiddenEdge.apply)
18 |   private implicit val forbiddenArcLabelFormat = jsonFormat3(ForbiddenArcLabel.apply)
19 |   private implicit val requestedArcFormat = jsonFormat3(RequestedArc.apply)
20 |   private implicit val requestedCposFormat = jsonFormat2(RequestedCpos.apply)
21 |   implicit val parserConstraintFormat = parentFormat[TransitionConstraint](
22 |     childFormat[ForbiddenEdge, TransitionConstraint],
23 |     childFormat[ForbiddenArcLabel, TransitionConstraint],
24 |     childFormat[RequestedArc, TransitionConstraint],
25 |     childFormat[RequestedCpos, TransitionConstraint]
26 |   )
27 | }
28 | 
29 | /** A ConstraintInterpretation tells you whether a transition is inapplicable in a given state.
30 |   *
31 |   * Specifically, it is a function that takes a (state, transition) pair, and returns true
32 |   * if the transition is inapplicable.
33 |   */
34 | trait ConstraintInterpretation extends ((State, StateTransition) => Boolean)
35 | 
36 | /** The TrivialConstraintInterpretation returns false for any state/transition pair.
37 |   *
38 |   * This means that transitions are always considered applicable.
39 |   */
40 | class TrivialConstraintInterpretation extends ConstraintInterpretation {
41 |   def apply(state: State, transition: StateTransition): Boolean = false
42 | }
43 | 


--------------------------------------------------------------------------------
/tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/fsm/TransitionSystem.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.fsm
 2 | 
 3 | import org.allenai.nlpstack.parse.poly.ml.FeatureVector
 4 | import org.allenai.nlpstack.parse.poly.polyparser.{
 5 |   ArcEagerTransitionSystemFactory,
 6 |   ArcHybridTransitionSystemFactory
 7 | }
 8 | 
 9 | import reming.DefaultJsonProtocol._
10 | 
11 | trait TransitionSystem {
12 |   val taskIdentifier: TaskIdentifier
13 |   def initialState(constraints: Seq[TransitionConstraint]): Option[State]
14 |   def guidedCostFunction(goldObj: Sculpture): Option[StateCostFunction]
15 |   def computeFeature(state: State): FeatureVector
16 |   def toSculpture(state: State): Option[Sculpture]
17 |   def interpretConstraint(constraint: TransitionConstraint): ((State, StateTransition) => Boolean)
18 | }
19 | 
20 | object TransitionSystem {
21 |   def trivialConstraint(state: State, transition: StateTransition): Boolean = false
22 | }
23 | 
24 | /** A TransitionSystemFactory is a factory that constructs marbleblock-specific transition
25 |   * systems. For instance, in parsing, this would create a transition system for each input
26 |   * sentence that you want to parse.
27 |   */
28 | trait TransitionSystemFactory {
29 |   def buildTransitionSystem(
30 |     marbleBlock: MarbleBlock,
31 |     constraints: Set[TransitionConstraint]
32 |   ): TransitionSystem
33 | }
34 | 
35 | object TransitionSystemFactory {
36 |   private implicit val arcHybridFormat = jsonFormat1(ArcHybridTransitionSystemFactory.apply)
37 |   private implicit val arcEagerFormat = jsonFormat1(ArcEagerTransitionSystemFactory.apply)
38 |   //private implicit val postaggerFormat = jsonFormat1(PostaggerTransitionSystemFactory.apply)
39 |   implicit val transitionSystemFactoryJsonFormat = parentFormat[TransitionSystemFactory](
40 |     childFormat[ArcHybridTransitionSystemFactory, TransitionSystemFactory],
41 |     childFormat[ArcEagerTransitionSystemFactory, TransitionSystemFactory]
42 |   //childFormat[PostaggerTransitionSystemFactory, TransitionSystemFactory]
43 |   )
44 | }
45 | 


--------------------------------------------------------------------------------
/tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/fsm/Walk.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.fsm
 2 | 
 3 | import reming.DefaultJsonProtocol._
 4 | 
 5 | /** A WalkStep is a single step in an FSM walk.
 6 |   *
 7 |   * @param state the current state
 8 |   * @param transition the transition to take
 9 |   * param transitionCosts the costs of the possible transitions in the current state
10 |   */
11 | case class WalkStep(state: State, transition: StateTransition)
12 | 
13 | object WalkStep {
14 |   implicit val jsFormat = jsonFormat2(WalkStep.apply)
15 | }
16 | 
17 | /** A Walk is a walk through a finite-state machine.
18 |   *
19 |   * @param initialState the state in which we begin
20 |   * @param steps the sequence of steps we take from the initial state
21 |   */
22 | case class Walk(initialState: State, steps: Seq[WalkStep]) {
23 | 
24 |   /** The sequence of transitions taken during this walk (in order). */
25 |   lazy val transitions = steps map { case WalkStep(_, transition) => transition }
26 | 
27 |   /** The sequence of states encountered during this walk (in order). */
28 |   lazy val states: Seq[State] = {
29 |     finalState match {
30 |       case Some(reachableState) =>
31 |         val walkStates: Seq[State] = steps map { step => step.state }
32 |         walkStates :+ reachableState
33 |       case None => Seq()
34 |     }
35 |   }
36 | 
37 |   /** Returns the parser state that results from executing the steps of this parse step, starting
38 |     * from the initial state.
39 |     */
40 |   lazy val finalState: Option[State] = {
41 |     if (steps.isEmpty) {
42 |       Some(initialState)
43 |     } else {
44 |       (transitions.last)(Some(steps.last.state))
45 |     }
46 |   }
47 | 
48 |   /** Returns whether this walk ends up in a goal state. */
49 |   lazy val isGoal: Boolean = {
50 |     finalState match {
51 |       case Some(state) => state.isFinal
52 |       case _ => false
53 |     }
54 |   }
55 | 
56 |   override def toString: String = {
57 |     "[" + (steps map { _.transition }).mkString(" ") + "]"
58 |   }
59 | }
60 | 
61 | object Walk {
62 |   implicit val jsFormat = jsonFormat2(Walk.apply)
63 | }
64 | 
65 | /** A ScoredWalk attaches a score to a Walk.
66 |   *
67 |   * @param walk the unscored Walk
68 |   * @param score the floating-point score
69 |   */
70 | case class ScoredWalk(walk: Walk, score: Double)
71 | 
72 | object ScoredWalk {
73 |   implicit val jsFormat = jsonFormat2(ScoredWalk.apply)
74 | }
75 | 


--------------------------------------------------------------------------------
/tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/ml/FeatureVector.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.ml
 2 | 
 3 | import reming.DefaultJsonProtocol._
 4 | 
 5 | /** The name of a feature, represented as a list of Symbols.
 6 |   *
 7 |   * @param symbols the list of symbols comprising the feature name
 8 |   */
 9 | case class FeatureName(symbols: Seq[Symbol]) {
10 |   override def toString(): String = {
11 |     (symbols map { sym => sym.name }).mkString(".")
12 |   }
13 | }
14 | 
15 | object FeatureName {
16 |   implicit val jsFormat = jsonFormat1(FeatureName.apply)
17 | }
18 | 
19 | /** A mapping from feature names to values.
20 |   *
21 |   * Unspecified feature names are assumed to correspond to a value of zero.
22 |   *
23 |   * @param values the map from feature names to values
24 |   */
25 | case class FeatureVector(values: Seq[(FeatureName, Double)]) {
26 | 
27 |   @transient lazy val featureNames = values map { _._1 }
28 | 
29 |   @transient lazy val featureMap = values.toMap
30 | 
31 |   /** Returns the value of the specified feature name.
32 |     *
33 |     * Note that this returns zero if the feature is not present in the map.
34 |     *
35 |     * @param name the feature name of interest
36 |     * @return the value assigned to that feature name
37 |     */
38 |   def getFeatureValue(name: FeatureName): Double = {
39 |     featureMap.getOrElse(name, 0.0)
40 |   }
41 | 
42 |   override def toString(): String = {
43 |     "[" + (values map {
44 |       case (featureName, featureValue) =>
45 |         f"${featureName} -> $featureValue%.3f"
46 |     }).mkString(" ") + "]"
47 |   }
48 | }
49 | 
50 | object FeatureVector {
51 |   implicit val jsFormat = jsonFormat1(FeatureVector.apply)
52 | 
53 |   /** Takes the difference between two feature vectors.
54 |     *
55 |     * @param vec1 first vector
56 |     * @param vec2 second vector
57 |     * @return the difference vector (first - second)
58 |     */
59 |   def subtractVectors(vec1: FeatureVector, vec2: FeatureVector): FeatureVector = {
60 |     FeatureVector(((vec1.featureNames ++ vec2.featureNames) map { featureName =>
61 |       (featureName, vec1.getFeatureValue(featureName)
62 |         - vec2.getFeatureValue(featureName))
63 |     }).toMap.toSeq)
64 |   }
65 | 
66 |   /** Merges two feature vectors.
67 |     *
68 |     * In case of conflict, values in the first vector are preferred.
69 |     *
70 |     * @param vec1 first vector
71 |     * @param vec2 second vector
72 |     * @return the merged vector
73 |     */
74 |   def mergeVectors(vec1: FeatureVector, vec2: FeatureVector): FeatureVector = {
75 |     FeatureVector((vec2.values.toMap ++ vec1.values.toMap).toSeq)
76 |   }
77 | 
78 | }
79 | 


--------------------------------------------------------------------------------
/tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/ml/LinearModel.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.ml
 2 | 
 3 | import org.allenai.nlpstack.parse.poly.core.Util
 4 | 
 5 | import reming.DefaultJsonProtocol._
 6 | 
 7 | /** A weighted linear combination of features.
 8 |   *
 9 |   * @param coefficients map from feature names to weight coefficients
10 |   */
11 | case class LinearModel(val coefficients: Seq[(FeatureName, Double)]) {
12 | 
13 |   @transient val coefficientMap = coefficients.toMap
14 | 
15 |   /** Returns the coefficient corresponding to the specified feature name.
16 |     *
17 |     * For unspecified coefficients, zero is returned.
18 |     *
19 |     * @param featureName the feature name of interest
20 |     * @return the coefficient corresponding to the specified feature name
21 |     */
22 |   def getCoefficient(featureName: FeatureName): Double = {
23 |     coefficientMap.getOrElse(featureName, 0.0)
24 |   }
25 | 
26 |   /** Computes the weighted linear combination, given the feature values in the argument vector.
27 |     *
28 |     * @param featureVector the feature vector of interest
29 |     * @return the weighted linear combination
30 |     */
31 |   def score(featureVector: FeatureVector): Double = {
32 |     def add(x: Double, y: Double): Double = { x + y }
33 |     (featureVector.featureNames map { featureName =>
34 |       getCoefficient(featureName) * featureVector.getFeatureValue(featureName)
35 |     }).fold(0.0)(add)
36 |   }
37 | }
38 | 
39 | object LinearModel {
40 |   implicit val jsFormat = jsonFormat1(LinearModel.apply)
41 | 
42 |   def loadLinearModel(filename: String): LinearModel = Util.readFromFile[LinearModel](filename)
43 | }
44 | 


--------------------------------------------------------------------------------
/tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/ml/TrainingData.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.ml
 2 | 
 3 | import reming.DefaultJsonProtocol._
 4 | 
 5 | /** Maps feature names to integers. Useful for serializing TrainingData instances for
 6 |   * consumption by command-line machine learning tools.
 7 |   *
 8 |   * @param featureNames an indexed sequence of feature names
 9 |   */
10 | case class FeatureEncoding(featureNames: IndexedSeq[FeatureName]) {
11 |   @transient lazy val featureNameToIndex: Map[FeatureName, Int] = featureNames.zipWithIndex.toMap
12 | }
13 | 
14 | object FeatureEncoding {
15 |   implicit val jsFormat = jsonFormat1(FeatureEncoding.apply)
16 | }
17 | 
18 | /** Abstraction for a set of labeled feature vectors.
19 |   *
20 |   * Provides various serialization options for different machine learning tools.
21 |   *
22 |   * @param labeledVectors a sequence of feature vectors labeled with integer outcomes
23 |   */
24 | case class TrainingData(labeledVectors: Iterable[(FeatureVector, Int)]) {
25 | 
26 |   /** The set of feature names found in the training data. */
27 |   lazy val featureNames: Set[FeatureName] = {
28 |     val featureNameSets: Iterable[Set[FeatureName]] = (labeledVectors map {
29 |       case (fvec, _) =>
30 |         fvec.featureNames.toSet
31 |     })
32 |     featureNameSets.fold(Set[FeatureName]())((x: Set[FeatureName], y: Set[FeatureName]) =>
33 |       x union y)
34 |   }
35 | 
36 |   /** Expresses this training data in "SVMlight" format, which is
37 |     * <line> .=. <target> <feature>:<value> ... <feature>:<value> # <info>
38 |     * <target> .=. +1 | -1 | 0 | <float>
39 |     * <feature> .=. <integer> | "qid"
40 |     * <value> .=. <float>
41 |     * <info> .=. <string>
42 |     *
43 |     * @param signature the signature to use for encoding feature names as integer
44 |     * @return the training data in SVMlight format
45 |     */
46 |   def asSvmLight(signature: FeatureEncoding): String = {
47 |     (labeledVectors map {
48 |       case (fvec: FeatureVector, label) =>
49 |         val sortedValues: Seq[(Int, Double)] = (fvec.values.toSeq map {
50 |           case (featureName, featureValue) =>
51 |             (signature.featureNameToIndex(featureName), featureValue)
52 |         }).sortBy(_._1)
53 |         val featureString = (sortedValues map {
54 |           case (featureIndex, featureValue) =>
55 |             s"${featureIndex}:${featureValue}"
56 |         }).mkString(" ")
57 |         s"${svmLightLabel(label)} ${featureString}"
58 |     }).mkString("\n")
59 |   }
60 | 
61 |   protected def svmLightLabel(label: Double): String = s"${label}"
62 | }
63 | 


--------------------------------------------------------------------------------
/tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/polyparser/ArcInverter.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.polyparser
 2 | 
 3 | /** The ArcInverter takes a PolytreeParse and inverts arcs whose labels are in the argument set
 4 |   * `inverseArcLabels`. Note that this operation should only affect the `children` field of a
 5 |   * PolytreeParse, since the other fields only care about the underlying undirected tree.
 6 |   *
 7 |   * The purpose of this class is to convert standard dependency parses into polytree
 8 |   * dependency parses. For instance, we may wish to invert all arcs x ---> y for which
 9 |   * the arc label is 'det (effectively this would invert the relationship between a determiner
10 |   * and its noun to say that the determiner "requires" the noun, rather than vice-versa).
11 |   *
12 |   * @param inverseArcLabels the set of arc labels to invert
13 |   */
14 | class ArcInverter(val inverseArcLabels: Set[ArcLabel]) extends (PolytreeParse => PolytreeParse) {
15 | 
16 |   /** Inverts the arcs whose labels are in `inverseArcLabels`
17 |     *
18 |     * @param parse the polytree parse we want to transform
19 |     * @return a new polytree parse, with the specified arcs inverted
20 |     */
21 |   def apply(parse: PolytreeParse): PolytreeParse = {
22 | 
23 |     // for each node, determine the neighbors for which the arcs should be inverted
24 |     val invertibleNeighbors: Vector[Set[Int]] = for {
25 |       labeledNeighbors <- parse.arclabels
26 |     } yield for {
27 |       (neighbor, label) <- labeledNeighbors if isInvertible(label)
28 |     } yield neighbor
29 | 
30 |     // compute the new children using an XOR operation
31 |     val newChildren: Vector[Set[Int]] = for {
32 |       (neighbors, children) <- invertibleNeighbors.zip(parse.children)
33 |     } yield ((neighbors diff children) union (children diff neighbors))
34 | 
35 |     PolytreeParse(parse.sentence, parse.breadcrumb, newChildren, parse.arclabels)
36 |   }
37 | 
38 |   def isInvertible(arcLabel: ArcLabel): Boolean = {
39 |     val stanLabel = arcLabel match {
40 |       case dpLabel: DependencyParsingArcLabel =>
41 |         dpLabel.stanLabel
42 |       case _ =>
43 |         arcLabel.toSymbol
44 |     }
45 |     inverseArcLabels.contains(SingleSymbolArcLabel(stanLabel))
46 |   }
47 | 
48 | }
49 | 


--------------------------------------------------------------------------------
/tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/polyparser/Neighborhood.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.polyparser
 2 | 
 3 | import reming.DefaultJsonProtocol._
 4 | 
 5 | /** A Neighborhood is a sequence of token indices, generally referring to a parse tree.
 6 |   *
 7 |   * For instance, one might want to consider neighborhoods like:
 8 |   * - a node and its children
 9 |   * - a node and its parents
10 |   * - a node and its breadcrumb
11 |   *
12 |   * @param tokens a sequence of token indices, usually associated in some way
13 |   * (see NeighborhoodExtractor instances for examples of such associations)
14 |   */
15 | case class Neighborhood(tokens: Seq[Int])
16 | 
17 | object Neighborhood {
18 |   implicit val neighborhoodJsonFormat = jsonFormat1(Neighborhood.apply)
19 | }
20 | 
21 | /** A data source for neighborhoods. */
22 | trait NeighborhoodSource {
23 |   /** Returns an iterator over the neighborhoods in this data source.
24 |     *
25 |     * @return an iterator over the neighborhoods in this data source
26 |     */
27 |   def getNeighborhoodIterator(): Iterator[(PolytreeParse, Neighborhood)]
28 | }
29 | 


--------------------------------------------------------------------------------
/tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/polyparser/ParsePool.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.polyparser
 2 | 
 3 | import java.io.{ File, PrintWriter }
 4 | 
 5 | import org.allenai.common.Resource
 6 | import org.allenai.nlpstack.parse.poly.fsm.NbestList
 7 | 
 8 | import scala.io.Source
 9 | import scala.util.Random
10 | 
11 | import reming.{ CompactPrinter, JsonParser }
12 | import reming.DefaultJsonProtocol._
13 | 
14 | /** A ParsePool is a collection of parse candidates for the same input sentence.
15 |   *
16 |   * @param parses a sequence of parse trees
17 |   */
18 | case class ParsePool(parses: Iterable[(PolytreeParse, Double)]) {
19 |   def toNbestList: NbestList = {
20 |     NbestList(parses)
21 |   }
22 | 
23 |   @transient lazy val indexedParses = parses.toIndexedSeq
24 | 
25 |   def chooseRandomParse: PolytreeParse = {
26 |     indexedParses(Random.nextInt(indexedParses.size))._1
27 |   }
28 | }
29 | 
30 | object ParsePool {
31 |   implicit val jsFormat = jsonFormat1(ParsePool.apply)
32 | }
33 | 
34 | /** A data source for ParsePool objects. */
35 | trait ParsePoolSource {
36 |   def poolIterator: Iterator[ParsePool]
37 | }
38 | 
39 | case class InMemoryParsePoolSource(inputIterator: Iterator[ParsePool]) extends ParsePoolSource {
40 | 
41 |   private val cachedPools = inputIterator.toIterable
42 | 
43 |   override def poolIterator: Iterator[ParsePool] = {
44 |     cachedPools.iterator
45 |   }
46 | }
47 | 
48 | case class FileBasedParsePoolSource(filename: String) extends ParsePoolSource {
49 | 
50 |   override def poolIterator: Iterator[ParsePool] = {
51 |     val lines: Iterator[String] = Source.fromFile(filename).getLines
52 |     lines map { line =>
53 |       JsonParser.read[ParsePool](line)
54 |     }
55 |   }
56 | }
57 | 
58 | object FileBasedParsePoolSource {
59 | 
60 |   def writePools(pools: Iterator[ParsePool], filename: String) {
61 |     Resource.using(new PrintWriter(new File(filename))) { writer =>
62 |       for (pool <- pools) {
63 |         CompactPrinter.printTo(writer, pool)
64 |       }
65 |     }
66 |   }
67 | }
68 | 
69 | 


--------------------------------------------------------------------------------
/tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/polyparser/Parser.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.polyparser
 2 | 
 3 | import org.allenai.nlpstack.core._
 4 | import org.allenai.nlpstack.parse.poly.fsm.TransitionConstraint
 5 | import org.allenai.nlpstack.tokenize.defaultTokenizer
 6 | import org.allenai.nlpstack.postag.defaultPostagger
 7 | 
 8 | import java.io.InputStream
 9 | 
10 | import org.allenai.nlpstack.parse.poly.core
11 | import org.allenai.nlpstack.parse.poly.core.{ WordClusters, Sentence, NexusToken }
12 | 
13 | object Parser {
14 | 
15 |   /** Loads a parser from its file.
16 |     *
17 |     * @param filename the JSON configuration file or model prefix
18 |     * @return the parser initialized from the file
19 |     */
20 |   def loadParser(filename: String): TransitionParser = {
21 |     TransitionParser.load(filename)
22 |   }
23 | 
24 |   /** Loads a parser from its configuration file. Also allows you to specify a set of
25 |     * "gold" parses to cache (the cache is checked before the base parser is utilized)
26 |     *
27 |     * @param filename the JSON configuration file or model prefix
28 |     * @param parsesToCache a sequence of "gold" parses to cache
29 |     * @return the initialized parser
30 |     */
31 |   def loadParserWithCache(filename: String, parsesToCache: Iterator[PolytreeParse]): TransitionParser = {
32 |     val fallbackParser = loadParser(filename)
33 |     ParseCache(parsesToCache.toSeq, fallbackParser)
34 |   }
35 | 
36 |   /** Loads a parser from an InputStream of a models file
37 |     * @param inputStream stream of models config file
38 |     * @return the parser initialized from the input stream
39 |     */
40 |   def loadParser(inputStream: InputStream): TransitionParser = {
41 |     TransitionParser.loadFromStream(inputStream)
42 |   }
43 | 
44 |   private val tokenizer = defaultTokenizer
45 |   private val postagger = defaultPostagger
46 | 
47 |   /** Tokenizes (and tags) an untokenized sentence.
48 |     *
49 |     * @param text the untokenized sentence
50 |     * @return a sequence of tokens
51 |     */
52 |   def tokenizeSentence(text: String): Seq[core.Token] = {
53 |     val postagged: Seq[PostaggedToken] = postagger.postag(tokenizer)(text)
54 |     NexusToken +: (postagged map {
55 |       case tok =>
56 |         core.Token(
57 |           word = Symbol(tok.string),
58 |           Map(
59 |             'autoPos ->
60 |               Set(Symbol(tok.postag)),
61 |             'autoCpos ->
62 |               Set(Symbol(WordClusters.ptbToUniversalPosTag.getOrElse(tok.postag, tok.postag)))
63 |           )
64 |         )
65 |     })
66 |   }
67 | }
68 | 


--------------------------------------------------------------------------------
/tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/polyparser/ParserConfiguration.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.polyparser
 2 | 
 3 | import java.io.{ PrintWriter, File, InputStream }
 4 | import java.net.URL
 5 | 
 6 | import org.allenai.common.Resource._
 7 | import org.allenai.nlpstack.parse.poly.fsm.{
 8 |   RerankingFunction,
 9 |   StateCostFunction,
10 |   StateCostFunctionFactory
11 | }
12 | import reming.DefaultJsonProtocol._
13 | 
14 | /** Contains the key components of a parser (for serialization purposes).
15 |   *
16 |   * @param parsingCostFunctionFactory the cost function factory for the transition parser
17 |   * @param rerankingFunction the cost function for parse reranking
18 |   * @param parsingNbestSize the nbest size to generate for reranking
19 |   */
20 | case class ParserConfiguration(
21 |   parsingCostFunctionFactory: StateCostFunctionFactory,
22 |   rerankingFunction: RerankingFunction,
23 |   parsingNbestSize: Int
24 | )
25 | 
26 | object ParserConfiguration {
27 |   implicit val parserConfigurationFormat = jsonFormat3(ParserConfiguration.apply)
28 | }
29 | 


--------------------------------------------------------------------------------
/tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/polyparser/ParserConstraint.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.polyparser
 2 | 
 3 | import org.allenai.nlpstack.parse.poly.fsm.TransitionConstraint
 4 | 
 5 | /** A ForbiddenEdge constraint designates a transition as illegal if it would directly create
 6 |   * an arc (in either direction) between the tokens at the given indices.
 7 |   *
 8 |   * Note that argument order does not matter for the constructor.
 9 |   *
10 |   * @param token1 index of the first token
11 |   * @param token2 index of the second token
12 |   */
13 | case class ForbiddenEdge(token1: Int, token2: Int) extends TransitionConstraint
14 | 
15 | /** A ForbiddenArcLabel constraint designates a transition as illegal if it would directly
16 |   * create an arc (in either direction) with the specified label between the tokens at the given
17 |   * indices. It also implicitly creates a RequestedArc constraint for the specified arc
18 |   * (basically it says that we DO want an arc between the specified indices, just not with this
19 |   * label).
20 |   *
21 |   * Note that argument order (of the token indices) does not matter for the constructor.
22 |   *
23 |   * @param token1 index of the first token
24 |   * @param token2 index of the second token
25 |   * @param arcLabel label that is forbidden between the two tokens
26 |   */
27 | case class ForbiddenArcLabel(token1: Int, token2: Int,
28 |   arcLabel: Symbol) extends TransitionConstraint
29 | 
30 | /** A RequestedArc constraint requests that the output parse MUST contain the requested arc.
31 |   *
32 |   * The arc is specified using the index of the token at the arc's head followed by the index of
33 |   * the token at the arc's tail.
34 |   *
35 |   * Note: currently this constraint does not pay attention to the arc direction, nor the arc
36 |   * label. It only enforces that that there is some edge between the two specified tokens.
37 |   *
38 |   * @param token1 index of the first token
39 |   * @param token2 index of the second token
40 |   * @param arcLabel desired label for the arc
41 |   */
42 | case class RequestedArc(token1: Int, token2: Int,
43 |   arcLabel: Option[Symbol] = None) extends TransitionConstraint
44 | 
45 | /** A RequestedCpos constraint specifies the coarse part-of-speech tag of a particular token.
46 |   * This means that in the returned parse, the 'cpos property for that token will correspond
47 |   * to the requested coarse tag.
48 |   *
49 |   * @param tokenIndex index of the desired token
50 |   * @param cpos desired coarse tag for the token
51 |   */
52 | case class RequestedCpos(tokenIndex: Int, cpos: Symbol) extends TransitionConstraint
53 | 


--------------------------------------------------------------------------------
/tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/polyparser/RerankingTransitionParser.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.polyparser
 2 | 
 3 | import org.allenai.nlpstack.parse.poly.core.Sentence
 4 | import org.allenai.nlpstack.parse.poly.fsm._
 5 | 
 6 | /** Uses the parser model to create an n-best list, then chooses the best parse from this n-best
 7 |   * list (according to the reranking function).
 8 |   *
 9 |   * @param config configuration object for the parser
10 |   */
11 | case class RerankingTransitionParser(config: ParserConfiguration) extends TransitionParser {
12 | 
13 |   @transient val reranker: Reranker = new Reranker(config.rerankingFunction)
14 | 
15 |   def parseWithScore(
16 |     sentence: Sentence,
17 |     constraints: Set[TransitionConstraint] = Set(),
18 |     doFastApproximation: Boolean = false
19 |   ): Option[(PolytreeParse, Double)] = {
20 | 
21 |     val parsingCostFunction =
22 |       config.parsingCostFunctionFactory.buildCostFunction(sentence, constraints)
23 |     val baseParser = new NbestSearch(parsingCostFunction)
24 |     val nbestList: Option[NbestList] =
25 |       parsingCostFunction.transitionSystem.initialState(
26 |         constraints.toSeq
27 |       ) map { initState =>
28 |         val nbestSize = // do full reranking only in the absence of constraints
29 |           if (constraints.nonEmpty) { 1 } else if (doFastApproximation) { 2 } else { config.parsingNbestSize }
30 |         baseParser.find(initState, nbestSize, constraints)
31 |       }
32 |     val mappedNbestList: Option[NbestList] = nbestList map { x =>
33 |       NbestList(x.scoredSculptures)
34 |     }
35 |     val candidate: Option[(Sculpture, Double)] = mappedNbestList flatMap { nbList =>
36 |       reranker.rerankWithScore(nbList)
37 |     }
38 |     candidate match {
39 |       case Some((parse: PolytreeParse, cost)) =>
40 |         Some((parse, cost))
41 |       case _ => None
42 |     }
43 |   }
44 |   def parse(
45 |     sentence: Sentence,
46 |     constraints: Set[TransitionConstraint] = Set()
47 |   ): Option[PolytreeParse] = {
48 | 
49 |     parseWithScore(sentence, constraints) map { case (parse, _) => parse }
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/reranking/ParseNodeFeature.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.reranking
 2 | 
 3 | import org.allenai.nlpstack.parse.poly.ml.{ FeatureName, FeatureVector }
 4 | import org.allenai.nlpstack.parse.poly.polyparser.PolytreeParse
 5 | 
 6 | import reming.LazyFormat
 7 | import reming.DefaultJsonProtocol._
 8 | 
 9 | /** Maps a selected node of a parse tree into a feature vector. */
10 | abstract class ParseNodeFeature extends ((PolytreeParse, Int) => FeatureVector)
11 | 
12 | object ParseNodeFeature {
13 |   implicit object ParseNodeFeatureJsonFormat extends LazyFormat[ParseNodeFeature] {
14 |     private implicit val parseNodeFeatureUnionFormat = jsonFormat1(ParseNodeFeatureUnion.apply)
15 | 
16 |     private implicit val transformedNeighborhoodFeatureFormat =
17 |       jsonFormat2(TransformedNeighborhoodFeature.apply)
18 | 
19 |     override val delegate = parentFormat[ParseNodeFeature](
20 |       childFormat[ParseNodeFeatureUnion, ParseNodeFeature],
21 |       childFormat[TransformedNeighborhoodFeature, ParseNodeFeature]
22 |     )
23 |   }
24 | }
25 | 
26 | /** A ParseNodeFeatureUnion merges the output of a list of features.
27 |   *
28 |   * @param features a list of the features we want to merge into a single feature
29 |   */
30 | case class ParseNodeFeatureUnion(
31 |     features: Seq[ParseNodeFeature]
32 | ) extends ParseNodeFeature {
33 | 
34 |   override def apply(parse: PolytreeParse, token: Int): FeatureVector = {
35 |     features map (f => f(parse, token)) reduce ((m1, m2) => FeatureVector.mergeVectors(m1, m2))
36 |   }
37 | }
38 | 
39 | /** A TransformedNeighborhoodFeature creates a feature vector from a set of neighborhood
40 |   * extractors and transforms.
41 |   *
42 |   * @param neighborhoodExtractors the neighborhood extractors you want to apply to each parse node
43 |   * @param transforms the transforms you want to apply to the extracted neighborhoods
44 |   */
45 | case class TransformedNeighborhoodFeature(
46 |     neighborhoodExtractors: Seq[(String, NeighborhoodExtractor)],
47 |     transforms: Seq[(String, NeighborhoodTransform)]
48 | ) extends ParseNodeFeature {
49 | 
50 |   override def apply(parse: PolytreeParse, token: Int): FeatureVector = {
51 |     FeatureVector(
52 |       for {
53 |         (extractorName, extractor) <- neighborhoodExtractors
54 |         neighborhood <- extractor(parse, token)
55 |         (transformName, transform) <- transforms
56 |         transformedNeighborhood <- transform(parse, neighborhood)
57 |       } yield {
58 |         val featureName = (Seq(extractorName, transformName) map { x => Symbol(x) }) ++
59 |           transformedNeighborhood.symbols
60 |         FeatureName(featureName) -> 1.0
61 |       }
62 |     )
63 |   }
64 | }
65 | 


--------------------------------------------------------------------------------
/tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/reranking/ParseRerankingFunction.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.reranking
 2 | 
 3 | import org.allenai.nlpstack.parse.poly.eval.ParseScore
 4 | import org.allenai.nlpstack.parse.poly.fsm._
 5 | import org.allenai.nlpstack.parse.poly.ml.LinearModel
 6 | import org.allenai.nlpstack.parse.poly.polyparser.PolytreeParse
 7 | 
 8 | /** Rescores a parse tree according to a specified scoring function.
 9 |   *
10 |   * @param scoringFunction the desired scoring function
11 |   */
12 | case class ParseRerankingFunction(scoringFunction: ParseScore)
13 |     extends RerankingFunction {
14 | 
15 |   override def apply(sculpture: Sculpture, baseCost: Double): Double = {
16 |     sculpture match {
17 |       case parse: PolytreeParse => 1.0 - scoringFunction(parse)
18 |       case _ => 1.0
19 |     }
20 |   }
21 | }
22 | 
23 | /** Rescores a parse tree based on a linear combination of features.
24 |   *
25 |   * @param feature computes a feature vector from the parse tree
26 |   * @param linearModel computes a linear combination of the computed features
27 |   */
28 | case class LinearParseRerankingFunction(
29 |     feature: PolytreeParseFeature,
30 |     linearModel: Option[LinearModel]
31 | ) extends RerankingFunction {
32 | 
33 |   override def apply(sculpture: Sculpture, baseCost: Double): Double = {
34 |     sculpture match {
35 |       case parse: PolytreeParse =>
36 |         linearModel.get.score(feature(parse, baseCost))
37 |       case _ => Double.PositiveInfinity
38 |     }
39 |   }
40 | }
41 | 
42 | 


--------------------------------------------------------------------------------
/tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/reranking/PolytreeParseFeature.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.reranking
 2 | 
 3 | import org.allenai.nlpstack.parse.poly.ml.{
 4 |   FeatureName => MLFeatureName,
 5 |   FeatureVector => MLFeatureVector
 6 | }
 7 | import org.allenai.nlpstack.parse.poly.polyparser.PolytreeParse
 8 | 
 9 | import reming.LazyFormat
10 | import reming.DefaultJsonProtocol._
11 | 
12 | /** Maps a scored parse into a feature vector. */
13 | abstract class PolytreeParseFeature extends ((PolytreeParse, Double) => MLFeatureVector)
14 | 
15 | object PolytreeParseFeature {
16 |   implicit object PolytreeParseFeatureJsonFormat extends LazyFormat[PolytreeParseFeature] {
17 |     implicit val polytreeParseFeatureUnionFormat = jsonFormat1(PolytreeParseFeatureUnion.apply)
18 |     implicit val baseParserScoreFeatureFormat = jsonFormat0(() => BaseParserScoreFeature)
19 |     implicit val sentenceLengthFeatureFormat = jsonFormat0(() => SentenceLengthFeature)
20 | 
21 |     override val delegate = parentFormat[PolytreeParseFeature](
22 |       childFormat[PolytreeParseFeatureUnion, PolytreeParseFeature],
23 |       childFormat[BaseParserScoreFeature.type, PolytreeParseFeature],
24 |       childFormat[SentenceLengthFeature.type, PolytreeParseFeature]
25 |     )
26 |   }
27 | }
28 | 
29 | /** Simply passes along the length of the sentence as a feature. */
30 | case object SentenceLengthFeature extends PolytreeParseFeature {
31 | 
32 |   override def apply(parse: PolytreeParse, score: Double): MLFeatureVector = {
33 |     MLFeatureVector(Seq(MLFeatureName(List(name)) -> parse.sentence.tokens.tail.size))
34 |   }
35 | 
36 |   val name: Symbol = 'sentLen
37 | }
38 | 
39 | /** Simply passes along the original score of the parse as a feature. */
40 | case object BaseParserScoreFeature extends PolytreeParseFeature {
41 | 
42 |   override def apply(parse: PolytreeParse, score: Double): MLFeatureVector = {
43 |     MLFeatureVector(Seq(MLFeatureName(List(name)) -> score))
44 |   }
45 | 
46 |   val name: Symbol = 'baseParserScore
47 | }
48 | 
49 | /** A PolytreeParseFeatureUnion merges the output of a list of features.
50 |   *
51 |   * @param features a list of the features we want to merge into a single feature
52 |   */
53 | case class PolytreeParseFeatureUnion(
54 |     val features: Seq[PolytreeParseFeature]
55 | ) extends PolytreeParseFeature {
56 | 
57 |   override def apply(parse: PolytreeParse, score: Double): MLFeatureVector = {
58 |     features map (f => f(parse, score)) reduce ((m1, m2) => MLFeatureVector.mergeVectors(m1, m2))
59 |   }
60 | }
61 | 


--------------------------------------------------------------------------------
/tools/parse/src/main/universal/parse-server.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CLASS_NAME="edu.knowitall.tool.parse.ClearDependencyParserMain"
 4 | 
 5 | SCRIPT_DIR=`dirname $0`
 6 | SHORT_NAME=`basename $0 .sh`
 7 | APP_ROOT="$SCRIPT_DIR/.."
 8 | JVM_ARGS="-Xmx3G"
 9 | 
10 | . "${SCRIPT_DIR}/run-class.sh" "$CLASS_NAME" "$SHORT_NAME" "$@"
11 | 


--------------------------------------------------------------------------------
/tools/parse/src/test/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 |   <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
 3 |     <!-- encoders are assigned the type
 4 |     ch.qos.logback.classic.encoder.PatternLayoutEncoder by default -->
 5 |     <encoder>
 6 |       <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 7 |     </encoder>
 8 |   </appender>
 9 | 
10 |   <root level="info">
11 |     <appender-ref ref="STDOUT" />
12 |   </root>
13 |   <logger name="org.apache.wire" level="WARN" additivity="false" />
14 | </configuration>
15 | 


--------------------------------------------------------------------------------
/tools/parse/src/test/scala/org/allenai/nlpstack/parse/FactorieParserSpec.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse
 2 | 
 3 | import org.allenai.common.testkit.UnitSpec
 4 | import org.allenai.nlpstack.core.parse.graph.DependencyGraph
 5 | import org.allenai.nlpstack.postag.defaultPostagger
 6 | import org.allenai.nlpstack.tokenize.defaultTokenizer
 7 | 
 8 | class FactorieParserSpec extends UnitSpec {
 9 |   private def parseTreeString(text: String) = {
10 |     val tokens = defaultTokenizer.tokenize(text)
11 |     val postaggedTokens = defaultPostagger.postagTokenized(tokens)
12 | 
13 |     val parser = new FactorieParser
14 |     val parseTree = parser.dependencyGraphPostagged(postaggedTokens)
15 | 
16 |     DependencyGraph.multilineStringFormat.write(parseTree)
17 |   }
18 | 
19 |   /*
20 |    * When these tests fail with anything but an exception, it's a judgement call
21 |    * whether the trees that the parser produces are valid parses or whether this
22 |    * is a genuine error. If in doubt, consult your favorite linguist, but by and
23 |    * large, don't worry too much about accuracy here. This is not a quality test
24 |    * suite.
25 |    */
26 | 
27 |   "FactorieParser" should "correctly parse a simple sentence" in {
28 |     val parseTreeStr = parseTreeString("A waffle is like a pancake with a syrup trap.")
29 |     val expectedParseTreeStr =
30 |       """|det(waffle-2, A-1)
31 |          |nsubj(is-3, waffle-2)
32 |          |root(ROOT-0, is-3)
33 |          |prep(is-3, like-4)
34 |          |det(pancake-6, a-5)
35 |          |pobj(like-4, pancake-6)
36 |          |prep(pancake-6, with-7)
37 |          |det(trap-10, a-8)
38 |          |nn(trap-10, syrup-9)
39 |          |pobj(with-7, trap-10)
40 |          |punct(is-3, .-11)""".stripMargin
41 |     assert(parseTreeStr === expectedParseTreeStr)
42 |   }
43 | 
44 |   it should "correctly parse a complicated sentence" in {
45 |     // This sentence has two roots when it comes out of Factorie.
46 |     val parseTreeStr = parseTreeString("Big investment banks refused to step up to the plate, traders say.")
47 |     val expectedParseTreeStr =
48 |       """|amod(banks-3, Big-1)
49 |          |nn(banks-3, investment-2)
50 |          |nsubj(refused-4, banks-3)
51 |          |root(ROOT-0, refused-4)
52 |          |aux(step-6, to-5)
53 |          |xcomp(refused-4, step-6)
54 |          |prt(step-6, up-7)
55 |          |prep(step-6, to-8)
56 |          |det(plate-10, the-9)
57 |          |pobj(to-8, plate-10)
58 |          |punct(say-13, ,-11)
59 |          |nsubj(say-13, traders-12)
60 |          |punct(say-13, .-14)""".stripMargin
61 |     assert(parseTreeStr === expectedParseTreeStr)
62 |   }
63 | }
64 | 


--------------------------------------------------------------------------------
/tools/parse/src/test/scala/org/allenai/nlpstack/parse/poly/core/SentenceSpec.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.core
 2 | 
 3 | import org.allenai.common.immutable.Interval
 4 | import org.allenai.common.testkit.UnitSpec
 5 | 
 6 | class SentenceSpec extends UnitSpec {
 7 |   // scalastyle:off
 8 | 
 9 |   ".initializeFromWhitespaceSeparatedString" should "give the correct sentence" in {
10 |     Sentence.initializeFromWhitespaceSeparatedString("This is input .") shouldBe
11 |       Sentence(IndexedSeq(NexusToken, Token('This), Token('is), Token('input), Token(Symbol("."))))
12 |   }
13 | 
14 |   it should "ignore leading and trailing whitespace" in {
15 |     Sentence.initializeFromWhitespaceSeparatedString("  This is input .   ") shouldBe
16 |       Sentence(IndexedSeq(NexusToken, Token('This), Token('is), Token('input), Token(Symbol("."))))
17 |   }
18 | 
19 |   "Initializing a sentence" should "give the correct paren intervals for sent1" in {
20 |     val sent1 = Sentence.initializeFromWhitespaceSeparatedString("we saw black cats")
21 |     sent1.parenIntervals shouldBe Set.empty
22 |   }
23 | 
24 |   it should "give the correct paren intervals for sent2" in {
25 |     val sent2 = Sentence.initializeFromWhitespaceSeparatedString(
26 |       "with the help of animals ( insects and birds ) flowers can be pollinated ( fertilized ) ."
27 |     )
28 |     sent2.parenIntervals shouldBe Set(Interval.closed(6, 10), Interval.closed(15, 17))
29 |   }
30 | 
31 |   it should "give the correct paren intervals for sent3" in {
32 |     val sent3 = Sentence.initializeFromWhitespaceSeparatedString(
33 |       "with the help of animals ( ( insects ) and birds ) flowers can " +
34 |         "be pollinated ( fertilized ) ."
35 |     )
36 |     sent3.parenIntervals shouldBe
37 |       Set(Interval.closed(7, 9), Interval.closed(6, 12), Interval.closed(17, 19))
38 |   }
39 | 
40 |   it should "give the correct paren intervals for sent4" in {
41 |     val sent4 = Sentence.initializeFromWhitespaceSeparatedString(
42 |       "with the help of animals insects ) and birds ) flowers can " +
43 |         "be pollinated ( fertilized ."
44 |     )
45 |     sent4.parenIntervals shouldBe
46 |       Set(Interval.closed(0, 7), Interval.closed(0, 10), Interval.closed(15, 17))
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/tools/parse/src/test/scala/org/allenai/nlpstack/parse/poly/core/TokenSpec.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.core
 2 | 
 3 | import org.allenai.common.testkit.UnitSpec
 4 | 
 5 | class TokenSpec extends UnitSpec {
 6 |   // scalastyle:off
 7 | 
 8 |   ".getProperty" should "return the empty set for an undefined property" in {
 9 |     Token.create("the").getProperty('unknownProperty) shouldBe Set()
10 |   }
11 | 
12 |   ".getDeterministicProperty" should "return the correct answer" in {
13 |     val tok = Token.create("the", coarsePos = Some("DET"), finePos = Some("DT"))
14 |     tok.updateProperty('myProperty, Set('good))
15 |     tok.getDeterministicProperty('cpos) shouldEqual 'DET
16 |   }
17 | 
18 |   it should "return Token.propertyNotFound" in {
19 |     val tok = Token.create("the").updateProperty('definite, Set('yes))
20 |     tok.getDeterministicProperty('indefinite) shouldEqual Token.propertyNotFound
21 |   }
22 | 
23 |   ".updateProperty" should "override the previous value" in {
24 |     val tok = Token.create("the").updateProperty('definite, Set('yes))
25 |     tok.getDeterministicProperty('definite) shouldEqual Symbol("yes")
26 |     tok.updateProperty('definite, Set('no)).getDeterministicProperty('definite) shouldEqual 'no
27 |   }
28 | 
29 |   ".extendProperty" should "extend the previous value" in {
30 |     val tok = Token.create("the").extendProperty('definite, 'yes)
31 |     tok.getProperty('definite) shouldBe Set('yes)
32 |     tok.extendProperty('definite, 'no).getProperty('definite) shouldBe Set('yes, 'no)
33 |   }
34 | 
35 | }
36 | 


--------------------------------------------------------------------------------
/tools/parse/src/test/scala/org/allenai/nlpstack/parse/poly/core/TokenTaggerSpec.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.core
 2 | 
 3 | import org.allenai.common.testkit.UnitSpec
 4 | 
 5 | class TokenTaggerSpec extends UnitSpec {
 6 |   // scalastyle:off
 7 | 
 8 |   "LexicalPropertiesTagger" should "give the correct tags" in {
 9 |     val sent = Sentence.initializeFromWhitespaceSeparatedString("apple and blueberry pie")
10 |     LexicalPropertiesTagger.tag(Token(Symbol("hello"))) shouldBe Set()
11 |     LexicalPropertiesTagger.tag(Token(Symbol("Hello"))) shouldBe Set(
12 |       TokenTag(LexicalPropertiesTagger.taggerName, 'firstCap),
13 |       TokenTag(LexicalPropertiesTagger.taggerName, 'existsCap)
14 |     )
15 |     LexicalPropertiesTagger.tag(Token(Symbol("HELLO"))) shouldBe Set(
16 |       TokenTag(LexicalPropertiesTagger.taggerName, 'firstCap),
17 |       TokenTag(LexicalPropertiesTagger.taggerName, 'existsCap),
18 |       TokenTag(LexicalPropertiesTagger.taggerName, 'allCaps)
19 |     )
20 |     LexicalPropertiesTagger.tag(Token(Symbol("HELLO22"))) shouldBe Set(
21 |       TokenTag(LexicalPropertiesTagger.taggerName, 'firstCap),
22 |       TokenTag(LexicalPropertiesTagger.taggerName, 'existsCap),
23 |       TokenTag(LexicalPropertiesTagger.taggerName, 'existsNum)
24 |     )
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/tools/parse/src/test/scala/org/allenai/nlpstack/parse/poly/decisiontree/DecisionTreeSpec.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.decisiontree
 2 | 
 3 | import org.allenai.common.testkit.UnitSpec
 4 | 
 5 | class DecisionTreeSpec extends UnitSpec {
 6 | 
 7 |   val decisionTree1 = DecisionTree(
 8 |     outcomes = Seq(0, 1),
 9 |     child = Vector(
10 |       Map(0 -> 1, 1 -> 2), // node 0
11 |       Map(), // node 1
12 |       Map(0 -> 3, 1 -> 4), // node 2
13 |       Map(), // node 3
14 |       Map()
15 |     ), // node 4
16 |     splittingFeature = Vector(
17 |       Some(35), // node 0
18 |       None, // node 1
19 |       Some(20), // node 2
20 |       None, // node 3
21 |       None
22 |     ), // node 4
23 |     outcomeHistograms = Vector(
24 |       Map(0 -> 45, 1 -> 55), // node 0
25 |       Map(0 -> 29, 1 -> 9), // node 1
26 |       Map(0 -> 16, 1 -> 46), // node 2
27 |       Map(0 -> 5, 1 -> 10), // node 3
28 |       Map(0 -> 11, 1 -> 36)
29 |     ) // node 4
30 |   )
31 | 
32 |   "DecisionTree.outcomeHistogram" should "get node 3's histogram" in {
33 |     val fv = SparseVector(outcome = None, numFeatures = 100, trueFeatures = Set(34, 35))
34 |     decisionTree1.outcomeHistogram(fv) shouldBe Map(0 -> 5, 1 -> 10)
35 |   }
36 | 
37 |   it should "get node 1's histogram" in {
38 |     val fv = SparseVector(outcome = None, numFeatures = 100, trueFeatures = Set(34, 20))
39 |     decisionTree1.outcomeHistogram(fv) shouldBe Map(0 -> 29, 1 -> 9)
40 |   }
41 | 
42 |   "DecisionTree.allFeatures" should "return 20 and 35 for decisionTree1" in {
43 |     decisionTree1.allFeatures shouldBe Set(20, 35)
44 |   }
45 | 
46 |   "DecisionTree.outcomeDistribution" should "get node 3's smoothed distribution" in {
47 |     val fv = SparseVector(outcome = None, numFeatures = 100, trueFeatures = Set(34, 20))
48 |     (math floor decisionTree1.outcomeDistribution(fv)._1.dist(0) * 1000) / 1000 shouldBe 0.755
49 |     (math floor decisionTree1.outcomeDistribution(fv)._1.dist(1) * 1000) / 1000 shouldBe 0.244
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/tools/parse/src/test/scala/org/allenai/nlpstack/parse/poly/fsm/TrainingVectorSourceSpec.scala:
--------------------------------------------------------------------------------
1 | package org.allenai.nlpstack.parse.poly.fsm
2 | 
3 | import org.allenai.common.testkit.UnitSpec
4 | 
5 | class TrainingVectorSourceSpec extends UnitSpec {
6 | 
7 | }
8 | 


--------------------------------------------------------------------------------
/tools/parse/src/test/scala/org/allenai/nlpstack/parse/poly/ml/BrownClustersSpec.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.ml
 2 | 
 3 | import org.allenai.common.testkit.UnitSpec
 4 | import org.allenai.nlpstack.parse.poly.core.{ Token, Sentence }
 5 | 
 6 | class BrownClustersSpec extends UnitSpec {
 7 | 
 8 |   val clusters1 = BrownClusters.fromStringMap(Map(
 9 |     ("apple", "00"),
10 |     ("cherry", "00"),
11 |     ("banana", "01"),
12 |     ("carrot", "100"),
13 |     ("beet", "101"),
14 |     ("turnip", "101"),
15 |     ("celery", "11")
16 |   ), Map())
17 | 
18 |   val clusters2 = BrownClusters.fromStringMap(Map(
19 |     ("apple", "10"),
20 |     ("beet", "01")
21 |   ), Map())
22 | 
23 |   val sentence1 = Sentence(IndexedSeq(Token('apple), Token('and), Token('cherry), Token('beet)))
24 | 
25 |   "BrownClusters.getAllClusters" should "return the correct answer" in {
26 |     clusters1.getAllClusters('turnip).size shouldBe 4
27 |     clusters1.getAllClusters('turnip) shouldBe clusters1.getAllClusters('beet)
28 |     clusters1.getAllClusters('turnip) == clusters1.getAllClusters('carrot) shouldBe false
29 |   }
30 | 
31 |   it should "return zero for an unknown word" in {
32 |     clusters1.getAllClusters('rutabaga) shouldBe List(Symbol("0"))
33 |   }
34 |   /*
35 |   "Sentence.taggedWithBrownClusters" should "return the correct answer" in {
36 |     sentence1.taggedWithBrownClusters(Seq(clusters1, clusters2)) shouldBe
37 |       Sentence(Seq(
38 |         Token('apple, Map('brown0 -> Set(Symbol("0"), Symbol("00")),
39 |           'brown1 -> Set(Symbol("1"), Symbol("10")))),
40 |         Token('and, Map('brown0 -> Set[Symbol](),
41 |           'brown1 -> Set[Symbol]())),
42 |         Token('cherry, Map('brown0 -> Set(Symbol("0"), Symbol("00")),
43 |           'brown1 -> Set[Symbol]())),
44 |         Token('beet, Map('brown0 -> Set(Symbol("1"), Symbol("10"), Symbol("101")),
45 |           'brown1 -> Set(Symbol("0"), Symbol("01"))))))
46 |   }
47 | */
48 | }
49 | 
50 | 


--------------------------------------------------------------------------------
/tools/parse/src/test/scala/org/allenai/nlpstack/parse/poly/ml/FeatureVectorSpec.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.ml
 2 | 
 3 | import org.allenai.common.testkit.UnitSpec
 4 | 
 5 | import reming.{ CompactPrinter, JsonParser }
 6 | 
 7 | class FeatureVectorSpec extends UnitSpec {
 8 |   // scalastyle:off
 9 | 
10 |   val nameA = FeatureName(List('a))
11 |   val nameB = FeatureName(List('b))
12 |   val nameC = FeatureName(List('c))
13 | 
14 |   "Calling .getFeatureValue" should "return the correct value" in {
15 |     val vec1 = FeatureVector(Seq(nameA -> 0.5, nameB -> 0.3))
16 |     vec1.getFeatureValue(nameA) shouldBe 0.5
17 |     vec1.getFeatureValue(nameB) shouldBe 0.3
18 |   }
19 | 
20 |   it should "return zero for an unrecognized feature name" in {
21 |     val vec1 = FeatureVector(Seq(nameA -> 0.5, nameB -> 0.3))
22 |     vec1.getFeatureValue(nameC) shouldBe 0
23 |   }
24 | 
25 |   "Calling subtractVectors" should "return the correct difference vector" in {
26 |     FeatureVector.subtractVectors(
27 |       FeatureVector(Seq(nameA -> 1, nameB -> 5)),
28 |       FeatureVector(Seq(nameA -> 3, nameC -> 4))
29 |     ).featureMap shouldBe
30 |       Map(nameA -> -2, nameB -> 5, nameC -> -4)
31 |   }
32 | 
33 |   "Calling mergeVectors" should "prioritize mappings in the first vector" in {
34 |     FeatureVector.mergeVectors(
35 |       FeatureVector(Seq(nameA -> 0.5, nameB -> 0.3)),
36 |       FeatureVector(Seq(nameA -> 0.7, nameC -> 0.4))
37 |     ).featureMap shouldBe
38 |       Map(nameA -> 0.5, nameB -> 0.3, nameC -> 0.4)
39 |   }
40 | 
41 |   "Serializing a FeatureVector" should "preserve the vector" in {
42 |     val vec1 = FeatureVector(Seq(nameA -> 0.5, nameB -> 0.3))
43 |     JsonParser.read[FeatureVector](CompactPrinter.printToString(vec1)) shouldBe vec1
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/tools/parse/src/test/scala/org/allenai/nlpstack/parse/poly/ml/LinearModelSpec.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.ml
 2 | 
 3 | import org.allenai.common.testkit.UnitSpec
 4 | 
 5 | class LinearModelSpec extends UnitSpec {
 6 | 
 7 |   val nameA = FeatureName(List('a))
 8 |   val nameB = FeatureName(List('b))
 9 |   val nameC = FeatureName(List('c))
10 |   val model1 = LinearModel(Seq((nameA, -2.0), (nameB, 3.0)))
11 | 
12 |   "Calling .getCoefficient" should "return the correct value" in {
13 |     model1.getCoefficient(nameA) shouldBe -2.0
14 |     model1.getCoefficient(nameB) shouldBe 3.0
15 |   }
16 | 
17 |   it should "return zero for unspecified coefficients" in {
18 |     model1.getCoefficient(nameC) shouldBe 0
19 |   }
20 | 
21 |   "Calling .score" should "return the correct score" in {
22 |     model1.score(FeatureVector(Seq(nameA -> 6.0, nameB -> 5.0, nameC -> -3.0))) shouldBe 3.0
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/tools/parse/src/test/scala/org/allenai/nlpstack/parse/poly/ml/NgramSetSpec.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.ml
 2 | 
 3 | import org.allenai.common.testkit.UnitSpec
 4 | import org.allenai.nlpstack.parse.poly.core.{ Token, Sentence }
 5 | import java.io.File
 6 | 
 7 | class NgramSetSpec extends UnitSpec {
 8 | 
 9 |   /*
10 |   val ngramSet1 = NgramSet.initializeFromUnderscoreSeparatedTerms(
11 |     Seq("graduated_cylinder", "Bunsen_burner", "oneword")
12 |   )
13 | 
14 | 
15 |   "BrownClusters.getAllClusters" should "return the correct answer" in {
16 |     val sentence = Sentence.initializeFromWhitespaceSeparatedString(
17 |       "My Bunsen burner is better than Bunsen and his graduated cylinder"
18 |     )
19 |     ngramSet1.identifyNgrams(sentence) shouldBe Set((2, 4), (10, 12))
20 |   }
21 | 
22 |   it should "correctly handle one-word ngrams" in {
23 |     val sentence = Sentence.initializeFromWhitespaceSeparatedString(
24 |       "This oneword should be easy ."
25 |     )
26 |     ngramSet1.identifyNgrams(sentence) shouldBe Set((2, 3))
27 |   }
28 | 
29 |   it should "correctly handle the beginning-of-sentence edge case" in {
30 |     val sentence = Sentence.initializeFromWhitespaceSeparatedString(
31 |       "Bunsen burner ? I hardly know her ."
32 |     )
33 |     println(s"ngramSet: ${ngramSet1.prefixes}")
34 |     ngramSet1.identifyNgrams(sentence) shouldBe Set((1, 3))
35 |   }
36 |   */
37 | }
38 | 


--------------------------------------------------------------------------------
/tools/parse/src/test/scala/org/allenai/nlpstack/parse/poly/ml/VerbnetSpec.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.ml
 2 | 
 3 | import org.allenai.common.Logging
 4 | import org.allenai.common.testkit.UnitSpec
 5 | 
 6 | import com.typesafe.config.ConfigFactory
 7 | import java.io.File
 8 | 
 9 | import org.allenai.nlpstack.parse.poly.core.SentenceTagger
10 | 
11 | class VerbnetSpec extends UnitSpec with Logging {
12 | 
13 |   val taggersConfig = ConfigFactory.parseFile(new File(SentenceTagger.taggersConfigFile))
14 |   val verbnetConfig = taggersConfig.getConfig("verbnet")
15 |   val groupName = verbnetConfig.getString("group")
16 |   val artifactName = verbnetConfig.getString("name")
17 |   val version = verbnetConfig.getInt("version")
18 |   val verbnet = new Verbnet(groupName, artifactName, version)
19 | 
20 |   "VerbnetUtil.getVerbnetClasses" should
21 |     "return the correct answer for verbs present in VerbNet" in {
22 |       verbnet.getVerbnetClassNames("roar") shouldBe Set(
23 |         Symbol("run-51.3.2"),
24 |         Symbol("weather-57"),
25 |         Symbol("animal_sounds-38"),
26 |         Symbol("manner_speaking-37.3"),
27 |         Symbol("sound_emission-43.2")
28 |       )
29 |       verbnet.getVerbnetClassNames("boast") shouldBe Set(Symbol("complain-37.8"))
30 |       verbnet.getVerbnetClassNames("synthesize") shouldBe Set(Symbol("create-26.4"))
31 |       verbnet.getVerbnetClassNames("run") shouldBe Set(
32 |         Symbol("swarm-47.5.1-1"),
33 |         Symbol("meander-47.7"),
34 |         Symbol("carry-11.4"),
35 |         Symbol("preparing-26.3-1"),
36 |         Symbol("run-51.3.2-2-1"),
37 |         Symbol("bump-18.4")
38 |       )
39 |     }
40 | 
41 |   "VerbnetUtil.getVerbnetClasses" should
42 |     "return the correct answer for words NOT present in VerbNet" in {
43 |       verbnet.getVerbnetClassNames("synthesis") shouldBe Set()
44 |       verbnet.getVerbnetClassNames("apple") shouldBe Set()
45 |     }
46 | 
47 | }
48 | 


--------------------------------------------------------------------------------
/tools/parse/src/test/scala/org/allenai/nlpstack/parse/poly/polyparser/ArcInverterSpec.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.polyparser
 2 | 
 3 | import org.allenai.common.testkit.UnitSpec
 4 | 
 5 | class ArcInverterSpec extends UnitSpec {
 6 | 
 7 |   "Calling apply" should "give back a modified parse for a simple parse" in {
 8 |     val inverter: ArcInverter =
 9 |       new ArcInverter(
10 |         Set(SingleSymbolArcLabel('det), SingleSymbolArcLabel('amod), SingleSymbolArcLabel('prep))
11 |       )
12 |     inverter(PolytreeParseTestData.parse1) shouldBe PolytreeParseTestData.parse1b
13 |   }
14 | 
15 |   it should "give back the same parse with no inverting labels" in {
16 |     val inverter: ArcInverter = new ArcInverter(Set())
17 |     inverter(PolytreeParseTestData.parse1) shouldBe PolytreeParseTestData.parse1
18 |   }
19 | 
20 | }
21 | 


--------------------------------------------------------------------------------
/tools/parse/src/test/scala/org/allenai/nlpstack/parse/poly/polyparser/MultiWordTaggerSpec.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.polyparser
 2 | 
 3 | import org.allenai.common.testkit.UnitSpec
 4 | import org.allenai.nlpstack.parse.poly.core.{ Sentence, Token }
 5 | 
 6 | class MultiWordTaggerSpec extends UnitSpec {
 7 |   // scalastyle:off
 8 | 
 9 |   import MultiWordTagger.{ mweSymbol, symbolFor, mweValue }
10 | 
11 |   val string = "I like pizza because of the Ninja Turtles"
12 |   val sentence = Sentence(string.split(" ").map(x => Token(Symbol(x))))
13 |   val mwe1 = IndexedSeq('pizza, 'because, 'of)
14 |   val mwe2 = IndexedSeq('because, 'of)
15 |   val mwe3 = IndexedSeq('I, 'like)
16 |   val mwe4 = IndexedSeq('Ninja, 'Turtles)
17 |   val mwe5 = IndexedSeq('Turtles)
18 |   val dictionary = Set(mwe1, mwe2, mwe3, mwe4, mwe5)
19 |   //val tagger = MultiWordTagger(dictionary)
20 |   //val got = tagger(sentence)
21 | 
22 |   // Empty property map
23 |   val propNone = Map.empty[Symbol, String]
24 | 
25 |   // Property map containing "part of mwe" property
26 |   val mweProp = Map(mweSymbol -> mweValue)
27 | 
28 |   // Expected property map for a token in the given mwe
29 |   def propFor(mwe: IndexedSeq[Symbol]) = mweProp + (symbolFor(mwe) -> mweValue)
30 | 
31 |   /*
32 |   "MultiWordTagger" should "predict properties correctly" in {
33 |     val expected = Seq(
34 |       propFor(mwe3), // I
35 |       propFor(mwe3), // like
36 |       propFor(mwe1), // pizza
37 |       propFor(mwe1) ++ propFor(mwe2), // because
38 |       propFor(mwe1) ++ propFor(mwe2), // of
39 |       propNone, // the
40 |       propFor(mwe4), // Ninja
41 |       propFor(mwe4) ++ propFor(mwe5)) // Turtles
42 |     val predicted = tagger(sentence).tokens.map(_.properties)
43 |     assert(expected == predicted)
44 |   }
45 |   */
46 | }
47 | 


--------------------------------------------------------------------------------
/tools/parse/src/test/scala/org/allenai/nlpstack/parse/poly/polyparser/PolytreeParseFeatureSpec.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.parse.poly.polyparser
 2 | 
 3 | import org.allenai.common.testkit.UnitSpec
 4 | import org.allenai.nlpstack.parse.poly.core.{ Sentence, NexusToken, Token }
 5 | import org.allenai.nlpstack.parse.poly.ml.{ FeatureVector => MLFeatureVector, FeatureName => MLFeatureName }
 6 | import org.allenai.nlpstack.parse.poly.reranking.BaseParserScoreFeature
 7 | 
 8 | class PolytreeParseFeatureSpec extends UnitSpec {
 9 |   // scalastyle:off
10 | 
11 |   /** This represents the following polytree parse:
12 |     * format: OFF
13 |     *
14 |     * NEXUS_0
15 |     *     |
16 |     *     |       the_1--
17 |     *     |              \
18 |     *     |               -->cat_2
19 |     *     \              /
20 |     *      -----> sat_3--
21 |     *        /
22 |     * by_4 --
23 |     *        \
24 |     *         --> me_5
25 |     *
26 |     * format: ON
27 |     */
28 |   val parse1 = PolytreeParse(
29 |     sentence = Sentence(Vector(NexusToken, Token('the), Token('cat), Token('sat),
30 |       Token('by), Token('me))),
31 |     breadcrumb = Vector(-1, 2, 3, 0, 3, 4),
32 |     children = Vector(Set(3), Set(2), Set(), Set(2), Set(3, 5), Set()),
33 |     arclabels =
34 |       Vector(
35 |         Set((3, SingleSymbolArcLabel('root))),
36 |         Set((2, SingleSymbolArcLabel('det))),
37 |         Set((1, SingleSymbolArcLabel('det)), (3, SingleSymbolArcLabel('nsubj))),
38 |         Set((0, SingleSymbolArcLabel('root)), (2, SingleSymbolArcLabel('nsubj)),
39 |           (4, SingleSymbolArcLabel('prep))),
40 |         Set((3, SingleSymbolArcLabel('prep)), (5, SingleSymbolArcLabel('pobj))),
41 |         Set((4, SingleSymbolArcLabel('pobj)))
42 |       )
43 |   )
44 | 
45 |   "Calling the .apply method of BaseParserScoreFeature" should "return the correct value" in {
46 |     val featureName = BaseParserScoreFeature.name
47 |     BaseParserScoreFeature(parse1, 12.0) shouldBe MLFeatureVector(Seq(
48 |       MLFeatureName(List(featureName)) -> 12.0
49 |     ))
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/tools/postag/build.sbt:
--------------------------------------------------------------------------------
1 | import Dependencies._
2 | 
3 | dependencyOverrides += "org.apache.commons" % "commons-compress" % "1.8"
4 | 
5 | libraryDependencies ++= loggingDependencies
6 | 


--------------------------------------------------------------------------------
/tools/postag/src/main/scala/org/allenai/nlpstack/postag/FactoriePostagger.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.postag
 2 | 
 3 | import org.allenai.nlpstack.core._
 4 | import org.allenai.nlpstack.postag.FactoriePostagger.factorieFormat
 5 | import org.allenai.nlpstack.tokenize.FactorieTokenizer
 6 | import org.allenai.datastore.Datastore
 7 | 
 8 | import cc.factorie.app.nlp.{ Document => FactorieDocument }
 9 | import cc.factorie.app.nlp.pos.OntonotesForwardPosTagger
10 | import cc.factorie.app.nlp.pos.PennPosTag
11 | 
12 | /** This is thread-safe. The only thing we call on OntonotesForwardPosTagger is
13 |   * predict(), and predict() is threadsafe. I don't know about the other methods
14 |   * on OntonotesForwardPosTagger.
15 |   */
16 | class FactoriePostagger extends Postagger {
17 |   val tagger = FactoriePostagger.tagger
18 | 
19 |   override def postagTokenized(tokens: Seq[Token]): Seq[PostaggedToken] = {
20 |     val factorieDoc = FactorieTokenizer.factorieFormat.write(tokens)
21 |     val factorieTokens = factorieDoc.tokens.toSeq
22 | 
23 |     tagger.predict(factorieTokens) // modifies factorieTokens
24 | 
25 |     factorieFormat.read(factorieDoc)
26 |   }
27 | }
28 | 
29 | object FactoriePostagger {
30 |   private val tagger =
31 |     new OntonotesForwardPosTagger(
32 |       Datastore.filePath(
33 |       "cc.factorie.app.nlp.pos",
34 |       "OntonotesForwardPosTagger.factorie",
35 |       1
36 |     ).toUri.toURL
37 |     )
38 | 
39 |   object factorieFormat extends Format[Seq[PostaggedToken], FactorieDocument] {
40 |     override def read(from: FactorieDocument): Seq[PostaggedToken] =
41 |       from.tokens.map(t => PostaggedToken(
42 |         tagger.tokenAnnotationString(t),
43 |         t.string,
44 |         t.stringStart
45 |       )).toSeq
46 | 
47 |     override def write(from: Seq[PostaggedToken]): FactorieDocument = {
48 |       val factorieDoc = FactorieTokenizer.factorieFormat.write(from)
49 |       require(factorieDoc.tokenCount == from.size)
50 |       (from, factorieDoc.tokens).zipped.foreach((token, factorieToken) => {
51 |         factorieToken.attr += new PennPosTag(factorieToken, token.postag)
52 |         factorieToken.attr += token
53 |       })
54 |       factorieDoc
55 |     }
56 |   }
57 | }
58 | 


--------------------------------------------------------------------------------
/tools/postag/src/main/scala/org/allenai/nlpstack/postag/OpenNlpPostagger.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.postag
 2 | 
 3 | import org.allenai.common.Resource
 4 | import org.allenai.nlpstack.core._
 5 | 
 6 | import opennlp.tools.postag.{ POSTaggerME, POSModel }
 7 | 
 8 | class OpenNlpPostagger extends Postagger {
 9 |   private val postagger = new POSTaggerME(OpenNlpPostagger.model)
10 | 
11 |   override def postagTokenized(tokens: Seq[Token]): Seq[PostaggedToken] = {
12 |     val postags = postagger.tag(tokens.iterator.map(_.string).toArray)
13 |     (tokens zip postags).map {
14 |       case (token, postag) =>
15 |         val fixedPostag = if (token.string == "-") "HYPH" else postag
16 |         PostaggedToken(token, fixedPostag)
17 |     }
18 |   }
19 | }
20 | 
21 | object OpenNlpPostagger {
22 |   private val defaultModelName = "en-pos-maxent.bin"
23 |   private val model =
24 |     Resource.using(this.getClass.getClassLoader.getResourceAsStream(defaultModelName)) { is =>
25 |       new POSModel(is)
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/tools/postag/src/main/scala/org/allenai/nlpstack/postag/StanfordPostagger.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.postag
 2 | 
 3 | import edu.stanford.nlp.ling.CoreLabel
 4 | import edu.stanford.nlp.tagger.maxent.MaxentTagger
 5 | 
 6 | import org.allenai.nlpstack.core._
 7 | import org.allenai.datastore.Datastore
 8 | 
 9 | import java.net.URL
10 | import scala.collection.JavaConverters._
11 | 
12 | class StanfordPostagger(
13 |     val tagger: MaxentTagger
14 | ) extends Postagger {
15 | 
16 |   def this() = this(StanfordPostagger.loadDefaultModel())
17 | 
18 |   override def postagTokenized(tokens: Seq[Token]): Seq[PostaggedToken] = {
19 |     val labels = tokens.map { token =>
20 |       val corelabel = new CoreLabel()
21 |       corelabel.setWord(token.string)
22 |       corelabel
23 |     }
24 |     val postags = tagger.tagSentence(labels.asJava).asScala.map(_.tag())
25 | 
26 |     (tokens zip postags).map {
27 |       case (token, postag) =>
28 |         PostaggedToken(token, postag)
29 |     }
30 |   }
31 | }
32 | 
33 | object StanfordPostagger {
34 |   def loadDefaultModel(): MaxentTagger = {
35 |     val filePath = Datastore.directoryPath(
36 |       "edu.stanford.nlp.models.pos-tagger",
37 |       "english-left3words-3.4.1",
38 |       1
39 |     )
40 |     new MaxentTagger(filePath.toString + "/english-left3words/english-left3words-distsim.tagger")
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/tools/postag/src/main/scala/org/allenai/nlpstack/postag/package.scala:
--------------------------------------------------------------------------------
1 | package org.allenai.nlpstack
2 | 
3 | import org.allenai.nlpstack.core.Postagger
4 | 
5 | package object postag {
6 |   val defaultPostagger: Postagger = new FactoriePostagger
7 | }
8 | 


--------------------------------------------------------------------------------
/tools/postag/src/main/universal/postag-server.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CLASS_NAME="edu.knowitall.tool.postag.OpenNlpPostagger"
 4 | 
 5 | SCRIPT_DIR=`dirname $0`
 6 | SHORT_NAME=`basename $0 .sh`
 7 | APP_ROOT="$SCRIPT_DIR/.."
 8 | JVM_ARGS="-Xmx128M"
 9 | 
10 | . "${SCRIPT_DIR}/run-class.sh" "$CLASS_NAME" "$SHORT_NAME" "$@"
11 | 


--------------------------------------------------------------------------------
/tools/postag/src/test/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 |   <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
 3 |     <!-- encoders are assigned the type
 4 |     ch.qos.logback.classic.encoder.PatternLayoutEncoder by default -->
 5 |     <encoder>
 6 |       <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 7 |     </encoder>
 8 |   </appender>
 9 | 
10 |   <root level="info">
11 |     <appender-ref ref="STDOUT" />
12 |   </root>
13 |   <logger name="org.apache.wire" level="WARN" additivity="false" />
14 | </configuration>
15 | 


--------------------------------------------------------------------------------
/tools/postag/src/test/scala/org/allenai/nlpstack/postag/FactoriePostaggerSpec.scala:
--------------------------------------------------------------------------------
1 | package org.allenai.nlpstack.postag
2 | 
3 | class FactoriePostaggerSpec extends PostaggerSpec {
4 |   val taggerToTest = new FactoriePostagger
5 | }
6 | 


--------------------------------------------------------------------------------
/tools/postag/src/test/scala/org/allenai/nlpstack/postag/OpenNlpPostaggerSpec.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack
 2 | package postag
 3 | 
 4 | class OpenNlpPostaggerSpec extends PostaggerSpec {
 5 |   // The OpenNLP postagger disagrees about the tags for "Pardon", where it is wrong, and "snub",
 6 |   // where the difference is acceptable.
 7 |   protected override def taggedTexts = Seq(
 8 |     super.taggedTexts(0),
 9 |     """|Pardon 0 NNP
10 |        |me 7 PRP
11 |        |for 10 IN
12 |        |asking 14 VBG
13 |        |, 20 ,
14 |        |sir 22 NN
15 |        |, 25 ,
16 |        |but 27 CC
17 |        |what 31 WP
18 |        |good 36 JJ
19 |        |are 41 VBP
20 |        |snub 45 NN
21 |        |fighters 50 NNS
22 |        |going 59 VBG
23 |        |to 65 TO
24 |        |be 68 VB
25 |        |against 71 IN
26 |        |that 79 DT
27 |        |? 83 .""".stripMargin
28 |   )
29 | 
30 |   val taggerToTest = new OpenNlpPostagger
31 | }
32 | 
33 | 


--------------------------------------------------------------------------------
/tools/postag/src/test/scala/org/allenai/nlpstack/postag/PostaggerSpec.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.postag
 2 | 
 3 | import org.allenai.common.testkit.UnitSpec
 4 | import org.allenai.nlpstack.core.Postagger
 5 | import org.allenai.nlpstack.tokenize.defaultTokenizer
 6 | 
 7 | abstract class PostaggerSpec extends UnitSpec {
 8 |   def taggerToTest: Postagger
 9 | 
10 |   protected val texts = Seq(
11 |     """|The battle station is heavily shielded and carries a firepower greater
12 |        |than half the star fleet. Its defenses are designed around a direct,
13 |        |large-scale assault. A small one-man fighter should be able to
14 |        |penetrate the outer defense.""".stripMargin,
15 |     """|Pardon me for asking, sir, but what good are snub fighters going to be
16 |        |against that?""".stripMargin
17 |   )
18 | 
19 |   protected def taggedTexts = Seq(
20 |     """|The 0 DT
21 |        |battle 4 NN
22 |        |station 11 NN
23 |        |is 19 VBZ
24 |        |heavily 22 RB
25 |        |shielded 30 VBN
26 |        |and 39 CC
27 |        |carries 43 VBZ
28 |        |a 51 DT
29 |        |firepower 53 NN
30 |        |greater 63 JJR
31 |        |than 71 IN
32 |        |half 76 PDT
33 |        |the 81 DT
34 |        |star 85 NN
35 |        |fleet 90 NN
36 |        |. 95 .
37 |        |Its 97 PRP$
38 |        |defenses 101 NNS
39 |        |are 110 VBP
40 |        |designed 114 VBN
41 |        |around 123 IN
42 |        |a 130 DT
43 |        |direct 132 JJ
44 |        |, 138 ,
45 |        |large-scale 140 JJ
46 |        |assault 152 NN
47 |        |. 159 .
48 |        |A 161 DT
49 |        |small 163 JJ
50 |        |one-man 169 JJ
51 |        |fighter 177 NN
52 |        |should 185 MD
53 |        |be 192 VB
54 |        |able 195 JJ
55 |        |to 200 TO
56 |        |penetrate 203 VB
57 |        |the 213 DT
58 |        |outer 217 JJ
59 |        |defense 223 NN
60 |        |. 230 .""".stripMargin,
61 |     """|Pardon 0 VB
62 |        |me 7 PRP
63 |        |for 10 IN
64 |        |asking 14 VBG
65 |        |, 20 ,
66 |        |sir 22 NN
67 |        |, 25 ,
68 |        |but 27 CC
69 |        |what 31 WP
70 |        |good 36 JJ
71 |        |are 41 VBP
72 |        |snub 45 JJ
73 |        |fighters 50 NNS
74 |        |going 59 VBG
75 |        |to 65 TO
76 |        |be 68 VB
77 |        |against 71 IN
78 |        |that 79 DT
79 |        |? 83 .""".stripMargin
80 |   )
81 | 
82 |   "postagger implementation" should "correctly postag two example sentences" in {
83 |     for ((text, expected) <- texts zip taggedTexts) {
84 |       val tagged = taggerToTest.postag(defaultTokenizer)(text)
85 |       val taggedString = tagged.mkString("\n")
86 |       assert(taggedString === expected)
87 |     }
88 |   }
89 | }
90 | 


--------------------------------------------------------------------------------
/tools/segment/build.sbt:
--------------------------------------------------------------------------------
1 | import Dependencies._
2 | 
3 | libraryDependencies ++= loggingDependencies
4 | 


--------------------------------------------------------------------------------
/tools/segment/src/main/scala/org/allenai/nlpstack/segment/ChalkSentenceSegmenter.scala:
--------------------------------------------------------------------------------
1 | package org.allenai.nlpstack
2 | package segment
3 | 
4 | @deprecated("Please use defaultSegmenter instead", "2014-06-24")
5 | class ChalkSentenceSegmenter extends FactorieSegmenter
6 | 


--------------------------------------------------------------------------------
/tools/segment/src/main/scala/org/allenai/nlpstack/segment/FactorieSegmenter.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.segment
 2 | 
 3 | import org.allenai.nlpstack.core.{ FactorieUtilities, Segment, Segmenter }
 4 | 
 5 | import cc.factorie.app.nlp._
 6 | import cc.factorie.app.nlp.segment.{ DeterministicSentenceSegmenter, DeterministicTokenizer }
 7 | 
 8 | class FactorieSegmenter extends Segmenter {
 9 |   /* This is a bit unfortunate. In Factorie, you tokenize first, and then
10 |    * segment. In nlpstack, it's the other way around. We solve the problem by
11 |    * tokenizing twice, once here to get the sentences, and then again in
12 |    * FactorieTokenizer. */
13 |   private val tokenizer =
14 |     new DeterministicTokenizer(tokenizeAllDashedWords = true)
15 |   private val segmenter = DeterministicSentenceSegmenter
16 |   private val map = new MutableDocumentAnnotatorMap ++=
17 |     DocumentAnnotatorPipeline.defaultDocumentAnnotationMap
18 |   map += tokenizer
19 |   map += segmenter
20 |   private val pipeline = DocumentAnnotatorPipeline(
21 |     map = map.toMap,
22 |     prereqs = Nil,
23 |     segmenter.postAttrs
24 |   )
25 | 
26 |   override def segment(document: String): Iterable[Segment] = {
27 |     val doc = pipeline.process(
28 |       new Document(
29 |         FactorieUtilities.replaceUnclosedTag(document)
30 |       )
31 |     )
32 | 
33 |     for (sentence <- doc.sentences) yield {
34 |       new Segment(sentence.documentString, sentence.tokens(0).stringStart)
35 |     }
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/tools/segment/src/main/scala/org/allenai/nlpstack/segment/StanfordSegmenter.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.segment
 2 | 
 3 | import java.util.Properties
 4 | 
 5 | import edu.stanford.nlp.ling.CoreAnnotations
 6 | import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation
 7 | import edu.stanford.nlp.pipeline.{ Annotation, StanfordCoreNLP }
 8 | import org.allenai.nlpstack.core.{ Segment, Segmenter }
 9 | import org.slf4j.bridge.SLF4JBridgeHandler
10 | 
11 | import scala.collection.JavaConverters._
12 | 
13 | object StanfordSegmenter extends Segmenter {
14 |   // redirect stanford's logging
15 |   SLF4JBridgeHandler.removeHandlersForRootLogger()
16 |   SLF4JBridgeHandler.install()
17 | 
18 |   /* This is a bit unfortunate. In Stanford, you tokenize first, and then
19 |    * segment. In nlpstack, it's the other way around. We solve the problem by
20 |    * tokenizing twice, once here to get the sentences, and then again in
21 |    * StanfordTokenizer. */
22 | 
23 |   private val pipeline = {
24 |     val props = new Properties()
25 |     props.put("annotators", "tokenize, ssplit")
26 |     new StanfordCoreNLP(props)
27 |   }
28 | 
29 |   override def segment(document: String): Iterable[Segment] = {
30 |     val annotation = new Annotation(document)
31 |     pipeline.annotate(annotation)
32 |     annotation.get(classOf[SentencesAnnotation]).asScala.map { sentence =>
33 |       val start = sentence.get(classOf[CoreAnnotations.CharacterOffsetBeginAnnotation])
34 |       val end = sentence.get(classOf[CoreAnnotations.CharacterOffsetEndAnnotation])
35 |       Segment(document.substring(start, end), start)
36 |     }
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/tools/segment/src/main/scala/org/allenai/nlpstack/segment/package.scala:
--------------------------------------------------------------------------------
1 | package org.allenai.nlpstack
2 | 
3 | import org.allenai.nlpstack.core.Segmenter
4 | 
5 | package object segment {
6 |   def defaultSegmenter: Segmenter = new FactorieSegmenter
7 | }
8 | 


--------------------------------------------------------------------------------
/tools/segment/src/main/universal/segment-server.scala:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CLASS_NAME="edu.knowitall.tool.segment.ChalkSentenceSegmenter"
 4 | 
 5 | SCRIPT_DIR=`dirname $0`
 6 | SHORT_NAME=`basename $0 .sh`
 7 | APP_ROOT="$SCRIPT_DIR/.."
 8 | JVM_ARGS="-Xmx256M"
 9 | 
10 | . "${SCRIPT_DIR}/run-class.sh" "$CLASS_NAME" "$SHORT_NAME" "$@"
11 | 


--------------------------------------------------------------------------------
/tools/segment/src/test/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 |   <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
 3 |     <!-- encoders are assigned the type
 4 |     ch.qos.logback.classic.encoder.PatternLayoutEncoder by default -->
 5 |     <encoder>
 6 |       <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 7 |     </encoder>
 8 |   </appender>
 9 | 
10 |   <root level="info">
11 |     <appender-ref ref="STDOUT" />
12 |   </root>
13 |   <logger name="org.apache.wire" level="WARN" additivity="false" />
14 | </configuration>
15 | 


--------------------------------------------------------------------------------
/tools/segment/src/test/scala/org/allenai/nlpstack/segment/ChalkSentenceSegmenter.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack
 2 | package segment
 3 | 
 4 | import org.allenai.common.testkit.UnitSpec
 5 | import org.allenai.nlpstack.core.Segment
 6 | 
 7 | class ChalkSentencerSpec extends UnitSpec {
 8 |   val sentencer = new ChalkSentenceSegmenter
 9 |   val document = "He went to work.  He bought a suit.  He ate a melon."
10 |   "chalk sentencer" should "properly segment" in {
11 |     val segments = sentencer.segment(document).toIndexedSeq
12 |     assert(segments(0) === Segment("He went to work.", 0))
13 |     assert(segments(1) === Segment("He bought a suit.", 18))
14 |     assert(segments(2) === Segment("He ate a melon.", 37))
15 |   }
16 | }
17 | 


--------------------------------------------------------------------------------
/tools/segment/src/test/scala/org/allenai/nlpstack/segment/FactorieSegmenterSpec.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.segment
 2 | 
 3 | import org.allenai.common.testkit.UnitSpec
 4 | import org.allenai.nlpstack.core.Segment
 5 | 
 6 | import org.apache.commons.io.IOUtils
 7 | 
 8 | class FactorieSegmenterSpec extends UnitSpec {
 9 |   val sentencer = new FactorieSegmenter
10 |   val document = "He went to work.  He bought a first-class suit. He ate a melon."
11 | 
12 |   "factorie sentencer" should "properly segment" in {
13 |     val segments = sentencer.segment(document).toIndexedSeq
14 |     assert(segments(0) === Segment("He went to work.", 0))
15 |     assert(segments(1) === Segment("He bought a first-class suit.", 18))
16 |     assert(segments(2) === Segment("He ate a melon.", 48))
17 |   }
18 | 
19 |   it should "not throw an exception for a long string" in {
20 |     val s =
21 |       IOUtils.toString(
22 |         this.getClass.getResourceAsStream("/org/allenai/nlpstack/segment/unclosed_tag_test.txt"),
23 |         "UTF-8"
24 |       )
25 |     sentencer.segment(s)
26 |   }
27 | 
28 |   it should "not interpret dollar symbols as regex backreferences" in {
29 |     val s = "<" + "$2" + "x" * 98
30 |     sentencer.segment(s)
31 |   }
32 | 
33 | }


--------------------------------------------------------------------------------
/tools/tokenize/build.sbt:
--------------------------------------------------------------------------------
1 | import Dependencies._
2 | 
3 | javaOptions in (Test, test) := Seq("-Xss1m")
4 | 
5 | fork in (Test, test) := true
6 | 
7 | libraryDependencies ++= loggingDependencies
8 | 


--------------------------------------------------------------------------------
/tools/tokenize/src/main/scala/org/allenai/nlpstack/tokenize/FactorieTokenizer.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.tokenize
 2 | 
 3 | import org.allenai.nlpstack.core.{ FactorieUtilities, Format, Tokenizer, Token }
 4 | import org.allenai.nlpstack.tokenize.FactorieTokenizer.factorieFormat
 5 | 
 6 | import cc.factorie.app.nlp.{
 7 |   Document => FactorieDocument,
 8 |   Token => FactorieToken,
 9 |   DocumentAnnotatorPipeline,
10 |   MutableDocumentAnnotatorMap
11 | }
12 | import cc.factorie.app.nlp.segment.DeterministicTokenizer
13 | 
14 | class FactorieTokenizer extends Tokenizer {
15 |   private val tokenizer =
16 |     new DeterministicTokenizer(tokenizeAllDashedWords = false)
17 |   private val map = new MutableDocumentAnnotatorMap ++=
18 |     DocumentAnnotatorPipeline.defaultDocumentAnnotationMap
19 |   map += tokenizer
20 |   private val pipeline = DocumentAnnotatorPipeline(
21 |     map = map.toMap,
22 |     prereqs = Nil,
23 |     tokenizer.postAttrs
24 |   )
25 | 
26 |   def tokenize(sentence: String): Seq[Token] = {
27 |     val doc = pipeline.process(
28 |       new FactorieDocument(
29 |         FactorieUtilities.replaceUnclosedTag(sentence)
30 |       )
31 |     )
32 | 
33 |     factorieFormat.read(doc)
34 |   }
35 | }
36 | 
37 | object FactorieTokenizer {
38 |   object factorieFormat extends Format[Seq[Token], FactorieDocument] {
39 |     override def read(from: FactorieDocument): Seq[Token] =
40 |       for (section <- from.sections; token <- section.tokens)
41 |         yield Token(token.string, token.stringStart)
42 | 
43 |     override def write(from: Seq[Token]): FactorieDocument = {
44 |       val factorieDoc = new FactorieDocument(Tokenizer.originalText(from))
45 |       for (token <- from) {
46 |         // creating factorie tokens modifies the factorie document
47 |         val factorieToken = new FactorieToken(
48 |           factorieDoc,
49 |           token.offset,
50 |           token.offset + token.string.length
51 |         )
52 |         factorieToken.attr += token
53 |       }
54 |       factorieDoc
55 |     }
56 |   }
57 | }
58 | 


--------------------------------------------------------------------------------
/tools/tokenize/src/main/scala/org/allenai/nlpstack/tokenize/PennTokenizer.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.tokenize
 2 | 
 3 | import org.allenai.nlpstack.core.Tokenizer
 4 | 
 5 | import java.util.regex._
 6 | 
 7 | /* The PennTokenizer was used to tokenize the Penn Treebank.
 8 |  * The following is a translation from a sed file.  This algorithm
 9 |  * is entirely deterministic.  It is composed of regular expression
10 |  * replacements.
11 |  *
12 |  * @author  Michael Schmitz
13 |  */
14 | object PennTokenizer extends Tokenizer {
15 |   val replacements = List(
16 |     // attempt to get correct directional quotes
17 |     ("^\"", "`` "),
18 |     //("""([ (\[{<])""", "$1 `` "),
19 |     ("""\.\.\.""", " ... "),
20 |     ("[,;:@#$%&]", " $0 "),
21 |     ("""([^.]\)\([.])([])}>"']*)[ 	]*$""", "$1 $2$3 "), // scalastyle:ignore
22 |     ("[?!]", " $0 "),
23 |     ("""[](){}<>]""", " $0 "),
24 |     ("--", " $0 "),
25 |     ("$|^", " "),
26 |     ("\"", " '' "),
27 |     (""" ([^'])' """, " '$1 "),
28 |     ("""'([sSmMdD]) """, " '$1 "),
29 |     ("'(ll|re|ve|LL|RE|VE) ", " '$1 "),
30 |     ("(n't|N'T) ", " $1 ")
31 |   ).map {
32 |       case (a, b) =>
33 |         (Pattern.compile(a), b)
34 |     }
35 | 
36 |   def tokenize(sentence: String) = {
37 |     val split = replacements.foldRight(sentence) {
38 |       case ((t, r), s) =>
39 |         t.matcher(s).replaceAll(r)
40 |     }.trim.split("\\s+")
41 | 
42 |     Tokenizer.computeOffsets(split, sentence)
43 |   }
44 | }
45 | 


--------------------------------------------------------------------------------
/tools/tokenize/src/main/scala/org/allenai/nlpstack/tokenize/RemoteTokenizer.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.tokenize
 2 | 
 3 | import org.allenai.nlpstack.core.Tokenizer
 4 | import org.allenai.nlpstack.core.remote.Remote
 5 | 
 6 | import scala.concurrent.ExecutionContext
 7 | 
 8 | class RemoteTokenizer(val urlString: String)(implicit executionContext: ExecutionContext)
 9 |     extends Tokenizer with Remote {
10 |   def tokenize(sentence: String) = {
11 |     val response = post(sentence)
12 |     Tokenizer.multilineStringFormat.read(response)
13 |   }
14 | }
15 | 


--------------------------------------------------------------------------------
/tools/tokenize/src/main/scala/org/allenai/nlpstack/tokenize/SimpleEnglishTokenizer.scala:
--------------------------------------------------------------------------------
1 | package org.allenai.nlpstack.tokenize
2 | 
3 | @deprecated("Please use defaultTokenizer instead", "2014-06-19")
4 | class SimpleEnglishTokenizer extends FactorieTokenizer


--------------------------------------------------------------------------------
/tools/tokenize/src/main/scala/org/allenai/nlpstack/tokenize/StanfordTokenizer.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.tokenize
 2 | 
 3 | import java.io.StringReader
 4 | import org.slf4j.bridge.SLF4JBridgeHandler
 5 | 
 6 | import scala.collection.{ mutable, JavaConverters }
 7 | 
 8 | import edu.stanford.nlp.process.PTBTokenizer
 9 | import org.allenai.nlpstack.core.{ Token, Tokenizer }
10 | 
11 | object StanfordTokenizer extends Tokenizer {
12 |   // redirect stanford's logging
13 |   SLF4JBridgeHandler.removeHandlersForRootLogger()
14 |   SLF4JBridgeHandler.install()
15 | 
16 |   var averageTokenLength = 6 // low estimates are better
17 |   private val tokenizerFactory = PTBTokenizer.factory()
18 |   tokenizerFactory.setOptions("untokenizable=allKeep")
19 | 
20 |   def tokenize(sentence: String) = {
21 |     val reader = new StringReader(sentence)
22 |     val tokenizer = tokenizerFactory.getTokenizer(reader)
23 |     val result = new mutable.ArrayBuffer[Token](sentence.length / averageTokenLength)
24 | 
25 |     while (tokenizer.hasNext) {
26 |       val token = tokenizer.next()
27 |       result += Token(sentence.substring(token.beginPosition(), token.endPosition()), token.beginPosition())
28 |     }
29 |     result
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/tools/tokenize/src/main/scala/org/allenai/nlpstack/tokenize/WhitespaceTokenizer.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.tokenize
 2 | 
 3 | import org.allenai.nlpstack.core.Tokenizer
 4 | 
 5 | /* The PennTokenizer was used to tokenize the Penn Treebank.
 6 |  * The following is a translation from a sed file.  This algorithm
 7 |  * is entirely deterministic.  It is composed of regular expression
 8 |  * replacements.
 9 |  *
10 |  * @author  Michael Schmitz
11 |  */
12 | object WhitespaceTokenizer extends Tokenizer {
13 |   override def tokenize(string: String) =
14 |     Tokenizer.computeOffsets(string.split("\\s+").toSeq, string)
15 | }
16 | 


--------------------------------------------------------------------------------
/tools/tokenize/src/main/scala/org/allenai/nlpstack/tokenize/package.scala:
--------------------------------------------------------------------------------
1 | package org.allenai.nlpstack
2 | 
3 | import org.allenai.nlpstack.core.Tokenizer
4 | 
5 | package object tokenize {
6 |   def defaultTokenizer: Tokenizer = StanfordTokenizer
7 | }
8 | 


--------------------------------------------------------------------------------
/tools/tokenize/src/main/universal/tokenize-server.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CLASS_NAME="edu.knowitall.tool.tokenize.ChalkTokenizer"
 4 | 
 5 | SCRIPT_DIR=`dirname $0`
 6 | SHORT_NAME=`basename $0 .sh`
 7 | APP_ROOT="$SCRIPT_DIR/.."
 8 | JVM_ARGS="-Xmx128M"
 9 | 
10 | . "${SCRIPT_DIR}/run-class.sh" "$CLASS_NAME" "$SHORT_NAME" "$@"
11 | 


--------------------------------------------------------------------------------
/tools/tokenize/src/test/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 |   <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
 3 |     <!-- encoders are assigned the type
 4 |     ch.qos.logback.classic.encoder.PatternLayoutEncoder by default -->
 5 |     <encoder>
 6 |       <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 7 |     </encoder>
 8 |   </appender>
 9 | 
10 |   <root level="info">
11 |     <appender-ref ref="STDOUT" />
12 |   </root>
13 |   <logger name="org.apache.wire" level="WARN" additivity="false" />
14 | </configuration>
15 | 


--------------------------------------------------------------------------------
/tools/tokenize/src/test/scala/org/allenai/nlpstack/tokenize/FactorieTokenizerSpec.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack
 2 | package chunk
 3 | 
 4 | import org.allenai.nlpstack.tokenize._
 5 | 
 6 | class FactorieTokenizerSpec extends TokenizerSpec {
 7 |   override val tokenizedTestSentences = Seq(
 8 |     """|The 0
 9 |        |battle 4
10 |        |station 11
11 |        |is 19
12 |        |heavily 22
13 |        |shielded 30
14 |        |and 39
15 |        |carries 43
16 |        |a 51
17 |        |firepower 53
18 |        |greater 63
19 |        |than 71
20 |        |half 76
21 |        |the 81
22 |        |star 85
23 |        |fleet 90
24 |        |. 95
25 |        |Its 97
26 |        |defenses 101
27 |        |are 110
28 |        |designed 114
29 |        |around 123
30 |        |a 130
31 |        |direct 132
32 |        |, 138
33 |        |large 140
34 |        |- 145
35 |        |scale 146
36 |        |assault 152
37 |        |. 159
38 |        |A 161
39 |        |small 163
40 |        |one 169
41 |        |- 172
42 |        |man 173
43 |        |fighter 177
44 |        |should 185
45 |        |be 192
46 |        |able 195
47 |        |to 200
48 |        |penetrate 203
49 |        |the 213
50 |        |outer 217
51 |        |defense 223
52 |        |. 230""".stripMargin,
53 |     """|Pardon 0
54 |        |me 7
55 |        |for 10
56 |        |asking 14
57 |        |, 20
58 |        |sir 22
59 |        |, 25
60 |        |but 27
61 |        |what 31
62 |        |good 36
63 |        |are 41
64 |        |snub 45
65 |        |fighters 50
66 |        |going 59
67 |        |to 65
68 |        |be 68
69 |        |against 71
70 |        |that 79
71 |        |? 83""".stripMargin
72 |   )
73 | 
74 |   val tokenizerToTest = new FactorieTokenizer
75 | }
76 | 


--------------------------------------------------------------------------------
/version.sbt:
--------------------------------------------------------------------------------
1 | version in ThisBuild := "1.20-SNAPSHOT"


--------------------------------------------------------------------------------
/webapp/README.md:
--------------------------------------------------------------------------------
 1 | # Nlpviz
 2 | 
 3 | This project should grow to visualize various NLP tools.  For example,
 4 | Mark H in his experimentation with polytrees would like to visualize
 5 | polytrees.  Presently Nlpviz only visualizes dependencies and so it's
 6 | largely a wrapper for Whatswrong (https://code.google.com/p/whatswrong/).
 7 | 
 8 | The current functionality was taken from Nlpweb.  It's difficult to add
 9 | a tool to Nlpweb because it's an old project and it requires setting up
10 | and configuring a server with your NLP tool.  This doesn't work for
11 | frequent experimentation.
12 | 
13 | There are a multitude of NLP formats out there.  Ideally we would
14 | standardize somewhat.  I would rather Nlpviz not turn into a tool
15 | that takes in every format out there.  Rather, I'd rather have a
16 | separate tool NlpCanonicalize that converts formats into what we
17 | adopt as canonical.
18 | 
19 | This tool can either be used from a webpage or used programatically
20 | via POST requests.
21 | 
22 | ## Running
23 | 
24 | This project uses sbt as the build system.  sbt can also be used to run
25 | Nlpviz.
26 | 
27 |     $ sbt compile
28 |     $ sbt run
29 | 
30 | Now you should have a HTTP server running at http://localhost:8080.  To
31 | change the port, edit `src/main/resources/application.conf`.
32 | 
33 | ## Future support
34 | 
35 | * Polytrees
36 | * SRL frames
37 | 


--------------------------------------------------------------------------------
/webapp/build.sbt:
--------------------------------------------------------------------------------
 1 | import Dependencies._
 2 | 
 3 | name := "webapp"
 4 | 
 5 | libraryDependencies ++= Seq(
 6 |   "commons-codec" % "commons-codec" % "1.9",
 7 |   "org.riedelcastro" % "whatswrong" % "0.2.4"
 8 | )
 9 | 
10 | dependencyOverrides += "commons-io" % "commons-io" % "2.4"
11 | 
12 | addLoggingDependencies(libraryDependencies)
13 | 


--------------------------------------------------------------------------------
/webapp/conf/deploy.conf:
--------------------------------------------------------------------------------
 1 | base = {
 2 |   prod.deploy.host = "nlpstack.dev.allenai.org"
 3 |   prod.directory = "/local/deploy/"
 4 | }
 5 | 
 6 | base.webapp = {
 7 |   include "global_deploy.conf"
 8 |   project = {
 9 |     name = "webapp"
10 |   }
11 | }
12 | 
13 | prod = ${base.webapp}${base.prod}
14 | 


--------------------------------------------------------------------------------
/webapp/conf/global_deploy.conf:
--------------------------------------------------------------------------------
 1 | // Baseline config file containing reasonable defaults and documentation of
 2 | // fields.
 3 | //
 4 | // See https://github.com/typesafehub/config/blob/master/HOCON.md for a full
 5 | // description of the Typesafe Config language.
 6 | //
 7 | // An example usage of this file is in example_solver_deploy.conf.
 8 | project = {
 9 |   // SBT project name. Required.
10 |   name = null
11 |   // The project subdirectory. Optional; if unset, the root directory will be
12 |   // used.
13 |   subdirectory = ${?project.name}
14 |   // Optional branch / commit / tag to checkout before building.
15 |   version = null
16 | }
17 | deploy = {
18 |   // Hostname to push to. Required.
19 |   host = null
20 |   // Directory on the remote host to push to. Required.
21 |   directory = "/local/deploy/"${?project.name}
22 |   // Start / stop script to run after the push is complete. Required.
23 |   startup_script = "bin/"${?project.name}".sh"
24 |   user = {
25 |     // Full path to the ssh keypair to use when connecting to the remote host.
26 |     // Required.
27 |     ssh_keyfile = null
28 |     // Username to connect to the remote host as. Required.
29 |     ssh_username = "ec2-user"
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/webapp/public/css/main.css:
--------------------------------------------------------------------------------
 1 | textarea {
 2 |   width: 80%;
 3 |   white-space: pre;
 4 |   word-wrap: normal;
 5 | }
 6 | 
 7 | .nav-tabs a {
 8 |   cursor: pointer;
 9 | }
10 | 


--------------------------------------------------------------------------------
/webapp/public/img/spinner.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allenai/nlpstack/b41ac75f093842485a24d6540ed417964e85c2fb/webapp/public/img/spinner.gif


--------------------------------------------------------------------------------
/webapp/public/index.html:
--------------------------------------------------------------------------------
 1 | <html lang="en">
 2 |   <head>
 3 |     <title>NLP Web</title>
 4 |     <link rel="stylesheet" href="/css/main.css" type="text/css" />
 5 |     <link rel="stylesheet" href="/css/bootstrap-3.1.1.min.css">
 6 | 
 7 |     <script src="/js/jquery-2.0.1.min.js"></script>
 8 |     <script src="/js/angular-1.2.13.min.js"></script>
 9 | 
10 |     <META HTTP-EQUIV="Pragma" CONTENT="no-cache">
11 |     <META HTTP-EQUIV="Expires" CONTENT="-1">
12 |   </head>
13 |   <body>
14 |     <div class="container">
15 |       <h1>
16 |         NLP Web <small>a web site to explore NLP</small>
17 |       </h1>
18 | 
19 |       <ul>
20 |         <li><a href="tools.html">Run NLP tools</a> over words or sentences.</li>
21 |         <li><a href="visualize.html">Visualize serialized data</a> representing NLP processing.</li>
22 |       </ul>
23 |     </div>
24 |   </body>
25 | </html>
26 | 


--------------------------------------------------------------------------------
/webapp/public/js/tools.js:
--------------------------------------------------------------------------------
 1 | angular.module('tools', ['ui.bootstrap']);
 2 | var NlpToolsCtrl = function($scope, $http) {
 3 |   $scope.model = { }
 4 | 
 5 |   $scope.working = true;
 6 |   $http.get("/api/tools")
 7 |     .success(function(data, status, headers, config) {
 8 |       $scope.working = false;
 9 |       $scope.model.tools = data;
10 | 
11 |       data.forEach(function (tool) {
12 |         $scope.model.toolInfo = {};
13 |         $http.get("/api/tools/" + tool).success(function(data, status, headers, config) {
14 |           $scope.model.toolInfo[tool] = data;
15 |         });
16 |       });
17 |     })
18 |     .error(function(data, status, headers, config) {
19 |       $scope.working = false;
20 |       $scope.errorMessage = data;
21 |     });
22 | 
23 |   $scope.runTool = function(tool) {
24 |     $scope.working = true;
25 |     $http.post("/api/tools/" + tool, $scope.model[tool])
26 |       .success(function(data, status, headers, config) {
27 |         $scope.working = false;
28 |         $scope.errorMessage = undefined;
29 | 
30 |         $scope.response = {};
31 |         $scope.response[tool] = data;
32 |       })
33 |       .error(function(data, status, headers, config) {
34 |         $scope.working = false;
35 |         $scope.response = undefined;
36 | 
37 |         $scope.errorMessage = data;
38 |       });
39 |   }
40 | 
41 |   $scope.showExample = function(tool) {
42 |     $scope.model[tool] = $scope.model.toolInfo[tool].example;
43 |   }
44 | }
45 | 
46 | 


--------------------------------------------------------------------------------
/webapp/public/js/visualize.js:
--------------------------------------------------------------------------------
 1 | angular.module('visualize', ['ui.bootstrap']);
 2 | var VisualizeCtrl = function($scope, $http) {
 3 |   $scope.model = { }
 4 | 
 5 |   $scope.showExample = function() {
 6 |     $scope.model.dependencies = "nsubj(ran-2, Michael-1)\nroot(ROOT-0, ran-2)\n" +
 7 |         "prt(ran-2, down-3)\ndet(hill-5, the-4)\ndobj(ran-2, hill-5)";
 8 |     $scope.visualizeDependencies();
 9 |   }
10 | 
11 |   $scope.visualizeDependencies = function() {
12 |     $scope.working = true;
13 |     $http.post("/api/visualize/dependencies/base64", $scope.model.dependencies)
14 |       .success(function(data, status, headers, config) {
15 |         $scope.working = false;
16 |         $scope.errorMessage = undefined;
17 |         $scope.response = {}
18 |         $scope.response.base64 = data;
19 |       })
20 |       .error(function(data, status, headers, config) {
21 |         $scope.working = false;
22 |         $scope.errorMessage = data;
23 |         $scope.response = undefined;
24 |       });
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/webapp/public/tools.html:
--------------------------------------------------------------------------------
 1 | <html lang="en" ng-app='tools'>
 2 |   <head>
 3 |     <title>NLP Stack Tools</title>
 4 |     <link rel="stylesheet" href="/css/main.css" type="text/css" />
 5 |     <link rel="stylesheet" href="/css/bootstrap-3.1.1.min.css">
 6 | 
 7 |     <script src="/js/jquery-2.0.1.min.js"></script>
 8 |     <script src="/js/angular-1.2.13.min.js"></script>
 9 |     <script src="/js/ui-bootstrap-tpls-0.10.0.min.js"></script>
10 |     <script src="/js/tools.js"></script>
11 | 
12 |     <META HTTP-EQUIV="Pragma" CONTENT="no-cache">
13 |     <META HTTP-EQUIV="Expires" CONTENT="-1">
14 |   </head>
15 |   <body ng-controller='NlpToolsCtrl'>
16 |     <div class="container">
17 |       <h1>
18 |         NLP Tools <small>a web service to interact with NLP Stack tools</small>
19 |         <span ng-show='working' class="spinner">
20 |           <img src="/img/spinner.gif" />
21 |         </span>
22 |       </h1>
23 | 
24 |       <tabset>
25 |         <tab ng-repeat="tool in model.tools" heading="{{ tool }}">
26 |             Try <a href="#" ng-click="showExample(tool)">an example</a>.
27 |             Tool implementation is {{ model.toolInfo[tool].impl }}.
28 |             <form>
29 |               <div class="panel panel-default">
30 |                 <div class="panel-body">
31 |                   <div>Enter text to {{ tool }}:</div>
32 |                   <div>
33 |                     <textarea ng-model="model[tool]" rows="10" required></textarea>
34 |                   </div>
35 |                   <input ng-click="runTool(tool)" type="submit" value="Submit" class="btn btn-default">
36 | 
37 |                   <div ng-repeat="response in response[tool]">
38 |                     <pre ng-repeat="text in response.texts">{{ text }}</pre>
39 |                     <div ng-repeat="base64Image in response.base64Images">
40 |                       <img ng-src="data:image/png;base64,{{ base64Image }}" />
41 |                     </div>
42 |                   </div>
43 |                 </div>
44 |               </div>
45 |             </form>
46 |           </tab>
47 |       </tabset>
48 | 
49 |       <div ng-show="errorMessage" class="alert alert-danger">{{ errorMessage }}</div>
50 |     </div>
51 |   </body>
52 | </html>
53 | 


--------------------------------------------------------------------------------
/webapp/public/visualize.html:
--------------------------------------------------------------------------------
 1 | <html lang="en" ng-app='visualize'>
 2 |   <head>
 3 |     <title>Nlpviz</title>
 4 |     <link rel="stylesheet" href="/css/main.css" type="text/css" />
 5 |     <link rel="stylesheet" href="/css/bootstrap-3.1.1.min.css">
 6 | 
 7 |     <script src="/js/jquery-2.0.1.min.js"></script>
 8 |     <script src="/js/angular-1.2.13.min.js"></script>
 9 |     <script src="/js/ui-bootstrap-tpls-0.10.0.min.js"></script>
10 |     <script src="/js/visualize.js"></script>
11 | 
12 |     <META HTTP-EQUIV="Pragma" CONTENT="no-cache">
13 |     <META HTTP-EQUIV="Expires" CONTENT="-1">
14 |   </head>
15 |   <body ng-controller='VisualizeCtrl'>
16 |     <div class="container">
17 |       <h1>
18 |         Nlpviz <small>a web service to visualize NLP</small>
19 |         <span ng-show='working' class="spinner">
20 |           <img src="/img/spinner.gif" />
21 |         </span>
22 |       </h1>
23 | 
24 |       <tabset>
25 |         <tab heading="Dependencies">
26 |           Try <a href="#" ng-click="showExample()">an example</a>.
27 |           <form>
28 |             <div class="panel panel-default">
29 |               <div class="panel-body">
30 |                 <div>Enter dependencies to visualize:</div>
31 |                 <div>
32 |                   <textarea ng-model="model.dependencies" rows="10" required></textarea>
33 |                 </div>
34 |                 <input ng-click="visualizeDependencies()" type="submit" value="Submit" class="btn btn-default">
35 |               </div>
36 |             </div>
37 |           </form>
38 |         </tab>
39 |       </tabset>
40 | 
41 |       <div ng-show="errorMessage" class="alert alert-danger">{{ errorMessage }}</div>
42 |       <img ng-show="response" ng-src="data:image/png;base64,{{ response.base64 }}"/>
43 |     </div>
44 |   </body>
45 | </html>
46 | 


--------------------------------------------------------------------------------
/webapp/src/main/bin/webapp.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | CLASS_NAME="org.allenai.nlpstack.webapp.Nlpweb"
4 | JVM_ARGS="-Xmx4g"
5 | 
6 | SCRIPT_DIR=`dirname $0`
7 | SHORT_NAME=`basename $0 .sh`
8 | . "${SCRIPT_DIR}/run-class.sh" "$CLASS_NAME" "$SHORT_NAME" "$@"
9 | 


--------------------------------------------------------------------------------
/webapp/src/main/resources/application.conf:
--------------------------------------------------------------------------------
1 | nlpstack.webapp {
2 |   port = 8062
3 | }
4 | 


--------------------------------------------------------------------------------
/webapp/src/main/scala/org/allenai/nlpstack/webapp/BasicService.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.webapp
 2 | 
 3 | import spray.routing._
 4 | 
 5 | trait BasicService extends HttpService {
 6 |   val staticContentRoot = "public"
 7 | 
 8 |   // format: OFF
 9 |   val basicRoute =
10 |     path("") {
11 |       get {
12 |         getFromFile(staticContentRoot + "/index.html")
13 |       }
14 |     } ~
15 |     pathPrefix("info") {
16 |       // TODO: version route
17 |       path("name") {
18 |         get {
19 |           complete(Nlpweb.name)
20 |         }
21 |       }
22 |     } ~
23 |     get {
24 |       unmatchedPath { p => getFromFile(staticContentRoot + p) }
25 |     }
26 |   // format: ON
27 | }
28 | 


--------------------------------------------------------------------------------
/webapp/src/main/scala/org/allenai/nlpstack/webapp/Nlpweb.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.webapp
 2 | 
 3 | import akka.actor.{ ActorSystem, Props }
 4 | import akka.io.IO
 5 | import akka.pattern.ask
 6 | import akka.util.Timeout
 7 | import com.typesafe.config.ConfigFactory
 8 | import spray.can.Http
 9 | 
10 | import scala.concurrent.duration.DurationInt
11 | 
12 | object Nlpweb {
13 |   lazy val config = ConfigFactory.load()
14 |   val name = "webapp"
15 | 
16 |   def main(args: Array[String]): Unit = {
17 |     // ActorSystem to host the application in.
18 |     implicit val system = ActorSystem("webapp")
19 | 
20 |     // Create and start our service actor.
21 |     val service = system.actorOf(Props[NlpwebActor], "webapp-actor")
22 | 
23 |     // Start a new HTTP server with our service actor as the handler.
24 |     {
25 |       // Timeout for starting the spray Http server (below).
26 |       implicit val timeout = Timeout(30.seconds)
27 | 
28 |       // IO is a scala object with an apply method that returns an ActorRef.
29 |       IO(Http) ? Http.Bind(
30 |         service,
31 |         interface = "0.0.0.0",
32 |         port = config.getInt("nlpstack.webapp.port")
33 |       )
34 |     }
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/webapp/src/main/scala/org/allenai/nlpstack/webapp/NlpwebActor.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.webapp
 2 | 
 3 | import akka.actor.Actor
 4 | import spray.http._
 5 | import spray.routing._
 6 | import spray.util.LoggingContext
 7 | 
 8 | class NlpwebActor extends Actor with BasicService with VisualizationService with ToolService {
 9 | 
10 |   implicit def myExceptionHandler(implicit log: LoggingContext) =
11 |     ExceptionHandler {
12 |       case e: Exception =>
13 |         requestUri { uri =>
14 |           log.error(toString, e)
15 |           complete(StatusCodes.InternalServerError -> e.getMessage)
16 |         }
17 |     }
18 | 
19 |   // The HttpService trait defines only one abstract member, which connects the
20 |   // services environment to the enclosing actor or test.
21 |   def actorRefFactory = context
22 | 
23 |   /** Expire cached page after 60 seconds. */
24 |   val cacheControlMaxAge = HttpHeaders.`Cache-Control`(CacheDirectives.`max-age`(0))
25 | 
26 |   // This actor only runs our route, but you could add other things here, like
27 |   // request stream processing or timeout handling
28 |   def receive = runRoute(basicRoute ~ visualizationRoute ~ toolRoute)
29 | }
30 | 


--------------------------------------------------------------------------------
/webapp/src/main/scala/org/allenai/nlpstack/webapp/ToolService.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.webapp
 2 | 
 3 | import org.allenai.nlpstack.webapp.tools._
 4 | 
 5 | import spray.http.StatusCodes
 6 | import spray.httpx.SprayJsonSupport
 7 | import spray.httpx.marshalling.ToResponseMarshallable.isMarshallable
 8 | import spray.json.DefaultJsonProtocol._ // IntelliJ thinks this is unused, but it's not.
 9 | import spray.routing.Directive.pimpApply
10 | import spray.routing.HttpService
11 | 
12 | trait ToolService extends HttpService with SprayJsonSupport {
13 |   val tools = Seq(
14 |     SentenceSegmenterTool,
15 |     LemmatizerTool,
16 |     TokenizerTool,
17 |     PostaggerTool,
18 |     ChunkerTool,
19 |     DependencyParserTool
20 |   )
21 | 
22 |   // format: OFF
23 |   val toolRoute =
24 |     pathPrefix("api" / "tools") {
25 |       // List available tools in JSON.
26 |       pathEnd {
27 |         get {
28 |           val toolNames = tools map (_.name)
29 |           complete(tools map (_.name))
30 |         }
31 |       } ~
32 |       path(Segment) { segment =>
33 |         tools find (_.name == segment) match {
34 |           case Some(tool) =>
35 |             // Give info about this tool.
36 |             get {
37 |               complete(tool.info)
38 |             } ~
39 |             // Process text with this tool.
40 |             post {
41 |               entity(as[String]) { body =>
42 |                 val sections: Seq[String] = tool.split(body)
43 |                 val results = sections map tool.apply
44 |                 complete(results)
45 |               }
46 |             }
47 |           case None =>
48 |             // Tool not found.
49 |             complete(StatusCodes.BadRequest -> s"Unknown tool: $segment")
50 |         }
51 |       }
52 |     }
53 |   // format: ON
54 | }
55 | 


--------------------------------------------------------------------------------
/webapp/src/main/scala/org/allenai/nlpstack/webapp/tools/ChunkerTool.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.webapp.tools
 2 | 
 3 | import org.allenai.nlpstack.core.{ ChunkedToken, Chunker, Writer }
 4 | import org.allenai.nlpstack.webapp.Whatswrong._
 5 | 
 6 | import java.awt.image.BufferedImage
 7 | 
 8 | object ChunkerTool extends Tool("chunk") with StringFormat {
 9 |   type Output = Seq[ChunkedToken]
10 | 
11 |   override def info = ToolInfo(Impl.chunker.getClass.getSimpleName, Impl.obamaSentences)
12 | 
13 |   override def split(input: String) = input split "\n"
14 |   override def process(section: String) = {
15 |     val tokens = Impl.tokenizer(section)
16 |     val postags = Impl.postagger.postagTokenized(tokens)
17 |     Impl.chunker.chunkPostagged(postags)
18 |   }
19 |   override def visualize(output: Output) = {
20 |     Seq(
21 |       implicitly[Writer[Output, BufferedImage]].write(output)
22 |     )
23 |   }
24 |   override def stringFormat = Chunker.stringFormat
25 | }
26 | 


--------------------------------------------------------------------------------
/webapp/src/main/scala/org/allenai/nlpstack/webapp/tools/DependencyParserTool.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.webapp.tools
 2 | 
 3 | import org.allenai.nlpstack.core.Writer
 4 | import org.allenai.nlpstack.core.parse.graph.DependencyGraph
 5 | import org.allenai.nlpstack.webapp.Whatswrong._
 6 | 
 7 | import java.awt.image.BufferedImage
 8 | 
 9 | object DependencyParserTool extends Tool("dependencies") with StringFormat {
10 |   type Output = DependencyGraph
11 | 
12 |   override def info = ToolInfo(Impl.dependencyParser.getClass.getSimpleName, Impl.obamaSentences)
13 | 
14 |   override def split(input: String) = input split "\n"
15 |   override def process(section: String) = {
16 |     val tokens = Impl.tokenizer(section)
17 |     val postags = Impl.postagger.postagTokenized(tokens)
18 |     Impl.dependencyParser.dependencyGraphPostagged(postags)
19 |   }
20 |   override def visualize(output: Output) = {
21 |     Seq(
22 |       implicitly[Writer[Output, BufferedImage]].write(output)
23 |     )
24 |   }
25 |   override def stringFormat = DependencyGraph.multilineStringFormat
26 | }
27 | 


--------------------------------------------------------------------------------
/webapp/src/main/scala/org/allenai/nlpstack/webapp/tools/Impl.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.webapp.tools
 2 | 
 3 | import org.allenai.nlpstack.chunk.OpenNlpChunker
 4 | import org.allenai.nlpstack.lemmatize.MorphaStemmer
 5 | import org.allenai.nlpstack.parse.PolytreeParser
 6 | import org.allenai.nlpstack.postag.defaultPostagger
 7 | import org.allenai.nlpstack.segment.defaultSegmenter
 8 | import org.allenai.nlpstack.tokenize.defaultTokenizer
 9 | 
10 | object Impl {
11 |   private[tools] val sentenceSegmenter = defaultSegmenter
12 |   private[tools] val tokenizer = defaultTokenizer
13 |   private[tools] val lemmatizer = new MorphaStemmer()
14 |   private[tools] val postagger = defaultPostagger
15 |   private[tools] val chunker = new OpenNlpChunker()
16 |   private[tools] val dependencyParser = new PolytreeParser
17 | 
18 |   val obamaText = "Barack Hussein Obama II is the 44th and current President of the United States, and the first African American to hold the office. Born in Honolulu, Hawaii, Obama is a graduate of Columbia University and Harvard Law School, where he served as president of the Harvard Law Review. He was a community organizer in Chicago before earning his law degree. He worked as a civil rights attorney and taught constitutional law at the University of Chicago Law School from 1992 to 2004. He served three terms representing the 13th District in the Illinois Senate from 1997 to 2004, running unsuccessfully for the United States House of Representatives in 2000." // scalastyle:ignore
19 |   val obamaSentences = sentenceSegmenter(obamaText) map (_.text) mkString "\n"
20 | }
21 | 


--------------------------------------------------------------------------------
/webapp/src/main/scala/org/allenai/nlpstack/webapp/tools/LemmatizerTool.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.webapp.tools
 2 | 
 3 | import org.allenai.nlpstack.core.{ Lemmatized, Token }
 4 | 
 5 | object LemmatizerTool extends Tool("lemmatize") {
 6 |   type Output = Seq[Lemmatized[Token]]
 7 | 
 8 |   override def info = ToolInfo(Impl.lemmatizer.getClass.getSimpleName, Impl.obamaSentences)
 9 | 
10 |   override def split(input: String) = input split "\n"
11 |   override def process(section: String) = {
12 |     val tokens = Impl.tokenizer.tokenize(section)
13 |     val postagged = Impl.postagger.postagTokenized(tokens)
14 |     postagged map Impl.lemmatizer.lemmatizePostaggedToken
15 |   }
16 |   override def visualize(output: Output) = Seq.empty
17 |   override def format(output: Output) = Seq(output mkString " ")
18 | }


--------------------------------------------------------------------------------
/webapp/src/main/scala/org/allenai/nlpstack/webapp/tools/PostaggerTool.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.webapp.tools
 2 | 
 3 | import org.allenai.nlpstack.webapp.Whatswrong._
 4 | import org.allenai.nlpstack.core.{ PostaggedToken, Postagger, Writer }
 5 | 
 6 | import java.awt.image.BufferedImage
 7 | 
 8 | object PostaggerTool extends Tool("postag") with StringFormat {
 9 |   type Output = Seq[PostaggedToken]
10 | 
11 |   override def info = ToolInfo(Impl.postagger.getClass.getSimpleName, Impl.obamaSentences)
12 | 
13 |   override def split(input: String) = input split "\n"
14 |   override def process(section: String) = {
15 |     val tokens = Impl.tokenizer(section)
16 |     Impl.postagger.postagTokenized(tokens)
17 |   }
18 |   override def visualize(output: Output) = {
19 |     Seq(
20 |       implicitly[Writer[Output, BufferedImage]].write(output)
21 |     )
22 |   }
23 |   override def stringFormat = Postagger.multilineStringFormat
24 | }
25 | 


--------------------------------------------------------------------------------
/webapp/src/main/scala/org/allenai/nlpstack/webapp/tools/SentenceSegmenterTool.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.webapp.tools
 2 | 
 3 | import org.allenai.nlpstack.core.Segment
 4 | 
 5 | object SentenceSegmenterTool extends Tool("segment") {
 6 |   type Output = Seq[Segment]
 7 | 
 8 |   override def info = ToolInfo(Impl.sentenceSegmenter.getClass.getSimpleName, Impl.obamaText)
 9 | 
10 |   override def split(input: String) = Seq(input)
11 |   override def process(section: String) = Impl.sentenceSegmenter(section).toSeq
12 |   override def visualize(output: Output) = Seq.empty
13 |   override def format(output: Output) = Seq(output mkString "\n")
14 | }


--------------------------------------------------------------------------------
/webapp/src/main/scala/org/allenai/nlpstack/webapp/tools/TokenizerTool.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.webapp.tools
 2 | 
 3 | import org.allenai.nlpstack.core.{ Tokenizer, Token, Writer }
 4 | import org.allenai.nlpstack.webapp.Whatswrong._
 5 | 
 6 | import java.awt.image.BufferedImage
 7 | 
 8 | object TokenizerTool extends Tool("tokenize") with StringFormat {
 9 |   type Output = Seq[Token]
10 | 
11 |   override def info = ToolInfo(Impl.tokenizer.getClass.getSimpleName, Impl.obamaSentences)
12 | 
13 |   override def split(input: String) = input split "\n"
14 |   override def process(section: String) = Impl.tokenizer(section)
15 |   override def visualize(output: Output) = {
16 |     Seq(
17 |       implicitly[Writer[Output, BufferedImage]].write(output)
18 |     )
19 |   }
20 |   override def stringFormat = Tokenizer.multilineStringFormat
21 | }
22 | 


--------------------------------------------------------------------------------
/webapp/src/main/scala/org/allenai/nlpstack/webapp/tools/Tool.scala:
--------------------------------------------------------------------------------
 1 | package org.allenai.nlpstack.webapp.tools
 2 | 
 3 | import org.allenai.common.Resource
 4 | import org.allenai.nlpstack.core.Format
 5 | 
 6 | import org.apache.commons.codec.binary.Base64OutputStream
 7 | import org.apache.commons.io.output.ByteArrayOutputStream
 8 | import spray.json.DefaultJsonProtocol.{ StringJsonFormat, jsonFormat2, seqFormat }
 9 | 
10 | import java.awt.image.BufferedImage
11 | import javax.imageio.ImageIO
12 | 
13 | /** A class for representing a tool.
14 |   *
15 |   * @param  name  the name of the tool
16 |   * @param  split  how to divide up the input text
17 |   * @param  process  how to process each section of the input text
18 |   * @param  visualize  conversions of the process output to a visualization
19 |   * @param  format  conversions of the process output to a string
20 |   */
21 | abstract class Tool(val name: String) {
22 |   type Output
23 | 
24 |   /** This information is presented on /tools/name. */
25 |   def info: ToolInfo
26 | 
27 |   /** The input to all tools is a single text box.  It may be split up
28 |     * as the tool sees fit.  For example, a sentence segmenter may not
29 |     * want to split the text, but a tokenizer might want to split the
30 |     * input by newline.
31 |     */
32 |   def split(input: String): Seq[String]
33 |   def process(section: String): Output
34 |   def visualize(output: Output): Seq[BufferedImage]
35 |   def format(output: Output): Seq[String]
36 | 
37 |   /** Process, visualize, format, and then bundle the results. */
38 |   def apply(section: String): ToolResponse = {
39 |     val processed = process(section)
40 | 
41 |     val visualizations = visualize(processed)
42 |     val base64Visualizations = visualizations map { bufferedImage =>
43 |       Resource.using(new ByteArrayOutputStream()) { baos =>
44 |         Resource.using(new Base64OutputStream(baos)) { base64os =>
45 |           ImageIO.write(bufferedImage, "png", base64os)
46 |           baos.flush()
47 |           new String(baos.toByteArray())
48 |         }
49 |       }
50 |     }
51 | 
52 |     ToolResponse(format(processed), base64Visualizations)
53 |   }
54 | }
55 | 
56 | trait StringFormat { this: Tool =>
57 |   def stringFormat: Format[Output, String]
58 |   def format(output: Output): Seq[String] = Seq(stringFormat.write(output))
59 | }
60 | 
61 | case class ToolInfo(impl: String, example: String)
62 | object ToolInfo {
63 |   implicit val toolInfoFormat = jsonFormat2(ToolInfo.apply)
64 | }
65 | 
66 | case class ToolResponse(texts: Seq[String], base64Images: Seq[String])
67 | object ToolResponse {
68 |   implicit val toolResponseJsonFormat = jsonFormat2(ToolResponse.apply)
69 | }
70 | 


--------------------------------------------------------------------------------
/webapp/webapp/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "nlpstack-webapp",
 3 |   "version": "0.0.0",
 4 |   "description": "Webapp for shocasing nlpstack technologies",
 5 |   "main": "index.js",
 6 |   "scripts": {
 7 |     "test": "echo \"No test specified\" && exit 0"
 8 |   },
 9 |   "repository": {
10 |     "type": "git",
11 |     "url": "https://github.com/allenai/nlpstack"
12 |   },
13 |   "author": "",
14 |   "license": "ISC",
15 |   "bugs": {
16 |     "url": "https://github.com/allenai/nlpstack/issues"
17 |   },
18 |   "homepage": "https://github.com/allenai/nlpstack"
19 | }
20 | 


--------------------------------------------------------------------------------