├── .gitignore ├── README.md ├── build.sbt ├── cli ├── README.md ├── build.sbt └── src │ └── main │ ├── resources │ ├── application.conf │ └── logback.xml │ └── scala │ └── org │ └── allenai │ └── nlpstack │ └── cli │ ├── ArgumentHeadExtractorMain.scala │ ├── ChunkerMain.scala │ ├── ConstituencyParserMain.scala │ ├── DependencyParserMain.scala │ ├── LineProcessor.scala │ ├── PostaggerMain.scala │ ├── RelationHeadExtractorMain.scala │ ├── SegmenterMain.scala │ ├── SrlMain.scala │ ├── StemmerMain.scala │ └── TokenizerMain.scala ├── conf └── logback.xml ├── project ├── Dependencies.scala ├── build.properties └── plugins.sbt ├── tools ├── chunk │ ├── LICENSE │ ├── build.sbt │ ├── conf │ │ └── deploy.conf │ └── src │ │ ├── main │ │ ├── scala │ │ │ └── org │ │ │ │ └── allenai │ │ │ │ └── nlpstack │ │ │ │ └── chunk │ │ │ │ ├── OpenNlpChunker.scala │ │ │ │ └── package.scala │ │ └── universal │ │ │ └── chunk-server.sh │ │ └── test │ │ ├── resources │ │ └── logback.xml │ │ └── scala │ │ └── org │ │ └── allenai │ │ └── nlpstack │ │ └── chunk │ │ └── OpenNlpChunkerSpec.scala ├── core │ ├── LICENSE │ ├── build.sbt │ └── src │ │ ├── main │ │ └── scala │ │ │ └── org │ │ │ └── allenai │ │ │ └── nlpstack │ │ │ └── core │ │ │ ├── ChunkedToken.scala │ │ │ ├── Chunker.scala │ │ │ ├── ConstituencyParser.scala │ │ │ ├── DependencyParser.scala │ │ │ ├── FactorieUtilities.scala │ │ │ ├── Format.scala │ │ │ ├── HashCodeHelper.scala │ │ │ ├── IdentityStemmer.scala │ │ │ ├── Lemmatized.scala │ │ │ ├── PostaggedToken.scala │ │ │ ├── Postagger.scala │ │ │ ├── Segmenter.scala │ │ │ ├── Stemmer.scala │ │ │ ├── Token.scala │ │ │ ├── Tokenizer.scala │ │ │ ├── conf │ │ │ ├── ConfidenceFunction.scala │ │ │ ├── ConfidenceTrainer.scala │ │ │ ├── Feature.scala │ │ │ ├── FeatureSet.scala │ │ │ ├── Labelled.scala │ │ │ ├── Trainer.scala │ │ │ └── impl │ │ │ │ └── LogisticRegression.scala │ │ │ ├── coref │ │ │ └── CorefResolver.scala │ │ │ ├── graph │ │ │ ├── Bipath.scala │ │ │ ├── DirectedEdge.scala │ │ │ ├── Graph.scala │ │ │ └── pattern │ │ │ │ ├── Match.scala │ │ │ │ ├── Matcher.scala │ │ │ │ └── Pattern.scala │ │ │ ├── headword │ │ │ └── HeadExtractor.scala │ │ │ ├── parse │ │ │ └── graph │ │ │ │ ├── Dependency.scala │ │ │ │ ├── DependencyGraph.scala │ │ │ │ ├── DependencyNode.scala │ │ │ │ ├── DependencyPattern.scala │ │ │ │ ├── JoinedDependencyGraph.scala │ │ │ │ ├── JoinedDependencyNode.scala │ │ │ │ ├── TokenDependencyNode.scala │ │ │ │ └── package.scala │ │ │ ├── remote │ │ │ ├── Remote.scala │ │ │ ├── RemoteDependencyParser.scala │ │ │ ├── RemoteSegmenter.scala │ │ │ └── RemoteStemmer.scala │ │ │ ├── repr │ │ │ ├── Chunks.scala │ │ │ ├── Dependencies.scala │ │ │ ├── Document.scala │ │ │ ├── Lemmas.scala │ │ │ ├── Postags.scala │ │ │ ├── Sentence.scala │ │ │ ├── Sentenced.scala │ │ │ └── Tokens.scala │ │ │ ├── srl │ │ │ ├── Frame.scala │ │ │ ├── FrameHierarchy.scala │ │ │ ├── RemoteSrl.scala │ │ │ └── Srl.scala │ │ │ └── typer │ │ │ └── Typer.scala │ │ └── test │ │ ├── resources │ │ └── logback.xml │ │ └── scala │ │ └── org │ │ └── allenai │ │ └── nlpstack │ │ └── core │ │ ├── ChunkerSpec.scala │ │ ├── CorefResolverSpec.scala │ │ ├── DependencyGraphSpec.scala │ │ ├── DependencyNodeSpec.scala │ │ ├── DependencySpec.scala │ │ ├── FormatSpec.scala │ │ ├── TokenSpec.scala │ │ └── TokenizerSpec.scala ├── headword │ ├── build.sbt │ └── src │ │ ├── main │ │ └── scala │ │ │ ├── JwiTools.scala │ │ │ └── KnowitallHeadExtractor.scala │ │ └── test │ │ └── scala │ │ ├── JwiToolsSpec.scala │ │ └── KnowitallHeadExtractorSpec.scala ├── lemmatize │ ├── LICENSE │ ├── build.sbt │ └── src │ │ ├── main │ │ ├── scala │ │ │ └── org │ │ │ │ └── allenai │ │ │ │ └── nlpstack │ │ │ │ └── lemmatize │ │ │ │ └── MorphaStemmer.scala │ │ └── universal │ │ │ └── lemmatize-server.sh │ │ └── test │ │ ├── resources │ │ └── logback.xml │ │ └── scala │ │ └── org │ │ └── allenai │ │ └── nlpstack │ │ └── lemmatize │ │ └── MorphaLemmatizerSpec.scala ├── parse │ ├── LICENSE │ ├── build.sbt │ ├── conf │ │ └── deploy.conf │ ├── jvm.sbt │ └── src │ │ ├── main │ │ ├── resources │ │ │ └── featuretaggers.config │ │ ├── scala │ │ │ └── org │ │ │ │ └── allenai │ │ │ │ └── nlpstack │ │ │ │ └── parse │ │ │ │ ├── BankerProtocol.scala │ │ │ │ ├── FactorieParser.scala │ │ │ │ ├── PolytreeParser.scala │ │ │ │ ├── package.scala │ │ │ │ └── poly │ │ │ │ ├── core │ │ │ │ ├── AnnotatedSentence.scala │ │ │ │ ├── DirectedGraph.scala │ │ │ │ ├── PositionTree.scala │ │ │ │ ├── Sentence.scala │ │ │ │ ├── SentenceTagger.scala │ │ │ │ ├── TaggedSentence.scala │ │ │ │ ├── Token.scala │ │ │ │ ├── TokenTagger.scala │ │ │ │ ├── Util.scala │ │ │ │ └── WordClusters.scala │ │ │ │ ├── decisiontree │ │ │ │ ├── DecisionTree.scala │ │ │ │ ├── DecisionTreeTrainer.scala │ │ │ │ ├── FeatureVector.scala │ │ │ │ ├── FeatureVectorSource.scala │ │ │ │ ├── OmnibusTrainer.scala │ │ │ │ ├── OneVersusAll.scala │ │ │ │ ├── ProbabilisticClassifier.scala │ │ │ │ ├── RandomForest.scala │ │ │ │ └── package.scala │ │ │ │ ├── eval │ │ │ │ ├── Evaluate.scala │ │ │ │ ├── ParseAnalyzer.scala │ │ │ │ ├── ParseBank.scala │ │ │ │ ├── ParseEvaluation.scala │ │ │ │ ├── ParseScore.scala │ │ │ │ └── TaggingEvaluation.scala │ │ │ │ ├── fsm │ │ │ │ ├── ClassificationTask.scala │ │ │ │ ├── EmbeddedClassifier.scala │ │ │ │ ├── FSMTrainingVectorSource.scala │ │ │ │ ├── MarbleBlock.scala │ │ │ │ ├── NbestCorpus.scala │ │ │ │ ├── NbestSearch.scala │ │ │ │ ├── Reranker.scala │ │ │ │ ├── Sculpture.scala │ │ │ │ ├── SculptureCost.scala │ │ │ │ ├── SculptureFeature.scala │ │ │ │ ├── SculptureTrainingVectorSource.scala │ │ │ │ ├── Search.scala │ │ │ │ ├── State.scala │ │ │ │ ├── StateCostFunction.scala │ │ │ │ ├── StateCostFunctionTrainer.scala │ │ │ │ ├── StateFeature.scala │ │ │ │ ├── StateTransition.scala │ │ │ │ ├── TransitionClassifier.scala │ │ │ │ ├── TransitionConstraint.scala │ │ │ │ ├── TransitionSystem.scala │ │ │ │ └── Walk.scala │ │ │ │ ├── ml │ │ │ │ ├── BrownClusters.scala │ │ │ │ ├── FeatureVector.scala │ │ │ │ ├── GoogleNGram.scala │ │ │ │ ├── LinearModel.scala │ │ │ │ ├── TrainingData.scala │ │ │ │ ├── Verbnet.scala │ │ │ │ └── WrapperClassifier.scala │ │ │ │ ├── polyparser │ │ │ │ ├── AdaptiveTraining.scala │ │ │ │ ├── ArcEagerTransitionSystem.scala │ │ │ │ ├── ArcHybridTransitionSystem.scala │ │ │ │ ├── ArcInverter.scala │ │ │ │ ├── DependencyParsingTransitionSystem.scala │ │ │ │ ├── MultiWordTagger.scala │ │ │ │ ├── NbestParser.scala │ │ │ │ ├── Neighborhood.scala │ │ │ │ ├── ParseCache.scala │ │ │ │ ├── ParseFile.scala │ │ │ │ ├── ParsePool.scala │ │ │ │ ├── Parser.scala │ │ │ │ ├── ParserClassificationTask.scala │ │ │ │ ├── ParserConfiguration.scala │ │ │ │ ├── ParserConstraint.scala │ │ │ │ ├── PolytreeParse.scala │ │ │ │ ├── PolytreeParseSource.scala │ │ │ │ ├── RerankingTransitionParser.scala │ │ │ │ ├── StateRef.scala │ │ │ │ ├── TokenFeature.scala │ │ │ │ ├── TokenTransform.scala │ │ │ │ ├── Training.scala │ │ │ │ ├── TransitionParser.scala │ │ │ │ ├── TransitionParserFeature.scala │ │ │ │ └── TransitionParserState.scala │ │ │ │ └── reranking │ │ │ │ ├── NeighborhoodEventStatistic.scala │ │ │ │ ├── NeighborhoodExtractor.scala │ │ │ │ ├── NeighborhoodTransform.scala │ │ │ │ ├── OracleReranker.scala │ │ │ │ ├── ParseNodeFeature.scala │ │ │ │ ├── ParseReranker.scala │ │ │ │ ├── ParseRerankerTraining.scala │ │ │ │ ├── ParseRerankingFunction.scala │ │ │ │ ├── PolytreeParseFeature.scala │ │ │ │ └── QualityEstimation.scala │ │ └── universal │ │ │ └── parse-server.sh │ │ └── test │ │ ├── resources │ │ └── logback.xml │ │ └── scala │ │ └── org │ │ └── allenai │ │ └── nlpstack │ │ └── parse │ │ ├── FactorieParserSpec.scala │ │ ├── PolytreeParserSpec.scala │ │ └── poly │ │ ├── core │ │ ├── PositionTreeSpec.scala │ │ ├── SentenceSpec.scala │ │ ├── SentenceTaggerSpec.scala │ │ ├── TokenSpec.scala │ │ └── TokenTaggerSpec.scala │ │ ├── decisiontree │ │ ├── DecisionTreeSpec.scala │ │ └── OneVersusAllSpec.scala │ │ ├── eval │ │ ├── ParseBankSpec.scala │ │ └── ParseScoreSpec.scala │ │ ├── fsm │ │ ├── ClassificationTaskSpec.scala │ │ ├── SearchSpec.scala │ │ └── TrainingVectorSourceSpec.scala │ │ ├── ml │ │ ├── BrownClustersSpec.scala │ │ ├── FeatureVectorSpec.scala │ │ ├── LinearModelSpec.scala │ │ ├── NgramSetSpec.scala │ │ └── VerbnetSpec.scala │ │ ├── polyparser │ │ ├── ArcEagerConstraintsSpec.scala │ │ ├── ArcEagerTransitionSystemSpec.scala │ │ ├── ArcHybridTransitionSystemSpec.scala │ │ ├── ArcInverterSpec.scala │ │ ├── GoldParseTrainingVectorSourceSpec.scala │ │ ├── GreedySearchSpec.scala │ │ ├── MultiWordTaggerSpec.scala │ │ ├── ParserClassificationTaskSpec.scala │ │ ├── PolytreeParseFeatureSpec.scala │ │ ├── PolytreeParseSourceSpec.scala │ │ ├── PolytreeParseSpec.scala │ │ ├── StateRefSpec.scala │ │ ├── TokenTransformSpec.scala │ │ └── TransitionSpec.scala │ │ └── reranking │ │ ├── NeighborhoodExtractorSpec.scala │ │ └── NeighborhoodTransformSpec.scala ├── postag │ ├── LICENSE │ ├── build.sbt │ └── src │ │ ├── main │ │ ├── scala │ │ │ └── org │ │ │ │ └── allenai │ │ │ │ └── nlpstack │ │ │ │ └── postag │ │ │ │ ├── FactoriePostagger.scala │ │ │ │ ├── OpenNlpPostagger.scala │ │ │ │ ├── StanfordPostagger.scala │ │ │ │ └── package.scala │ │ └── universal │ │ │ └── postag-server.sh │ │ └── test │ │ ├── resources │ │ └── logback.xml │ │ └── scala │ │ └── org │ │ └── allenai │ │ └── nlpstack │ │ └── postag │ │ ├── FactoriePostaggerSpec.scala │ │ ├── OpenNlpPostaggerSpec.scala │ │ └── PostaggerSpec.scala ├── segment │ ├── LICENSE │ ├── build.sbt │ └── src │ │ ├── main │ │ ├── scala │ │ │ └── org │ │ │ │ └── allenai │ │ │ │ └── nlpstack │ │ │ │ └── segment │ │ │ │ ├── ChalkSentenceSegmenter.scala │ │ │ │ ├── FactorieSegmenter.scala │ │ │ │ ├── StanfordSegmenter.scala │ │ │ │ └── package.scala │ │ └── universal │ │ │ └── segment-server.scala │ │ └── test │ │ ├── resources │ │ ├── logback.xml │ │ └── org │ │ │ └── allenai │ │ │ └── nlpstack │ │ │ └── segment │ │ │ └── unclosed_tag_test.txt │ │ └── scala │ │ └── org │ │ └── allenai │ │ └── nlpstack │ │ └── segment │ │ ├── ChalkSentenceSegmenter.scala │ │ └── FactorieSegmenterSpec.scala └── tokenize │ ├── LICENSE │ ├── build.sbt │ └── src │ ├── main │ ├── scala │ │ └── org │ │ │ └── allenai │ │ │ └── nlpstack │ │ │ └── tokenize │ │ │ ├── FactorieTokenizer.scala │ │ │ ├── PennTokenizer.scala │ │ │ ├── RemoteTokenizer.scala │ │ │ ├── SimpleEnglishTokenizer.scala │ │ │ ├── StanfordTokenizer.scala │ │ │ ├── WhitespaceTokenizer.scala │ │ │ └── package.scala │ └── universal │ │ └── tokenize-server.sh │ └── test │ ├── resources │ ├── logback.xml │ └── org │ │ └── allenai │ │ └── nlpstack │ │ └── tokenize │ │ └── unclosed_tag_test.txt │ └── scala │ └── org │ └── allenai │ └── nlpstack │ └── tokenize │ ├── FactorieTokenizerSpec.scala │ └── TokenizerSpec.scala ├── version.sbt └── webapp ├── README.md ├── build.sbt ├── conf ├── deploy.conf └── global_deploy.conf ├── public ├── css │ ├── bootstrap-3.1.1.min.css │ └── main.css ├── img │ └── spinner.gif ├── index.html ├── js │ ├── angular-1.2.13.min.js │ ├── bootstrap-3.1.1.min.js │ ├── jquery-2.0.1.min.js │ ├── tools.js │ ├── ui-bootstrap-tpls-0.10.0.min.js │ └── visualize.js ├── tools.html └── visualize.html ├── src └── main │ ├── bin │ └── webapp.sh │ ├── resources │ └── application.conf │ └── scala │ └── org │ └── allenai │ └── nlpstack │ └── webapp │ ├── BasicService.scala │ ├── Nlpweb.scala │ ├── NlpwebActor.scala │ ├── ToolService.scala │ ├── VisualizationService.scala │ ├── Whatswrong.scala │ └── tools │ ├── ChunkerTool.scala │ ├── DependencyParserTool.scala │ ├── Impl.scala │ ├── LemmatizerTool.scala │ ├── PostaggerTool.scala │ ├── SentenceSegmenterTool.scala │ ├── TokenizerTool.scala │ └── Tool.scala └── webapp └── package.json /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | target/ 3 | .idea/ 4 | -------------------------------------------------------------------------------- /cli/build.sbt: -------------------------------------------------------------------------------- 1 | import Dependencies._ 2 | 3 | name := "nlpstack-cli" 4 | 5 | libraryDependencies ++= Seq( 6 | scopt, 7 | "com.typesafe.akka" %% "akka-actor" % defaultAkkaVersion, 8 | sprayCan, 9 | sprayRouting, 10 | typesafeConfig) 11 | 12 | fork in run := true 13 | 14 | javaOptions += "-Xmx8G" 15 | -------------------------------------------------------------------------------- /cli/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | nlpstack { 2 | tools { 3 | tokenizer { 4 | defaultPort = "14000" 5 | } 6 | segmenter { 7 | defaultPort = "14001" 8 | } 9 | postagger { 10 | defaultPort = "14002" 11 | } 12 | chunker { 13 | defaultPort = "14003" 14 | } 15 | dep-parser { 16 | defaultPort = "14004" 17 | } 18 | argumentheadextractor { 19 | defaultPort = "14005" 20 | } 21 | relationheadextractor { 22 | defaultPort = "14006" 23 | } 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /cli/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 6 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /cli/src/main/scala/org/allenai/nlpstack/cli/ArgumentHeadExtractorMain.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.cli 2 | 3 | import org.allenai.nlpstack.core._ 4 | import org.allenai.nlpstack.core.headword.HeadExtractor 5 | import org.allenai.nlpstack.headword.KnowitallHeadExtractor 6 | import org.allenai.nlpstack.postag.defaultPostagger 7 | import org.allenai.nlpstack.tokenize.defaultTokenizer 8 | 9 | abstract class ArgumentHeadExtractorMain extends LineProcessor("argumentheadextractor") { 10 | def tokenizer: Tokenizer 11 | def postagger: Postagger 12 | def headExtractor: HeadExtractor 13 | 14 | override def process(line: String) = { 15 | val headTokens = headExtractor.argumentHead(tokenizer, postagger)(line) 16 | Postagger.multilineStringFormat.write(headTokens) 17 | } 18 | } 19 | 20 | object KnowitallArgumentHeadExtractorMain extends ArgumentHeadExtractorMain { 21 | override val tokenizer = defaultTokenizer 22 | override val postagger = defaultPostagger 23 | override val headExtractor = new KnowitallHeadExtractor() 24 | } -------------------------------------------------------------------------------- /cli/src/main/scala/org/allenai/nlpstack/cli/ChunkerMain.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.cli 2 | 3 | import org.allenai.nlpstack.core._ 4 | import org.allenai.nlpstack.chunk._ 5 | import org.allenai.nlpstack.postag._ 6 | import org.allenai.nlpstack.tokenize._ 7 | 8 | abstract class ChunkerMain 9 | extends LineProcessor("chunker") { 10 | def tokenizer: Tokenizer 11 | def postagger: Postagger 12 | def chunker: Chunker 13 | 14 | override def process(line: String) = { 15 | val chunkedTokens = chunker.chunk(tokenizer, postagger)(line) 16 | Chunker.multilineStringFormat.write(chunkedTokens) 17 | } 18 | 19 | override def init(config: Config) { 20 | // for timing purposes 21 | chunker.chunk(tokenizer, postagger)("I want to initialize the chunker.") 22 | } 23 | } 24 | 25 | object OpenNlpChunkerMain extends ChunkerMain { 26 | override lazy val tokenizer = defaultTokenizer 27 | override lazy val postagger = defaultPostagger 28 | override lazy val chunker = new OpenNlpChunker() 29 | } 30 | -------------------------------------------------------------------------------- /cli/src/main/scala/org/allenai/nlpstack/cli/ConstituencyParserMain.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.cli 2 | 3 | import org.allenai.nlpstack.core.ConstituencyParser 4 | import org.allenai.nlpstack.parse.FactorieParser 5 | import org.allenai.nlpstack.parse.PolytreeParser 6 | import org.allenai.nlpstack.postag.defaultPostagger 7 | import org.allenai.nlpstack.tokenize.defaultTokenizer 8 | 9 | abstract class ConstituencyParserMain extends LineProcessor("constit-parser") { 10 | def constituencyParser: ConstituencyParser 11 | override def process(line: String) = { 12 | constituencyParser.parse(line).toString 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /cli/src/main/scala/org/allenai/nlpstack/cli/DependencyParserMain.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.cli 2 | 3 | import org.allenai.nlpstack.core._ 4 | import org.allenai.nlpstack.parse._ 5 | import org.allenai.nlpstack.postag._ 6 | import org.allenai.nlpstack.tokenize._ 7 | 8 | abstract class DependencyParserMain extends LineProcessor("dep-parser") { 9 | def tokenizer: Tokenizer 10 | def postagger: Postagger 11 | def dependencyParser: DependencyParser 12 | 13 | override def init(config: Config) { 14 | // for timing purposes 15 | val tokens = tokenizer("I want to initialize the parser.") 16 | val postagged = postagger.postagTokenized(tokens) 17 | dependencyParser.dependencyGraphPostagged(postagged) 18 | } 19 | 20 | override def process(line: String) = { 21 | val tokens = tokenizer(line) 22 | val postagged = postagger.postagTokenized(tokens) 23 | val dgraph = dependencyParser.dependencyGraphPostagged(postagged) 24 | DependencyParser.multilineStringFormat.write((postagged, dgraph)) 25 | } 26 | } 27 | 28 | object FactorieParserMain extends DependencyParserMain { 29 | override lazy val tokenizer = defaultTokenizer 30 | override lazy val postagger = defaultPostagger 31 | override lazy val dependencyParser = new FactorieParser 32 | } 33 | 34 | object PolytreeParserMain extends DependencyParserMain { 35 | override lazy val tokenizer = defaultTokenizer 36 | override lazy val postagger = defaultPostagger 37 | override lazy val dependencyParser = new PolytreeParser 38 | } 39 | -------------------------------------------------------------------------------- /cli/src/main/scala/org/allenai/nlpstack/cli/PostaggerMain.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.cli 2 | 3 | import org.allenai.nlpstack.core._ 4 | import org.allenai.nlpstack.tokenize._ 5 | import org.allenai.nlpstack.postag.FactoriePostagger 6 | import org.allenai.nlpstack.postag.StanfordPostagger 7 | 8 | abstract class PostaggerMain extends LineProcessor("postagger") { 9 | def tokenizer: Tokenizer 10 | def postagger: Postagger 11 | override def process(line: String) = { 12 | val postaggedTokens = postagger.postag(tokenizer)(line) 13 | Postagger.multilineStringFormat.write(postaggedTokens) 14 | } 15 | } 16 | 17 | object FactoriePostaggerMain extends PostaggerMain { 18 | override val tokenizer = defaultTokenizer 19 | override val postagger = new FactoriePostagger() 20 | } 21 | 22 | object StanfordPostaggerMain extends PostaggerMain { 23 | override val tokenizer = defaultTokenizer 24 | override val postagger = new StanfordPostagger() 25 | } -------------------------------------------------------------------------------- /cli/src/main/scala/org/allenai/nlpstack/cli/RelationHeadExtractorMain.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.cli 2 | 3 | import org.allenai.nlpstack.core._ 4 | import org.allenai.nlpstack.core.headword.HeadExtractor 5 | import org.allenai.nlpstack.headword.KnowitallHeadExtractor 6 | import org.allenai.nlpstack.postag.defaultPostagger 7 | import org.allenai.nlpstack.tokenize.defaultTokenizer 8 | 9 | abstract class RelationHeadExtractorMain extends LineProcessor("relationheadextractor") { 10 | def tokenizer: Tokenizer 11 | def postagger: Postagger 12 | def headExtractor: HeadExtractor 13 | 14 | override def process(line: String) = { 15 | val headTokens = headExtractor.relationHead(tokenizer, postagger)(line) 16 | Postagger.multilineStringFormat.write(headTokens) 17 | } 18 | } 19 | 20 | object KnowitallRelationHeadExtractorMain extends RelationHeadExtractorMain { 21 | override val tokenizer = defaultTokenizer 22 | override val postagger = defaultPostagger 23 | override val headExtractor = new KnowitallHeadExtractor() 24 | } -------------------------------------------------------------------------------- /cli/src/main/scala/org/allenai/nlpstack/cli/SegmenterMain.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.cli 2 | 3 | import org.allenai.nlpstack.core._ 4 | import org.allenai.nlpstack.segment.{ StanfordSegmenter, ChalkSentenceSegmenter, FactorieSegmenter } 5 | 6 | abstract class SegmenterMain 7 | extends LineProcessor("segmenter") { 8 | def sentencer: Segmenter 9 | override def process(line: String) = sentencer(line).map(_.text).mkString("\n") 10 | } 11 | 12 | object FactorieSegmenterMain extends SegmenterMain { 13 | override val sentencer = new FactorieSegmenter() 14 | } 15 | 16 | object StanfordSegmenterMain extends SegmenterMain { 17 | override val sentencer = StanfordSegmenter 18 | } 19 | -------------------------------------------------------------------------------- /cli/src/main/scala/org/allenai/nlpstack/cli/SrlMain.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.cli 2 | 3 | import org.allenai.nlpstack.core.srl._ 4 | import org.allenai.nlpstack.core.DependencyParser 5 | 6 | abstract class SrlMain extends LineProcessor("srl") { 7 | def srl: Srl 8 | 9 | override def process(line: String) = { 10 | val (tokens, dgraph) = DependencyParser.multilineStringFormat.read(line) 11 | (srl(tokens, dgraph) map (_.serialize)).mkString("\n") 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /cli/src/main/scala/org/allenai/nlpstack/cli/StemmerMain.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.cli 2 | 3 | import org.allenai.nlpstack.core._ 4 | import org.allenai.nlpstack.lemmatize.MorphaStemmer 5 | 6 | abstract class StemmerMain 7 | extends LineProcessor("stemmer") { 8 | def lemmatizer: Stemmer 9 | override def process(line: String) = line.split("\\s+").map(lemmatizer.stem(_)).mkString(" ") 10 | } 11 | 12 | object MorphaStemmerMain extends StemmerMain { 13 | lazy val lemmatizer = new MorphaStemmer 14 | } 15 | -------------------------------------------------------------------------------- /cli/src/main/scala/org/allenai/nlpstack/cli/TokenizerMain.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.cli 2 | 3 | import org.allenai.nlpstack.core.Tokenizer 4 | import org.allenai.nlpstack.tokenize._ 5 | 6 | abstract class TokenizerMain extends LineProcessor("tokenizer") { 7 | def tokenizer: Tokenizer 8 | override def process(sentence: String) = 9 | Tokenizer.multilineStringFormat.write(tokenizer.tokenize(sentence)) 10 | } 11 | 12 | object FactorieTokenizerMain extends TokenizerMain { 13 | val tokenizer = new FactorieTokenizer() 14 | } 15 | 16 | object PennTokenizerMain extends TokenizerMain { 17 | val tokenizer = PennTokenizer 18 | } 19 | 20 | object WhitespaceTokenizerMain extends TokenizerMain { 21 | val tokenizer = WhitespaceTokenizer 22 | } 23 | 24 | object StanfordTokenizerMain extends TokenizerMain { 25 | val tokenizer = StanfordTokenizer 26 | } 27 | -------------------------------------------------------------------------------- /conf/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 6 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /project/Dependencies.scala: -------------------------------------------------------------------------------- 1 | import sbt._ 2 | import org.allenai.plugins.CoreDependencies 3 | 4 | object Dependencies extends CoreDependencies { 5 | val datastore = "org.allenai" %% "datastore" % "1.0.2" 6 | 7 | val commonsIo = "commons-io" % "commons-io" % "2.4" 8 | 9 | val parserCombinators = "org.scala-lang.modules" %% "scala-parser-combinators" % "1.0.3" 10 | 11 | val clearGroup = "com.clearnlp" 12 | val clearVersion = "2.0.2" 13 | val clear = clearGroup % "clearnlp" % clearVersion 14 | val opennlp = ("org.apache.opennlp" % "opennlp-tools" % "1.5.3" 15 | exclude ("net.sf.jwordnet", "jwnl")) 16 | 17 | val stanfordCoreNlp = "edu.stanford.nlp" % "stanford-corenlp" % "3.4.1" 18 | 19 | val factorie = ("cc.factorie" %% "factorie" % "1.1.1" 20 | exclude ("junit", "junit") 21 | exclude ("commons-logging", "commons-logging")) 22 | val factorieWordnet = "cc.factorie.app.nlp" % "wordnet" % "1.0" 23 | 24 | val testingLibraries = Seq(allenAiTestkit % "test") 25 | 26 | val apache2 = "Apache 2.0 " -> url("http://www.opensource.org/licenses/bsd-3-clause") 27 | 28 | val loggingDependencies = Seq( 29 | Logging.slf4jApi, 30 | Logging.logbackCore, 31 | Logging.logbackClassic, 32 | "org.slf4j" % "jcl-over-slf4j" % Logging.slf4jVersion, 33 | "org.slf4j" % "log4j-over-slf4j" % Logging.slf4jVersion, 34 | "org.slf4j" % "jul-to-slf4j" % Logging.slf4jVersion 35 | ) 36 | 37 | val jVerbnet = "edu.mit" % "jverbnet" % "1.2.0.1" 38 | 39 | val jwiWordnet = "edu.mit" % "jwi" % "2.2.3" 40 | 41 | val reming = "com.github.jkinkead" %% "reming-json" % "0.0.9" 42 | 43 | val Overrides = loggingDependencies.toSet 44 | } 45 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=0.13.8 2 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("org.allenai.plugins" % "allenai-sbt-plugins" % "1.4.8") 2 | 3 | addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.0.0") 4 | -------------------------------------------------------------------------------- /tools/chunk/build.sbt: -------------------------------------------------------------------------------- 1 | import Dependencies._ 2 | 3 | libraryDependencies ++= loggingDependencies 4 | -------------------------------------------------------------------------------- /tools/chunk/conf/deploy.conf: -------------------------------------------------------------------------------- 1 | chunker = { 2 | project = { 3 | name = "tools-chunk" 4 | } 5 | 6 | deploy = { 7 | startup_script = "bin/chunker-server.sh" 8 | directory = "/local/deploy/tools/chunker" 9 | user.ssh_username = "ec2-user" 10 | } 11 | 12 | deploy.host = "nlp.allenai.org" 13 | } 14 | -------------------------------------------------------------------------------- /tools/chunk/src/main/scala/org/allenai/nlpstack/chunk/OpenNlpChunker.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.chunk 2 | 3 | import org.allenai.common.Resource 4 | import org.allenai.nlpstack.core.{ ChunkedToken, Chunker, PostaggedToken } 5 | 6 | import opennlp.tools.chunker.{ ChunkerME, ChunkerModel } 7 | 8 | class OpenNlpChunker extends Chunker { 9 | //Added ThreadLocal to prevent concurrency issues 10 | private final val chunker: ThreadLocal[ChunkerME] = new ThreadLocal[ChunkerME]() { 11 | override protected def initialValue(): ChunkerME = new ChunkerME(OpenNlpChunker.model) 12 | } 13 | 14 | def chunkPostagged(tokens: Seq[PostaggedToken]): Seq[ChunkedToken] = { 15 | // OpenNLP uses : as the postag for hyphens, but we use HYPH, so we change it back before 16 | // sending it to the chunker. 17 | val fixedTokens = tokens.map { t => 18 | if (t.string == "-") PostaggedToken(t, ":") else t 19 | } 20 | 21 | val chunks = chunker.get().chunk(tokens.map(_.string).toArray, fixedTokens.map(_.postag) 22 | .toArray) 23 | (tokens zip chunks) map { case (token, chunk) => ChunkedToken(token, chunk) } 24 | } 25 | } 26 | 27 | object OpenNlpChunker { 28 | private val defaultModelName = "en-chunker.bin" 29 | private val model = Resource.using(this.getClass.getClassLoader.getResourceAsStream(defaultModelName)) { is => 30 | new ChunkerModel(is) 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /tools/chunk/src/main/scala/org/allenai/nlpstack/chunk/package.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack 2 | 3 | import org.allenai.nlpstack.core.Chunker 4 | 5 | package object chunk { 6 | val defaultChunker: Chunker = new OpenNlpChunker 7 | } -------------------------------------------------------------------------------- /tools/chunk/src/main/universal/chunk-server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CLASS_NAME="edu.knowitall.tool.chunk.OpenNlpChunker" 4 | 5 | SCRIPT_DIR=`dirname $0` 6 | SHORT_NAME=`basename $0 .sh` 7 | APP_ROOT="$SCRIPT_DIR/.." 8 | JVM_ARGS="-Xmx128M" 9 | 10 | . "${SCRIPT_DIR}/run-class.sh" "$CLASS_NAME" "$SHORT_NAME" "$@" 11 | -------------------------------------------------------------------------------- /tools/chunk/src/test/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 6 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /tools/chunk/src/test/scala/org/allenai/nlpstack/chunk/OpenNlpChunkerSpec.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack 2 | package chunk 3 | 4 | import org.allenai.common.testkit.UnitSpec 5 | import org.allenai.nlpstack.postag._ 6 | import org.allenai.nlpstack.tokenize._ 7 | 8 | class OpenNlpChunkerSpec extends UnitSpec { 9 | "chunker" should "correctly chunk an example sentence" in { 10 | val text = "This is a test of the OpenNlp chunker." 11 | val tokenizer = defaultTokenizer 12 | val postagger = new OpenNlpPostagger 13 | val chunker = new OpenNlpChunker 14 | 15 | val chunked = chunker.chunk(tokenizer, postagger)(text) 16 | chunked.mkString("; ") === "This 0 DT B-NP; is 5 VBZ B-VP; a 8 DT B-NP; test 10 NN I-NP; of 15 IN B-PP; the 18 DT B-NP; OpenNlp 22 NNP I-NP; chunker 30 NN I-NP; . 37 . O" 17 | } 18 | } 19 | 20 | -------------------------------------------------------------------------------- /tools/core/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012, University of Washington 2 | BSD 3-clause License / BSD Modified License / New BSD License 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | * Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above copyright 10 | notice, this list of conditions and the following disclaimer in the 11 | documentation and/or other materials provided with the distribution. 12 | * Neither the name of the nor the 13 | names of its contributors may be used to endorse or promote products 14 | derived from this software without specific prior written permission. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY 20 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /tools/core/build.sbt: -------------------------------------------------------------------------------- 1 | import Dependencies._ 2 | 3 | name := "nlpstack-core" 4 | 5 | licenses := Seq(apache2) 6 | 7 | libraryDependencies ++= Seq( 8 | parserCombinators, 9 | // for remotes 10 | "net.databinder.dispatch" %% "dispatch-core" % "0.11.2") 11 | 12 | dependencyOverrides ++= Set( 13 | "org.scala-lang.modules" %% "scala-xml" % "1.0.2") 14 | 15 | libraryDependencies ++= loggingDependencies 16 | -------------------------------------------------------------------------------- /tools/core/src/main/scala/org/allenai/nlpstack/core/ConstituencyParser.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core 2 | 3 | /** A constituency parser turns a sentence into a constituency 4 | * tree, a structure that is somewhat like chunking but 5 | * hierarchical. 6 | */ 7 | trait ConstituencyParser { 8 | def parse(string: String): ParseTree 9 | } 10 | 11 | /** A representation of the constituency parse. */ 12 | abstract class ParseTree( 13 | val token: String, var index: Int, val children: Array[ParseTree] 14 | ) extends Iterable[ParseTree] { 15 | 16 | /** Prints the tree in Penn treebank format. */ 17 | override def toString() = 18 | if (children.size == 0) { 19 | token 20 | } else { 21 | "(" + token + " " + children.map(child => child.toString).mkString(" ") + ")" 22 | } 23 | 24 | def value = token 25 | 26 | def iterator = { 27 | def preorder(node: ParseTree): List[ParseTree] = { 28 | node :: node.children.toList.flatMap(preorder(_)) 29 | } 30 | preorder(this).iterator 31 | } 32 | 33 | def print(): Unit = { 34 | def print(tree: ParseTree, indent: Int) { 35 | if (tree.children.isEmpty) { 36 | println(" " * indent + "(" + tree.token + ")") 37 | } else { 38 | println(" " * indent + "(" + tree.token) 39 | tree.children.foreach { tree => print(tree, indent + 2) } 40 | println(" " * indent + ")") 41 | } 42 | } 43 | 44 | print(this, 0) 45 | } 46 | 47 | def printDOT(writer: java.lang.Appendable) { 48 | def quote(string: String) = "\"" + string + "\"" 49 | def nodeString(node: ParseTree) = node.token 50 | val indent = " " * 2 51 | 52 | writer.append("digraph g {\n") 53 | 54 | for (node <- this) { 55 | val shape = node match { 56 | case node: ParseTreePhrase => "box" 57 | case node: ParseTreePostag => "invtriangle" 58 | case node: ParseTreeToken => "circle" 59 | } 60 | writer.append(indent + node.index + " [label=" + quote(nodeString(node)) + 61 | ", shape=" + quote(shape) + "]\n") 62 | } 63 | 64 | for (node <- this) { 65 | for (child <- node.children) { 66 | writer.append(indent + node.index.toString + " -> " + child.index.toString + "\n") 67 | } 68 | } 69 | writer.append("}") 70 | } 71 | } 72 | 73 | class ParseTreePhrase( 74 | token: String, index: Int, 75 | children: Array[ParseTree] 76 | ) extends ParseTree(token, index, children) 77 | 78 | class ParseTreePostag( 79 | token: String, 80 | index: Int, 81 | children: Array[ParseTree] 82 | ) extends ParseTree(token, index, children) 83 | 84 | class ParseTreeToken( 85 | token: String, 86 | index: Int, 87 | children: Array[ParseTree] 88 | ) extends ParseTree(token, index, children) 89 | -------------------------------------------------------------------------------- /tools/core/src/main/scala/org/allenai/nlpstack/core/FactorieUtilities.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core 2 | 3 | import java.util.regex.Matcher 4 | 5 | /** Shared utilities for making Factorie work. These are probably not generally 6 | * useful. 7 | */ 8 | object FactorieUtilities { 9 | // Factorie's tokenizer crashes on unclosed XML tags. To work around this, we 10 | // detect unclosed tags, and replace the opening < with a space. 11 | private val unclosedTagRegex = "<([^>]{100})".r 12 | def replaceUnclosedTag(s: String): String = { 13 | val replaced = unclosedTagRegex.replaceAllIn(s, m => Matcher.quoteReplacement(" " + m.group(1))) 14 | // Have to do this repeatedly for the case of "foo << barbarbarbar..." 15 | if (replaced == s) s else replaceUnclosedTag(replaced) 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /tools/core/src/main/scala/org/allenai/nlpstack/core/Format.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core 2 | 3 | import scala.util.Try 4 | 5 | trait Writer[F, T] { 6 | def write(from: F): T 7 | } 8 | 9 | trait Reader[F, T] { 10 | def read(from: F): T 11 | def readTry(from: F): Try[T] = Try(this.read(from)) 12 | } 13 | 14 | trait Format[F, T] extends Writer[F, T] with Reader[T, F] { 15 | def roundtrip(f: F) = read(write(f)) 16 | def reverseRoundtrip(t: T) = write(read(t)) 17 | } 18 | 19 | object Format { 20 | object stringQuoter extends Quoter(Set('"')) 21 | 22 | class Quoter(val chars: Set[Char]) { 23 | def this(charString: String) = this(charString.toSet) 24 | 25 | def quote(s: String): String = { 26 | val escapedBackslashes = s.replace("\\", "\\\\") 27 | chars.foldLeft(escapedBackslashes)((unreplaced: String, char: Char) => 28 | unreplaced.replace(char.toString, "\\" + char)) 29 | } 30 | 31 | def unquote(s: String): String = { 32 | val escapedBackslashes = chars.foldLeft(s)((quoted: String, char: Char) => 33 | quoted.replace("\\" + char, char.toString)) 34 | escapedBackslashes.replace("\\\\", "\\") 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /tools/core/src/main/scala/org/allenai/nlpstack/core/HashCodeHelper.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core 2 | 3 | /** This object provides a function to generate a hash code 4 | * out of multiple hashable parts. 5 | * 6 | * @author Michael Schmitz 7 | */ 8 | object HashCodeHelper { 9 | def apply(parts: Any*): Int = this.apply(41)(parts: _*) 10 | def apply(prime: Int)(parts: Any*): Int = { 11 | var code = 0; 12 | for (part <- parts) { 13 | code = prime * code + part.hashCode 14 | } 15 | 16 | code 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /tools/core/src/main/scala/org/allenai/nlpstack/core/IdentityStemmer.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core 2 | 3 | /** A trivial stemmer that doesn't apply a stemming algorithm. */ 4 | object IdentityStemmer extends Stemmer { 5 | override def stem(word: String) = word 6 | 7 | implicit def instance: Stemmer = IdentityStemmer 8 | } 9 | -------------------------------------------------------------------------------- /tools/core/src/main/scala/org/allenai/nlpstack/core/Lemmatized.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core 2 | 3 | case class Lemmatized[+T <: Token](token: T, lemma: String) 4 | -------------------------------------------------------------------------------- /tools/core/src/main/scala/org/allenai/nlpstack/core/Segmenter.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core 2 | 3 | import org.allenai.common.immutable.Interval 4 | 5 | import spray.json.DefaultJsonProtocol._ 6 | 7 | /** A sentencer breaks text into sentences. 8 | */ 9 | abstract class Segmenter { 10 | def apply(document: String) = segment(document) 11 | 12 | def segmentTexts(document: String) = { 13 | this.segment(document).map(_.text) 14 | } 15 | 16 | def segment(document: String): Iterable[Segment] 17 | } 18 | 19 | case class Segment(text: String, offset: Int) { 20 | override def toString = serialize 21 | 22 | def interval = Interval.open(offset, offset + text.length) 23 | def length = text.length 24 | 25 | def serialize = text + "@" + offset 26 | } 27 | 28 | object Segment { 29 | private[this] val segmentRegex = """(.+)@(\d+)""".r 30 | def deserialize(pickled: String): Segment = { 31 | pickled match { 32 | case segmentRegex(string, offset) => new Segment(string, offset.toInt) 33 | case s => throw new MatchError("Could not deserialize: " + s) 34 | } 35 | } 36 | 37 | implicit val segmentJsonFormat = jsonFormat2(Segment.apply) 38 | } 39 | -------------------------------------------------------------------------------- /tools/core/src/main/scala/org/allenai/nlpstack/core/Stemmer.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core 2 | 3 | /** A stemmer takes a string token and produces a normalized form. */ 4 | abstract class Stemmer { 5 | def apply(word: String) = lemmatize(word) 6 | 7 | /** Apply the stemming algorithm. */ 8 | def stem(word: String): String 9 | 10 | /** Stem a token without a postag. */ 11 | def stemToken[T <: Token](token: T) = Lemmatized(token, this.stem(token.string)) 12 | 13 | /** Apply the normalizing algorithm and then the stemming algorithm. */ 14 | def lemmatize(word: String) = this.stem(Stemmer.normalize(word)) 15 | 16 | /** Lemmatize a token without a postag. */ 17 | def lemmatizeToken[T <: Token](token: T) = Lemmatized(token, this.lemmatize(token.string)) 18 | } 19 | 20 | trait PostaggedStemmer { 21 | /** Some stemmers can take advantage of postags. */ 22 | def stem(word: String, postag: String): String 23 | 24 | /** Apply the normalizing algorithm and then the stemming algorithm with postag. */ 25 | def lemmatize(word: String, postag: String) = this.stem(Stemmer.normalize(word), postag) 26 | 27 | /** Stem a token with a postag. */ 28 | def stemPostaggedToken[T <: PostaggedToken](token: T): Lemmatized[T] = 29 | Lemmatized(token, this.stem(token.string, token.postag)) 30 | 31 | /** Lemmatize a token with a postag. */ 32 | def lemmatizePostaggedToken[T <: PostaggedToken](token: T): Lemmatized[T] = 33 | Lemmatized(token, this.lemmatize(token.string, token.postag)) 34 | } 35 | 36 | object Stemmer { 37 | /** Special characters to remove. */ 38 | val remove = """[()\[\].,;:"']""".r; 39 | 40 | /** Remove special characters and lowercase the string. */ 41 | def normalize(word: String) = Stemmer.remove.replaceAllIn( 42 | word.trim.toLowerCase, "" 43 | ) 44 | } 45 | -------------------------------------------------------------------------------- /tools/core/src/main/scala/org/allenai/nlpstack/core/Token.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core 2 | 3 | import org.allenai.common.immutable.Interval 4 | 5 | import spray.json._ 6 | 7 | /** The most simple representation of a token. A token has a string 8 | * and a character offset in the original text. 9 | * 10 | * @param string the string of the token 11 | * @param offset the character offset of the token in the source sentence 12 | */ 13 | class Token(val string: String, val offset: Int) { 14 | override def toString = Token.stringFormat.write(this) 15 | 16 | override def hashCode = HashCodeHelper(this.string, this.offset) 17 | def canEqual(that: Token) = that.isInstanceOf[Token] 18 | override def equals(that: Any) = that match { 19 | case that: Token => (that canEqual this) && 20 | this.string == that.string && 21 | this.offset == that.offset 22 | case _ => false 23 | } 24 | 25 | @deprecated("Use offsets instead.", "2.4.0") 26 | def interval = offsets 27 | 28 | def offsets = Interval.open(offset, offset + string.length) 29 | } 30 | 31 | object Token { 32 | def apply(string: String, offset: Int) = new Token(string, offset) 33 | def unapply(token: Token): Option[(String, Int)] = Some((token.string, token.offset)) 34 | 35 | object stringFormat extends Format[Token, String] { 36 | val tokenRegex = "(.*?) +([^ ]*)".r 37 | def write(token: Token): String = token.string + " " + token.offset 38 | def read(pickled: String): Token = { 39 | val (string, offset) = pickled match { 40 | case tokenRegex(string, offset) => (string, offset.toInt) 41 | case _ => throw new MatchError("Error parsing token: " + pickled) 42 | } 43 | Token(string, offset) 44 | } 45 | } 46 | 47 | implicit object tokenJsonFormat extends RootJsonFormat[Token] { 48 | def write(t: Token) = JsObject( 49 | "string" -> JsString(t.string), 50 | "offset" -> JsNumber(t.offset) 51 | ) 52 | 53 | def read(value: JsValue) = value.asJsObject.getFields("string", "offset") match { 54 | case Seq(JsString(string), JsNumber(offset)) => 55 | Token.apply(string, offset.toInt) 56 | case _ => throw new DeserializationException("Token expected.") 57 | } 58 | } 59 | 60 | def rebuildString(tokens: Iterable[Token]): String = { 61 | val str = new StringBuilder 62 | for (token <- tokens) { 63 | if (str.length < token.offset) { 64 | str.append(" " * (token.offset - str.length)) 65 | } 66 | str.replace(token.offset, token.offset + token.string.length, token.string) 67 | } 68 | str.mkString 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /tools/core/src/main/scala/org/allenai/nlpstack/core/conf/ConfidenceFunction.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core.conf 2 | 3 | import org.allenai.common.Resource.using 4 | 5 | import java.io.{ BufferedOutputStream, File, FileOutputStream, OutputStream } 6 | 7 | /** A confidence function for ranking how likely an extraction is correct. 8 | * 9 | * @tparam E the extraction to rank 10 | * @param featureSet the features to use 11 | */ 12 | abstract class ConfidenceFunction[E](val featureSet: FeatureSet[E, Double]) 13 | extends Function[E, Double] { 14 | def apply(that: E): Double 15 | 16 | def save(output: OutputStream): Unit 17 | def saveFile(file: File) { 18 | using(new BufferedOutputStream(new FileOutputStream(file))) { stream => 19 | this.save(stream) 20 | } 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /tools/core/src/main/scala/org/allenai/nlpstack/core/conf/ConfidenceTrainer.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core.conf 2 | 3 | /** A trainer for a confidence function. 4 | * 5 | * @tparam E the extraction the confidence function will rank 6 | * @param featureSet the features to use 7 | */ 8 | abstract class ConfidenceTrainer[E](features: FeatureSet[E, Double]) 9 | extends Trainer[E, Double](features) { 10 | override val apply = train _ 11 | override def train(examples: Iterable[Labelled[E]]): ConfidenceFunction[E] 12 | } 13 | -------------------------------------------------------------------------------- /tools/core/src/main/scala/org/allenai/nlpstack/core/conf/Feature.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core.conf 2 | 3 | import scala.language.implicitConversions 4 | 5 | /** An abstract representation for a feature used by the 6 | * confidence function. 7 | * 8 | * @param name a human-readable name for this feature 9 | */ 10 | abstract class Feature[E, V](val name: String) extends Function[E, V] { 11 | def apply(that: E): V 12 | } 13 | 14 | object Feature { 15 | /** A convenience factory method for creating a Feature from 16 | * an anonymous function. 17 | */ 18 | def from[E, V](name: String, f: E => V) = new Feature[E, V](name) { 19 | override def apply(that: E): V = f(that) 20 | } 21 | 22 | implicit def booleanToDouble[E](feature: Feature[E, Boolean]) = 23 | new Feature[E, Double](feature.name) { 24 | override def apply(item: E) = { 25 | if (feature(item)) { 26 | 1.0 27 | } else { 28 | 0.0 29 | } 30 | } 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /tools/core/src/main/scala/org/allenai/nlpstack/core/conf/FeatureSet.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core.conf 2 | 3 | import scala.collection.immutable.SortedMap 4 | 5 | /** FeatureSet represents a set of features on T that can be 6 | * represented as a double. 7 | * 8 | * @param featureMap a lookup for the features 9 | */ 10 | class FeatureSet[T, V](val featureMap: SortedMap[String, Feature[T, V]]) { 11 | def this() = this(SortedMap.empty[String, Feature[T, V]]) 12 | 13 | def apply(name: String) = featureMap(name) 14 | 15 | def featureNames(): Seq[String] = 16 | featureMap.keys.toSeq 17 | 18 | def numFeatures(): Int = 19 | featureNames.size 20 | 21 | def vectorize(example: T): Seq[V] = 22 | featureNames.map({ name => 23 | val featureFunction = featureMap(name) 24 | featureFunction(example) 25 | })(scala.collection.breakOut) 26 | } 27 | 28 | object FeatureSet { 29 | val binaryClass = true 30 | 31 | def apply[T, V](features: Iterable[Feature[T, V]]): FeatureSet[T, V] = { 32 | new FeatureSet[T, V](SortedMap.empty[String, Feature[T, V]] ++ 33 | features.map(feature => (feature.name, feature))) 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /tools/core/src/main/scala/org/allenai/nlpstack/core/conf/Labelled.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core.conf 2 | 3 | /** A representation of a labelled extraction. 4 | * 5 | * @param label whether this extraction is true or false 6 | * @param item the item labelled 7 | */ 8 | case class Labelled[E](label: Boolean, item: E) 9 | -------------------------------------------------------------------------------- /tools/core/src/main/scala/org/allenai/nlpstack/core/conf/Trainer.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core.conf 2 | 3 | abstract class Trainer[E, V](val features: FeatureSet[E, V]) { 4 | val apply = train _ 5 | def train(examples: Iterable[Labelled[E]]): Function[E, V] 6 | } 7 | -------------------------------------------------------------------------------- /tools/core/src/main/scala/org/allenai/nlpstack/core/coref/CorefResolver.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core.coref 2 | 3 | import org.allenai.nlpstack.core.Format.Quoter 4 | import org.allenai.nlpstack.core.{ PostaggedToken, Format, Token } 5 | import org.allenai.nlpstack.core.parse.graph.{ DependencyNode, DependencyGraph } 6 | 7 | import scala.util.matching.Regex 8 | import java.util.regex.Pattern 9 | 10 | case class Referent( 11 | val references: Seq[DependencyNode], 12 | val mainReference: Option[DependencyNode] 13 | ) 14 | 15 | abstract class CorefResolver[T <: Token] { 16 | def resolveCoreferences(postaggedParse: (Seq[T], DependencyGraph)): Seq[Referent] 17 | } 18 | 19 | object CorefResolver { 20 | object multilineStringFormat extends StringFormat("\n") 21 | object singlelineStringFormat extends StringFormat(";") 22 | 23 | class StringFormat(val separator: String) 24 | extends Format[(DependencyGraph, Seq[Referent]), String] { 25 | private val dgraphStringFormat = new DependencyGraph.StringFormat(separator) 26 | 27 | private val regex = 28 | new Regex("""^\((.*[^)])\)( refers to (.*))?$""", "list", "_", "mainRef") 29 | 30 | override def read(from: String): (DependencyGraph, Seq[Referent]) = { 31 | val parts = from.split(Pattern.quote(separator * 2), 2) 32 | require(parts.length == 2) 33 | val (dgraphString, corefString) = (parts(0), parts(1)) 34 | 35 | val dgraph = dgraphStringFormat.read(dgraphString) 36 | 37 | val coref = corefString.split(Pattern.quote(separator)).map(s => { 38 | val m = regex.findFirstMatchIn(s) 39 | require(m.isDefined) 40 | val stringReferences = m.get.group("list").split(Pattern.quote(", ")) 41 | val references = stringReferences map DependencyNode.stringFormat.read 42 | 43 | val mainReference = m.get.group("mainRef") match { 44 | case null => None 45 | case mainRefString => Some(DependencyNode.stringFormat.read(mainRefString)) 46 | } 47 | 48 | Referent(references, mainReference) 49 | }) 50 | 51 | (dgraph, coref) 52 | } 53 | 54 | override def write(from: (DependencyGraph, Seq[Referent])): String = { 55 | val (dgraph, coref) = from 56 | dgraphStringFormat.write(dgraph) + 57 | separator + 58 | separator + 59 | coref.map(r => 60 | "(%s)".format(r.references.mkString(", ")) + (r.mainReference match { 61 | case None => "" 62 | case Some(node) => " refers to %s".format(node) 63 | })).mkString(separator) 64 | } 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /tools/core/src/main/scala/org/allenai/nlpstack/core/graph/Bipath.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core.graph 2 | 3 | import org.allenai.nlpstack.core.graph.Graph._ 4 | 5 | /** A representation of a path through a graph. The path is represented 6 | * by a list of directed edges. 7 | * 8 | * @author Michael Schmitz 9 | */ 10 | class Bipath[T](val path: List[DirectedEdge[T]]) { 11 | require(path != null) 12 | 13 | // extend Object 14 | override def toString = "[" + path.mkString(", ") + "]"; 15 | def canEqual(that: Any) = that.isInstanceOf[Bipath[_]] 16 | override def equals(that: Any) = that match { 17 | case that: Bipath[_] => (that canEqual this) && that.path == this.path 18 | case _ => false 19 | } 20 | override def hashCode = 37 * (path.hashCode + 1) 21 | 22 | /** the undirected edges of the path */ 23 | def edges: Set[Edge[T]] = path.foldRight[Set[Edge[T]]](Set()) { 24 | case (item, set) => set + item.edge 25 | } 26 | 27 | /** the unique vertices along the path */ 28 | def nodes: List[T] = path.head.start :: path.map(_.end) 29 | 30 | /** the first vertex in the path */ 31 | def start: T = path.head.start 32 | 33 | /** collapse edges in the path that match `pred` */ 34 | def collapse(pred: Edge[T] => Boolean, merge: (T, T) => T) = { 35 | if (path.forall(dep => pred(dep.edge))) { 36 | this 37 | } else { 38 | val array = path.toArray 39 | for (i <- array.indices) { 40 | val current = array(i) 41 | if (pred(current.edge)) { 42 | // TODO: sorted 43 | val merged = merge(current.start, current.end) 44 | if (current.isInstanceOf[UpEdge[_]]) { 45 | if (array.indices contains (i + 1)) { 46 | array(i + 1) = array(i + 1).switchStart(merged) 47 | } 48 | 49 | if (array.indices contains (i - 1)) { 50 | array(i - 1) = array(i - 1).switchEnd(merged) 51 | } 52 | } else if (current.isInstanceOf[DownEdge[_]]) { 53 | if (array.indices contains (i + 1)) { 54 | array(i + 1).switchStart(merged) 55 | } 56 | 57 | if (array.indices contains (i - 1)) { 58 | array(i - 1) = array(i - 1).switchEnd(merged) 59 | } 60 | } 61 | } 62 | } 63 | 64 | new Bipath(array.filter(dep => !pred(dep.edge)).toList) 65 | } 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /tools/core/src/main/scala/org/allenai/nlpstack/core/graph/pattern/Match.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core.graph.pattern 2 | 3 | import org.allenai.nlpstack.core.graph.{ Bipath, DirectedEdge, Graph } 4 | 5 | /** A representation of a match of a pattern in a graph. 6 | * 7 | * @author Michael Schmitz 8 | */ 9 | class Match[T]( 10 | /** the pattern that was applied */ 11 | val pattern: Pattern[T], 12 | /** the matched path through the graph */ 13 | val bipath: Bipath[T], 14 | /** the pattern groups in the match */ 15 | val nodeGroups: Map[String, Match.NodeGroup[T]], 16 | val edgeGroups: Map[String, Match.EdgeGroup[T]] 17 | ) { 18 | // extend Object 19 | override def toString = bipath.toString + ": " + nodeGroups.toString + " and " + 20 | edgeGroups.toString 21 | 22 | def groups: Map[String, Match.Group] = nodeGroups ++ edgeGroups 23 | 24 | def nodes: Iterable[T] = bipath.nodes 25 | def edges: Iterable[Graph.Edge[T]] = bipath.edges 26 | } 27 | 28 | object Match { 29 | sealed abstract class Group(val text: String) 30 | case class NodeGroup[T](node: T, matchText: String) extends Group(matchText) 31 | case class EdgeGroup[T](dedge: DirectedEdge[T], matchText: String) extends Group(matchText) 32 | } 33 | -------------------------------------------------------------------------------- /tools/core/src/main/scala/org/allenai/nlpstack/core/headword/HeadExtractor.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core.headword 2 | 3 | import org.allenai.nlpstack.core.{ PostaggedToken, Postagger, Tokenizer } 4 | 5 | trait HeadExtractor { 6 | 7 | /** Given a string representing a relation, will return those 8 | * tokens comprising the headword(s) of the relation, possibly empty if 9 | * headword(s) couldn't be determined. 10 | */ 11 | def relationHead( 12 | tokenizer: Tokenizer, postagger: Postagger 13 | )(relation: String): Seq[PostaggedToken] 14 | 15 | /** Given a string representing an argument, will return those 16 | * tokens comprising the headword(s) of the relation, possibly empty if 17 | * headword(s) couldn't be determined. 18 | */ 19 | def argumentHead( 20 | tokenizer: Tokenizer, postagger: Postagger 21 | )(argument: String): Seq[PostaggedToken] 22 | 23 | /** Given a Seq[PostaggedToken] representing a relation, will return those 24 | * tokens comprising the headword(s) of the relation, possibly empty if 25 | * headword(s) couldn't be determined. 26 | */ 27 | def relationHead(tokens: Seq[PostaggedToken]): Seq[PostaggedToken] 28 | 29 | /** Given a Seq[PostaggedToken] representing an argument, will return those 30 | * tokens comprising the headword(s) of the argument, possibly empty if 31 | * headword(s) couldn't be determined. 32 | */ 33 | def argumentHead(tokens: Seq[PostaggedToken]): Seq[PostaggedToken] 34 | } 35 | -------------------------------------------------------------------------------- /tools/core/src/main/scala/org/allenai/nlpstack/core/parse/graph/Dependency.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core.parse.graph 2 | 3 | import org.allenai.nlpstack.core.Format 4 | 5 | import scala.util.matching.Regex 6 | 7 | object Dependency { 8 | val Serialized = 9 | new Regex("""(\p{Graph}+)\(\s*(\p{Graph}*?-\d\d*?),\s*(\p{Graph}*?-\d\d*)\s*\)""") 10 | 11 | implicit object DependencyOrdering extends Ordering[Dependency] { 12 | def compare(a: Dependency, b: Dependency) = { 13 | def tupled(x: Dependency) = (x.dest.id, x.source.id) 14 | 15 | implicitly[Ordering[(Int, Int)]].compare(tupled(a), tupled(b)) 16 | } 17 | } 18 | 19 | object stringFormat extends Format[Dependency, String] { 20 | def write(dep: Dependency): String = { 21 | dep.label + "(" + DependencyNode.stringFormat.write(dep.source) + ", " + 22 | DependencyNode.stringFormat.write(dep.dest) + ")" 23 | } 24 | 25 | def read(pickled: String): Dependency = try { 26 | val Serialized(label, source, dest) = pickled 27 | new Dependency( 28 | DependencyNode.stringFormat.read(source), 29 | DependencyNode.stringFormat.read(dest), 30 | label 31 | ) 32 | } catch { 33 | case e: Throwable => 34 | throw new Dependency.SerializationException( 35 | "could not deserialize dependency: " + pickled, 36 | e 37 | ) 38 | } 39 | } 40 | 41 | @deprecated("Use stringFormat instead.", "2.4.5") 42 | def deserialize(string: String) = stringFormat.read(string) 43 | 44 | class SerializationException(message: String, cause: Throwable) 45 | extends RuntimeException(message, cause) 46 | } 47 | 48 | object Dependencies { 49 | def serialize(deps: Iterable[Dependency]) = { 50 | deps.iterator.map { 51 | Dependency.stringFormat.write(_) 52 | }.mkString("; ") 53 | } 54 | def deserialize(string: String): Seq[Dependency] = string.split("""\s*(?:;|\n)\s*"""). 55 | map(Dependency.stringFormat.read(_)) 56 | 57 | object DependencyOrdering extends Ordering[Dependency] { 58 | def compare(a: Dependency, b: Dependency) = { 59 | def tuplize(dep: Dependency) = 60 | (dep.source.id, dep.dest.id, dep.label) 61 | implicitly[Ordering[(Int, Int, String)]].compare(tuplize(a), tuplize(b)) 62 | } 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /tools/core/src/main/scala/org/allenai/nlpstack/core/parse/graph/DependencyNode.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core.parse.graph 2 | 3 | import org.allenai.nlpstack.core.Format 4 | 5 | import spray.json.DefaultJsonProtocol._ 6 | 7 | import scala.util.matching.Regex 8 | 9 | /** A representation for a node in the graph of dependencies. A node 10 | * represents one or more adjacent tokens in the source sentence. 11 | */ 12 | case class DependencyNode(val id: Int, val string: String) { 13 | require(string != null) 14 | 15 | // extend Object 16 | override def toString() = s"$string-$id" 17 | } 18 | 19 | object DependencyNode { 20 | implicit object DependencyNodeOrdering extends Ordering[DependencyNode] { 21 | def compare(a: DependencyNode, b: DependencyNode) = a.id compare b.id 22 | } 23 | 24 | object stringFormat extends Format[DependencyNode, String] { 25 | val Serialized = new Regex("""(\p{Graph}*?)-(\d\d*)""") 26 | def write(node: DependencyNode): String = { 27 | val cleanText = node.string.replaceAll("[[_()][^\\p{Graph}]]", "") 28 | Iterator(cleanText, node.id).mkString("-") 29 | } 30 | 31 | def read(pickled: String): DependencyNode = { 32 | val (text, id) = pickled match { 33 | case Serialized(text, id) => (text, id) 34 | case _ => throw new MatchError("Could not split pickled node into parts: " + pickled) 35 | } 36 | 37 | new DependencyNode(id.toInt, text) 38 | } 39 | } 40 | 41 | implicit val dependencyNodeJsonFormat = jsonFormat2(DependencyNode.apply) 42 | 43 | @deprecated("Use StringFormat instead.", "2.4.5") 44 | def deserialize(string: String) = { 45 | stringFormat.read(string) 46 | } 47 | 48 | class SerializationException(message: String, cause: Throwable) 49 | extends RuntimeException(message, cause) 50 | } 51 | -------------------------------------------------------------------------------- /tools/core/src/main/scala/org/allenai/nlpstack/core/parse/graph/JoinedDependencyNode.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core.parse.graph 2 | 3 | import org.allenai.common.immutable.Interval 4 | import org.allenai.nlpstack.core.graph.Graph 5 | 6 | /** A representation for a node in the graph of dependencies. A node 7 | * represents one or more adjacent tokens in the source sentence. 8 | */ 9 | case class JoinedDependencyNode(val ids: Seq[Int], val strings: Seq[String]) { 10 | require(!ids.isEmpty) 11 | require(!strings.isEmpty) 12 | 13 | def string = strings.mkString(" ") 14 | 15 | def span = Interval.closed(ids.min, ids.max) 16 | 17 | // extend Object 18 | override def toString() = s"${strings.mkString(" ")}-${ids.mkString(",")}" 19 | } 20 | 21 | object JoinedDependencyNode { 22 | def from(node: DependencyNode) = JoinedDependencyNode(Seq(node.id), Seq(node.string)) 23 | 24 | /** Merge nodes that correspond to adjacent tokens. 25 | * 26 | * @throws IllegalArgumentException there is no superior of the set 27 | * @return the superior node of the set 28 | */ 29 | implicit def directedMerge( 30 | graph: Graph[JoinedDependencyNode] 31 | )(nodes: Traversable[JoinedDependencyNode]) = { 32 | if (nodes.isEmpty) throw new IllegalArgumentException("argument nodes empty") 33 | val sorted = nodes.toList.sortBy(_.span) 34 | val strings = sorted.map(_.string) 35 | 36 | // ensure the nodes are adjacent in the source sentence 37 | // or at least that the spans are 38 | val spans = sorted.map(_.span) 39 | if (!(Interval.span(spans) forall (point => spans.exists(span => span contains point)))) { 40 | throw new IllegalArgumentException("A set of non-adjacent intervals cannot be merged: " + 41 | nodes.mkString(", ")) 42 | } 43 | 44 | new JoinedDependencyNode(sorted.flatMap(_.ids).sorted, strings) 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /tools/core/src/main/scala/org/allenai/nlpstack/core/parse/graph/TokenDependencyNode.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core.parse.graph 2 | 3 | import org.allenai.nlpstack.core._ 4 | 5 | /** A representation for a node in the graph of dependencies. A node 6 | * represents one or more adjacent tokens in the source sentence. 7 | */ 8 | case class TokenDependencyNode(val id: Int, val lemmatizedToken: Lemmatized[PostaggedToken]) { 9 | def string = token.string 10 | def postag = token.postag 11 | def lemma = lemmatizedToken.lemma 12 | 13 | def token: PostaggedToken = lemmatizedToken.token 14 | 15 | // extend Object 16 | override def toString() = s"$string-$id" 17 | } 18 | 19 | object TokenDependencyNode { 20 | def from(tokens: Seq[Lemmatized[PostaggedToken]])(node: DependencyNode) = 21 | TokenDependencyNode(node.id, tokens(node.id)) 22 | } 23 | -------------------------------------------------------------------------------- /tools/core/src/main/scala/org/allenai/nlpstack/core/parse/graph/package.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core.parse 2 | 3 | import org.allenai.nlpstack.core.graph.Graph.Edge 4 | 5 | package object graph { 6 | type Dependency = Edge[DependencyNode] 7 | } 8 | -------------------------------------------------------------------------------- /tools/core/src/main/scala/org/allenai/nlpstack/core/remote/Remote.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core.remote 2 | 3 | import dispatch.{ Http, as, url } 4 | 5 | import scala.concurrent.duration.DurationInt 6 | import scala.concurrent.{ Await, ExecutionContext } 7 | 8 | trait Remote { 9 | def urlString: String 10 | def timeout = 5.minutes 11 | 12 | val svc = url(urlString) 13 | 14 | def post(string: String)(implicit executor: ExecutionContext) = 15 | Await.result(Http(svc << string OK as.String), timeout) 16 | } 17 | -------------------------------------------------------------------------------- /tools/core/src/main/scala/org/allenai/nlpstack/core/remote/RemoteDependencyParser.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core.remote 2 | 3 | import org.allenai.nlpstack.core.DependencyParser 4 | 5 | import scala.concurrent.ExecutionContext 6 | 7 | class RemoteDependencyParser( 8 | val urlString: String 9 | )(implicit executionContext: ExecutionContext) extends Remote { 10 | def dependencyGraph(sentence: String) = { 11 | val response = post(sentence) 12 | 13 | DependencyParser.multilineStringFormat.read(response) 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /tools/core/src/main/scala/org/allenai/nlpstack/core/remote/RemoteSegmenter.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core.remote 2 | 3 | import org.allenai.nlpstack.core.{ Segment, Segmenter } 4 | 5 | import scala.concurrent.ExecutionContext 6 | 7 | class RemoteSegmenter( 8 | val urlString: String 9 | )(implicit executionContext: ExecutionContext) extends Segmenter with Remote { 10 | def segment(sentence: String) = { 11 | val response = this.post(sentence) 12 | response.split("\\n").map(Segment.deserialize)(scala.collection.breakOut) 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /tools/core/src/main/scala/org/allenai/nlpstack/core/remote/RemoteStemmer.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core.remote 2 | 3 | import org.allenai.nlpstack.core.Stemmer 4 | 5 | import scala.concurrent.ExecutionContext 6 | 7 | class RemoteStemmer( 8 | val urlString: String 9 | )(implicit executionContext: ExecutionContext) extends Stemmer with Remote { 10 | override def stem(word: String) = { 11 | post(word) 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /tools/core/src/main/scala/org/allenai/nlpstack/core/repr/Chunks.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core.repr 2 | 3 | import org.allenai.common.immutable.Interval 4 | import org.allenai.nlpstack.core.ChunkedToken 5 | 6 | trait ChunksSupertrait extends PostagsSupertrait { 7 | this: Sentence => 8 | 9 | type token <: ChunkedToken 10 | 11 | def chunks: Seq[String] = tokens.map(_.chunk) 12 | def chunkIntervals: Seq[(String, Interval)] = 13 | org.allenai.nlpstack.core.Chunker.intervals(tokens) 14 | } 15 | 16 | trait Chunks extends ChunksSupertrait { 17 | this: Sentence => 18 | 19 | type token = ChunkedToken 20 | } 21 | 22 | trait Chunker extends Chunks { 23 | this: Sentence => 24 | 25 | def tokenizer: org.allenai.nlpstack.core.Tokenizer 26 | def postagger: org.allenai.nlpstack.core.Postagger 27 | def chunker: org.allenai.nlpstack.core.Chunker 28 | 29 | override lazy val tokens: Seq[ChunkedToken] = 30 | chunker.chunk(tokenizer, postagger)(this.text) 31 | } 32 | -------------------------------------------------------------------------------- /tools/core/src/main/scala/org/allenai/nlpstack/core/repr/Document.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core.repr 2 | 3 | class Document(val text: String) { 4 | override def toString = { 5 | if (text.length > 80) { 6 | s"Document(${text.take(80) + "..."})" 7 | } else { 8 | s"Document($text)" 9 | } 10 | } 11 | 12 | def canEqual(that: Document) = that.isInstanceOf[Document] 13 | override def equals(that: Any) = that match { 14 | case that: Document => (that canEqual this) && this.text == that.text 15 | } 16 | override def hashCode = text.hashCode 17 | } 18 | -------------------------------------------------------------------------------- /tools/core/src/main/scala/org/allenai/nlpstack/core/repr/Lemmas.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core.repr 2 | 3 | import org.allenai.nlpstack.core.Stemmer 4 | 5 | trait Lemmas { 6 | tokenized: TokensSupertrait => 7 | 8 | def lemmatizedTokens: Seq[org.allenai.nlpstack.core.Lemmatized[token]] 9 | } 10 | 11 | trait Lemmatizer extends Lemmas { 12 | tokenized: TokensSupertrait => 13 | 14 | def lemmatizer: Stemmer 15 | 16 | override lazy val lemmatizedTokens: Seq[org.allenai.nlpstack.core.Lemmatized[token]] = 17 | tokenized.tokens map lemmatizer.lemmatizeToken 18 | } 19 | -------------------------------------------------------------------------------- /tools/core/src/main/scala/org/allenai/nlpstack/core/repr/Postags.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core.repr 2 | 3 | import org.allenai.nlpstack.core.PostaggedToken 4 | 5 | trait PostagsSupertrait extends TokensSupertrait { 6 | this: Sentence => 7 | 8 | type token <: PostaggedToken 9 | 10 | def postags: Seq[String] = tokens.map(_.postag) 11 | } 12 | 13 | trait Postags extends PostagsSupertrait { 14 | this: Sentence => 15 | 16 | type token = PostaggedToken 17 | } 18 | 19 | trait Postagger extends Postags { 20 | this: Sentence => 21 | def tokenizer: org.allenai.nlpstack.core.Tokenizer 22 | def postagger: org.allenai.nlpstack.core.Postagger 23 | 24 | override lazy val tokens: Seq[PostaggedToken] = 25 | postagger.postag(tokenizer)(this.text) 26 | } 27 | 28 | -------------------------------------------------------------------------------- /tools/core/src/main/scala/org/allenai/nlpstack/core/repr/Sentence.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core.repr 2 | 3 | class Sentence(val text: String) { 4 | override def toString = s"Sentence($text)" 5 | 6 | def canEqual(that: Sentence) = that.isInstanceOf[Sentence] 7 | override def equals(that: Any) = that match { 8 | case that: Sentence => (that canEqual this) && this.text == that.text 9 | } 10 | override def hashCode = text.hashCode 11 | } 12 | -------------------------------------------------------------------------------- /tools/core/src/main/scala/org/allenai/nlpstack/core/repr/Sentenced.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core.repr 2 | 3 | import org.allenai.nlpstack.core.Segmenter 4 | 5 | case class DocumentSentence[S <: Sentence](sentence: S, offset: Int) 6 | 7 | trait Sentenced[S <: Sentence] { 8 | this: Document => 9 | 10 | def sentences: Stream[DocumentSentence[S]] 11 | } 12 | 13 | trait Sentencer[S <: Sentence] extends Sentenced[S] { 14 | this: Document => 15 | 16 | def constructor(text: String): S 17 | def sentencer: Segmenter 18 | 19 | override lazy val sentences: Stream[DocumentSentence[S]] = 20 | sentencer(text).toStream.map { segment => 21 | DocumentSentence(constructor(segment.text), segment.offset) 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /tools/core/src/main/scala/org/allenai/nlpstack/core/repr/Tokens.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core.repr 2 | 3 | import org.allenai.nlpstack.core.Token 4 | 5 | trait TokensSupertrait { 6 | this: Sentence => 7 | type token <: Token 8 | 9 | def tokens: Seq[token] 10 | 11 | def strings: Seq[String] = tokens.map(_.string) 12 | } 13 | 14 | trait Tokens extends TokensSupertrait { 15 | this: Sentence => 16 | type token = Token 17 | } 18 | 19 | trait Tokenizer extends Tokens { 20 | this: Sentence => 21 | 22 | def tokenizer: org.allenai.nlpstack.core.Tokenizer 23 | 24 | override lazy val tokens: Seq[Token] = 25 | tokenizer.tokenize(text) 26 | } 27 | -------------------------------------------------------------------------------- /tools/core/src/main/scala/org/allenai/nlpstack/core/srl/RemoteSrl.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core.srl 2 | 3 | import org.allenai.nlpstack.core.DependencyParser 4 | import org.allenai.nlpstack.core.parse.graph._ 5 | import org.allenai.nlpstack.core.PostaggedToken 6 | import org.allenai.nlpstack.core.remote.Remote 7 | 8 | import scala.concurrent.ExecutionContext 9 | 10 | class RemoteSrl(val urlString: String)(implicit executionContext: ExecutionContext) 11 | extends Srl with Remote { 12 | def apply(tokens: Seq[PostaggedToken], dgraph: DependencyGraph) = { 13 | val response = this.post(DependencyParser.multilineStringFormat.write(tokens -> dgraph)) 14 | if (response.isEmpty) { 15 | Seq.empty 16 | } else { 17 | response.split("\\n").map(Frame.deserialize(dgraph))(scala.collection.breakOut) 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /tools/core/src/main/scala/org/allenai/nlpstack/core/srl/Srl.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core.srl 2 | 3 | import org.allenai.nlpstack.core.parse.graph.DependencyGraph 4 | import org.allenai.nlpstack.core.PostaggedToken 5 | 6 | abstract class Srl { 7 | def apply(tokens: Seq[PostaggedToken], graph: DependencyGraph): Seq[Frame] 8 | } 9 | -------------------------------------------------------------------------------- /tools/core/src/main/scala/org/allenai/nlpstack/core/typer/Typer.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core.typer 2 | 3 | import org.allenai.common.immutable.Interval 4 | import org.allenai.nlpstack.core.Token 5 | 6 | abstract class Typer[E <: Token] { 7 | def name: String 8 | def source: String 9 | 10 | def apply(seq: Seq[E]): Seq[Type] 11 | } 12 | 13 | abstract class Type { 14 | def name: String 15 | def source: String 16 | def tokenInterval: Interval 17 | def text: String 18 | 19 | def matchText[E <: Token](seq: Seq[E]): String = 20 | seq.iterator.slice(tokenInterval.start, tokenInterval.end).map(_.string).mkString(" ") 21 | def tokens[E <: Token](seq: Seq[E]): Seq[E] = seq.slice(tokenInterval.start, tokenInterval.end) 22 | } 23 | 24 | object Type { 25 | def apply(name: String, source: String, tokenInterval: Interval, text: String): Type = { 26 | this.create(name, source, tokenInterval, text) 27 | } 28 | 29 | def create(name: String, source: String, tokenInterval: Interval, text: String): Type = { 30 | TypeImpl(name, source, tokenInterval, text) 31 | } 32 | 33 | private case class TypeImpl( 34 | val name: String, 35 | val source: String, 36 | val tokenInterval: Interval, 37 | val text: String 38 | ) extends Type 39 | } 40 | -------------------------------------------------------------------------------- /tools/core/src/test/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 6 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /tools/core/src/test/scala/org/allenai/nlpstack/core/ChunkerSpec.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core 2 | 3 | import org.allenai.common.testkit.UnitSpec 4 | 5 | class ChunkerSpec extends UnitSpec { 6 | "chunker" should "determine intervals correctly." in { 7 | val strings = "John very quickly ran away from the deep blue reflection in the mirror .".split(" ") 8 | val postags = "NNP RB RB VBD RB IN DT JJ JJ NN IN DT NN .".split(" ") 9 | val chunks = "B-NP B-ADVP B-ADVP B-VP B-ADVP B-PP B-NP I-NP I-NP I-NP B-PP B-NP I-NP O".split(" ") 10 | 11 | val text = "John very quickly ran away from the deep blue reflection in the mirror." 12 | val tokens = Tokenizer.computeOffsets(strings, text) 13 | val chunkedTokens = Chunker.tokensFrom(chunks, postags, tokens) 14 | 15 | Chunker.intervals(chunkedTokens).map(_.toString) should contain theSameElementsAs ( 16 | List( 17 | "(NP,{0})", 18 | "(ADVP,{1})", 19 | "(ADVP,{2})", 20 | "(VP,{3})", 21 | "(ADVP,{4})", 22 | "(PP,{5})", 23 | "(NP,[6, 10))", 24 | "(PP,{10})", 25 | "(NP,[11, 13))", 26 | "(O,{13})" 27 | ) 28 | ) 29 | } 30 | 31 | it should "join of" in { 32 | val strings = "John 's dog ate at the University of Washington".split(" ") 33 | val postags = "NNP POS NN VBD IN DT NNP IN NNP".split(" ") 34 | val chunks = "B-NP B-NP I-NP V-BP B-PP B-NP I-NP B-PP B-NP".split(" ") 35 | 36 | val text = "John's dog ate at the University of Washington." 37 | val tokens = Tokenizer.computeOffsets(strings, text) 38 | val chunkedTokens = Chunker.tokensFrom(chunks, postags, tokens) 39 | 40 | Chunker.joinOf(chunkedTokens).map(_.chunk).mkString(" ") === 41 | "B-NP B-NP I-NP V-BP B-PP B-NP I-NP I-NP I-NP" 42 | } 43 | 44 | it should "join possessives" in { 45 | val strings = "John 's dog ate at the University of Washington".split(" ") 46 | val postags = "NNP POS NN VBD IN DT NNP IN NNP".split(" ") 47 | val chunks = "B-NP B-NP I-NP V-BP B-PP B-NP I-NP B-PP B-NP".split(" ") 48 | 49 | val text = "John's dog ate at the University of Washington." 50 | val tokens = Tokenizer.computeOffsets(strings, text) 51 | val chunkedTokens = Chunker.tokensFrom(chunks, postags, tokens) 52 | 53 | Chunker.joinPos(chunkedTokens).map(_.chunk).mkString(" ") === 54 | "B-NP I-NP I-NP V-BP B-PP B-NP I-NP B-PP B-NP" 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /tools/core/src/test/scala/org/allenai/nlpstack/core/CorefResolverSpec.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core 2 | 3 | import org.allenai.common.testkit.UnitSpec 4 | import org.allenai.nlpstack.core.coref.{ CorefResolver, Referent } 5 | import org.allenai.nlpstack.core.parse.graph.DependencyGraph 6 | 7 | class CorefResolverSpec extends UnitSpec { 8 | "CorefResolverSerialization" should "round trip through serialization" in { 9 | val dgraphString = 10 | """|det(amphibians-3, The-1) 11 | |amod(amphibians-3, first-2) 12 | |nn(.-25, amphibians-3) 13 | |vmod(amphibians-3, evolved-4) 14 | |aux(move-6, to-5) 15 | |xcomp(evolved-4, move-6) 16 | |prep(move-6, out-7) 17 | |pcomp(out-7, of-8) 18 | |nsubj(.-25, the-9) 19 | |nn(.-25, water-10) 20 | |cc(water-10, and-11) 21 | |nn(land-13, colonize-12) 22 | |conj(water-10, land-13) 23 | |punct(water-10, ,-14) 24 | |cc(water-10, but-15) 25 | |nsubj(had-17, they-16) 26 | |rcmod(water-10, had-17) 27 | |aux(return-19, to-18) 28 | |xcomp(had-17, return-19) 29 | |prep(return-19, to-20) 30 | |det(water-22, the-21) 31 | |pobj(to-20, water-22) 32 | |aux(reproduce-24, to-23) 33 | |vmod(water-22, reproduce-24) 34 | |root(ROOT-0, .-25)""".stripMargin 35 | val dgraph = DependencyGraph.multilineStringFormat.read(dgraphString) 36 | 37 | // minus 1 because the dgraph's serialization format increases the numbers 38 | // by one 39 | val amphibians = dgraph.nodeById(3 - 1).get 40 | val they = dgraph.nodeById(16 - 1).get 41 | 42 | for ( 43 | format <- Seq(CorefResolver.multilineStringFormat, CorefResolver.singlelineStringFormat); 44 | mainReference <- Seq(Some(amphibians), None) 45 | ) { 46 | val coref = Seq(Referent(Seq(amphibians, they), mainReference)) 47 | val corefString = format.write((dgraph, coref)) 48 | val newCoref = format.read(corefString) 49 | assert((dgraph, coref) === newCoref) 50 | } 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /tools/core/src/test/scala/org/allenai/nlpstack/core/DependencyNodeSpec.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core 2 | 3 | import org.allenai.common.testkit.UnitSpec 4 | import org.allenai.nlpstack.core.parse.graph.DependencyNode 5 | 6 | class DependencyNodeSpec extends UnitSpec { 7 | "DependencyNode" should "round trip through string serialization when it contains a hyphen" in { 8 | val pickledDepNode = "Co-Redemptrix-13" 9 | val depNode = DependencyNode.stringFormat.read(pickledDepNode) 10 | val repickled = DependencyNode.stringFormat.write(depNode) 11 | 12 | assert(pickledDepNode === repickled) 13 | } 14 | 15 | "DependencyNode" should "round trip through json serialization" in { 16 | val node = new DependencyNode(4, "Michael") 17 | val pickled = DependencyNode.dependencyNodeJsonFormat.write(node) 18 | val unpickled = DependencyNode.dependencyNodeJsonFormat.read(pickled) 19 | 20 | assert(node === unpickled) 21 | } 22 | } 23 | 24 | -------------------------------------------------------------------------------- /tools/core/src/test/scala/org/allenai/nlpstack/core/DependencySpec.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core 2 | 3 | import org.allenai.common.testkit.UnitSpec 4 | import org.allenai.nlpstack.core.parse.graph.Dependency 5 | 6 | class DependencySpec extends UnitSpec { 7 | "Dependency" should "round trip through serialization" in { 8 | val pickledDep = "det(reflection-9, the-6)" 9 | val dep = Dependency.stringFormat.read(pickledDep) 10 | val repickled = Dependency.stringFormat.write(dep) 11 | 12 | assert(pickledDep === repickled) 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /tools/core/src/test/scala/org/allenai/nlpstack/core/FormatSpec.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core 2 | 3 | import org.allenai.common.testkit.UnitSpec 4 | import org.allenai.nlpstack.core.Format.Quoter 5 | 6 | class FormatSpec extends UnitSpec { 7 | "stringQuoter" should "quote strings" in { 8 | assert(Format.stringQuoter.quote("A 3\" diameter") === "A 3\\\" diameter") 9 | assert(Format.stringQuoter.quote("C:\\Windows\\System32") === "C:\\\\Windows\\\\System32") 10 | } 11 | 12 | "custom Quoter" should "quote strings" in { 13 | val q = new Quoter(";\"") 14 | val unquoted = "To be; Or \\not\\ \"to be\"" 15 | val quoted = "To be\\; Or \\\\not\\\\ \\\"to be\\\"" 16 | assert(q.quote(unquoted) == quoted) 17 | assert(q.unquote(quoted) == unquoted) 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /tools/core/src/test/scala/org/allenai/nlpstack/core/TokenizerSpec.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.core 2 | 3 | import org.allenai.common.testkit.UnitSpec 4 | 5 | class TokenizerSpecTest extends UnitSpec { 6 | "tokenizer" should "compute offsets correctly and infer the original text" in { 7 | val sentence = "John walks down the hall." 8 | val tokens = Tokenizer.computeOffsets(Seq("John", "walks", "down", "the", "hall", "."), sentence) 9 | 10 | // make sure offsets were computed correctly 11 | assert(tokens.map(_.offsets.start) === Seq(0, 5, 11, 16, 20, 24)) 12 | 13 | // make sure we can go back to the original sentence 14 | assert(Tokenizer.originalText(tokens) === sentence) 15 | } 16 | 17 | it should "compute offsets correctly and infer the original text when there is a leading space" in { 18 | val sentence = " John walks down the hall." 19 | val tokens = Tokenizer.computeOffsets(Seq("John", "walks", "down", "the", "hall", "."), sentence) 20 | 21 | // make sure offsets were computed correctly 22 | assert(tokens.map(_.offsets.start) === Seq(2, 7, 13, 18, 22, 26)) 23 | 24 | // make sure we can go back to the original sentence 25 | assert(Tokenizer.originalText(tokens) === sentence) 26 | } 27 | 28 | it should "trim original text correctly when a start offset is specified" in { 29 | val sentence = " John walks down the hall." 30 | val trimmedSentence = "John walks down the hall." 31 | val tokens = Tokenizer.computeOffsets(Seq("John", "walks", "down", "the", "hall", "."), sentence) 32 | 33 | // make sure offsets were computed correctly 34 | assert(tokens.map(_.offsets.start) === Seq(2, 7, 13, 18, 22, 26)) 35 | 36 | // make sure we can go back to the original sentence 37 | assert(Tokenizer.originalText(tokens, tokens.head.offset) === trimmedSentence) 38 | } 39 | 40 | it should "throw an exception if tokens are out of order" in { 41 | val tokens = Seq( 42 | new Token("large-scale", 0), 43 | new Token("large", 0), 44 | new Token("scale", 6) 45 | ) 46 | 47 | a[IllegalArgumentException] should be thrownBy { 48 | Tokenizer.originalText(tokens, 10) 49 | } 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /tools/headword/build.sbt: -------------------------------------------------------------------------------- 1 | import Dependencies._ 2 | 3 | libraryDependencies ++= loggingDependencies 4 | -------------------------------------------------------------------------------- /tools/headword/src/test/scala/JwiToolsSpec.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.headword 2 | 3 | import org.allenai.common.testkit.UnitSpec 4 | 5 | class JwiToolsSpec extends UnitSpec { 6 | 7 | val jwiTools = new JwiTools() 8 | 9 | "JwiTools" should "correctly stem a word" in { 10 | val word = "elephants" 11 | val stem = jwiTools.stem(word) 12 | assert(stem === "elephant") 13 | } 14 | 15 | it should "throw an exception if wordnet path is invalid" in { 16 | a[IllegalArgumentException] should be thrownBy { 17 | new JwiTools("foo/bar") 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /tools/lemmatize/LICENSE: -------------------------------------------------------------------------------- 1 | This licence accompanied the original LEX files from which Morpha.java was 2 | generated. 3 | 4 | All supplemental code is copyright under Apache 2.0. 5 | 6 | Copyright (c) 1995-2001 University of Sheffield, University of Sussex 7 | All rights reserved. 8 | 9 | Redistribution and use of source and derived binary forms are permitted 10 | provided that: 11 | - they are not used in commercial products 12 | - the above copyright notice and this paragraph are duplicated in 13 | all such forms 14 | - any documentation, advertising materials, and other materials 15 | related to such distribution and use acknowledge that the software 16 | was developed by Kevin Humphreys and John 17 | Carroll and Guido Minnen 18 | and refer to the following related 19 | publication: 20 | 21 | Guido Minnen, John Carroll and Darren Pearce. 2000. Robust, Applied 22 | Morphological Generation. In Proceedings of the First International 23 | Natural Language Generation Conference (INLG), Mitzpe Ramon, Israel. 24 | 201-208. 25 | 26 | The name of University of Sheffield may not be used to endorse or 27 | promote products derived from this software without specific prior 28 | written permission. 29 | 30 | This software is provided "as is" and without any express or 31 | implied warranties, including, without limitation, the implied 32 | warranties of merchantibility and fitness for a particular purpose. 33 | 34 | If you make any changes, the authors would appreciate it 35 | if you sent them details of what you have done. 36 | -------------------------------------------------------------------------------- /tools/lemmatize/build.sbt: -------------------------------------------------------------------------------- 1 | import Dependencies._ 2 | 3 | libraryDependencies ++= loggingDependencies 4 | -------------------------------------------------------------------------------- /tools/lemmatize/src/main/scala/org/allenai/nlpstack/lemmatize/MorphaStemmer.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.lemmatize 2 | 3 | import org.allenai.nlpstack.core.{ PostaggedStemmer, Stemmer } 4 | 5 | import edu.washington.cs.knowitall.morpha.{ MorphaStemmer => MorphaStem } 6 | 7 | /** This stemmer handles many cases, but the JFlex is 5 MB. */ 8 | class MorphaStemmer extends Stemmer with PostaggedStemmer { 9 | private val whitespace = "\\s".r 10 | 11 | private def stem(word: String, stemmer: (String => String)) = 12 | if (whitespace.findFirstMatchIn(word).isDefined) { 13 | word 14 | } else { 15 | stemmer(word) 16 | } 17 | 18 | def stem(word: String) = stem(word, MorphaStem.stemToken(_)) 19 | override def stem(word: String, postag: String) = 20 | stem(word, MorphaStem.stemToken(_, postag)) 21 | } 22 | 23 | /** MorphaStemmer is threadsafe. Clients can use this global instance. */ 24 | object MorphaStemmer extends MorphaStemmer 25 | -------------------------------------------------------------------------------- /tools/lemmatize/src/main/universal/lemmatize-server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CLASS_NAME="edu.knowitall.tool.lemmatize.MorphaStemmer" 4 | 5 | SCRIPT_DIR=`dirname $0` 6 | SHORT_NAME=`basename $0 .sh` 7 | APP_ROOT="$SCRIPT_DIR/.." 8 | JVM_ARGS="-Xmx128M" 9 | 10 | . "${SCRIPT_DIR}/run-class.sh" "$CLASS_NAME" "$SHORT_NAME" "$@" 11 | -------------------------------------------------------------------------------- /tools/lemmatize/src/test/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 6 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /tools/lemmatize/src/test/scala/org/allenai/nlpstack/lemmatize/MorphaLemmatizerSpec.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.lemmatize 2 | 3 | import org.allenai.common.testkit.UnitSpec 4 | 5 | class MorphaLemmatizerSpec extends UnitSpec { 6 | "lemmatizer" should "correctly lemmatize a word" in { 7 | val word = "ate" 8 | val lemma = MorphaStemmer.lemmatize(word) 9 | assert(lemma === "eat") 10 | } 11 | 12 | it should "not lemmatize a word with spaces" in { 13 | val wordWithSpace = "29 1/2" 14 | assert(MorphaStemmer.lemmatize(wordWithSpace) === wordWithSpace) 15 | } 16 | } 17 | 18 | -------------------------------------------------------------------------------- /tools/parse/build.sbt: -------------------------------------------------------------------------------- 1 | parallelExecution in ThisBuild := false 2 | 3 | javaOptions += "-XX:ReservedCodeCacheSize=512M" 4 | 5 | javaOptions += "-Xmx16G" 6 | 7 | // uncomment if you want to train the parser 8 | javaOptions += "-Xss800m" 9 | 10 | fork in test := true 11 | -------------------------------------------------------------------------------- /tools/parse/conf/deploy.conf: -------------------------------------------------------------------------------- 1 | clearParser = { 2 | 3 | project = { 4 | subdirectory = "parse/clear" 5 | name = "parse-clear" 6 | } 7 | 8 | deploy = { 9 | startup_script = "bin/clear-parse-server.sh" 10 | directory = "/local/deploy/nlptools-clear-parser" 11 | user.ssh_username = "ec2-user" 12 | } 13 | 14 | 15 | // For now set this on the command line via -Ddeploy.host=ec2-54-200-156-107.us-west-2.compute.amazonaws.com 16 | // TODO is to get a real aname for nlptools and configure it here. 17 | // deploy.host = 18 | } 19 | -------------------------------------------------------------------------------- /tools/parse/jvm.sbt: -------------------------------------------------------------------------------- 1 | fork := true 2 | -------------------------------------------------------------------------------- /tools/parse/src/main/resources/featuretaggers.config: -------------------------------------------------------------------------------- 1 | verbnet { 2 | group: "org.allenai.nlp.resources" 3 | name: "verbnet-3.2" 4 | version: 1 5 | } 6 | googleUnigram { 7 | group: "org.allenai.nlp.resources" 8 | name: "googleNgramsNodes-20130501-freq1000Filtered" 9 | version: 1 10 | features: [ "depLabel", "posTag" ] 11 | } 12 | -------------------------------------------------------------------------------- /tools/parse/src/main/scala/org/allenai/nlpstack/parse/package.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack 2 | 3 | import org.allenai.nlpstack.core.DependencyParser 4 | 5 | package object parse { 6 | val defaultDependencyParser: DependencyParser = new FactorieParser 7 | } 8 | -------------------------------------------------------------------------------- /tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/core/AnnotatedSentence.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.core 2 | 3 | import org.allenai.nlpstack.parse.poly.ml.{ FeatureName, FeatureVector } 4 | import reming.DefaultJsonProtocol._ 5 | 6 | /** An AnnotatedSentence is a sentence whose tokens are each annotated with a feature 7 | * vector. 8 | * 9 | * @param sentence the unannotated sentence 10 | * @param annotation an indexed sequence, of which the nth element is the feature vector for 11 | * the nth token of the sentence 12 | */ 13 | case class AnnotatedSentence(sentence: Sentence, annotation: IndexedSeq[FeatureVector]) 14 | 15 | object AnnotatedSentence { 16 | implicit val annotatedSentenceJsonFormat = jsonFormat2(AnnotatedSentence.apply) 17 | 18 | /** Converts a TaggedSentence into an AnnotatedSentence by making simple features from 19 | * the tags. 20 | * 21 | * @param tagged the original tagged sentence 22 | * @return an annotated sentence (with feature vectors derived from the tags) 23 | */ 24 | def annotate(tagged: TaggedSentence): AnnotatedSentence = { 25 | AnnotatedSentence( 26 | tagged.sentence, 27 | Range(0, tagged.sentence.size) map { tokenIndex => 28 | FeatureVector( 29 | tagged.tags.getOrElse(tokenIndex, Set[TokenTag]()).toSeq 30 | map { tag => 31 | FeatureName(Seq(tag.name, tag.value)) -> 1.0 32 | } 33 | ) 34 | } 35 | ) 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/core/TaggedSentence.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.core 2 | 3 | import org.allenai.nlpstack.parse.poly.fsm.{ Sculpture, SculptureSource } 4 | 5 | /** A TaggedSentence is a sentence accompanied by a map that assigns tags to its tokens. 6 | * 7 | * Specifically, the `tags` field maps each token index to a set of TokenTag objects corresponding 8 | * to that token. 9 | * 10 | * @param sentence the untagged sentence 11 | * @param tags maps each token index to a set of TokenTag objects 12 | */ 13 | case class TaggedSentence(sentence: Sentence, tags: Map[Int, Set[TokenTag]]) extends Sculpture { 14 | override val marbleBlock = sentence 15 | } 16 | 17 | /** A data source for TaggedSentence objects. */ 18 | trait TaggedSentenceSource extends SculptureSource with SentenceSource { 19 | def taggedSentenceIterator: Iterator[TaggedSentence] 20 | 21 | override def sculptureIterator: Iterator[Sculpture] = taggedSentenceIterator 22 | 23 | override def sentenceIterator: Iterator[Sentence] = taggedSentenceIterator map { taggedSentence => 24 | taggedSentence.sentence 25 | } 26 | } 27 | 28 | /** A TaggedSentenceSource derived from a SentenceSource. 29 | * 30 | * Tokens are tagged with a specified property from their `properties` field. 31 | * 32 | * @param sentenceSource the sentence source to derive the tagged sentences from 33 | * @param propertyName the token property to use as the "tag" 34 | */ 35 | case class DerivedTaggedSentenceSource( 36 | sentenceSource: SentenceSource, 37 | propertyName: Symbol 38 | ) extends TaggedSentenceSource { 39 | 40 | override def taggedSentenceIterator: Iterator[TaggedSentence] = { 41 | for { 42 | sentence <- sentenceSource.sentenceIterator 43 | } yield { 44 | TaggedSentence( 45 | sentence, 46 | (sentence.tokens.zipWithIndex map { 47 | case (tok, index) => 48 | (index, tok.getProperty(propertyName) map { prop => TokenTag(propertyName, prop) }) 49 | }).toMap 50 | ) 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/core/Util.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.core 2 | 3 | import org.allenai.common.Resource 4 | import org.allenai.nlpstack.core.{ 5 | Token => NlpStackToken, 6 | PostaggedToken, 7 | Postagger, 8 | Tokenizer 9 | } 10 | import reming.{ JsonFormat, JsonParser } 11 | 12 | import java.io.{ File, InputStream, PushbackInputStream } 13 | import java.net.URL 14 | import java.util.zip.GZIPInputStream 15 | 16 | import scala.io.BufferedSource 17 | 18 | object Util { 19 | def readFromFile[T: JsonFormat](filename: String): T = { 20 | readFromUrl(new File(filename).toURI.toURL) 21 | } 22 | 23 | def readFromUrl[T: JsonFormat](url: URL): T = { 24 | Resource.using(url.openStream()) { readFromStream[T] } 25 | } 26 | 27 | def readFromStream[T: JsonFormat](stream: InputStream): T = { 28 | val headerLength = 2 29 | val pbStream = new PushbackInputStream(stream, headerLength) 30 | val header = new Array[Byte](headerLength) 31 | val readBytes = pbStream.read(header, 0, headerLength) 32 | pbStream.unread(header, 0, readBytes) 33 | 34 | val isZipped = 35 | (readBytes == headerLength) && 36 | (header(0) == GZIPInputStream.GZIP_MAGIC.toByte) && 37 | (header(1) == (GZIPInputStream.GZIP_MAGIC >> 8).toByte) 38 | 39 | val uncompressedStream = 40 | if (isZipped) { 41 | new GZIPInputStream(pbStream) 42 | } else { 43 | pbStream 44 | } 45 | 46 | JsonParser.read[T](new BufferedSource(uncompressedStream)) 47 | } 48 | 49 | /** Uses an NlpStack postagger to tag a Sentence object. 50 | * 51 | * @param sentence the Sentence to tag 52 | * @param posTagger the nlpstack postagger to use 53 | * @return a map from Sentence token indices to their POS tags 54 | */ 55 | def getPostaggedTokens(sentence: Sentence, posTagger: Postagger): Map[Int, PostaggedToken] = { 56 | val words: IndexedSeq[String] = sentence.tokens.tail map { tok => tok.word.name } 57 | val nlpStackTokens: IndexedSeq[NlpStackToken] = 58 | Tokenizer.computeOffsets(words, words.mkString).toIndexedSeq 59 | (posTagger.postagTokenized(nlpStackTokens).zipWithIndex map { 60 | case (taggedTok, index) => 61 | (index + 1, taggedTok) 62 | }).toMap 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/decisiontree/OmnibusTrainer.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.decisiontree 2 | 3 | class OmnibusTrainer() 4 | extends ProbabilisticClassifierTrainer { 5 | 6 | val dtTrainer = new RandomForestTrainer(0, 12, 0.1f, MultinomialGainMetric(0.5f), numThreads = 6) 7 | val rfTrainer = new RandomForestTrainer(0, 12, 0.1f, MultinomialGainMetric(0.5f), numThreads = 6) 8 | 9 | override def apply(data: FeatureVectorSource): ProbabilisticClassifier = { 10 | val trainer = data.classificationTask.filenameFriendlyName match { 11 | case name if name.startsWith("dt-") => 12 | dtTrainer 13 | case _ => 14 | rfTrainer 15 | } 16 | trainer(data) 17 | } 18 | } 19 | 20 | -------------------------------------------------------------------------------- /tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/decisiontree/package.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly 2 | 3 | /** Implements C4.5 decision trees for integral labels and attributes. 4 | * 5 | * Main class to use is [[org.allenai.nlpstack.parse.poly.decisiontree.DecisionTree]]. 6 | * Use the companion object to build the tree. 7 | * Then use [[org.allenai.nlpstack.parse.poly.decisiontree.DecisionTree.classify( )]] 8 | * or 9 | * [[org.allenai.nlpstack.parse.poly.decisiontree.DecisionTree.outcomeDistribution( )]] 10 | * to do prediction. 11 | * 12 | * The tree takes data in the form of 13 | * [[org.allenai.nlpstack.parse.poly.decisiontree.FeatureVectors]]. 14 | * This is a container for a collection of 15 | * [[org.allenai.nlpstack.parse.poly.decisiontree.FeatureVector]] objects. 16 | * 17 | * Implementations of these are 18 | * [[org.allenai.nlpstack.parse.poly.decisiontree.SparseVector]] 19 | * or 20 | * [[org.allenai.nlpstack.parse.poly.decisiontree.DenseVector]]. 21 | */ 22 | package object decisiontree { 23 | 24 | } 25 | -------------------------------------------------------------------------------- /tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/eval/Evaluate.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.eval 2 | 3 | import org.allenai.nlpstack.parse.poly.polyparser._ 4 | import scopt.OptionParser 5 | 6 | private case class EvaluateConfig(candidateFilename: String = "", goldFilename: String = "") 7 | 8 | object Evaluate { 9 | 10 | /** Command-line for evaluating a set of parses against a gold set. 11 | * 12 | * Usage: Evaluate [options] 13 | * 14 | * -c | --candidate 15 | * the file containing the candidate parses (CoNLL-X format) 16 | * -g | --gold 17 | * the file containing the gold parses (CoNLL-X format) 18 | * 19 | * @param args see above 20 | */ 21 | def main(args: Array[String]) { 22 | val optionParser = new OptionParser[EvaluateConfig]("Evaluate") { 23 | opt[String]('c', "candidate") required () valueName ("") action 24 | { (x, c) => c.copy(candidateFilename = x) } text ("the file containing the candidate " + 25 | "parses (CoNLL-X format)") 26 | opt[String]('g', "gold") required () valueName ("") action 27 | { (x, c) => c.copy(goldFilename = x) } text ("the file containing the gold " + 28 | "parses (CoNLL-X format)") 29 | } 30 | val config: EvaluateConfig = optionParser.parse(args, EvaluateConfig()).get 31 | val fileFormat: PolytreeParseFileFormat = ConllX(true) 32 | val candidateParses = 33 | InMemoryPolytreeParseSource( 34 | (PolytreeParse.fromFile(config.candidateFilename, fileFormat) map { Some(_) }).flatten.toSeq 35 | ) 36 | 37 | val goldParseBank = 38 | ParseBank.createParseBankFromSource( 39 | InMemoryPolytreeParseSource(PolytreeParse.fromFile(config.goldFilename, fileFormat).toSeq) 40 | ) 41 | ParseEvaluation.performStandardEvaluation(candidateParses, goldParseBank) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/fsm/MarbleBlock.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.fsm 2 | 3 | /** A MarbleBlock is an unstructured input corresponding to a start state of a finite-state 4 | * machine. The goal of the finite-state machine is to find a final state (which correponds 5 | * to a Sculpture, i.e. a structured output). 6 | * 7 | * As an example, consider a transition-based parser. A MarbleBlock would be a sentence to be 8 | * parsed, whereas a Sculpture would be a parse tree for that sentence. 9 | */ 10 | trait MarbleBlock 11 | -------------------------------------------------------------------------------- /tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/fsm/NbestCorpus.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.fsm 2 | 3 | import org.allenai.nlpstack.parse.poly.core.Util 4 | 5 | import reming.DefaultJsonProtocol._ 6 | 7 | /** A sequence of (scored) sculptures. */ 8 | case class NbestList(scoredSculptures: Iterable[(Sculpture, Double)]) 9 | 10 | object NbestList { 11 | implicit val jsFormat = jsonFormat1(NbestList.apply) 12 | } 13 | 14 | /** A sequence of NbestLists. */ 15 | 16 | case class NbestCorpus(nbestLists: Iterable[NbestList]) 17 | 18 | object NbestCorpus { 19 | implicit val jsFormat = jsonFormat1(NbestCorpus.apply) 20 | 21 | def loadNbestCorpus(filename: String): NbestCorpus = Util.readFromFile(filename) 22 | } 23 | -------------------------------------------------------------------------------- /tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/fsm/NbestSearch.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.fsm 2 | 3 | import scala.collection.mutable 4 | 5 | /** Finds the best n greedy paths through a finite-state machine. 6 | * 7 | * @param costFunction the cost function to use to evaluate transitions from a given state 8 | */ 9 | class NbestSearch( 10 | costFunction: StateCostFunction, 11 | timeout: Int = NbestSearch.defaultTimeout 12 | ) { 13 | 14 | // Right now, we use a rather generous "qualifying cost delta" of 10000.0, to make sure that 15 | // most reasonable alternatives are remembered by the nostalgic parser. 16 | val baseParser: NostalgicSearch = new NostalgicSearch(costFunction, 10000.0) 17 | 18 | /** Finds the best n greedy paths through a finite-state machine. 19 | * 20 | * @param initialState the initial state in the finite-state machine 21 | * @param maxDesiredWalks the number of walks desired (i.e. n) 22 | * @param constraints a set of constraints that must be satisfied by returned paths 23 | * @return an n-best list containing n greedy paths through the FSM 24 | */ 25 | def find(initialState: State, maxDesiredWalks: Int, 26 | constraints: Set[TransitionConstraint] = Set()): NbestList = { 27 | 28 | val queue = mutable.PriorityQueue[ScoredWalk]()( 29 | Ordering.by({ walk: ScoredWalk => -walk.score }) 30 | ) 31 | var results: Seq[ScoredWalk] = Seq() 32 | var iterNumber: Int = 0 33 | queue.enqueue(ScoredWalk(Walk(initialState, Seq()), 0.0)) 34 | while (queue.nonEmpty && results.size < maxDesiredWalks && iterNumber < timeout) { 35 | iterNumber += 1 36 | val scoredWalk: ScoredWalk = queue.dequeue() 37 | if (scoredWalk.walk.isGoal) { 38 | results = scoredWalk +: results 39 | } else { 40 | val (mementos, _) = 41 | baseParser.getPromisingWalks(scoredWalk.walk, scoredWalk.score, constraints) 42 | mementos.headOption match { 43 | case Some(memento) => 44 | if (memento.walk.isGoal) { 45 | results = memento +: results 46 | queue ++= mementos.tail 47 | } else { 48 | queue ++= mementos 49 | } 50 | case _ => 51 | } 52 | } 53 | } 54 | val allWalks: Seq[ScoredWalk] = results 55 | NbestList( 56 | (allWalks map { scoredWalk => 57 | scoredWalk.walk.finalState flatMap { state => 58 | state.asSculpture 59 | } map { sculpture => 60 | (sculpture, scoredWalk.score) 61 | } 62 | }).flatten 63 | ) 64 | } 65 | } 66 | 67 | object NbestSearch { 68 | val defaultTimeout = 1000 69 | } 70 | -------------------------------------------------------------------------------- /tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/fsm/Sculpture.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.fsm 2 | 3 | import org.allenai.nlpstack.parse.poly.polyparser.{ PolytreeParse } 4 | 5 | import reming.DefaultJsonProtocol._ 6 | 7 | /** A Sculpture is a structured output corresponding to a final state of a finite-state 8 | * machine, whose goal is to transform an unstructured input (a MarbleBlock) into a 9 | * structured output. 10 | * 11 | * As an example, consider a transition-based parser. A MarbleBlock would be a sentence to be 12 | * parsed, whereas a Sculpture would be a parse tree for that sentence. 13 | */ 14 | trait Sculpture { 15 | def marbleBlock: MarbleBlock 16 | } 17 | 18 | object Sculpture { 19 | private implicit val polytreeParseFormat = jsonFormat4(PolytreeParse.apply) 20 | implicit val sculptureJsonFormat = parentFormat[Sculpture](childFormat[PolytreeParse, Sculpture]) 21 | } 22 | 23 | /** An interface for a Sculpture data source. */ 24 | trait SculptureSource { 25 | 26 | /** Returns a use-once iterator over all sculptures in the data source. */ 27 | def sculptureIterator: Iterator[Sculpture] 28 | } 29 | -------------------------------------------------------------------------------- /tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/fsm/SculptureCost.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.fsm 2 | 3 | abstract class SculptureCost extends (Sculpture => Double) 4 | -------------------------------------------------------------------------------- /tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/fsm/SculptureFeature.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.fsm 2 | 3 | import org.allenai.nlpstack.parse.poly.ml.FeatureVector 4 | 5 | /** A SculptureFeature computes a feature vector corresponding to a given sculpture. */ 6 | abstract class SculptureFeature extends (Sculpture => FeatureVector) 7 | -------------------------------------------------------------------------------- /tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/fsm/SculptureTrainingVectorSource.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.fsm 2 | 3 | /** A SculptureTrainingVectorSource reduces a sculpture to a set of feature vectors for 4 | * classifier training. 5 | * 6 | * Essentially, we derive the transition states that lead to the gold sculpture. 7 | * Each of these states becomes a feature vector, 8 | * labeled with the transition executed from that state in the gold sculpture. 9 | * 10 | * One of the constructor arguments is a TaskIdentifer. This will dispatch the feature vectors 11 | * to train different classifiers. For instance, if taskIdentifier(state) != 12 | * taskIdentifier(state2), then their respective feature vectors (i.e. feature(state) and 13 | * feature(state2)) will be used to train different classifiers. 14 | * 15 | * @param trainingSculptures the data source for the training sculptures 16 | * @param transitionSystemFactory the transition system factory to use (for generating states) 17 | * @param baseCostFunctionFactory a trained cost function factory to adapt (optional) 18 | */ 19 | case class SculptureTrainingVectorSource( 20 | trainingSculptures: SculptureSource, 21 | transitionSystemFactory: TransitionSystemFactory, 22 | baseCostFunctionFactory: Option[StateCostFunctionFactory] = None 23 | ) 24 | extends FSMTrainingVectorSource(transitionSystemFactory, baseCostFunctionFactory) { 25 | 26 | def getVectorIterator: Iterator[FSMTrainingVector] = { 27 | for { 28 | taggedSentence <- trainingSculptures.sculptureIterator 29 | vector <- generateVectors(taggedSentence) 30 | } yield vector 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/fsm/State.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.fsm 2 | 3 | import org.allenai.nlpstack.parse.poly.polyparser.TransitionParserState 4 | 5 | import reming.DefaultJsonProtocol._ 6 | 7 | /** A state of a finite-state machine. */ 8 | trait State { 9 | val isFinal: Boolean 10 | def asSculpture: Option[Sculpture] 11 | } 12 | 13 | object State { 14 | private implicit val transitionParserStateFormat = jsonFormat8(TransitionParserState.apply) 15 | implicit val stateJsonFormat = parentFormat[State](childFormat[TransitionParserState, State]) 16 | } 17 | 18 | /** A StateCost maps a state to a cost. */ 19 | trait StateCost extends (Option[State] => Double) 20 | 21 | trait StateSource { 22 | /** Generates an iterator over State objects. 23 | * 24 | * @return a use-once iterator over State objects 25 | */ 26 | def getStateIterator: Iterator[State] 27 | } 28 | 29 | /** A StateSource that keeps all its states in memory. */ 30 | case class InMemoryStateSource(states: Iterable[State]) extends StateSource { 31 | override def getStateIterator: Iterator[State] = states.iterator 32 | } 33 | -------------------------------------------------------------------------------- /tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/fsm/StateCostFunctionTrainer.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.fsm 2 | 3 | import org.allenai.nlpstack.parse.poly.ml.FeatureName 4 | 5 | /** A StateCostFunctionTrainer trains a StateCostFunction from data. Training is 6 | * triggered during construction, after which the .costFunction field contains the trained 7 | * TransitionCostFunctionAndClassifier. 8 | * 9 | * @param trainingVectorSource a source of training vectors 10 | */ 11 | abstract class StateCostFunctionTrainer( 12 | transitionSystemFactory: TransitionSystemFactory, trainingVectorSource: FSMTrainingVectorSource 13 | ) { 14 | 15 | /** The trained cost function factory. */ 16 | def costFunctionFactory: StateCostFunctionFactory 17 | 18 | protected val featureNames: List[FeatureName] = 19 | FSMTrainingVectorSource.collectFeatureNames(trainingVectorSource).toList 20 | 21 | protected val featureNameToIndex: Map[FeatureName, Int] = featureNames.zipWithIndex.toMap 22 | 23 | protected val transitions: IndexedSeq[StateTransition] = 24 | FSMTrainingVectorSource.collectTransitions(trainingVectorSource).toIndexedSeq 25 | } 26 | 27 | -------------------------------------------------------------------------------- /tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/fsm/StateFeature.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.fsm 2 | 3 | import org.allenai.nlpstack.parse.poly.ml.FeatureVector 4 | import org.allenai.nlpstack.parse.poly.polyparser._ 5 | 6 | import reming.LazyFormat 7 | import reming.DefaultJsonProtocol._ 8 | 9 | /** A StateFeature computes a feature vector corresponding to a given parser state. */ 10 | abstract class StateFeature extends (State => FeatureVector) 11 | 12 | object StateFeature { 13 | private implicit val tokenTransformFeatureFormat = jsonFormat2(TokenTransformFeature.apply) 14 | private implicit val offlineTokenFeatureFormat = jsonFormat2(OfflineTokenFeature.apply) 15 | private implicit val tokenCardinalityFeatureFormat = jsonFormat1(TokenCardinalityFeature.apply) 16 | 17 | implicit object StateFeatureJsonFormat extends LazyFormat[StateFeature] { 18 | private implicit val featureUnionFormat = jsonFormat1(FeatureUnion.apply) 19 | 20 | override val delegate = parentFormat[StateFeature]( 21 | childFormat[TokenTransformFeature, StateFeature], 22 | childFormat[OfflineTokenFeature, StateFeature], 23 | childFormat[TokenCardinalityFeature, StateFeature], 24 | childFormat[FeatureUnion, StateFeature] 25 | ) 26 | } 27 | } 28 | 29 | /** A FeatureUnion simply merges the output of a list of features. 30 | * 31 | * @param features a list of the features we want to merge into a single feature 32 | */ 33 | case class FeatureUnion(val features: Iterable[StateFeature]) 34 | extends StateFeature { 35 | 36 | override def apply(state: State): FeatureVector = { 37 | features map { f => 38 | f(state) 39 | } reduce { (m1, m2) => 40 | FeatureVector(m1.values ++ m2.values) 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/fsm/StateTransition.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.fsm 2 | 3 | import org.allenai.nlpstack.parse.poly.polyparser._ 4 | import reming.DefaultJsonProtocol._ 5 | 6 | abstract class StateTransition extends (Option[State] => Option[State]) { 7 | val name: String 8 | override def toString: String = name 9 | } 10 | 11 | object StateTransition { 12 | def applyTransitionSequence( 13 | initialState: State, 14 | transitions: Seq[StateTransition] 15 | ): Option[State] = { 16 | 17 | transitions.foldLeft(Option(initialState)) { (state, transition) => transition(state) } 18 | } 19 | 20 | def applicable(transition: StateTransition, state: Option[State]): Boolean = { 21 | transition(state) != None 22 | } 23 | 24 | private implicit val arcEagerShiftFormat = jsonFormat0(() => ArcEagerShift) 25 | private implicit val arcEagerReduceFormat = jsonFormat0(() => ArcEagerReduce) 26 | private implicit val arcHybridShiftFormat = jsonFormat0(() => ArcHybridShift) 27 | private implicit val fallbackFormat = jsonFormat0(() => Fallback) 28 | private implicit val leftArcFormat = jsonFormat1(ArcEagerLeftArc.apply) 29 | private implicit val rightArcFormat = jsonFormat1(ArcEagerRightArc.apply) 30 | private implicit val hybridLeftArcFormat = jsonFormat1(ArcHybridLeftArc.apply) 31 | private implicit val hybridRightArcFormat = jsonFormat1(ArcHybridRightArc.apply) 32 | private implicit val leftLabelArcFormat = jsonFormat1(LabelLeftArc.apply) 33 | private implicit val rightLabelArcFormat = jsonFormat1(LabelRightArc.apply) 34 | //private implicit val tagTokenFormat = jsonFormat1(AssignTag.apply) 35 | 36 | implicit val stateTransitionJsonFormat = parentFormat[StateTransition]( 37 | childFormat[ArcEagerShift.type, StateTransition]("Sh"), 38 | childFormat[ArcEagerReduce.type, StateTransition]("Re"), 39 | childFormat[ArcHybridShift.type, StateTransition]("HySh"), 40 | childFormat[Fallback.type, StateTransition]("Fb"), 41 | childFormat[ArcEagerLeftArc, StateTransition]("Lt"), 42 | childFormat[ArcEagerRightArc, StateTransition]("Rt"), 43 | childFormat[ArcHybridLeftArc, StateTransition]("HyLt"), 44 | childFormat[ArcHybridRightArc, StateTransition]("HyRt"), 45 | //childFormat[AssignTag, StateTransition]("Tag"), 46 | childFormat[LabelLeftArc, StateTransition]("LtLbl"), 47 | childFormat[LabelRightArc, StateTransition]("RtLbl") 48 | ) 49 | } 50 | 51 | case object Fallback extends StateTransition { 52 | 53 | override def apply(state: Option[State]): Option[State] = None 54 | 55 | override val name: String = "Fb" 56 | } 57 | -------------------------------------------------------------------------------- /tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/fsm/TransitionClassifier.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.fsm 2 | 3 | import org.allenai.nlpstack.parse.poly.ml.FeatureVector 4 | 5 | import reming.DefaultJsonProtocol._ 6 | 7 | /** A TransitionClassifier maps Transitions to probabilities. */ 8 | abstract class TransitionClassifier { 9 | 10 | /** Returns the most probable Transition according to .getDistribution(featureVector). 11 | * 12 | * @param featureVector the feature vector to use to compute the distribution 13 | * @return the most probable Transition, given the argument feature vector 14 | */ 15 | def classify(featureVector: FeatureVector): StateTransition 16 | 17 | /** Given the argument feature vector, this assigns a probability to a set of Transitions. 18 | * 19 | * @param featureVector the feature vector to use to compute the distribution 20 | * @return a probability distribution over Transitions 21 | */ 22 | def getDistribution(featureVector: FeatureVector): Map[StateTransition, Float] 23 | 24 | } 25 | 26 | /** Companion class for serializing TransitionClassifier instances. */ 27 | object TransitionClassifier { 28 | private implicit val embeddedClassifierFormat = jsonFormat4(EmbeddedClassifier.apply) 29 | 30 | implicit val transitionClassifierJsonFormat = parentFormat[TransitionClassifier]( 31 | childFormat[EmbeddedClassifier, TransitionClassifier] 32 | ) 33 | } 34 | 35 | -------------------------------------------------------------------------------- /tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/fsm/TransitionConstraint.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.fsm 2 | 3 | import org.allenai.nlpstack.parse.poly.polyparser.{ 4 | RequestedCpos, 5 | RequestedArc, 6 | ForbiddenArcLabel, 7 | ForbiddenEdge 8 | } 9 | import reming.DefaultJsonProtocol._ 10 | 11 | /** A TransitionConstraint returns true if a given transition is illegal to 12 | * apply in a given state. 13 | */ 14 | trait TransitionConstraint 15 | 16 | object TransitionConstraint { 17 | private implicit val forbiddenEdgeFormat = jsonFormat2(ForbiddenEdge.apply) 18 | private implicit val forbiddenArcLabelFormat = jsonFormat3(ForbiddenArcLabel.apply) 19 | private implicit val requestedArcFormat = jsonFormat3(RequestedArc.apply) 20 | private implicit val requestedCposFormat = jsonFormat2(RequestedCpos.apply) 21 | implicit val parserConstraintFormat = parentFormat[TransitionConstraint]( 22 | childFormat[ForbiddenEdge, TransitionConstraint], 23 | childFormat[ForbiddenArcLabel, TransitionConstraint], 24 | childFormat[RequestedArc, TransitionConstraint], 25 | childFormat[RequestedCpos, TransitionConstraint] 26 | ) 27 | } 28 | 29 | /** A ConstraintInterpretation tells you whether a transition is inapplicable in a given state. 30 | * 31 | * Specifically, it is a function that takes a (state, transition) pair, and returns true 32 | * if the transition is inapplicable. 33 | */ 34 | trait ConstraintInterpretation extends ((State, StateTransition) => Boolean) 35 | 36 | /** The TrivialConstraintInterpretation returns false for any state/transition pair. 37 | * 38 | * This means that transitions are always considered applicable. 39 | */ 40 | class TrivialConstraintInterpretation extends ConstraintInterpretation { 41 | def apply(state: State, transition: StateTransition): Boolean = false 42 | } 43 | -------------------------------------------------------------------------------- /tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/fsm/TransitionSystem.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.fsm 2 | 3 | import org.allenai.nlpstack.parse.poly.ml.FeatureVector 4 | import org.allenai.nlpstack.parse.poly.polyparser.{ 5 | ArcEagerTransitionSystemFactory, 6 | ArcHybridTransitionSystemFactory 7 | } 8 | 9 | import reming.DefaultJsonProtocol._ 10 | 11 | trait TransitionSystem { 12 | val taskIdentifier: TaskIdentifier 13 | def initialState(constraints: Seq[TransitionConstraint]): Option[State] 14 | def guidedCostFunction(goldObj: Sculpture): Option[StateCostFunction] 15 | def computeFeature(state: State): FeatureVector 16 | def toSculpture(state: State): Option[Sculpture] 17 | def interpretConstraint(constraint: TransitionConstraint): ((State, StateTransition) => Boolean) 18 | } 19 | 20 | object TransitionSystem { 21 | def trivialConstraint(state: State, transition: StateTransition): Boolean = false 22 | } 23 | 24 | /** A TransitionSystemFactory is a factory that constructs marbleblock-specific transition 25 | * systems. For instance, in parsing, this would create a transition system for each input 26 | * sentence that you want to parse. 27 | */ 28 | trait TransitionSystemFactory { 29 | def buildTransitionSystem( 30 | marbleBlock: MarbleBlock, 31 | constraints: Set[TransitionConstraint] 32 | ): TransitionSystem 33 | } 34 | 35 | object TransitionSystemFactory { 36 | private implicit val arcHybridFormat = jsonFormat1(ArcHybridTransitionSystemFactory.apply) 37 | private implicit val arcEagerFormat = jsonFormat1(ArcEagerTransitionSystemFactory.apply) 38 | //private implicit val postaggerFormat = jsonFormat1(PostaggerTransitionSystemFactory.apply) 39 | implicit val transitionSystemFactoryJsonFormat = parentFormat[TransitionSystemFactory]( 40 | childFormat[ArcHybridTransitionSystemFactory, TransitionSystemFactory], 41 | childFormat[ArcEagerTransitionSystemFactory, TransitionSystemFactory] 42 | //childFormat[PostaggerTransitionSystemFactory, TransitionSystemFactory] 43 | ) 44 | } 45 | -------------------------------------------------------------------------------- /tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/fsm/Walk.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.fsm 2 | 3 | import reming.DefaultJsonProtocol._ 4 | 5 | /** A WalkStep is a single step in an FSM walk. 6 | * 7 | * @param state the current state 8 | * @param transition the transition to take 9 | * param transitionCosts the costs of the possible transitions in the current state 10 | */ 11 | case class WalkStep(state: State, transition: StateTransition) 12 | 13 | object WalkStep { 14 | implicit val jsFormat = jsonFormat2(WalkStep.apply) 15 | } 16 | 17 | /** A Walk is a walk through a finite-state machine. 18 | * 19 | * @param initialState the state in which we begin 20 | * @param steps the sequence of steps we take from the initial state 21 | */ 22 | case class Walk(initialState: State, steps: Seq[WalkStep]) { 23 | 24 | /** The sequence of transitions taken during this walk (in order). */ 25 | lazy val transitions = steps map { case WalkStep(_, transition) => transition } 26 | 27 | /** The sequence of states encountered during this walk (in order). */ 28 | lazy val states: Seq[State] = { 29 | finalState match { 30 | case Some(reachableState) => 31 | val walkStates: Seq[State] = steps map { step => step.state } 32 | walkStates :+ reachableState 33 | case None => Seq() 34 | } 35 | } 36 | 37 | /** Returns the parser state that results from executing the steps of this parse step, starting 38 | * from the initial state. 39 | */ 40 | lazy val finalState: Option[State] = { 41 | if (steps.isEmpty) { 42 | Some(initialState) 43 | } else { 44 | (transitions.last)(Some(steps.last.state)) 45 | } 46 | } 47 | 48 | /** Returns whether this walk ends up in a goal state. */ 49 | lazy val isGoal: Boolean = { 50 | finalState match { 51 | case Some(state) => state.isFinal 52 | case _ => false 53 | } 54 | } 55 | 56 | override def toString: String = { 57 | "[" + (steps map { _.transition }).mkString(" ") + "]" 58 | } 59 | } 60 | 61 | object Walk { 62 | implicit val jsFormat = jsonFormat2(Walk.apply) 63 | } 64 | 65 | /** A ScoredWalk attaches a score to a Walk. 66 | * 67 | * @param walk the unscored Walk 68 | * @param score the floating-point score 69 | */ 70 | case class ScoredWalk(walk: Walk, score: Double) 71 | 72 | object ScoredWalk { 73 | implicit val jsFormat = jsonFormat2(ScoredWalk.apply) 74 | } 75 | -------------------------------------------------------------------------------- /tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/ml/FeatureVector.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.ml 2 | 3 | import reming.DefaultJsonProtocol._ 4 | 5 | /** The name of a feature, represented as a list of Symbols. 6 | * 7 | * @param symbols the list of symbols comprising the feature name 8 | */ 9 | case class FeatureName(symbols: Seq[Symbol]) { 10 | override def toString(): String = { 11 | (symbols map { sym => sym.name }).mkString(".") 12 | } 13 | } 14 | 15 | object FeatureName { 16 | implicit val jsFormat = jsonFormat1(FeatureName.apply) 17 | } 18 | 19 | /** A mapping from feature names to values. 20 | * 21 | * Unspecified feature names are assumed to correspond to a value of zero. 22 | * 23 | * @param values the map from feature names to values 24 | */ 25 | case class FeatureVector(values: Seq[(FeatureName, Double)]) { 26 | 27 | @transient lazy val featureNames = values map { _._1 } 28 | 29 | @transient lazy val featureMap = values.toMap 30 | 31 | /** Returns the value of the specified feature name. 32 | * 33 | * Note that this returns zero if the feature is not present in the map. 34 | * 35 | * @param name the feature name of interest 36 | * @return the value assigned to that feature name 37 | */ 38 | def getFeatureValue(name: FeatureName): Double = { 39 | featureMap.getOrElse(name, 0.0) 40 | } 41 | 42 | override def toString(): String = { 43 | "[" + (values map { 44 | case (featureName, featureValue) => 45 | f"${featureName} -> $featureValue%.3f" 46 | }).mkString(" ") + "]" 47 | } 48 | } 49 | 50 | object FeatureVector { 51 | implicit val jsFormat = jsonFormat1(FeatureVector.apply) 52 | 53 | /** Takes the difference between two feature vectors. 54 | * 55 | * @param vec1 first vector 56 | * @param vec2 second vector 57 | * @return the difference vector (first - second) 58 | */ 59 | def subtractVectors(vec1: FeatureVector, vec2: FeatureVector): FeatureVector = { 60 | FeatureVector(((vec1.featureNames ++ vec2.featureNames) map { featureName => 61 | (featureName, vec1.getFeatureValue(featureName) 62 | - vec2.getFeatureValue(featureName)) 63 | }).toMap.toSeq) 64 | } 65 | 66 | /** Merges two feature vectors. 67 | * 68 | * In case of conflict, values in the first vector are preferred. 69 | * 70 | * @param vec1 first vector 71 | * @param vec2 second vector 72 | * @return the merged vector 73 | */ 74 | def mergeVectors(vec1: FeatureVector, vec2: FeatureVector): FeatureVector = { 75 | FeatureVector((vec2.values.toMap ++ vec1.values.toMap).toSeq) 76 | } 77 | 78 | } 79 | -------------------------------------------------------------------------------- /tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/ml/LinearModel.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.ml 2 | 3 | import org.allenai.nlpstack.parse.poly.core.Util 4 | 5 | import reming.DefaultJsonProtocol._ 6 | 7 | /** A weighted linear combination of features. 8 | * 9 | * @param coefficients map from feature names to weight coefficients 10 | */ 11 | case class LinearModel(val coefficients: Seq[(FeatureName, Double)]) { 12 | 13 | @transient val coefficientMap = coefficients.toMap 14 | 15 | /** Returns the coefficient corresponding to the specified feature name. 16 | * 17 | * For unspecified coefficients, zero is returned. 18 | * 19 | * @param featureName the feature name of interest 20 | * @return the coefficient corresponding to the specified feature name 21 | */ 22 | def getCoefficient(featureName: FeatureName): Double = { 23 | coefficientMap.getOrElse(featureName, 0.0) 24 | } 25 | 26 | /** Computes the weighted linear combination, given the feature values in the argument vector. 27 | * 28 | * @param featureVector the feature vector of interest 29 | * @return the weighted linear combination 30 | */ 31 | def score(featureVector: FeatureVector): Double = { 32 | def add(x: Double, y: Double): Double = { x + y } 33 | (featureVector.featureNames map { featureName => 34 | getCoefficient(featureName) * featureVector.getFeatureValue(featureName) 35 | }).fold(0.0)(add) 36 | } 37 | } 38 | 39 | object LinearModel { 40 | implicit val jsFormat = jsonFormat1(LinearModel.apply) 41 | 42 | def loadLinearModel(filename: String): LinearModel = Util.readFromFile[LinearModel](filename) 43 | } 44 | -------------------------------------------------------------------------------- /tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/ml/TrainingData.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.ml 2 | 3 | import reming.DefaultJsonProtocol._ 4 | 5 | /** Maps feature names to integers. Useful for serializing TrainingData instances for 6 | * consumption by command-line machine learning tools. 7 | * 8 | * @param featureNames an indexed sequence of feature names 9 | */ 10 | case class FeatureEncoding(featureNames: IndexedSeq[FeatureName]) { 11 | @transient lazy val featureNameToIndex: Map[FeatureName, Int] = featureNames.zipWithIndex.toMap 12 | } 13 | 14 | object FeatureEncoding { 15 | implicit val jsFormat = jsonFormat1(FeatureEncoding.apply) 16 | } 17 | 18 | /** Abstraction for a set of labeled feature vectors. 19 | * 20 | * Provides various serialization options for different machine learning tools. 21 | * 22 | * @param labeledVectors a sequence of feature vectors labeled with integer outcomes 23 | */ 24 | case class TrainingData(labeledVectors: Iterable[(FeatureVector, Int)]) { 25 | 26 | /** The set of feature names found in the training data. */ 27 | lazy val featureNames: Set[FeatureName] = { 28 | val featureNameSets: Iterable[Set[FeatureName]] = (labeledVectors map { 29 | case (fvec, _) => 30 | fvec.featureNames.toSet 31 | }) 32 | featureNameSets.fold(Set[FeatureName]())((x: Set[FeatureName], y: Set[FeatureName]) => 33 | x union y) 34 | } 35 | 36 | /** Expresses this training data in "SVMlight" format, which is 37 | * .=. : ... : # 38 | * .=. +1 | -1 | 0 | 39 | * .=. | "qid" 40 | * .=. 41 | * .=. 42 | * 43 | * @param signature the signature to use for encoding feature names as integer 44 | * @return the training data in SVMlight format 45 | */ 46 | def asSvmLight(signature: FeatureEncoding): String = { 47 | (labeledVectors map { 48 | case (fvec: FeatureVector, label) => 49 | val sortedValues: Seq[(Int, Double)] = (fvec.values.toSeq map { 50 | case (featureName, featureValue) => 51 | (signature.featureNameToIndex(featureName), featureValue) 52 | }).sortBy(_._1) 53 | val featureString = (sortedValues map { 54 | case (featureIndex, featureValue) => 55 | s"${featureIndex}:${featureValue}" 56 | }).mkString(" ") 57 | s"${svmLightLabel(label)} ${featureString}" 58 | }).mkString("\n") 59 | } 60 | 61 | protected def svmLightLabel(label: Double): String = s"${label}" 62 | } 63 | -------------------------------------------------------------------------------- /tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/polyparser/ArcInverter.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.polyparser 2 | 3 | /** The ArcInverter takes a PolytreeParse and inverts arcs whose labels are in the argument set 4 | * `inverseArcLabels`. Note that this operation should only affect the `children` field of a 5 | * PolytreeParse, since the other fields only care about the underlying undirected tree. 6 | * 7 | * The purpose of this class is to convert standard dependency parses into polytree 8 | * dependency parses. For instance, we may wish to invert all arcs x ---> y for which 9 | * the arc label is 'det (effectively this would invert the relationship between a determiner 10 | * and its noun to say that the determiner "requires" the noun, rather than vice-versa). 11 | * 12 | * @param inverseArcLabels the set of arc labels to invert 13 | */ 14 | class ArcInverter(val inverseArcLabels: Set[ArcLabel]) extends (PolytreeParse => PolytreeParse) { 15 | 16 | /** Inverts the arcs whose labels are in `inverseArcLabels` 17 | * 18 | * @param parse the polytree parse we want to transform 19 | * @return a new polytree parse, with the specified arcs inverted 20 | */ 21 | def apply(parse: PolytreeParse): PolytreeParse = { 22 | 23 | // for each node, determine the neighbors for which the arcs should be inverted 24 | val invertibleNeighbors: Vector[Set[Int]] = for { 25 | labeledNeighbors <- parse.arclabels 26 | } yield for { 27 | (neighbor, label) <- labeledNeighbors if isInvertible(label) 28 | } yield neighbor 29 | 30 | // compute the new children using an XOR operation 31 | val newChildren: Vector[Set[Int]] = for { 32 | (neighbors, children) <- invertibleNeighbors.zip(parse.children) 33 | } yield ((neighbors diff children) union (children diff neighbors)) 34 | 35 | PolytreeParse(parse.sentence, parse.breadcrumb, newChildren, parse.arclabels) 36 | } 37 | 38 | def isInvertible(arcLabel: ArcLabel): Boolean = { 39 | val stanLabel = arcLabel match { 40 | case dpLabel: DependencyParsingArcLabel => 41 | dpLabel.stanLabel 42 | case _ => 43 | arcLabel.toSymbol 44 | } 45 | inverseArcLabels.contains(SingleSymbolArcLabel(stanLabel)) 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/polyparser/Neighborhood.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.polyparser 2 | 3 | import reming.DefaultJsonProtocol._ 4 | 5 | /** A Neighborhood is a sequence of token indices, generally referring to a parse tree. 6 | * 7 | * For instance, one might want to consider neighborhoods like: 8 | * - a node and its children 9 | * - a node and its parents 10 | * - a node and its breadcrumb 11 | * 12 | * @param tokens a sequence of token indices, usually associated in some way 13 | * (see NeighborhoodExtractor instances for examples of such associations) 14 | */ 15 | case class Neighborhood(tokens: Seq[Int]) 16 | 17 | object Neighborhood { 18 | implicit val neighborhoodJsonFormat = jsonFormat1(Neighborhood.apply) 19 | } 20 | 21 | /** A data source for neighborhoods. */ 22 | trait NeighborhoodSource { 23 | /** Returns an iterator over the neighborhoods in this data source. 24 | * 25 | * @return an iterator over the neighborhoods in this data source 26 | */ 27 | def getNeighborhoodIterator(): Iterator[(PolytreeParse, Neighborhood)] 28 | } 29 | -------------------------------------------------------------------------------- /tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/polyparser/ParsePool.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.polyparser 2 | 3 | import java.io.{ File, PrintWriter } 4 | 5 | import org.allenai.common.Resource 6 | import org.allenai.nlpstack.parse.poly.fsm.NbestList 7 | 8 | import scala.io.Source 9 | import scala.util.Random 10 | 11 | import reming.{ CompactPrinter, JsonParser } 12 | import reming.DefaultJsonProtocol._ 13 | 14 | /** A ParsePool is a collection of parse candidates for the same input sentence. 15 | * 16 | * @param parses a sequence of parse trees 17 | */ 18 | case class ParsePool(parses: Iterable[(PolytreeParse, Double)]) { 19 | def toNbestList: NbestList = { 20 | NbestList(parses) 21 | } 22 | 23 | @transient lazy val indexedParses = parses.toIndexedSeq 24 | 25 | def chooseRandomParse: PolytreeParse = { 26 | indexedParses(Random.nextInt(indexedParses.size))._1 27 | } 28 | } 29 | 30 | object ParsePool { 31 | implicit val jsFormat = jsonFormat1(ParsePool.apply) 32 | } 33 | 34 | /** A data source for ParsePool objects. */ 35 | trait ParsePoolSource { 36 | def poolIterator: Iterator[ParsePool] 37 | } 38 | 39 | case class InMemoryParsePoolSource(inputIterator: Iterator[ParsePool]) extends ParsePoolSource { 40 | 41 | private val cachedPools = inputIterator.toIterable 42 | 43 | override def poolIterator: Iterator[ParsePool] = { 44 | cachedPools.iterator 45 | } 46 | } 47 | 48 | case class FileBasedParsePoolSource(filename: String) extends ParsePoolSource { 49 | 50 | override def poolIterator: Iterator[ParsePool] = { 51 | val lines: Iterator[String] = Source.fromFile(filename).getLines 52 | lines map { line => 53 | JsonParser.read[ParsePool](line) 54 | } 55 | } 56 | } 57 | 58 | object FileBasedParsePoolSource { 59 | 60 | def writePools(pools: Iterator[ParsePool], filename: String) { 61 | Resource.using(new PrintWriter(new File(filename))) { writer => 62 | for (pool <- pools) { 63 | CompactPrinter.printTo(writer, pool) 64 | } 65 | } 66 | } 67 | } 68 | 69 | -------------------------------------------------------------------------------- /tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/polyparser/Parser.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.polyparser 2 | 3 | import org.allenai.nlpstack.core._ 4 | import org.allenai.nlpstack.parse.poly.fsm.TransitionConstraint 5 | import org.allenai.nlpstack.tokenize.defaultTokenizer 6 | import org.allenai.nlpstack.postag.defaultPostagger 7 | 8 | import java.io.InputStream 9 | 10 | import org.allenai.nlpstack.parse.poly.core 11 | import org.allenai.nlpstack.parse.poly.core.{ WordClusters, Sentence, NexusToken } 12 | 13 | object Parser { 14 | 15 | /** Loads a parser from its file. 16 | * 17 | * @param filename the JSON configuration file or model prefix 18 | * @return the parser initialized from the file 19 | */ 20 | def loadParser(filename: String): TransitionParser = { 21 | TransitionParser.load(filename) 22 | } 23 | 24 | /** Loads a parser from its configuration file. Also allows you to specify a set of 25 | * "gold" parses to cache (the cache is checked before the base parser is utilized) 26 | * 27 | * @param filename the JSON configuration file or model prefix 28 | * @param parsesToCache a sequence of "gold" parses to cache 29 | * @return the initialized parser 30 | */ 31 | def loadParserWithCache(filename: String, parsesToCache: Iterator[PolytreeParse]): TransitionParser = { 32 | val fallbackParser = loadParser(filename) 33 | ParseCache(parsesToCache.toSeq, fallbackParser) 34 | } 35 | 36 | /** Loads a parser from an InputStream of a models file 37 | * @param inputStream stream of models config file 38 | * @return the parser initialized from the input stream 39 | */ 40 | def loadParser(inputStream: InputStream): TransitionParser = { 41 | TransitionParser.loadFromStream(inputStream) 42 | } 43 | 44 | private val tokenizer = defaultTokenizer 45 | private val postagger = defaultPostagger 46 | 47 | /** Tokenizes (and tags) an untokenized sentence. 48 | * 49 | * @param text the untokenized sentence 50 | * @return a sequence of tokens 51 | */ 52 | def tokenizeSentence(text: String): Seq[core.Token] = { 53 | val postagged: Seq[PostaggedToken] = postagger.postag(tokenizer)(text) 54 | NexusToken +: (postagged map { 55 | case tok => 56 | core.Token( 57 | word = Symbol(tok.string), 58 | Map( 59 | 'autoPos -> 60 | Set(Symbol(tok.postag)), 61 | 'autoCpos -> 62 | Set(Symbol(WordClusters.ptbToUniversalPosTag.getOrElse(tok.postag, tok.postag))) 63 | ) 64 | ) 65 | }) 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/polyparser/ParserConfiguration.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.polyparser 2 | 3 | import java.io.{ PrintWriter, File, InputStream } 4 | import java.net.URL 5 | 6 | import org.allenai.common.Resource._ 7 | import org.allenai.nlpstack.parse.poly.fsm.{ 8 | RerankingFunction, 9 | StateCostFunction, 10 | StateCostFunctionFactory 11 | } 12 | import reming.DefaultJsonProtocol._ 13 | 14 | /** Contains the key components of a parser (for serialization purposes). 15 | * 16 | * @param parsingCostFunctionFactory the cost function factory for the transition parser 17 | * @param rerankingFunction the cost function for parse reranking 18 | * @param parsingNbestSize the nbest size to generate for reranking 19 | */ 20 | case class ParserConfiguration( 21 | parsingCostFunctionFactory: StateCostFunctionFactory, 22 | rerankingFunction: RerankingFunction, 23 | parsingNbestSize: Int 24 | ) 25 | 26 | object ParserConfiguration { 27 | implicit val parserConfigurationFormat = jsonFormat3(ParserConfiguration.apply) 28 | } 29 | -------------------------------------------------------------------------------- /tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/polyparser/ParserConstraint.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.polyparser 2 | 3 | import org.allenai.nlpstack.parse.poly.fsm.TransitionConstraint 4 | 5 | /** A ForbiddenEdge constraint designates a transition as illegal if it would directly create 6 | * an arc (in either direction) between the tokens at the given indices. 7 | * 8 | * Note that argument order does not matter for the constructor. 9 | * 10 | * @param token1 index of the first token 11 | * @param token2 index of the second token 12 | */ 13 | case class ForbiddenEdge(token1: Int, token2: Int) extends TransitionConstraint 14 | 15 | /** A ForbiddenArcLabel constraint designates a transition as illegal if it would directly 16 | * create an arc (in either direction) with the specified label between the tokens at the given 17 | * indices. It also implicitly creates a RequestedArc constraint for the specified arc 18 | * (basically it says that we DO want an arc between the specified indices, just not with this 19 | * label). 20 | * 21 | * Note that argument order (of the token indices) does not matter for the constructor. 22 | * 23 | * @param token1 index of the first token 24 | * @param token2 index of the second token 25 | * @param arcLabel label that is forbidden between the two tokens 26 | */ 27 | case class ForbiddenArcLabel(token1: Int, token2: Int, 28 | arcLabel: Symbol) extends TransitionConstraint 29 | 30 | /** A RequestedArc constraint requests that the output parse MUST contain the requested arc. 31 | * 32 | * The arc is specified using the index of the token at the arc's head followed by the index of 33 | * the token at the arc's tail. 34 | * 35 | * Note: currently this constraint does not pay attention to the arc direction, nor the arc 36 | * label. It only enforces that that there is some edge between the two specified tokens. 37 | * 38 | * @param token1 index of the first token 39 | * @param token2 index of the second token 40 | * @param arcLabel desired label for the arc 41 | */ 42 | case class RequestedArc(token1: Int, token2: Int, 43 | arcLabel: Option[Symbol] = None) extends TransitionConstraint 44 | 45 | /** A RequestedCpos constraint specifies the coarse part-of-speech tag of a particular token. 46 | * This means that in the returned parse, the 'cpos property for that token will correspond 47 | * to the requested coarse tag. 48 | * 49 | * @param tokenIndex index of the desired token 50 | * @param cpos desired coarse tag for the token 51 | */ 52 | case class RequestedCpos(tokenIndex: Int, cpos: Symbol) extends TransitionConstraint 53 | -------------------------------------------------------------------------------- /tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/polyparser/RerankingTransitionParser.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.polyparser 2 | 3 | import org.allenai.nlpstack.parse.poly.core.Sentence 4 | import org.allenai.nlpstack.parse.poly.fsm._ 5 | 6 | /** Uses the parser model to create an n-best list, then chooses the best parse from this n-best 7 | * list (according to the reranking function). 8 | * 9 | * @param config configuration object for the parser 10 | */ 11 | case class RerankingTransitionParser(config: ParserConfiguration) extends TransitionParser { 12 | 13 | @transient val reranker: Reranker = new Reranker(config.rerankingFunction) 14 | 15 | def parseWithScore( 16 | sentence: Sentence, 17 | constraints: Set[TransitionConstraint] = Set(), 18 | doFastApproximation: Boolean = false 19 | ): Option[(PolytreeParse, Double)] = { 20 | 21 | val parsingCostFunction = 22 | config.parsingCostFunctionFactory.buildCostFunction(sentence, constraints) 23 | val baseParser = new NbestSearch(parsingCostFunction) 24 | val nbestList: Option[NbestList] = 25 | parsingCostFunction.transitionSystem.initialState( 26 | constraints.toSeq 27 | ) map { initState => 28 | val nbestSize = // do full reranking only in the absence of constraints 29 | if (constraints.nonEmpty) { 1 } else if (doFastApproximation) { 2 } else { config.parsingNbestSize } 30 | baseParser.find(initState, nbestSize, constraints) 31 | } 32 | val mappedNbestList: Option[NbestList] = nbestList map { x => 33 | NbestList(x.scoredSculptures) 34 | } 35 | val candidate: Option[(Sculpture, Double)] = mappedNbestList flatMap { nbList => 36 | reranker.rerankWithScore(nbList) 37 | } 38 | candidate match { 39 | case Some((parse: PolytreeParse, cost)) => 40 | Some((parse, cost)) 41 | case _ => None 42 | } 43 | } 44 | def parse( 45 | sentence: Sentence, 46 | constraints: Set[TransitionConstraint] = Set() 47 | ): Option[PolytreeParse] = { 48 | 49 | parseWithScore(sentence, constraints) map { case (parse, _) => parse } 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/reranking/ParseNodeFeature.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.reranking 2 | 3 | import org.allenai.nlpstack.parse.poly.ml.{ FeatureName, FeatureVector } 4 | import org.allenai.nlpstack.parse.poly.polyparser.PolytreeParse 5 | 6 | import reming.LazyFormat 7 | import reming.DefaultJsonProtocol._ 8 | 9 | /** Maps a selected node of a parse tree into a feature vector. */ 10 | abstract class ParseNodeFeature extends ((PolytreeParse, Int) => FeatureVector) 11 | 12 | object ParseNodeFeature { 13 | implicit object ParseNodeFeatureJsonFormat extends LazyFormat[ParseNodeFeature] { 14 | private implicit val parseNodeFeatureUnionFormat = jsonFormat1(ParseNodeFeatureUnion.apply) 15 | 16 | private implicit val transformedNeighborhoodFeatureFormat = 17 | jsonFormat2(TransformedNeighborhoodFeature.apply) 18 | 19 | override val delegate = parentFormat[ParseNodeFeature]( 20 | childFormat[ParseNodeFeatureUnion, ParseNodeFeature], 21 | childFormat[TransformedNeighborhoodFeature, ParseNodeFeature] 22 | ) 23 | } 24 | } 25 | 26 | /** A ParseNodeFeatureUnion merges the output of a list of features. 27 | * 28 | * @param features a list of the features we want to merge into a single feature 29 | */ 30 | case class ParseNodeFeatureUnion( 31 | features: Seq[ParseNodeFeature] 32 | ) extends ParseNodeFeature { 33 | 34 | override def apply(parse: PolytreeParse, token: Int): FeatureVector = { 35 | features map (f => f(parse, token)) reduce ((m1, m2) => FeatureVector.mergeVectors(m1, m2)) 36 | } 37 | } 38 | 39 | /** A TransformedNeighborhoodFeature creates a feature vector from a set of neighborhood 40 | * extractors and transforms. 41 | * 42 | * @param neighborhoodExtractors the neighborhood extractors you want to apply to each parse node 43 | * @param transforms the transforms you want to apply to the extracted neighborhoods 44 | */ 45 | case class TransformedNeighborhoodFeature( 46 | neighborhoodExtractors: Seq[(String, NeighborhoodExtractor)], 47 | transforms: Seq[(String, NeighborhoodTransform)] 48 | ) extends ParseNodeFeature { 49 | 50 | override def apply(parse: PolytreeParse, token: Int): FeatureVector = { 51 | FeatureVector( 52 | for { 53 | (extractorName, extractor) <- neighborhoodExtractors 54 | neighborhood <- extractor(parse, token) 55 | (transformName, transform) <- transforms 56 | transformedNeighborhood <- transform(parse, neighborhood) 57 | } yield { 58 | val featureName = (Seq(extractorName, transformName) map { x => Symbol(x) }) ++ 59 | transformedNeighborhood.symbols 60 | FeatureName(featureName) -> 1.0 61 | } 62 | ) 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/reranking/ParseRerankingFunction.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.reranking 2 | 3 | import org.allenai.nlpstack.parse.poly.eval.ParseScore 4 | import org.allenai.nlpstack.parse.poly.fsm._ 5 | import org.allenai.nlpstack.parse.poly.ml.LinearModel 6 | import org.allenai.nlpstack.parse.poly.polyparser.PolytreeParse 7 | 8 | /** Rescores a parse tree according to a specified scoring function. 9 | * 10 | * @param scoringFunction the desired scoring function 11 | */ 12 | case class ParseRerankingFunction(scoringFunction: ParseScore) 13 | extends RerankingFunction { 14 | 15 | override def apply(sculpture: Sculpture, baseCost: Double): Double = { 16 | sculpture match { 17 | case parse: PolytreeParse => 1.0 - scoringFunction(parse) 18 | case _ => 1.0 19 | } 20 | } 21 | } 22 | 23 | /** Rescores a parse tree based on a linear combination of features. 24 | * 25 | * @param feature computes a feature vector from the parse tree 26 | * @param linearModel computes a linear combination of the computed features 27 | */ 28 | case class LinearParseRerankingFunction( 29 | feature: PolytreeParseFeature, 30 | linearModel: Option[LinearModel] 31 | ) extends RerankingFunction { 32 | 33 | override def apply(sculpture: Sculpture, baseCost: Double): Double = { 34 | sculpture match { 35 | case parse: PolytreeParse => 36 | linearModel.get.score(feature(parse, baseCost)) 37 | case _ => Double.PositiveInfinity 38 | } 39 | } 40 | } 41 | 42 | -------------------------------------------------------------------------------- /tools/parse/src/main/scala/org/allenai/nlpstack/parse/poly/reranking/PolytreeParseFeature.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.reranking 2 | 3 | import org.allenai.nlpstack.parse.poly.ml.{ 4 | FeatureName => MLFeatureName, 5 | FeatureVector => MLFeatureVector 6 | } 7 | import org.allenai.nlpstack.parse.poly.polyparser.PolytreeParse 8 | 9 | import reming.LazyFormat 10 | import reming.DefaultJsonProtocol._ 11 | 12 | /** Maps a scored parse into a feature vector. */ 13 | abstract class PolytreeParseFeature extends ((PolytreeParse, Double) => MLFeatureVector) 14 | 15 | object PolytreeParseFeature { 16 | implicit object PolytreeParseFeatureJsonFormat extends LazyFormat[PolytreeParseFeature] { 17 | implicit val polytreeParseFeatureUnionFormat = jsonFormat1(PolytreeParseFeatureUnion.apply) 18 | implicit val baseParserScoreFeatureFormat = jsonFormat0(() => BaseParserScoreFeature) 19 | implicit val sentenceLengthFeatureFormat = jsonFormat0(() => SentenceLengthFeature) 20 | 21 | override val delegate = parentFormat[PolytreeParseFeature]( 22 | childFormat[PolytreeParseFeatureUnion, PolytreeParseFeature], 23 | childFormat[BaseParserScoreFeature.type, PolytreeParseFeature], 24 | childFormat[SentenceLengthFeature.type, PolytreeParseFeature] 25 | ) 26 | } 27 | } 28 | 29 | /** Simply passes along the length of the sentence as a feature. */ 30 | case object SentenceLengthFeature extends PolytreeParseFeature { 31 | 32 | override def apply(parse: PolytreeParse, score: Double): MLFeatureVector = { 33 | MLFeatureVector(Seq(MLFeatureName(List(name)) -> parse.sentence.tokens.tail.size)) 34 | } 35 | 36 | val name: Symbol = 'sentLen 37 | } 38 | 39 | /** Simply passes along the original score of the parse as a feature. */ 40 | case object BaseParserScoreFeature extends PolytreeParseFeature { 41 | 42 | override def apply(parse: PolytreeParse, score: Double): MLFeatureVector = { 43 | MLFeatureVector(Seq(MLFeatureName(List(name)) -> score)) 44 | } 45 | 46 | val name: Symbol = 'baseParserScore 47 | } 48 | 49 | /** A PolytreeParseFeatureUnion merges the output of a list of features. 50 | * 51 | * @param features a list of the features we want to merge into a single feature 52 | */ 53 | case class PolytreeParseFeatureUnion( 54 | val features: Seq[PolytreeParseFeature] 55 | ) extends PolytreeParseFeature { 56 | 57 | override def apply(parse: PolytreeParse, score: Double): MLFeatureVector = { 58 | features map (f => f(parse, score)) reduce ((m1, m2) => MLFeatureVector.mergeVectors(m1, m2)) 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /tools/parse/src/main/universal/parse-server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CLASS_NAME="edu.knowitall.tool.parse.ClearDependencyParserMain" 4 | 5 | SCRIPT_DIR=`dirname $0` 6 | SHORT_NAME=`basename $0 .sh` 7 | APP_ROOT="$SCRIPT_DIR/.." 8 | JVM_ARGS="-Xmx3G" 9 | 10 | . "${SCRIPT_DIR}/run-class.sh" "$CLASS_NAME" "$SHORT_NAME" "$@" 11 | -------------------------------------------------------------------------------- /tools/parse/src/test/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 6 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /tools/parse/src/test/scala/org/allenai/nlpstack/parse/FactorieParserSpec.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse 2 | 3 | import org.allenai.common.testkit.UnitSpec 4 | import org.allenai.nlpstack.core.parse.graph.DependencyGraph 5 | import org.allenai.nlpstack.postag.defaultPostagger 6 | import org.allenai.nlpstack.tokenize.defaultTokenizer 7 | 8 | class FactorieParserSpec extends UnitSpec { 9 | private def parseTreeString(text: String) = { 10 | val tokens = defaultTokenizer.tokenize(text) 11 | val postaggedTokens = defaultPostagger.postagTokenized(tokens) 12 | 13 | val parser = new FactorieParser 14 | val parseTree = parser.dependencyGraphPostagged(postaggedTokens) 15 | 16 | DependencyGraph.multilineStringFormat.write(parseTree) 17 | } 18 | 19 | /* 20 | * When these tests fail with anything but an exception, it's a judgement call 21 | * whether the trees that the parser produces are valid parses or whether this 22 | * is a genuine error. If in doubt, consult your favorite linguist, but by and 23 | * large, don't worry too much about accuracy here. This is not a quality test 24 | * suite. 25 | */ 26 | 27 | "FactorieParser" should "correctly parse a simple sentence" in { 28 | val parseTreeStr = parseTreeString("A waffle is like a pancake with a syrup trap.") 29 | val expectedParseTreeStr = 30 | """|det(waffle-2, A-1) 31 | |nsubj(is-3, waffle-2) 32 | |root(ROOT-0, is-3) 33 | |prep(is-3, like-4) 34 | |det(pancake-6, a-5) 35 | |pobj(like-4, pancake-6) 36 | |prep(pancake-6, with-7) 37 | |det(trap-10, a-8) 38 | |nn(trap-10, syrup-9) 39 | |pobj(with-7, trap-10) 40 | |punct(is-3, .-11)""".stripMargin 41 | assert(parseTreeStr === expectedParseTreeStr) 42 | } 43 | 44 | it should "correctly parse a complicated sentence" in { 45 | // This sentence has two roots when it comes out of Factorie. 46 | val parseTreeStr = parseTreeString("Big investment banks refused to step up to the plate, traders say.") 47 | val expectedParseTreeStr = 48 | """|amod(banks-3, Big-1) 49 | |nn(banks-3, investment-2) 50 | |nsubj(refused-4, banks-3) 51 | |root(ROOT-0, refused-4) 52 | |aux(step-6, to-5) 53 | |xcomp(refused-4, step-6) 54 | |prt(step-6, up-7) 55 | |prep(step-6, to-8) 56 | |det(plate-10, the-9) 57 | |pobj(to-8, plate-10) 58 | |punct(say-13, ,-11) 59 | |nsubj(say-13, traders-12) 60 | |punct(say-13, .-14)""".stripMargin 61 | assert(parseTreeStr === expectedParseTreeStr) 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /tools/parse/src/test/scala/org/allenai/nlpstack/parse/poly/core/SentenceSpec.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.core 2 | 3 | import org.allenai.common.immutable.Interval 4 | import org.allenai.common.testkit.UnitSpec 5 | 6 | class SentenceSpec extends UnitSpec { 7 | // scalastyle:off 8 | 9 | ".initializeFromWhitespaceSeparatedString" should "give the correct sentence" in { 10 | Sentence.initializeFromWhitespaceSeparatedString("This is input .") shouldBe 11 | Sentence(IndexedSeq(NexusToken, Token('This), Token('is), Token('input), Token(Symbol(".")))) 12 | } 13 | 14 | it should "ignore leading and trailing whitespace" in { 15 | Sentence.initializeFromWhitespaceSeparatedString(" This is input . ") shouldBe 16 | Sentence(IndexedSeq(NexusToken, Token('This), Token('is), Token('input), Token(Symbol(".")))) 17 | } 18 | 19 | "Initializing a sentence" should "give the correct paren intervals for sent1" in { 20 | val sent1 = Sentence.initializeFromWhitespaceSeparatedString("we saw black cats") 21 | sent1.parenIntervals shouldBe Set.empty 22 | } 23 | 24 | it should "give the correct paren intervals for sent2" in { 25 | val sent2 = Sentence.initializeFromWhitespaceSeparatedString( 26 | "with the help of animals ( insects and birds ) flowers can be pollinated ( fertilized ) ." 27 | ) 28 | sent2.parenIntervals shouldBe Set(Interval.closed(6, 10), Interval.closed(15, 17)) 29 | } 30 | 31 | it should "give the correct paren intervals for sent3" in { 32 | val sent3 = Sentence.initializeFromWhitespaceSeparatedString( 33 | "with the help of animals ( ( insects ) and birds ) flowers can " + 34 | "be pollinated ( fertilized ) ." 35 | ) 36 | sent3.parenIntervals shouldBe 37 | Set(Interval.closed(7, 9), Interval.closed(6, 12), Interval.closed(17, 19)) 38 | } 39 | 40 | it should "give the correct paren intervals for sent4" in { 41 | val sent4 = Sentence.initializeFromWhitespaceSeparatedString( 42 | "with the help of animals insects ) and birds ) flowers can " + 43 | "be pollinated ( fertilized ." 44 | ) 45 | sent4.parenIntervals shouldBe 46 | Set(Interval.closed(0, 7), Interval.closed(0, 10), Interval.closed(15, 17)) 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /tools/parse/src/test/scala/org/allenai/nlpstack/parse/poly/core/TokenSpec.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.core 2 | 3 | import org.allenai.common.testkit.UnitSpec 4 | 5 | class TokenSpec extends UnitSpec { 6 | // scalastyle:off 7 | 8 | ".getProperty" should "return the empty set for an undefined property" in { 9 | Token.create("the").getProperty('unknownProperty) shouldBe Set() 10 | } 11 | 12 | ".getDeterministicProperty" should "return the correct answer" in { 13 | val tok = Token.create("the", coarsePos = Some("DET"), finePos = Some("DT")) 14 | tok.updateProperty('myProperty, Set('good)) 15 | tok.getDeterministicProperty('cpos) shouldEqual 'DET 16 | } 17 | 18 | it should "return Token.propertyNotFound" in { 19 | val tok = Token.create("the").updateProperty('definite, Set('yes)) 20 | tok.getDeterministicProperty('indefinite) shouldEqual Token.propertyNotFound 21 | } 22 | 23 | ".updateProperty" should "override the previous value" in { 24 | val tok = Token.create("the").updateProperty('definite, Set('yes)) 25 | tok.getDeterministicProperty('definite) shouldEqual Symbol("yes") 26 | tok.updateProperty('definite, Set('no)).getDeterministicProperty('definite) shouldEqual 'no 27 | } 28 | 29 | ".extendProperty" should "extend the previous value" in { 30 | val tok = Token.create("the").extendProperty('definite, 'yes) 31 | tok.getProperty('definite) shouldBe Set('yes) 32 | tok.extendProperty('definite, 'no).getProperty('definite) shouldBe Set('yes, 'no) 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /tools/parse/src/test/scala/org/allenai/nlpstack/parse/poly/core/TokenTaggerSpec.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.core 2 | 3 | import org.allenai.common.testkit.UnitSpec 4 | 5 | class TokenTaggerSpec extends UnitSpec { 6 | // scalastyle:off 7 | 8 | "LexicalPropertiesTagger" should "give the correct tags" in { 9 | val sent = Sentence.initializeFromWhitespaceSeparatedString("apple and blueberry pie") 10 | LexicalPropertiesTagger.tag(Token(Symbol("hello"))) shouldBe Set() 11 | LexicalPropertiesTagger.tag(Token(Symbol("Hello"))) shouldBe Set( 12 | TokenTag(LexicalPropertiesTagger.taggerName, 'firstCap), 13 | TokenTag(LexicalPropertiesTagger.taggerName, 'existsCap) 14 | ) 15 | LexicalPropertiesTagger.tag(Token(Symbol("HELLO"))) shouldBe Set( 16 | TokenTag(LexicalPropertiesTagger.taggerName, 'firstCap), 17 | TokenTag(LexicalPropertiesTagger.taggerName, 'existsCap), 18 | TokenTag(LexicalPropertiesTagger.taggerName, 'allCaps) 19 | ) 20 | LexicalPropertiesTagger.tag(Token(Symbol("HELLO22"))) shouldBe Set( 21 | TokenTag(LexicalPropertiesTagger.taggerName, 'firstCap), 22 | TokenTag(LexicalPropertiesTagger.taggerName, 'existsCap), 23 | TokenTag(LexicalPropertiesTagger.taggerName, 'existsNum) 24 | ) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /tools/parse/src/test/scala/org/allenai/nlpstack/parse/poly/decisiontree/DecisionTreeSpec.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.decisiontree 2 | 3 | import org.allenai.common.testkit.UnitSpec 4 | 5 | class DecisionTreeSpec extends UnitSpec { 6 | 7 | val decisionTree1 = DecisionTree( 8 | outcomes = Seq(0, 1), 9 | child = Vector( 10 | Map(0 -> 1, 1 -> 2), // node 0 11 | Map(), // node 1 12 | Map(0 -> 3, 1 -> 4), // node 2 13 | Map(), // node 3 14 | Map() 15 | ), // node 4 16 | splittingFeature = Vector( 17 | Some(35), // node 0 18 | None, // node 1 19 | Some(20), // node 2 20 | None, // node 3 21 | None 22 | ), // node 4 23 | outcomeHistograms = Vector( 24 | Map(0 -> 45, 1 -> 55), // node 0 25 | Map(0 -> 29, 1 -> 9), // node 1 26 | Map(0 -> 16, 1 -> 46), // node 2 27 | Map(0 -> 5, 1 -> 10), // node 3 28 | Map(0 -> 11, 1 -> 36) 29 | ) // node 4 30 | ) 31 | 32 | "DecisionTree.outcomeHistogram" should "get node 3's histogram" in { 33 | val fv = SparseVector(outcome = None, numFeatures = 100, trueFeatures = Set(34, 35)) 34 | decisionTree1.outcomeHistogram(fv) shouldBe Map(0 -> 5, 1 -> 10) 35 | } 36 | 37 | it should "get node 1's histogram" in { 38 | val fv = SparseVector(outcome = None, numFeatures = 100, trueFeatures = Set(34, 20)) 39 | decisionTree1.outcomeHistogram(fv) shouldBe Map(0 -> 29, 1 -> 9) 40 | } 41 | 42 | "DecisionTree.allFeatures" should "return 20 and 35 for decisionTree1" in { 43 | decisionTree1.allFeatures shouldBe Set(20, 35) 44 | } 45 | 46 | "DecisionTree.outcomeDistribution" should "get node 3's smoothed distribution" in { 47 | val fv = SparseVector(outcome = None, numFeatures = 100, trueFeatures = Set(34, 20)) 48 | (math floor decisionTree1.outcomeDistribution(fv)._1.dist(0) * 1000) / 1000 shouldBe 0.755 49 | (math floor decisionTree1.outcomeDistribution(fv)._1.dist(1) * 1000) / 1000 shouldBe 0.244 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /tools/parse/src/test/scala/org/allenai/nlpstack/parse/poly/fsm/TrainingVectorSourceSpec.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.fsm 2 | 3 | import org.allenai.common.testkit.UnitSpec 4 | 5 | class TrainingVectorSourceSpec extends UnitSpec { 6 | 7 | } 8 | -------------------------------------------------------------------------------- /tools/parse/src/test/scala/org/allenai/nlpstack/parse/poly/ml/BrownClustersSpec.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.ml 2 | 3 | import org.allenai.common.testkit.UnitSpec 4 | import org.allenai.nlpstack.parse.poly.core.{ Token, Sentence } 5 | 6 | class BrownClustersSpec extends UnitSpec { 7 | 8 | val clusters1 = BrownClusters.fromStringMap(Map( 9 | ("apple", "00"), 10 | ("cherry", "00"), 11 | ("banana", "01"), 12 | ("carrot", "100"), 13 | ("beet", "101"), 14 | ("turnip", "101"), 15 | ("celery", "11") 16 | ), Map()) 17 | 18 | val clusters2 = BrownClusters.fromStringMap(Map( 19 | ("apple", "10"), 20 | ("beet", "01") 21 | ), Map()) 22 | 23 | val sentence1 = Sentence(IndexedSeq(Token('apple), Token('and), Token('cherry), Token('beet))) 24 | 25 | "BrownClusters.getAllClusters" should "return the correct answer" in { 26 | clusters1.getAllClusters('turnip).size shouldBe 4 27 | clusters1.getAllClusters('turnip) shouldBe clusters1.getAllClusters('beet) 28 | clusters1.getAllClusters('turnip) == clusters1.getAllClusters('carrot) shouldBe false 29 | } 30 | 31 | it should "return zero for an unknown word" in { 32 | clusters1.getAllClusters('rutabaga) shouldBe List(Symbol("0")) 33 | } 34 | /* 35 | "Sentence.taggedWithBrownClusters" should "return the correct answer" in { 36 | sentence1.taggedWithBrownClusters(Seq(clusters1, clusters2)) shouldBe 37 | Sentence(Seq( 38 | Token('apple, Map('brown0 -> Set(Symbol("0"), Symbol("00")), 39 | 'brown1 -> Set(Symbol("1"), Symbol("10")))), 40 | Token('and, Map('brown0 -> Set[Symbol](), 41 | 'brown1 -> Set[Symbol]())), 42 | Token('cherry, Map('brown0 -> Set(Symbol("0"), Symbol("00")), 43 | 'brown1 -> Set[Symbol]())), 44 | Token('beet, Map('brown0 -> Set(Symbol("1"), Symbol("10"), Symbol("101")), 45 | 'brown1 -> Set(Symbol("0"), Symbol("01")))))) 46 | } 47 | */ 48 | } 49 | 50 | -------------------------------------------------------------------------------- /tools/parse/src/test/scala/org/allenai/nlpstack/parse/poly/ml/FeatureVectorSpec.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.ml 2 | 3 | import org.allenai.common.testkit.UnitSpec 4 | 5 | import reming.{ CompactPrinter, JsonParser } 6 | 7 | class FeatureVectorSpec extends UnitSpec { 8 | // scalastyle:off 9 | 10 | val nameA = FeatureName(List('a)) 11 | val nameB = FeatureName(List('b)) 12 | val nameC = FeatureName(List('c)) 13 | 14 | "Calling .getFeatureValue" should "return the correct value" in { 15 | val vec1 = FeatureVector(Seq(nameA -> 0.5, nameB -> 0.3)) 16 | vec1.getFeatureValue(nameA) shouldBe 0.5 17 | vec1.getFeatureValue(nameB) shouldBe 0.3 18 | } 19 | 20 | it should "return zero for an unrecognized feature name" in { 21 | val vec1 = FeatureVector(Seq(nameA -> 0.5, nameB -> 0.3)) 22 | vec1.getFeatureValue(nameC) shouldBe 0 23 | } 24 | 25 | "Calling subtractVectors" should "return the correct difference vector" in { 26 | FeatureVector.subtractVectors( 27 | FeatureVector(Seq(nameA -> 1, nameB -> 5)), 28 | FeatureVector(Seq(nameA -> 3, nameC -> 4)) 29 | ).featureMap shouldBe 30 | Map(nameA -> -2, nameB -> 5, nameC -> -4) 31 | } 32 | 33 | "Calling mergeVectors" should "prioritize mappings in the first vector" in { 34 | FeatureVector.mergeVectors( 35 | FeatureVector(Seq(nameA -> 0.5, nameB -> 0.3)), 36 | FeatureVector(Seq(nameA -> 0.7, nameC -> 0.4)) 37 | ).featureMap shouldBe 38 | Map(nameA -> 0.5, nameB -> 0.3, nameC -> 0.4) 39 | } 40 | 41 | "Serializing a FeatureVector" should "preserve the vector" in { 42 | val vec1 = FeatureVector(Seq(nameA -> 0.5, nameB -> 0.3)) 43 | JsonParser.read[FeatureVector](CompactPrinter.printToString(vec1)) shouldBe vec1 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /tools/parse/src/test/scala/org/allenai/nlpstack/parse/poly/ml/LinearModelSpec.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.ml 2 | 3 | import org.allenai.common.testkit.UnitSpec 4 | 5 | class LinearModelSpec extends UnitSpec { 6 | 7 | val nameA = FeatureName(List('a)) 8 | val nameB = FeatureName(List('b)) 9 | val nameC = FeatureName(List('c)) 10 | val model1 = LinearModel(Seq((nameA, -2.0), (nameB, 3.0))) 11 | 12 | "Calling .getCoefficient" should "return the correct value" in { 13 | model1.getCoefficient(nameA) shouldBe -2.0 14 | model1.getCoefficient(nameB) shouldBe 3.0 15 | } 16 | 17 | it should "return zero for unspecified coefficients" in { 18 | model1.getCoefficient(nameC) shouldBe 0 19 | } 20 | 21 | "Calling .score" should "return the correct score" in { 22 | model1.score(FeatureVector(Seq(nameA -> 6.0, nameB -> 5.0, nameC -> -3.0))) shouldBe 3.0 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /tools/parse/src/test/scala/org/allenai/nlpstack/parse/poly/ml/NgramSetSpec.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.ml 2 | 3 | import org.allenai.common.testkit.UnitSpec 4 | import org.allenai.nlpstack.parse.poly.core.{ Token, Sentence } 5 | import java.io.File 6 | 7 | class NgramSetSpec extends UnitSpec { 8 | 9 | /* 10 | val ngramSet1 = NgramSet.initializeFromUnderscoreSeparatedTerms( 11 | Seq("graduated_cylinder", "Bunsen_burner", "oneword") 12 | ) 13 | 14 | 15 | "BrownClusters.getAllClusters" should "return the correct answer" in { 16 | val sentence = Sentence.initializeFromWhitespaceSeparatedString( 17 | "My Bunsen burner is better than Bunsen and his graduated cylinder" 18 | ) 19 | ngramSet1.identifyNgrams(sentence) shouldBe Set((2, 4), (10, 12)) 20 | } 21 | 22 | it should "correctly handle one-word ngrams" in { 23 | val sentence = Sentence.initializeFromWhitespaceSeparatedString( 24 | "This oneword should be easy ." 25 | ) 26 | ngramSet1.identifyNgrams(sentence) shouldBe Set((2, 3)) 27 | } 28 | 29 | it should "correctly handle the beginning-of-sentence edge case" in { 30 | val sentence = Sentence.initializeFromWhitespaceSeparatedString( 31 | "Bunsen burner ? I hardly know her ." 32 | ) 33 | println(s"ngramSet: ${ngramSet1.prefixes}") 34 | ngramSet1.identifyNgrams(sentence) shouldBe Set((1, 3)) 35 | } 36 | */ 37 | } 38 | -------------------------------------------------------------------------------- /tools/parse/src/test/scala/org/allenai/nlpstack/parse/poly/ml/VerbnetSpec.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.ml 2 | 3 | import org.allenai.common.Logging 4 | import org.allenai.common.testkit.UnitSpec 5 | 6 | import com.typesafe.config.ConfigFactory 7 | import java.io.File 8 | 9 | import org.allenai.nlpstack.parse.poly.core.SentenceTagger 10 | 11 | class VerbnetSpec extends UnitSpec with Logging { 12 | 13 | val taggersConfig = ConfigFactory.parseFile(new File(SentenceTagger.taggersConfigFile)) 14 | val verbnetConfig = taggersConfig.getConfig("verbnet") 15 | val groupName = verbnetConfig.getString("group") 16 | val artifactName = verbnetConfig.getString("name") 17 | val version = verbnetConfig.getInt("version") 18 | val verbnet = new Verbnet(groupName, artifactName, version) 19 | 20 | "VerbnetUtil.getVerbnetClasses" should 21 | "return the correct answer for verbs present in VerbNet" in { 22 | verbnet.getVerbnetClassNames("roar") shouldBe Set( 23 | Symbol("run-51.3.2"), 24 | Symbol("weather-57"), 25 | Symbol("animal_sounds-38"), 26 | Symbol("manner_speaking-37.3"), 27 | Symbol("sound_emission-43.2") 28 | ) 29 | verbnet.getVerbnetClassNames("boast") shouldBe Set(Symbol("complain-37.8")) 30 | verbnet.getVerbnetClassNames("synthesize") shouldBe Set(Symbol("create-26.4")) 31 | verbnet.getVerbnetClassNames("run") shouldBe Set( 32 | Symbol("swarm-47.5.1-1"), 33 | Symbol("meander-47.7"), 34 | Symbol("carry-11.4"), 35 | Symbol("preparing-26.3-1"), 36 | Symbol("run-51.3.2-2-1"), 37 | Symbol("bump-18.4") 38 | ) 39 | } 40 | 41 | "VerbnetUtil.getVerbnetClasses" should 42 | "return the correct answer for words NOT present in VerbNet" in { 43 | verbnet.getVerbnetClassNames("synthesis") shouldBe Set() 44 | verbnet.getVerbnetClassNames("apple") shouldBe Set() 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /tools/parse/src/test/scala/org/allenai/nlpstack/parse/poly/polyparser/ArcInverterSpec.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.polyparser 2 | 3 | import org.allenai.common.testkit.UnitSpec 4 | 5 | class ArcInverterSpec extends UnitSpec { 6 | 7 | "Calling apply" should "give back a modified parse for a simple parse" in { 8 | val inverter: ArcInverter = 9 | new ArcInverter( 10 | Set(SingleSymbolArcLabel('det), SingleSymbolArcLabel('amod), SingleSymbolArcLabel('prep)) 11 | ) 12 | inverter(PolytreeParseTestData.parse1) shouldBe PolytreeParseTestData.parse1b 13 | } 14 | 15 | it should "give back the same parse with no inverting labels" in { 16 | val inverter: ArcInverter = new ArcInverter(Set()) 17 | inverter(PolytreeParseTestData.parse1) shouldBe PolytreeParseTestData.parse1 18 | } 19 | 20 | } 21 | -------------------------------------------------------------------------------- /tools/parse/src/test/scala/org/allenai/nlpstack/parse/poly/polyparser/MultiWordTaggerSpec.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.polyparser 2 | 3 | import org.allenai.common.testkit.UnitSpec 4 | import org.allenai.nlpstack.parse.poly.core.{ Sentence, Token } 5 | 6 | class MultiWordTaggerSpec extends UnitSpec { 7 | // scalastyle:off 8 | 9 | import MultiWordTagger.{ mweSymbol, symbolFor, mweValue } 10 | 11 | val string = "I like pizza because of the Ninja Turtles" 12 | val sentence = Sentence(string.split(" ").map(x => Token(Symbol(x)))) 13 | val mwe1 = IndexedSeq('pizza, 'because, 'of) 14 | val mwe2 = IndexedSeq('because, 'of) 15 | val mwe3 = IndexedSeq('I, 'like) 16 | val mwe4 = IndexedSeq('Ninja, 'Turtles) 17 | val mwe5 = IndexedSeq('Turtles) 18 | val dictionary = Set(mwe1, mwe2, mwe3, mwe4, mwe5) 19 | //val tagger = MultiWordTagger(dictionary) 20 | //val got = tagger(sentence) 21 | 22 | // Empty property map 23 | val propNone = Map.empty[Symbol, String] 24 | 25 | // Property map containing "part of mwe" property 26 | val mweProp = Map(mweSymbol -> mweValue) 27 | 28 | // Expected property map for a token in the given mwe 29 | def propFor(mwe: IndexedSeq[Symbol]) = mweProp + (symbolFor(mwe) -> mweValue) 30 | 31 | /* 32 | "MultiWordTagger" should "predict properties correctly" in { 33 | val expected = Seq( 34 | propFor(mwe3), // I 35 | propFor(mwe3), // like 36 | propFor(mwe1), // pizza 37 | propFor(mwe1) ++ propFor(mwe2), // because 38 | propFor(mwe1) ++ propFor(mwe2), // of 39 | propNone, // the 40 | propFor(mwe4), // Ninja 41 | propFor(mwe4) ++ propFor(mwe5)) // Turtles 42 | val predicted = tagger(sentence).tokens.map(_.properties) 43 | assert(expected == predicted) 44 | } 45 | */ 46 | } 47 | -------------------------------------------------------------------------------- /tools/parse/src/test/scala/org/allenai/nlpstack/parse/poly/polyparser/PolytreeParseFeatureSpec.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.parse.poly.polyparser 2 | 3 | import org.allenai.common.testkit.UnitSpec 4 | import org.allenai.nlpstack.parse.poly.core.{ Sentence, NexusToken, Token } 5 | import org.allenai.nlpstack.parse.poly.ml.{ FeatureVector => MLFeatureVector, FeatureName => MLFeatureName } 6 | import org.allenai.nlpstack.parse.poly.reranking.BaseParserScoreFeature 7 | 8 | class PolytreeParseFeatureSpec extends UnitSpec { 9 | // scalastyle:off 10 | 11 | /** This represents the following polytree parse: 12 | * format: OFF 13 | * 14 | * NEXUS_0 15 | * | 16 | * | the_1-- 17 | * | \ 18 | * | -->cat_2 19 | * \ / 20 | * -----> sat_3-- 21 | * / 22 | * by_4 -- 23 | * \ 24 | * --> me_5 25 | * 26 | * format: ON 27 | */ 28 | val parse1 = PolytreeParse( 29 | sentence = Sentence(Vector(NexusToken, Token('the), Token('cat), Token('sat), 30 | Token('by), Token('me))), 31 | breadcrumb = Vector(-1, 2, 3, 0, 3, 4), 32 | children = Vector(Set(3), Set(2), Set(), Set(2), Set(3, 5), Set()), 33 | arclabels = 34 | Vector( 35 | Set((3, SingleSymbolArcLabel('root))), 36 | Set((2, SingleSymbolArcLabel('det))), 37 | Set((1, SingleSymbolArcLabel('det)), (3, SingleSymbolArcLabel('nsubj))), 38 | Set((0, SingleSymbolArcLabel('root)), (2, SingleSymbolArcLabel('nsubj)), 39 | (4, SingleSymbolArcLabel('prep))), 40 | Set((3, SingleSymbolArcLabel('prep)), (5, SingleSymbolArcLabel('pobj))), 41 | Set((4, SingleSymbolArcLabel('pobj))) 42 | ) 43 | ) 44 | 45 | "Calling the .apply method of BaseParserScoreFeature" should "return the correct value" in { 46 | val featureName = BaseParserScoreFeature.name 47 | BaseParserScoreFeature(parse1, 12.0) shouldBe MLFeatureVector(Seq( 48 | MLFeatureName(List(featureName)) -> 12.0 49 | )) 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /tools/postag/build.sbt: -------------------------------------------------------------------------------- 1 | import Dependencies._ 2 | 3 | dependencyOverrides += "org.apache.commons" % "commons-compress" % "1.8" 4 | 5 | libraryDependencies ++= loggingDependencies 6 | -------------------------------------------------------------------------------- /tools/postag/src/main/scala/org/allenai/nlpstack/postag/FactoriePostagger.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.postag 2 | 3 | import org.allenai.nlpstack.core._ 4 | import org.allenai.nlpstack.postag.FactoriePostagger.factorieFormat 5 | import org.allenai.nlpstack.tokenize.FactorieTokenizer 6 | import org.allenai.datastore.Datastore 7 | 8 | import cc.factorie.app.nlp.{ Document => FactorieDocument } 9 | import cc.factorie.app.nlp.pos.OntonotesForwardPosTagger 10 | import cc.factorie.app.nlp.pos.PennPosTag 11 | 12 | /** This is thread-safe. The only thing we call on OntonotesForwardPosTagger is 13 | * predict(), and predict() is threadsafe. I don't know about the other methods 14 | * on OntonotesForwardPosTagger. 15 | */ 16 | class FactoriePostagger extends Postagger { 17 | val tagger = FactoriePostagger.tagger 18 | 19 | override def postagTokenized(tokens: Seq[Token]): Seq[PostaggedToken] = { 20 | val factorieDoc = FactorieTokenizer.factorieFormat.write(tokens) 21 | val factorieTokens = factorieDoc.tokens.toSeq 22 | 23 | tagger.predict(factorieTokens) // modifies factorieTokens 24 | 25 | factorieFormat.read(factorieDoc) 26 | } 27 | } 28 | 29 | object FactoriePostagger { 30 | private val tagger = 31 | new OntonotesForwardPosTagger( 32 | Datastore.filePath( 33 | "cc.factorie.app.nlp.pos", 34 | "OntonotesForwardPosTagger.factorie", 35 | 1 36 | ).toUri.toURL 37 | ) 38 | 39 | object factorieFormat extends Format[Seq[PostaggedToken], FactorieDocument] { 40 | override def read(from: FactorieDocument): Seq[PostaggedToken] = 41 | from.tokens.map(t => PostaggedToken( 42 | tagger.tokenAnnotationString(t), 43 | t.string, 44 | t.stringStart 45 | )).toSeq 46 | 47 | override def write(from: Seq[PostaggedToken]): FactorieDocument = { 48 | val factorieDoc = FactorieTokenizer.factorieFormat.write(from) 49 | require(factorieDoc.tokenCount == from.size) 50 | (from, factorieDoc.tokens).zipped.foreach((token, factorieToken) => { 51 | factorieToken.attr += new PennPosTag(factorieToken, token.postag) 52 | factorieToken.attr += token 53 | }) 54 | factorieDoc 55 | } 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /tools/postag/src/main/scala/org/allenai/nlpstack/postag/OpenNlpPostagger.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.postag 2 | 3 | import org.allenai.common.Resource 4 | import org.allenai.nlpstack.core._ 5 | 6 | import opennlp.tools.postag.{ POSTaggerME, POSModel } 7 | 8 | class OpenNlpPostagger extends Postagger { 9 | private val postagger = new POSTaggerME(OpenNlpPostagger.model) 10 | 11 | override def postagTokenized(tokens: Seq[Token]): Seq[PostaggedToken] = { 12 | val postags = postagger.tag(tokens.iterator.map(_.string).toArray) 13 | (tokens zip postags).map { 14 | case (token, postag) => 15 | val fixedPostag = if (token.string == "-") "HYPH" else postag 16 | PostaggedToken(token, fixedPostag) 17 | } 18 | } 19 | } 20 | 21 | object OpenNlpPostagger { 22 | private val defaultModelName = "en-pos-maxent.bin" 23 | private val model = 24 | Resource.using(this.getClass.getClassLoader.getResourceAsStream(defaultModelName)) { is => 25 | new POSModel(is) 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /tools/postag/src/main/scala/org/allenai/nlpstack/postag/StanfordPostagger.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.postag 2 | 3 | import edu.stanford.nlp.ling.CoreLabel 4 | import edu.stanford.nlp.tagger.maxent.MaxentTagger 5 | 6 | import org.allenai.nlpstack.core._ 7 | import org.allenai.datastore.Datastore 8 | 9 | import java.net.URL 10 | import scala.collection.JavaConverters._ 11 | 12 | class StanfordPostagger( 13 | val tagger: MaxentTagger 14 | ) extends Postagger { 15 | 16 | def this() = this(StanfordPostagger.loadDefaultModel()) 17 | 18 | override def postagTokenized(tokens: Seq[Token]): Seq[PostaggedToken] = { 19 | val labels = tokens.map { token => 20 | val corelabel = new CoreLabel() 21 | corelabel.setWord(token.string) 22 | corelabel 23 | } 24 | val postags = tagger.tagSentence(labels.asJava).asScala.map(_.tag()) 25 | 26 | (tokens zip postags).map { 27 | case (token, postag) => 28 | PostaggedToken(token, postag) 29 | } 30 | } 31 | } 32 | 33 | object StanfordPostagger { 34 | def loadDefaultModel(): MaxentTagger = { 35 | val filePath = Datastore.directoryPath( 36 | "edu.stanford.nlp.models.pos-tagger", 37 | "english-left3words-3.4.1", 38 | 1 39 | ) 40 | new MaxentTagger(filePath.toString + "/english-left3words/english-left3words-distsim.tagger") 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /tools/postag/src/main/scala/org/allenai/nlpstack/postag/package.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack 2 | 3 | import org.allenai.nlpstack.core.Postagger 4 | 5 | package object postag { 6 | val defaultPostagger: Postagger = new FactoriePostagger 7 | } 8 | -------------------------------------------------------------------------------- /tools/postag/src/main/universal/postag-server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CLASS_NAME="edu.knowitall.tool.postag.OpenNlpPostagger" 4 | 5 | SCRIPT_DIR=`dirname $0` 6 | SHORT_NAME=`basename $0 .sh` 7 | APP_ROOT="$SCRIPT_DIR/.." 8 | JVM_ARGS="-Xmx128M" 9 | 10 | . "${SCRIPT_DIR}/run-class.sh" "$CLASS_NAME" "$SHORT_NAME" "$@" 11 | -------------------------------------------------------------------------------- /tools/postag/src/test/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 6 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /tools/postag/src/test/scala/org/allenai/nlpstack/postag/FactoriePostaggerSpec.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.postag 2 | 3 | class FactoriePostaggerSpec extends PostaggerSpec { 4 | val taggerToTest = new FactoriePostagger 5 | } 6 | -------------------------------------------------------------------------------- /tools/postag/src/test/scala/org/allenai/nlpstack/postag/OpenNlpPostaggerSpec.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack 2 | package postag 3 | 4 | class OpenNlpPostaggerSpec extends PostaggerSpec { 5 | // The OpenNLP postagger disagrees about the tags for "Pardon", where it is wrong, and "snub", 6 | // where the difference is acceptable. 7 | protected override def taggedTexts = Seq( 8 | super.taggedTexts(0), 9 | """|Pardon 0 NNP 10 | |me 7 PRP 11 | |for 10 IN 12 | |asking 14 VBG 13 | |, 20 , 14 | |sir 22 NN 15 | |, 25 , 16 | |but 27 CC 17 | |what 31 WP 18 | |good 36 JJ 19 | |are 41 VBP 20 | |snub 45 NN 21 | |fighters 50 NNS 22 | |going 59 VBG 23 | |to 65 TO 24 | |be 68 VB 25 | |against 71 IN 26 | |that 79 DT 27 | |? 83 .""".stripMargin 28 | ) 29 | 30 | val taggerToTest = new OpenNlpPostagger 31 | } 32 | 33 | -------------------------------------------------------------------------------- /tools/postag/src/test/scala/org/allenai/nlpstack/postag/PostaggerSpec.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.postag 2 | 3 | import org.allenai.common.testkit.UnitSpec 4 | import org.allenai.nlpstack.core.Postagger 5 | import org.allenai.nlpstack.tokenize.defaultTokenizer 6 | 7 | abstract class PostaggerSpec extends UnitSpec { 8 | def taggerToTest: Postagger 9 | 10 | protected val texts = Seq( 11 | """|The battle station is heavily shielded and carries a firepower greater 12 | |than half the star fleet. Its defenses are designed around a direct, 13 | |large-scale assault. A small one-man fighter should be able to 14 | |penetrate the outer defense.""".stripMargin, 15 | """|Pardon me for asking, sir, but what good are snub fighters going to be 16 | |against that?""".stripMargin 17 | ) 18 | 19 | protected def taggedTexts = Seq( 20 | """|The 0 DT 21 | |battle 4 NN 22 | |station 11 NN 23 | |is 19 VBZ 24 | |heavily 22 RB 25 | |shielded 30 VBN 26 | |and 39 CC 27 | |carries 43 VBZ 28 | |a 51 DT 29 | |firepower 53 NN 30 | |greater 63 JJR 31 | |than 71 IN 32 | |half 76 PDT 33 | |the 81 DT 34 | |star 85 NN 35 | |fleet 90 NN 36 | |. 95 . 37 | |Its 97 PRP$ 38 | |defenses 101 NNS 39 | |are 110 VBP 40 | |designed 114 VBN 41 | |around 123 IN 42 | |a 130 DT 43 | |direct 132 JJ 44 | |, 138 , 45 | |large-scale 140 JJ 46 | |assault 152 NN 47 | |. 159 . 48 | |A 161 DT 49 | |small 163 JJ 50 | |one-man 169 JJ 51 | |fighter 177 NN 52 | |should 185 MD 53 | |be 192 VB 54 | |able 195 JJ 55 | |to 200 TO 56 | |penetrate 203 VB 57 | |the 213 DT 58 | |outer 217 JJ 59 | |defense 223 NN 60 | |. 230 .""".stripMargin, 61 | """|Pardon 0 VB 62 | |me 7 PRP 63 | |for 10 IN 64 | |asking 14 VBG 65 | |, 20 , 66 | |sir 22 NN 67 | |, 25 , 68 | |but 27 CC 69 | |what 31 WP 70 | |good 36 JJ 71 | |are 41 VBP 72 | |snub 45 JJ 73 | |fighters 50 NNS 74 | |going 59 VBG 75 | |to 65 TO 76 | |be 68 VB 77 | |against 71 IN 78 | |that 79 DT 79 | |? 83 .""".stripMargin 80 | ) 81 | 82 | "postagger implementation" should "correctly postag two example sentences" in { 83 | for ((text, expected) <- texts zip taggedTexts) { 84 | val tagged = taggerToTest.postag(defaultTokenizer)(text) 85 | val taggedString = tagged.mkString("\n") 86 | assert(taggedString === expected) 87 | } 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /tools/segment/build.sbt: -------------------------------------------------------------------------------- 1 | import Dependencies._ 2 | 3 | libraryDependencies ++= loggingDependencies 4 | -------------------------------------------------------------------------------- /tools/segment/src/main/scala/org/allenai/nlpstack/segment/ChalkSentenceSegmenter.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack 2 | package segment 3 | 4 | @deprecated("Please use defaultSegmenter instead", "2014-06-24") 5 | class ChalkSentenceSegmenter extends FactorieSegmenter 6 | -------------------------------------------------------------------------------- /tools/segment/src/main/scala/org/allenai/nlpstack/segment/FactorieSegmenter.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.segment 2 | 3 | import org.allenai.nlpstack.core.{ FactorieUtilities, Segment, Segmenter } 4 | 5 | import cc.factorie.app.nlp._ 6 | import cc.factorie.app.nlp.segment.{ DeterministicSentenceSegmenter, DeterministicTokenizer } 7 | 8 | class FactorieSegmenter extends Segmenter { 9 | /* This is a bit unfortunate. In Factorie, you tokenize first, and then 10 | * segment. In nlpstack, it's the other way around. We solve the problem by 11 | * tokenizing twice, once here to get the sentences, and then again in 12 | * FactorieTokenizer. */ 13 | private val tokenizer = 14 | new DeterministicTokenizer(tokenizeAllDashedWords = true) 15 | private val segmenter = DeterministicSentenceSegmenter 16 | private val map = new MutableDocumentAnnotatorMap ++= 17 | DocumentAnnotatorPipeline.defaultDocumentAnnotationMap 18 | map += tokenizer 19 | map += segmenter 20 | private val pipeline = DocumentAnnotatorPipeline( 21 | map = map.toMap, 22 | prereqs = Nil, 23 | segmenter.postAttrs 24 | ) 25 | 26 | override def segment(document: String): Iterable[Segment] = { 27 | val doc = pipeline.process( 28 | new Document( 29 | FactorieUtilities.replaceUnclosedTag(document) 30 | ) 31 | ) 32 | 33 | for (sentence <- doc.sentences) yield { 34 | new Segment(sentence.documentString, sentence.tokens(0).stringStart) 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /tools/segment/src/main/scala/org/allenai/nlpstack/segment/StanfordSegmenter.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.segment 2 | 3 | import java.util.Properties 4 | 5 | import edu.stanford.nlp.ling.CoreAnnotations 6 | import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation 7 | import edu.stanford.nlp.pipeline.{ Annotation, StanfordCoreNLP } 8 | import org.allenai.nlpstack.core.{ Segment, Segmenter } 9 | import org.slf4j.bridge.SLF4JBridgeHandler 10 | 11 | import scala.collection.JavaConverters._ 12 | 13 | object StanfordSegmenter extends Segmenter { 14 | // redirect stanford's logging 15 | SLF4JBridgeHandler.removeHandlersForRootLogger() 16 | SLF4JBridgeHandler.install() 17 | 18 | /* This is a bit unfortunate. In Stanford, you tokenize first, and then 19 | * segment. In nlpstack, it's the other way around. We solve the problem by 20 | * tokenizing twice, once here to get the sentences, and then again in 21 | * StanfordTokenizer. */ 22 | 23 | private val pipeline = { 24 | val props = new Properties() 25 | props.put("annotators", "tokenize, ssplit") 26 | new StanfordCoreNLP(props) 27 | } 28 | 29 | override def segment(document: String): Iterable[Segment] = { 30 | val annotation = new Annotation(document) 31 | pipeline.annotate(annotation) 32 | annotation.get(classOf[SentencesAnnotation]).asScala.map { sentence => 33 | val start = sentence.get(classOf[CoreAnnotations.CharacterOffsetBeginAnnotation]) 34 | val end = sentence.get(classOf[CoreAnnotations.CharacterOffsetEndAnnotation]) 35 | Segment(document.substring(start, end), start) 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /tools/segment/src/main/scala/org/allenai/nlpstack/segment/package.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack 2 | 3 | import org.allenai.nlpstack.core.Segmenter 4 | 5 | package object segment { 6 | def defaultSegmenter: Segmenter = new FactorieSegmenter 7 | } 8 | -------------------------------------------------------------------------------- /tools/segment/src/main/universal/segment-server.scala: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CLASS_NAME="edu.knowitall.tool.segment.ChalkSentenceSegmenter" 4 | 5 | SCRIPT_DIR=`dirname $0` 6 | SHORT_NAME=`basename $0 .sh` 7 | APP_ROOT="$SCRIPT_DIR/.." 8 | JVM_ARGS="-Xmx256M" 9 | 10 | . "${SCRIPT_DIR}/run-class.sh" "$CLASS_NAME" "$SHORT_NAME" "$@" 11 | -------------------------------------------------------------------------------- /tools/segment/src/test/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 6 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /tools/segment/src/test/scala/org/allenai/nlpstack/segment/ChalkSentenceSegmenter.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack 2 | package segment 3 | 4 | import org.allenai.common.testkit.UnitSpec 5 | import org.allenai.nlpstack.core.Segment 6 | 7 | class ChalkSentencerSpec extends UnitSpec { 8 | val sentencer = new ChalkSentenceSegmenter 9 | val document = "He went to work. He bought a suit. He ate a melon." 10 | "chalk sentencer" should "properly segment" in { 11 | val segments = sentencer.segment(document).toIndexedSeq 12 | assert(segments(0) === Segment("He went to work.", 0)) 13 | assert(segments(1) === Segment("He bought a suit.", 18)) 14 | assert(segments(2) === Segment("He ate a melon.", 37)) 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /tools/segment/src/test/scala/org/allenai/nlpstack/segment/FactorieSegmenterSpec.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.segment 2 | 3 | import org.allenai.common.testkit.UnitSpec 4 | import org.allenai.nlpstack.core.Segment 5 | 6 | import org.apache.commons.io.IOUtils 7 | 8 | class FactorieSegmenterSpec extends UnitSpec { 9 | val sentencer = new FactorieSegmenter 10 | val document = "He went to work. He bought a first-class suit. He ate a melon." 11 | 12 | "factorie sentencer" should "properly segment" in { 13 | val segments = sentencer.segment(document).toIndexedSeq 14 | assert(segments(0) === Segment("He went to work.", 0)) 15 | assert(segments(1) === Segment("He bought a first-class suit.", 18)) 16 | assert(segments(2) === Segment("He ate a melon.", 48)) 17 | } 18 | 19 | it should "not throw an exception for a long string" in { 20 | val s = 21 | IOUtils.toString( 22 | this.getClass.getResourceAsStream("/org/allenai/nlpstack/segment/unclosed_tag_test.txt"), 23 | "UTF-8" 24 | ) 25 | sentencer.segment(s) 26 | } 27 | 28 | it should "not interpret dollar symbols as regex backreferences" in { 29 | val s = "<" + "$2" + "x" * 98 30 | sentencer.segment(s) 31 | } 32 | 33 | } -------------------------------------------------------------------------------- /tools/tokenize/build.sbt: -------------------------------------------------------------------------------- 1 | import Dependencies._ 2 | 3 | javaOptions in (Test, test) := Seq("-Xss1m") 4 | 5 | fork in (Test, test) := true 6 | 7 | libraryDependencies ++= loggingDependencies 8 | -------------------------------------------------------------------------------- /tools/tokenize/src/main/scala/org/allenai/nlpstack/tokenize/FactorieTokenizer.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.tokenize 2 | 3 | import org.allenai.nlpstack.core.{ FactorieUtilities, Format, Tokenizer, Token } 4 | import org.allenai.nlpstack.tokenize.FactorieTokenizer.factorieFormat 5 | 6 | import cc.factorie.app.nlp.{ 7 | Document => FactorieDocument, 8 | Token => FactorieToken, 9 | DocumentAnnotatorPipeline, 10 | MutableDocumentAnnotatorMap 11 | } 12 | import cc.factorie.app.nlp.segment.DeterministicTokenizer 13 | 14 | class FactorieTokenizer extends Tokenizer { 15 | private val tokenizer = 16 | new DeterministicTokenizer(tokenizeAllDashedWords = false) 17 | private val map = new MutableDocumentAnnotatorMap ++= 18 | DocumentAnnotatorPipeline.defaultDocumentAnnotationMap 19 | map += tokenizer 20 | private val pipeline = DocumentAnnotatorPipeline( 21 | map = map.toMap, 22 | prereqs = Nil, 23 | tokenizer.postAttrs 24 | ) 25 | 26 | def tokenize(sentence: String): Seq[Token] = { 27 | val doc = pipeline.process( 28 | new FactorieDocument( 29 | FactorieUtilities.replaceUnclosedTag(sentence) 30 | ) 31 | ) 32 | 33 | factorieFormat.read(doc) 34 | } 35 | } 36 | 37 | object FactorieTokenizer { 38 | object factorieFormat extends Format[Seq[Token], FactorieDocument] { 39 | override def read(from: FactorieDocument): Seq[Token] = 40 | for (section <- from.sections; token <- section.tokens) 41 | yield Token(token.string, token.stringStart) 42 | 43 | override def write(from: Seq[Token]): FactorieDocument = { 44 | val factorieDoc = new FactorieDocument(Tokenizer.originalText(from)) 45 | for (token <- from) { 46 | // creating factorie tokens modifies the factorie document 47 | val factorieToken = new FactorieToken( 48 | factorieDoc, 49 | token.offset, 50 | token.offset + token.string.length 51 | ) 52 | factorieToken.attr += token 53 | } 54 | factorieDoc 55 | } 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /tools/tokenize/src/main/scala/org/allenai/nlpstack/tokenize/PennTokenizer.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.tokenize 2 | 3 | import org.allenai.nlpstack.core.Tokenizer 4 | 5 | import java.util.regex._ 6 | 7 | /* The PennTokenizer was used to tokenize the Penn Treebank. 8 | * The following is a translation from a sed file. This algorithm 9 | * is entirely deterministic. It is composed of regular expression 10 | * replacements. 11 | * 12 | * @author Michael Schmitz 13 | */ 14 | object PennTokenizer extends Tokenizer { 15 | val replacements = List( 16 | // attempt to get correct directional quotes 17 | ("^\"", "`` "), 18 | //("""([ (\[{<])""", "$1 `` "), 19 | ("""\.\.\.""", " ... "), 20 | ("[,;:@#$%&]", " $0 "), 21 | ("""([^.]\)\([.])([])}>"']*)[ ]*$""", "$1 $2$3 "), // scalastyle:ignore 22 | ("[?!]", " $0 "), 23 | ("""[](){}<>]""", " $0 "), 24 | ("--", " $0 "), 25 | ("$|^", " "), 26 | ("\"", " '' "), 27 | (""" ([^'])' """, " '$1 "), 28 | ("""'([sSmMdD]) """, " '$1 "), 29 | ("'(ll|re|ve|LL|RE|VE) ", " '$1 "), 30 | ("(n't|N'T) ", " $1 ") 31 | ).map { 32 | case (a, b) => 33 | (Pattern.compile(a), b) 34 | } 35 | 36 | def tokenize(sentence: String) = { 37 | val split = replacements.foldRight(sentence) { 38 | case ((t, r), s) => 39 | t.matcher(s).replaceAll(r) 40 | }.trim.split("\\s+") 41 | 42 | Tokenizer.computeOffsets(split, sentence) 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /tools/tokenize/src/main/scala/org/allenai/nlpstack/tokenize/RemoteTokenizer.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.tokenize 2 | 3 | import org.allenai.nlpstack.core.Tokenizer 4 | import org.allenai.nlpstack.core.remote.Remote 5 | 6 | import scala.concurrent.ExecutionContext 7 | 8 | class RemoteTokenizer(val urlString: String)(implicit executionContext: ExecutionContext) 9 | extends Tokenizer with Remote { 10 | def tokenize(sentence: String) = { 11 | val response = post(sentence) 12 | Tokenizer.multilineStringFormat.read(response) 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /tools/tokenize/src/main/scala/org/allenai/nlpstack/tokenize/SimpleEnglishTokenizer.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.tokenize 2 | 3 | @deprecated("Please use defaultTokenizer instead", "2014-06-19") 4 | class SimpleEnglishTokenizer extends FactorieTokenizer -------------------------------------------------------------------------------- /tools/tokenize/src/main/scala/org/allenai/nlpstack/tokenize/StanfordTokenizer.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.tokenize 2 | 3 | import java.io.StringReader 4 | import org.slf4j.bridge.SLF4JBridgeHandler 5 | 6 | import scala.collection.{ mutable, JavaConverters } 7 | 8 | import edu.stanford.nlp.process.PTBTokenizer 9 | import org.allenai.nlpstack.core.{ Token, Tokenizer } 10 | 11 | object StanfordTokenizer extends Tokenizer { 12 | // redirect stanford's logging 13 | SLF4JBridgeHandler.removeHandlersForRootLogger() 14 | SLF4JBridgeHandler.install() 15 | 16 | var averageTokenLength = 6 // low estimates are better 17 | private val tokenizerFactory = PTBTokenizer.factory() 18 | tokenizerFactory.setOptions("untokenizable=allKeep") 19 | 20 | def tokenize(sentence: String) = { 21 | val reader = new StringReader(sentence) 22 | val tokenizer = tokenizerFactory.getTokenizer(reader) 23 | val result = new mutable.ArrayBuffer[Token](sentence.length / averageTokenLength) 24 | 25 | while (tokenizer.hasNext) { 26 | val token = tokenizer.next() 27 | result += Token(sentence.substring(token.beginPosition(), token.endPosition()), token.beginPosition()) 28 | } 29 | result 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /tools/tokenize/src/main/scala/org/allenai/nlpstack/tokenize/WhitespaceTokenizer.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.tokenize 2 | 3 | import org.allenai.nlpstack.core.Tokenizer 4 | 5 | /* The PennTokenizer was used to tokenize the Penn Treebank. 6 | * The following is a translation from a sed file. This algorithm 7 | * is entirely deterministic. It is composed of regular expression 8 | * replacements. 9 | * 10 | * @author Michael Schmitz 11 | */ 12 | object WhitespaceTokenizer extends Tokenizer { 13 | override def tokenize(string: String) = 14 | Tokenizer.computeOffsets(string.split("\\s+").toSeq, string) 15 | } 16 | -------------------------------------------------------------------------------- /tools/tokenize/src/main/scala/org/allenai/nlpstack/tokenize/package.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack 2 | 3 | import org.allenai.nlpstack.core.Tokenizer 4 | 5 | package object tokenize { 6 | def defaultTokenizer: Tokenizer = StanfordTokenizer 7 | } 8 | -------------------------------------------------------------------------------- /tools/tokenize/src/main/universal/tokenize-server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CLASS_NAME="edu.knowitall.tool.tokenize.ChalkTokenizer" 4 | 5 | SCRIPT_DIR=`dirname $0` 6 | SHORT_NAME=`basename $0 .sh` 7 | APP_ROOT="$SCRIPT_DIR/.." 8 | JVM_ARGS="-Xmx128M" 9 | 10 | . "${SCRIPT_DIR}/run-class.sh" "$CLASS_NAME" "$SHORT_NAME" "$@" 11 | -------------------------------------------------------------------------------- /tools/tokenize/src/test/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 6 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /tools/tokenize/src/test/scala/org/allenai/nlpstack/tokenize/FactorieTokenizerSpec.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack 2 | package chunk 3 | 4 | import org.allenai.nlpstack.tokenize._ 5 | 6 | class FactorieTokenizerSpec extends TokenizerSpec { 7 | override val tokenizedTestSentences = Seq( 8 | """|The 0 9 | |battle 4 10 | |station 11 11 | |is 19 12 | |heavily 22 13 | |shielded 30 14 | |and 39 15 | |carries 43 16 | |a 51 17 | |firepower 53 18 | |greater 63 19 | |than 71 20 | |half 76 21 | |the 81 22 | |star 85 23 | |fleet 90 24 | |. 95 25 | |Its 97 26 | |defenses 101 27 | |are 110 28 | |designed 114 29 | |around 123 30 | |a 130 31 | |direct 132 32 | |, 138 33 | |large 140 34 | |- 145 35 | |scale 146 36 | |assault 152 37 | |. 159 38 | |A 161 39 | |small 163 40 | |one 169 41 | |- 172 42 | |man 173 43 | |fighter 177 44 | |should 185 45 | |be 192 46 | |able 195 47 | |to 200 48 | |penetrate 203 49 | |the 213 50 | |outer 217 51 | |defense 223 52 | |. 230""".stripMargin, 53 | """|Pardon 0 54 | |me 7 55 | |for 10 56 | |asking 14 57 | |, 20 58 | |sir 22 59 | |, 25 60 | |but 27 61 | |what 31 62 | |good 36 63 | |are 41 64 | |snub 45 65 | |fighters 50 66 | |going 59 67 | |to 65 68 | |be 68 69 | |against 71 70 | |that 79 71 | |? 83""".stripMargin 72 | ) 73 | 74 | val tokenizerToTest = new FactorieTokenizer 75 | } 76 | -------------------------------------------------------------------------------- /version.sbt: -------------------------------------------------------------------------------- 1 | version in ThisBuild := "1.20-SNAPSHOT" -------------------------------------------------------------------------------- /webapp/README.md: -------------------------------------------------------------------------------- 1 | # Nlpviz 2 | 3 | This project should grow to visualize various NLP tools. For example, 4 | Mark H in his experimentation with polytrees would like to visualize 5 | polytrees. Presently Nlpviz only visualizes dependencies and so it's 6 | largely a wrapper for Whatswrong (https://code.google.com/p/whatswrong/). 7 | 8 | The current functionality was taken from Nlpweb. It's difficult to add 9 | a tool to Nlpweb because it's an old project and it requires setting up 10 | and configuring a server with your NLP tool. This doesn't work for 11 | frequent experimentation. 12 | 13 | There are a multitude of NLP formats out there. Ideally we would 14 | standardize somewhat. I would rather Nlpviz not turn into a tool 15 | that takes in every format out there. Rather, I'd rather have a 16 | separate tool NlpCanonicalize that converts formats into what we 17 | adopt as canonical. 18 | 19 | This tool can either be used from a webpage or used programatically 20 | via POST requests. 21 | 22 | ## Running 23 | 24 | This project uses sbt as the build system. sbt can also be used to run 25 | Nlpviz. 26 | 27 | $ sbt compile 28 | $ sbt run 29 | 30 | Now you should have a HTTP server running at http://localhost:8080. To 31 | change the port, edit `src/main/resources/application.conf`. 32 | 33 | ## Future support 34 | 35 | * Polytrees 36 | * SRL frames 37 | -------------------------------------------------------------------------------- /webapp/build.sbt: -------------------------------------------------------------------------------- 1 | import Dependencies._ 2 | 3 | name := "webapp" 4 | 5 | libraryDependencies ++= Seq( 6 | "commons-codec" % "commons-codec" % "1.9", 7 | "org.riedelcastro" % "whatswrong" % "0.2.4" 8 | ) 9 | 10 | dependencyOverrides += "commons-io" % "commons-io" % "2.4" 11 | 12 | addLoggingDependencies(libraryDependencies) 13 | -------------------------------------------------------------------------------- /webapp/conf/deploy.conf: -------------------------------------------------------------------------------- 1 | base = { 2 | prod.deploy.host = "nlpstack.dev.allenai.org" 3 | prod.directory = "/local/deploy/" 4 | } 5 | 6 | base.webapp = { 7 | include "global_deploy.conf" 8 | project = { 9 | name = "webapp" 10 | } 11 | } 12 | 13 | prod = ${base.webapp}${base.prod} 14 | -------------------------------------------------------------------------------- /webapp/conf/global_deploy.conf: -------------------------------------------------------------------------------- 1 | // Baseline config file containing reasonable defaults and documentation of 2 | // fields. 3 | // 4 | // See https://github.com/typesafehub/config/blob/master/HOCON.md for a full 5 | // description of the Typesafe Config language. 6 | // 7 | // An example usage of this file is in example_solver_deploy.conf. 8 | project = { 9 | // SBT project name. Required. 10 | name = null 11 | // The project subdirectory. Optional; if unset, the root directory will be 12 | // used. 13 | subdirectory = ${?project.name} 14 | // Optional branch / commit / tag to checkout before building. 15 | version = null 16 | } 17 | deploy = { 18 | // Hostname to push to. Required. 19 | host = null 20 | // Directory on the remote host to push to. Required. 21 | directory = "/local/deploy/"${?project.name} 22 | // Start / stop script to run after the push is complete. Required. 23 | startup_script = "bin/"${?project.name}".sh" 24 | user = { 25 | // Full path to the ssh keypair to use when connecting to the remote host. 26 | // Required. 27 | ssh_keyfile = null 28 | // Username to connect to the remote host as. Required. 29 | ssh_username = "ec2-user" 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /webapp/public/css/main.css: -------------------------------------------------------------------------------- 1 | textarea { 2 | width: 80%; 3 | white-space: pre; 4 | word-wrap: normal; 5 | } 6 | 7 | .nav-tabs a { 8 | cursor: pointer; 9 | } 10 | -------------------------------------------------------------------------------- /webapp/public/img/spinner.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/nlpstack/b41ac75f093842485a24d6540ed417964e85c2fb/webapp/public/img/spinner.gif -------------------------------------------------------------------------------- /webapp/public/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | NLP Web 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | NLP Web a web site to explore NLP 17 | 18 | 19 | 20 | Run NLP tools over words or sentences. 21 | Visualize serialized data representing NLP processing. 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /webapp/public/js/tools.js: -------------------------------------------------------------------------------- 1 | angular.module('tools', ['ui.bootstrap']); 2 | var NlpToolsCtrl = function($scope, $http) { 3 | $scope.model = { } 4 | 5 | $scope.working = true; 6 | $http.get("/api/tools") 7 | .success(function(data, status, headers, config) { 8 | $scope.working = false; 9 | $scope.model.tools = data; 10 | 11 | data.forEach(function (tool) { 12 | $scope.model.toolInfo = {}; 13 | $http.get("/api/tools/" + tool).success(function(data, status, headers, config) { 14 | $scope.model.toolInfo[tool] = data; 15 | }); 16 | }); 17 | }) 18 | .error(function(data, status, headers, config) { 19 | $scope.working = false; 20 | $scope.errorMessage = data; 21 | }); 22 | 23 | $scope.runTool = function(tool) { 24 | $scope.working = true; 25 | $http.post("/api/tools/" + tool, $scope.model[tool]) 26 | .success(function(data, status, headers, config) { 27 | $scope.working = false; 28 | $scope.errorMessage = undefined; 29 | 30 | $scope.response = {}; 31 | $scope.response[tool] = data; 32 | }) 33 | .error(function(data, status, headers, config) { 34 | $scope.working = false; 35 | $scope.response = undefined; 36 | 37 | $scope.errorMessage = data; 38 | }); 39 | } 40 | 41 | $scope.showExample = function(tool) { 42 | $scope.model[tool] = $scope.model.toolInfo[tool].example; 43 | } 44 | } 45 | 46 | -------------------------------------------------------------------------------- /webapp/public/js/visualize.js: -------------------------------------------------------------------------------- 1 | angular.module('visualize', ['ui.bootstrap']); 2 | var VisualizeCtrl = function($scope, $http) { 3 | $scope.model = { } 4 | 5 | $scope.showExample = function() { 6 | $scope.model.dependencies = "nsubj(ran-2, Michael-1)\nroot(ROOT-0, ran-2)\n" + 7 | "prt(ran-2, down-3)\ndet(hill-5, the-4)\ndobj(ran-2, hill-5)"; 8 | $scope.visualizeDependencies(); 9 | } 10 | 11 | $scope.visualizeDependencies = function() { 12 | $scope.working = true; 13 | $http.post("/api/visualize/dependencies/base64", $scope.model.dependencies) 14 | .success(function(data, status, headers, config) { 15 | $scope.working = false; 16 | $scope.errorMessage = undefined; 17 | $scope.response = {} 18 | $scope.response.base64 = data; 19 | }) 20 | .error(function(data, status, headers, config) { 21 | $scope.working = false; 22 | $scope.errorMessage = data; 23 | $scope.response = undefined; 24 | }); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /webapp/public/tools.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | NLP Stack Tools 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | NLP Tools a web service to interact with NLP Stack tools 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | Try an example. 27 | Tool implementation is {{ model.toolInfo[tool].impl }}. 28 | 29 | 30 | 31 | Enter text to {{ tool }}: 32 | 33 | 34 | 35 | 36 | 37 | 38 | {{ text }} 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | {{ errorMessage }} 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /webapp/public/visualize.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Nlpviz 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | Nlpviz a web service to visualize NLP 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | Try an example. 27 | 28 | 29 | 30 | Enter dependencies to visualize: 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | {{ errorMessage }} 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /webapp/src/main/bin/webapp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CLASS_NAME="org.allenai.nlpstack.webapp.Nlpweb" 4 | JVM_ARGS="-Xmx4g" 5 | 6 | SCRIPT_DIR=`dirname $0` 7 | SHORT_NAME=`basename $0 .sh` 8 | . "${SCRIPT_DIR}/run-class.sh" "$CLASS_NAME" "$SHORT_NAME" "$@" 9 | -------------------------------------------------------------------------------- /webapp/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | nlpstack.webapp { 2 | port = 8062 3 | } 4 | -------------------------------------------------------------------------------- /webapp/src/main/scala/org/allenai/nlpstack/webapp/BasicService.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.webapp 2 | 3 | import spray.routing._ 4 | 5 | trait BasicService extends HttpService { 6 | val staticContentRoot = "public" 7 | 8 | // format: OFF 9 | val basicRoute = 10 | path("") { 11 | get { 12 | getFromFile(staticContentRoot + "/index.html") 13 | } 14 | } ~ 15 | pathPrefix("info") { 16 | // TODO: version route 17 | path("name") { 18 | get { 19 | complete(Nlpweb.name) 20 | } 21 | } 22 | } ~ 23 | get { 24 | unmatchedPath { p => getFromFile(staticContentRoot + p) } 25 | } 26 | // format: ON 27 | } 28 | -------------------------------------------------------------------------------- /webapp/src/main/scala/org/allenai/nlpstack/webapp/Nlpweb.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.webapp 2 | 3 | import akka.actor.{ ActorSystem, Props } 4 | import akka.io.IO 5 | import akka.pattern.ask 6 | import akka.util.Timeout 7 | import com.typesafe.config.ConfigFactory 8 | import spray.can.Http 9 | 10 | import scala.concurrent.duration.DurationInt 11 | 12 | object Nlpweb { 13 | lazy val config = ConfigFactory.load() 14 | val name = "webapp" 15 | 16 | def main(args: Array[String]): Unit = { 17 | // ActorSystem to host the application in. 18 | implicit val system = ActorSystem("webapp") 19 | 20 | // Create and start our service actor. 21 | val service = system.actorOf(Props[NlpwebActor], "webapp-actor") 22 | 23 | // Start a new HTTP server with our service actor as the handler. 24 | { 25 | // Timeout for starting the spray Http server (below). 26 | implicit val timeout = Timeout(30.seconds) 27 | 28 | // IO is a scala object with an apply method that returns an ActorRef. 29 | IO(Http) ? Http.Bind( 30 | service, 31 | interface = "0.0.0.0", 32 | port = config.getInt("nlpstack.webapp.port") 33 | ) 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /webapp/src/main/scala/org/allenai/nlpstack/webapp/NlpwebActor.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.webapp 2 | 3 | import akka.actor.Actor 4 | import spray.http._ 5 | import spray.routing._ 6 | import spray.util.LoggingContext 7 | 8 | class NlpwebActor extends Actor with BasicService with VisualizationService with ToolService { 9 | 10 | implicit def myExceptionHandler(implicit log: LoggingContext) = 11 | ExceptionHandler { 12 | case e: Exception => 13 | requestUri { uri => 14 | log.error(toString, e) 15 | complete(StatusCodes.InternalServerError -> e.getMessage) 16 | } 17 | } 18 | 19 | // The HttpService trait defines only one abstract member, which connects the 20 | // services environment to the enclosing actor or test. 21 | def actorRefFactory = context 22 | 23 | /** Expire cached page after 60 seconds. */ 24 | val cacheControlMaxAge = HttpHeaders.`Cache-Control`(CacheDirectives.`max-age`(0)) 25 | 26 | // This actor only runs our route, but you could add other things here, like 27 | // request stream processing or timeout handling 28 | def receive = runRoute(basicRoute ~ visualizationRoute ~ toolRoute) 29 | } 30 | -------------------------------------------------------------------------------- /webapp/src/main/scala/org/allenai/nlpstack/webapp/ToolService.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.webapp 2 | 3 | import org.allenai.nlpstack.webapp.tools._ 4 | 5 | import spray.http.StatusCodes 6 | import spray.httpx.SprayJsonSupport 7 | import spray.httpx.marshalling.ToResponseMarshallable.isMarshallable 8 | import spray.json.DefaultJsonProtocol._ // IntelliJ thinks this is unused, but it's not. 9 | import spray.routing.Directive.pimpApply 10 | import spray.routing.HttpService 11 | 12 | trait ToolService extends HttpService with SprayJsonSupport { 13 | val tools = Seq( 14 | SentenceSegmenterTool, 15 | LemmatizerTool, 16 | TokenizerTool, 17 | PostaggerTool, 18 | ChunkerTool, 19 | DependencyParserTool 20 | ) 21 | 22 | // format: OFF 23 | val toolRoute = 24 | pathPrefix("api" / "tools") { 25 | // List available tools in JSON. 26 | pathEnd { 27 | get { 28 | val toolNames = tools map (_.name) 29 | complete(tools map (_.name)) 30 | } 31 | } ~ 32 | path(Segment) { segment => 33 | tools find (_.name == segment) match { 34 | case Some(tool) => 35 | // Give info about this tool. 36 | get { 37 | complete(tool.info) 38 | } ~ 39 | // Process text with this tool. 40 | post { 41 | entity(as[String]) { body => 42 | val sections: Seq[String] = tool.split(body) 43 | val results = sections map tool.apply 44 | complete(results) 45 | } 46 | } 47 | case None => 48 | // Tool not found. 49 | complete(StatusCodes.BadRequest -> s"Unknown tool: $segment") 50 | } 51 | } 52 | } 53 | // format: ON 54 | } 55 | -------------------------------------------------------------------------------- /webapp/src/main/scala/org/allenai/nlpstack/webapp/tools/ChunkerTool.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.webapp.tools 2 | 3 | import org.allenai.nlpstack.core.{ ChunkedToken, Chunker, Writer } 4 | import org.allenai.nlpstack.webapp.Whatswrong._ 5 | 6 | import java.awt.image.BufferedImage 7 | 8 | object ChunkerTool extends Tool("chunk") with StringFormat { 9 | type Output = Seq[ChunkedToken] 10 | 11 | override def info = ToolInfo(Impl.chunker.getClass.getSimpleName, Impl.obamaSentences) 12 | 13 | override def split(input: String) = input split "\n" 14 | override def process(section: String) = { 15 | val tokens = Impl.tokenizer(section) 16 | val postags = Impl.postagger.postagTokenized(tokens) 17 | Impl.chunker.chunkPostagged(postags) 18 | } 19 | override def visualize(output: Output) = { 20 | Seq( 21 | implicitly[Writer[Output, BufferedImage]].write(output) 22 | ) 23 | } 24 | override def stringFormat = Chunker.stringFormat 25 | } 26 | -------------------------------------------------------------------------------- /webapp/src/main/scala/org/allenai/nlpstack/webapp/tools/DependencyParserTool.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.webapp.tools 2 | 3 | import org.allenai.nlpstack.core.Writer 4 | import org.allenai.nlpstack.core.parse.graph.DependencyGraph 5 | import org.allenai.nlpstack.webapp.Whatswrong._ 6 | 7 | import java.awt.image.BufferedImage 8 | 9 | object DependencyParserTool extends Tool("dependencies") with StringFormat { 10 | type Output = DependencyGraph 11 | 12 | override def info = ToolInfo(Impl.dependencyParser.getClass.getSimpleName, Impl.obamaSentences) 13 | 14 | override def split(input: String) = input split "\n" 15 | override def process(section: String) = { 16 | val tokens = Impl.tokenizer(section) 17 | val postags = Impl.postagger.postagTokenized(tokens) 18 | Impl.dependencyParser.dependencyGraphPostagged(postags) 19 | } 20 | override def visualize(output: Output) = { 21 | Seq( 22 | implicitly[Writer[Output, BufferedImage]].write(output) 23 | ) 24 | } 25 | override def stringFormat = DependencyGraph.multilineStringFormat 26 | } 27 | -------------------------------------------------------------------------------- /webapp/src/main/scala/org/allenai/nlpstack/webapp/tools/Impl.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.webapp.tools 2 | 3 | import org.allenai.nlpstack.chunk.OpenNlpChunker 4 | import org.allenai.nlpstack.lemmatize.MorphaStemmer 5 | import org.allenai.nlpstack.parse.PolytreeParser 6 | import org.allenai.nlpstack.postag.defaultPostagger 7 | import org.allenai.nlpstack.segment.defaultSegmenter 8 | import org.allenai.nlpstack.tokenize.defaultTokenizer 9 | 10 | object Impl { 11 | private[tools] val sentenceSegmenter = defaultSegmenter 12 | private[tools] val tokenizer = defaultTokenizer 13 | private[tools] val lemmatizer = new MorphaStemmer() 14 | private[tools] val postagger = defaultPostagger 15 | private[tools] val chunker = new OpenNlpChunker() 16 | private[tools] val dependencyParser = new PolytreeParser 17 | 18 | val obamaText = "Barack Hussein Obama II is the 44th and current President of the United States, and the first African American to hold the office. Born in Honolulu, Hawaii, Obama is a graduate of Columbia University and Harvard Law School, where he served as president of the Harvard Law Review. He was a community organizer in Chicago before earning his law degree. He worked as a civil rights attorney and taught constitutional law at the University of Chicago Law School from 1992 to 2004. He served three terms representing the 13th District in the Illinois Senate from 1997 to 2004, running unsuccessfully for the United States House of Representatives in 2000." // scalastyle:ignore 19 | val obamaSentences = sentenceSegmenter(obamaText) map (_.text) mkString "\n" 20 | } 21 | -------------------------------------------------------------------------------- /webapp/src/main/scala/org/allenai/nlpstack/webapp/tools/LemmatizerTool.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.webapp.tools 2 | 3 | import org.allenai.nlpstack.core.{ Lemmatized, Token } 4 | 5 | object LemmatizerTool extends Tool("lemmatize") { 6 | type Output = Seq[Lemmatized[Token]] 7 | 8 | override def info = ToolInfo(Impl.lemmatizer.getClass.getSimpleName, Impl.obamaSentences) 9 | 10 | override def split(input: String) = input split "\n" 11 | override def process(section: String) = { 12 | val tokens = Impl.tokenizer.tokenize(section) 13 | val postagged = Impl.postagger.postagTokenized(tokens) 14 | postagged map Impl.lemmatizer.lemmatizePostaggedToken 15 | } 16 | override def visualize(output: Output) = Seq.empty 17 | override def format(output: Output) = Seq(output mkString " ") 18 | } -------------------------------------------------------------------------------- /webapp/src/main/scala/org/allenai/nlpstack/webapp/tools/PostaggerTool.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.webapp.tools 2 | 3 | import org.allenai.nlpstack.webapp.Whatswrong._ 4 | import org.allenai.nlpstack.core.{ PostaggedToken, Postagger, Writer } 5 | 6 | import java.awt.image.BufferedImage 7 | 8 | object PostaggerTool extends Tool("postag") with StringFormat { 9 | type Output = Seq[PostaggedToken] 10 | 11 | override def info = ToolInfo(Impl.postagger.getClass.getSimpleName, Impl.obamaSentences) 12 | 13 | override def split(input: String) = input split "\n" 14 | override def process(section: String) = { 15 | val tokens = Impl.tokenizer(section) 16 | Impl.postagger.postagTokenized(tokens) 17 | } 18 | override def visualize(output: Output) = { 19 | Seq( 20 | implicitly[Writer[Output, BufferedImage]].write(output) 21 | ) 22 | } 23 | override def stringFormat = Postagger.multilineStringFormat 24 | } 25 | -------------------------------------------------------------------------------- /webapp/src/main/scala/org/allenai/nlpstack/webapp/tools/SentenceSegmenterTool.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.webapp.tools 2 | 3 | import org.allenai.nlpstack.core.Segment 4 | 5 | object SentenceSegmenterTool extends Tool("segment") { 6 | type Output = Seq[Segment] 7 | 8 | override def info = ToolInfo(Impl.sentenceSegmenter.getClass.getSimpleName, Impl.obamaText) 9 | 10 | override def split(input: String) = Seq(input) 11 | override def process(section: String) = Impl.sentenceSegmenter(section).toSeq 12 | override def visualize(output: Output) = Seq.empty 13 | override def format(output: Output) = Seq(output mkString "\n") 14 | } -------------------------------------------------------------------------------- /webapp/src/main/scala/org/allenai/nlpstack/webapp/tools/TokenizerTool.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.webapp.tools 2 | 3 | import org.allenai.nlpstack.core.{ Tokenizer, Token, Writer } 4 | import org.allenai.nlpstack.webapp.Whatswrong._ 5 | 6 | import java.awt.image.BufferedImage 7 | 8 | object TokenizerTool extends Tool("tokenize") with StringFormat { 9 | type Output = Seq[Token] 10 | 11 | override def info = ToolInfo(Impl.tokenizer.getClass.getSimpleName, Impl.obamaSentences) 12 | 13 | override def split(input: String) = input split "\n" 14 | override def process(section: String) = Impl.tokenizer(section) 15 | override def visualize(output: Output) = { 16 | Seq( 17 | implicitly[Writer[Output, BufferedImage]].write(output) 18 | ) 19 | } 20 | override def stringFormat = Tokenizer.multilineStringFormat 21 | } 22 | -------------------------------------------------------------------------------- /webapp/src/main/scala/org/allenai/nlpstack/webapp/tools/Tool.scala: -------------------------------------------------------------------------------- 1 | package org.allenai.nlpstack.webapp.tools 2 | 3 | import org.allenai.common.Resource 4 | import org.allenai.nlpstack.core.Format 5 | 6 | import org.apache.commons.codec.binary.Base64OutputStream 7 | import org.apache.commons.io.output.ByteArrayOutputStream 8 | import spray.json.DefaultJsonProtocol.{ StringJsonFormat, jsonFormat2, seqFormat } 9 | 10 | import java.awt.image.BufferedImage 11 | import javax.imageio.ImageIO 12 | 13 | /** A class for representing a tool. 14 | * 15 | * @param name the name of the tool 16 | * @param split how to divide up the input text 17 | * @param process how to process each section of the input text 18 | * @param visualize conversions of the process output to a visualization 19 | * @param format conversions of the process output to a string 20 | */ 21 | abstract class Tool(val name: String) { 22 | type Output 23 | 24 | /** This information is presented on /tools/name. */ 25 | def info: ToolInfo 26 | 27 | /** The input to all tools is a single text box. It may be split up 28 | * as the tool sees fit. For example, a sentence segmenter may not 29 | * want to split the text, but a tokenizer might want to split the 30 | * input by newline. 31 | */ 32 | def split(input: String): Seq[String] 33 | def process(section: String): Output 34 | def visualize(output: Output): Seq[BufferedImage] 35 | def format(output: Output): Seq[String] 36 | 37 | /** Process, visualize, format, and then bundle the results. */ 38 | def apply(section: String): ToolResponse = { 39 | val processed = process(section) 40 | 41 | val visualizations = visualize(processed) 42 | val base64Visualizations = visualizations map { bufferedImage => 43 | Resource.using(new ByteArrayOutputStream()) { baos => 44 | Resource.using(new Base64OutputStream(baos)) { base64os => 45 | ImageIO.write(bufferedImage, "png", base64os) 46 | baos.flush() 47 | new String(baos.toByteArray()) 48 | } 49 | } 50 | } 51 | 52 | ToolResponse(format(processed), base64Visualizations) 53 | } 54 | } 55 | 56 | trait StringFormat { this: Tool => 57 | def stringFormat: Format[Output, String] 58 | def format(output: Output): Seq[String] = Seq(stringFormat.write(output)) 59 | } 60 | 61 | case class ToolInfo(impl: String, example: String) 62 | object ToolInfo { 63 | implicit val toolInfoFormat = jsonFormat2(ToolInfo.apply) 64 | } 65 | 66 | case class ToolResponse(texts: Seq[String], base64Images: Seq[String]) 67 | object ToolResponse { 68 | implicit val toolResponseJsonFormat = jsonFormat2(ToolResponse.apply) 69 | } 70 | -------------------------------------------------------------------------------- /webapp/webapp/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "nlpstack-webapp", 3 | "version": "0.0.0", 4 | "description": "Webapp for shocasing nlpstack technologies", 5 | "main": "index.js", 6 | "scripts": { 7 | "test": "echo \"No test specified\" && exit 0" 8 | }, 9 | "repository": { 10 | "type": "git", 11 | "url": "https://github.com/allenai/nlpstack" 12 | }, 13 | "author": "", 14 | "license": "ISC", 15 | "bugs": { 16 | "url": "https://github.com/allenai/nlpstack/issues" 17 | }, 18 | "homepage": "https://github.com/allenai/nlpstack" 19 | } 20 | --------------------------------------------------------------------------------
{{ text }}