├── .dockerignore ├── .gitignore ├── .travis.yml ├── Dockerfile ├── LICENSE ├── README.md ├── app ├── pom.xml └── src │ ├── main │ ├── resources │ │ └── logback.xml │ └── scala │ │ └── edu │ │ └── knowitall │ │ ├── ollie │ │ ├── OllieCli.scala │ │ └── SentenceIterator.scala │ │ └── openparse │ │ ├── OpenParseCli.scala │ │ ├── OpenParseGui.scala │ │ └── gui │ │ ├── Dot.scala │ │ ├── ExtractionEntry.scala │ │ ├── Parser.scala │ │ └── Sentence.scala │ └── test │ └── resources │ └── logback-test.xml ├── core ├── build.sbt ├── here.txt ├── pom.xml ├── project │ └── plugins.sbt ├── scripts │ ├── applypatterns.sh │ ├── build_templates.sh │ ├── create_patterns.sh │ ├── create_test_train.sh │ ├── extractor.sh │ └── keep_common_patterns.sh └── src │ ├── main │ ├── resources │ │ └── edu │ │ │ └── knowitall │ │ │ ├── ollie │ │ │ ├── cognitiveWords.txt │ │ │ ├── communicationWords.txt │ │ │ ├── confidence │ │ │ │ └── default-classifier.txt │ │ │ └── prefixWords.txt │ │ │ └── openparse │ │ │ ├── categories │ │ │ ├── location.txt │ │ │ └── person.txt │ │ │ └── openparse.model │ └── scala │ │ └── edu │ │ └── knowitall │ │ ├── common │ │ └── enrich │ │ │ └── Traversable.scala │ │ ├── ollie │ │ ├── DependencyGraphExtras.scala │ │ ├── NaryExtraction.scala │ │ ├── Ollie.scala │ │ ├── OllieExtraction.scala │ │ ├── OllieExtractionInstance.scala │ │ ├── ScoredOllieExtractionInstance.scala │ │ ├── confidence │ │ │ ├── OllieConfidenceFunction.scala │ │ │ ├── OllieFeatureEvaluation.scala │ │ │ ├── OllieFeatureSet.scala │ │ │ └── train │ │ │ │ ├── CrossValidateConfidence.scala │ │ │ │ └── TrainOllieConfidence.scala │ │ └── output │ │ │ └── BratOutput.scala │ │ └── openparse │ │ ├── AnalyzePatterns.scala │ │ ├── BuildPatterns.scala │ │ ├── ExtractorPattern.scala │ │ ├── GraphExpansions.scala │ │ ├── OpenParse.scala │ │ ├── bootstrap │ │ ├── FilterTargetExtractions.scala │ │ ├── FindCommon.scala │ │ ├── FindTargetArguments.scala │ │ └── FindTargetExtractions.scala │ │ ├── eval │ │ ├── GroupScoredBy.scala │ │ ├── PrecisionYield.scala │ │ ├── RankPatterns.scala │ │ ├── Score.scala │ │ └── StatisticalSignificance.scala │ │ ├── extract │ │ ├── Extraction.scala │ │ ├── GeneralExtractor.scala │ │ ├── PatternExtractor.scala │ │ ├── SpecificExtractor.scala │ │ └── TemplateExtractor.scala │ │ └── template │ │ ├── BuildTemplates.scala │ │ ├── CountsToConfidence.scala │ │ ├── GeneralizeTemplate.scala │ │ └── PassiveReflections.scala │ └── test │ ├── resources │ └── logback-test.xml │ └── scala │ └── edu │ └── knowitall │ ├── common │ └── enrich │ │ └── TraversableSpecTest.scala │ ├── ollie │ ├── DependencyGraphExtrasSpec.scala │ └── confidence │ │ └── OllieFeatureSetSpec.scala │ └── openparse │ ├── BuildPatternsSpec.scala │ ├── ExtractorPatternSpec.scala │ ├── OllieSpec.scala │ ├── OpenParseSpec.scala │ └── PatternExtractorSpec.scala ├── data └── training.tsv ├── example ├── pom.xml └── src │ └── main │ ├── java │ └── example │ │ └── JavaOllieWrapper.java │ ├── resouces │ └── logback.xml │ └── scala │ └── ollie │ └── Example.scala └── pom.xml /.dockerignore: -------------------------------------------------------------------------------- 1 | Dockerfile 2 | .dockerignore 3 | .gitignore 4 | .git 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | .cache 3 | .classpath 4 | .project 5 | .settings 6 | engmalt.linear.mco 7 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: scala 2 | scala: 3 | - "2.9.2" 4 | jdk: 5 | - oraclejdk7 6 | - openjdk7 7 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM maven:3.5.2-jdk-7 2 | 3 | WORKDIR /stage 4 | 5 | COPY ./ /stage/ 6 | RUN curl http://www.maltparser.org/mco/english_parser/engmalt.linear-1.7.mco > /stage/engmalt.linear-1.7.mco 7 | RUN mvn clean package 8 | 9 | CMD ["java", "-Xmx512m", "-jar", "ollie-app-1.0.1-SNAPSHOT.jar"] 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Ollie Software License Agreement 2 | 3 | Ollie Software 4 | (C) 2011-2012, University of Washington. All rights reserved. 5 | US patent number 7,877,343 and 12/970,155 patent pending 6 | 7 | The University of Washington (UW), Professor Mausam, Michael Schmitz, Robert 8 | Bart, and Stephen Soderland, (Developers) give permission for you and your 9 | laboratory (University) to use Ollie. Ollie is a system that extracts 10 | relational triples from text. Ollie is protected by a United States copyright 11 | and patents. The National Science Foundation supported work on Ollie. Under 12 | University of Washington's patents 7,877,343 (issued) and 12/970,155 (patent 13 | pending), the UW grants to you the non-exclusive right to use patent claims 14 | practiced by the University of Washington's Ollie software solely for 15 | non-commercial purposes and as long as you comply with the terms of this Ollie 16 | Software License Agreement. UW and the Developers allow you to copy and modify 17 | Ollie for non-commercial purposes, and to distribute modifications through 18 | GitHub or directly to the University of Washington, on the following 19 | conditions: 20 | 21 | 22 | 1. Ollie is not used for any commercial purposes, or as part of a system 23 | which has commercial purposes. 24 | 25 | 26 | 2. Any software derived from Ollie must carry prominent notices stating that 27 | you modified it along with the date modified. The derivative must also carry 28 | prominent notices stating that it is released under this Ollie Software 29 | License Agreement 30 | 31 | If you wish to obtain Ollie or to obtain any patent rights for any commercial 32 | purposes, you will need to contact the University of Washington to see if 33 | rights are available and to negotiate a commercial license and pay a fee. This 34 | includes, but is not limited to, using Ollie to provide services to outside 35 | parties for a fee. In that case please contact: 36 | 37 | UW Center for Commercialization 38 | University of Washington 39 | 4311 11th Ave. NE, 40 | Suite 500 Seattle, WA 98105-4608 41 | 42 | Phone: (206) 543-3970 43 | Email: license@u.washington.edu 44 | 45 | 46 | 3. You retain in Ollie and any modifications to Ollie, the copyright, 47 | trademark, patent or other notices pertaining to Ollie as provided by UW. 48 | 49 | 50 | 4. You provide the Developers with feedback on the use of the Ollie software 51 | in your research, and that the Developers and UW are permitted to use any 52 | information you provide in making changes to the Ollie software. All bug 53 | reports and technical questions shall be sent to: afader@cs.washington.edu. 54 | Modifications may be communicated through GitHub pull requests at: 55 | 56 | https://github.com/knowitall/ 57 | 58 | 59 | 5. You acknowledge that the Developers, UW and its licensees may develop 60 | modifications to Ollie that may be substantially similar to your modifications 61 | of Ollie, and that the Developers, UW and its licensees shall not be 62 | constrained in any way by you in UW's or its licensees' use or management of 63 | such modifications. You acknowledge the right of the Developers and UW to 64 | prepare and publish modifications to Ollie that may be substantially similar 65 | or functionally equivalent to your modifications and improvements, and if you 66 | obtain patent protection for any modification or improvement to Ollie you 67 | agree not to allege or enjoin infringement of your patent by the Developers, 68 | the UW or by any of UW's licensees obtaining modifications or improvements to 69 | Ollie from the University of Washington or the Developers. 70 | 71 | 72 | 6. If utilization of the Ollie software results in outcomes which will be 73 | published, please specify the version of Ollie you used and cite the UW 74 | Developers. 75 | 76 | @inproceedings{ollie-emnlp12, 77 | author = {Mausam and Michael Schmitz and Robert Bart and 78 | Stephen Soderland and Oren Etzioni}, 79 | title = {Open Language Learning for Information Extraction}, 80 | booktitle = {Proceedings of Conference on Empirical Methods in 81 | Natural Language Processing and Computational Natural 82 | Language Learning (EMNLP-CONLL)}, 83 | year = {2012} 84 | } 85 | 86 | 87 | 7. Any risk associated with using the Ollie software at your organization is 88 | with you and your organization. Ollie is experimental in nature and is made 89 | available as a research courtesy "AS IS," without obligation by UW to provide 90 | accompanying services or support. 91 | 92 | 93 | UW AND THE AUTHORS EXPRESSLY DISCLAIM ANY AND ALL WARRANTIES REGARDING THE 94 | SOFTWARE, WHETHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO WARRANTIES 95 | PERTAINING TO MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. 96 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Ollie 2 | 3 | Ollie is a program that automatically identifies and extracts binary 4 | relationships from English sentences. Ollie is designed for Web-scale 5 | information extraction, where target relations are not specified in advance. 6 | 7 | Ollie is our second-generation information extraction system . Whereas ReVerb operates on flat sequences 9 | of tokens, Ollie works with the tree-like (graph with only small cycles) 10 | representation using Stanford's compression of the dependencies. This allows 11 | Ollie to capture expression that ReVerb misses, such as long-range relations. 12 | 13 | Ollie also captures context that modifies a binary relation. Presently Ollie 14 | handles attribution (He said/she believes) and enabling conditions (if X 15 | then). 16 | 17 | ## Quick Start 18 | 19 | ### Docker 20 | 21 | You can now run Ollie with a single Docker command. 22 | 23 | ``` 24 | docker run -it schmmd/ollie:latest 25 | ``` 26 | 27 | To configure Ollie, you can drop into a bash shell with `docker run -it schmmd/ollie:latest /bin/bash` 28 | and run Ollie from the command line. 29 | 30 | ### Local Machine 31 | 32 | If you want to run Ollie on a small amount of text without modifying the source 33 | code, you can use an executable file that can be run from the command line. 34 | Please note that Ollie was built using Scala 2.9 and so it requires Java 7. 35 | Follow these steps to get started: 36 | 37 | 1. Download the latest Ollie binary from 38 | http://knowitall.cs.washington.edu/ollie/ollie-app-latest.jar. 39 | 40 | 2. Download the linear English MaltParser model (engmalt.linear-1.7.mco) from 41 | http://www.maltparser.org/mco/english_parser/engmalt.html 42 | and place it in the same directory as Ollie. 43 | 44 | 3. Run `java -Xmx512m -jar ollie-app-latest.jar yourfile.txt`. The input file 45 | should contain one sentence per line unless `--split` is specified. Omit 46 | the input file for an interactive console. 47 | 48 | ## Examples 49 | 50 | ### Enabling Condition 51 | 52 | An enabling condition is a condition that needs to be met for the extraction to 53 | be true. Certain words demark an enabling condition, such as "if" and "when". 54 | Ollie captures enabling conditions if they are present. 55 | 56 | sentence: If I slept past noon, I'd be late for work. 57 | extraction: (I; 'd be late for; work)[enabler=If I slept past noon] 58 | 59 | ### Attribution 60 | 61 | An attribution clause specifies an entity that asserted an extraction and a 62 | verb that specifies the expression. Ollie captures attributions if they are 63 | present. 64 | 65 | sentence: Some people say Barack Obama was not born in the United States. 66 | extraction: (Barack Obama; was not born in; the United States)[attrib=Some people say] 67 | 68 | sentence: Early astronomers believe that the earth is the center of the universe. 69 | extraction: (the earth; is the center of; the universe)[attrib=Early astronomers believe] 70 | 71 | ### Relational noun 72 | 73 | Some relations are expressed without verbs. Ollie can capture these as well as 74 | verb-mediated relations. 75 | 76 | sentence: Microsoft co-founder Bill Gates spoke at a conference on Monday. 77 | extraction: (Bill Gates; be co-founder of; Microsoft) 78 | 79 | 80 | ### N-ary extractions 81 | 82 | Often times similar relations will specify different aspects of the same event. 83 | Since Ollie captures long-range relations it can capture N-ary extractions by 84 | collapsing extractions where the relation phrase only differs by the 85 | preposition. 86 | 87 | sentence: I learned that the 2012 Sasquatch music festival is scheduled for May 25th until May 28th. 88 | extraction: (the 2012 Sasquatch music festival; is scheduled for; May 25th) 89 | extraction: (the 2012 Sasquatch music festival; is scheduled until; May 28th) 90 | nary: (the 2012 Sasquatch music festival; is scheduled; [for May 25th; to May 28th]) 91 | 92 | ## Building 93 | 94 | Building Ollie from source requires Apache Maven (). 95 | First, clone or download the Ollie source from GitHub. Run this command in the 96 | top-level source folder to download the required dependencies, compile, and 97 | create a single jar file. 98 | 99 | mvn clean package 100 | 101 | The compiled class files will be put in the base directory. The single 102 | executable jar file will be written to `ollie-app-VERSION.jar` where `VERSION` 103 | is the version number. 104 | 105 | ## Command Line Interface 106 | 107 | Once you have built Ollie, you can run it from the command line. 108 | 109 | java -Xmx512m -jar ollie-app-VERSION.jar yourfile.txt 110 | 111 | Omit the input file for an interactive console. 112 | 113 | Ollie takes sentences, one-per-line as input or splits text into sentences if 114 | `--split` is specified. Run Ollie with `--usage` to see full usage. 115 | 116 | The Ollie command line tool has a few output formats. The output format is 117 | specified by `--output-format` and a valid format: 118 | 119 | 1. The `interactive` format that is meant to be easily human readable. 120 | 2. The `tabbed` format is mean to be easily parsable. A header will be output 121 | as the first row to label the columns. 122 | 3. `tabbedsingle` is similar to `tabbed` but the extraction is output as (arg1; relation; 123 | arg2) in a single column. 124 | 4. The `serialized` is meant to be fully deserialized into an 125 | `OllieExtractionInstance` class. 126 | 127 | ## Graphical Interface 128 | 129 | Ollie works ontop of a subcomponent called OpenParse. The distinction is 130 | largely technical; OpenParse does not handle attribution and enabling condition 131 | and uses a coarser confidence metric. You can use a GUI application to 132 | visualize the OpenParse extractions in a parse tree. To use it, you will need 133 | to have [graphviz](http://www.graphviz.org/) installed. You can run the GUI 134 | with: 135 | 136 | java -Xms512M -Xmx1g -cp ollie-app-VERSION.jar edu.knowitall.openparse.OpenParseGui 137 | 138 | By default, this application will look for graphviz's `dot` program at 139 | `/usr/bin/dot`. You can specify a location with the `--graphviz` parameter. 140 | 141 | You can try out your own models with `Options->Load Model...`. To see an 142 | example model, look at `openparse.model` in `src/main/resources`. Your model 143 | may have one or more patterns in it. If you want to see pattern matches 144 | (without node expansion) instead of triple extractions, you can choose to show 145 | the raw match with `Options->Raw Matches`. This will allow you to use patterns 146 | that do not capture an arg1, rel, and arg2. 147 | 148 | ## Parsers 149 | 150 | Ollie is packaged to use Malt Parser, one of the fastest dependency parsers 151 | available. You will need the model file (`engmalt.linear-1.7.mco`) in the 152 | directory the application is run from or you will need to specify its location 153 | with the `--malt-model` parameter. Malt Parser models are available online. 154 | 155 | http://www.maltparser.org/mco/english_parser/engmalt.html 156 | 157 | Ollie works with any other parser in the `nlptools` project. For example, it 158 | is easy to swap out Malt for Stanford's parser. Stanford's parser is not a 159 | part of the Ollie distribution by default because of licensing conflicts, but 160 | the Stanford parser was used as the execution parser for the results in the 161 | paper. Malt Parser was used to bootstrap the patterns. We are interested 162 | in Clear parser as an alternative, but it's not a trivial change because Clear 163 | uses a slightly different dependency representation. 164 | 165 | ## Using Eclipse 166 | 167 | To modify the Ollie source code in Eclipse, use the [M2Eclipse 168 | plugin](http://www.sonatype.org/m2eclipse/) along with 169 | [ScalaIDE](http://scala-ide.org/). You can then import the project using 170 | the following. 171 | 172 | File > Import > Existing Maven Projects 173 | 174 | ## Including Ollie as a Dependency 175 | 176 | Add the following as a Maven dependency. 177 | 178 | edu.washington.cs.knowitall.ollie 179 | ollie-core_2.9.2 180 | [1.0.0, ) 181 | 182 | The best way to find the latest version is to browse [Maven Central](http://search.maven.org/#search%7Cga%7C1%7Cg%3A%22edu.washington.cs.knowitall%22). 183 | 184 | `ollie-core` does not include a way to parse sentences. You will need to use a 185 | parser supplied by the [nlptools](https://github.com/knowitall/nlptools) 186 | project. The source for for `ollie-app` is an excellent example of a project 187 | using `ollie-core` as a dependency. `ollie-app` supplies a parser from 188 | [nlptools](https://github.com/knowitall/nlptools). 189 | 190 | There is an example project that uses Ollie in the `example` folder of the 191 | source distribution. 192 | 193 | ## Training the Confidence Function 194 | 195 | While Ollie comes with a trained confidence function, it is possible to retrain 196 | the confidence function. First, you need to run Ollie over a set of sentences 197 | and store the output in the *serialized* format. 198 | 199 | echo "Michael rolled down the hill." | java -jar ollie-app-1.0.0-SNAPSHOT.jar --serialized --output toannotate.tsv 200 | 201 | Next you need to annotate the extractions. Modify the output file and 202 | **change** the first column to a binary annotation--`1` for correct and `0` for 203 | wrong. Your final file will look similar to `ollie/data/training.tsv`. Now 204 | run the logistic regression trainer. 205 | 206 | java -cp ollie-app-1.0.0-SNAPSHOT.jar edu.washington.cs.knowitall.ollie.confidence.train.TrainOllieConfidence toannotate.tsv 207 | 208 | ## Concurrency 209 | 210 | When operating at web scale, parallelism is essential. While the base Ollie 211 | extractor is immutable and thread safe, the parser may not be thread safe. I 212 | do not know whether Malt parser is thread safe. 213 | 214 | ## FAQ 215 | 216 | 1. How fast is Ollie? 217 | 218 | You should really benchmark Ollie yourself, but on my computer (a new computer in 2011), Ollie processed 5000 high-quality web sentences in 56 seconds, or 89 sentences per second, in a single thread. Ollie is easily parallelizable and the Ollie extractor itself is threadsafe (see Concurrency section). 219 | 220 | ## Contact 221 | 222 | To contact the UW about Ollie, email knowit-ollie@cs.washington.edu. 223 | 224 | ## Citing Ollie 225 | If you use Ollie in your academic work, please cite Ollie with the following 226 | BibTeX citation: 227 | 228 | @inproceedings{ollie-emnlp12, 229 | author = {Mausam and Michael Schmitz and Robert Bart and Stephen Soderland and Oren Etzioni}, 230 | title = {Open Language Learning for Information Extraction}, 231 | booktitle = {Proceedings of Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning (EMNLP-CONLL)}, 232 | year = {2012} 233 | } 234 | -------------------------------------------------------------------------------- /app/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | edu.washington.cs.knowitall.ollie 5 | ollie-app 6 | ollie-app 7 | 1.0.1-SNAPSHOT 8 | 9 | edu.washington.cs.knowitall 10 | knowitall-oss 11 | 1.0.2 12 | 13 | 14 | UTF-8 15 | 2.4.0 16 | 17 | 18 | 19 | org.scala-lang 20 | scala-swing 21 | 2.9.2 22 | 23 | 24 | edu.washington.cs.knowitall.ollie 25 | ollie-core_2.9.2 26 | 1.0.2 27 | 28 | 29 | edu.washington.cs.knowitall.nlptools 30 | nlptools-parse-malt_2.9.2 31 | ${nlptools.version} 32 | 33 | 34 | edu.washington.cs.knowitall.nlptools 35 | nlptools-parse-stanford_2.9.2 36 | ${nlptools.version} 37 | 38 | 39 | edu.washington.cs.knowitall.nlptools 40 | nlptools-sentence-opennlp_2.9.2 41 | ${nlptools.version} 42 | 43 | 44 | junit 45 | junit 46 | 4.11 47 | test 48 | 49 | 50 | batik 51 | batik-swing 52 | 1.6-1 53 | 54 | 55 | org.specs2 56 | specs2_2.9.2 57 | 1.12.3 58 | test 59 | 60 | 61 | 62 | ch.qos.logback 63 | logback-classic 64 | 1.0.9 65 | 66 | 67 | ch.qos.logback 68 | logback-core 69 | 1.0.9 70 | 71 | 72 | 73 | src/main/scala 74 | src/test/scala 75 | 76 | 77 | 78 | net.alchim31.maven 79 | scala-maven-plugin 80 | 3.1.1 81 | 82 | 83 | -deprecation 84 | -unchecked 85 | 86 | 87 | 88 | 89 | 90 | compile 91 | testCompile 92 | 93 | 94 | 95 | 96 | 97 | maven-assembly-plugin 98 | 99 | ${project.build.directory}/../.. 100 | false 101 | 102 | 103 | edu.knowitall.ollie.OllieCli 104 | 105 | 106 | 107 | 108 | 109 | distro-assembly 110 | package 111 | 112 | single 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | -------------------------------------------------------------------------------- /app/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /app/src/main/scala/edu/knowitall/ollie/SentenceIterator.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.ollie 2 | 3 | import edu.knowitall.tool.segment.Segmenter 4 | 5 | class SentenceIterator(sentencer: Segmenter, private var lines: BufferedIterator[String]) extends Iterator[String] { 6 | var sentences: Iterator[String] = Iterator.empty 7 | 8 | lines.dropWhile(_.trim.isEmpty) 9 | 10 | def nextSentences = { 11 | val (paragraph, rest) = lines.span(!_.trim.isEmpty) 12 | lines = rest.dropWhile(_.trim.isEmpty).buffered 13 | sentencer.segmentTexts(paragraph.mkString(" ")).iterator.buffered 14 | } 15 | 16 | def hasNext: Boolean = { 17 | if (sentences.hasNext) { 18 | true 19 | } 20 | else if (!lines.hasNext) { 21 | false 22 | } 23 | else { 24 | sentences = nextSentences 25 | sentences.hasNext 26 | } 27 | } 28 | 29 | def next: String = { 30 | if (sentences.hasNext) { 31 | sentences.next() 32 | } 33 | else { 34 | sentences = nextSentences 35 | sentences.next() 36 | } 37 | } 38 | } -------------------------------------------------------------------------------- /app/src/main/scala/edu/knowitall/openparse/OpenParseCli.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.openparse 2 | 3 | import java.io.{PrintWriter, File} 4 | import java.net.URL 5 | 6 | import scala.collection.Set 7 | import scala.io.Source 8 | 9 | import org.slf4j.LoggerFactory 10 | 11 | import edu.knowitall.collection.immutable.graph.pattern.Match 12 | import edu.knowitall.collection.immutable.graph.Graph 13 | import edu.knowitall.common.Resource.using 14 | import edu.knowitall.common.Timing 15 | import edu.knowitall.tool.parse.MaltParser 16 | import edu.knowitall.openparse.OpenParse.validMatch 17 | import edu.knowitall.openparse.extract.{TemplateExtractor, PatternExtractorType, PatternExtractor, GeneralExtractor, Extraction, DetailedExtraction} 18 | import edu.knowitall.tool.parse.graph.{DependencyNode, DependencyGraph} 19 | 20 | import scopt.OptionParser 21 | 22 | object OpenParseCli { 23 | val logger = LoggerFactory.getLogger(this.getClass) 24 | 25 | abstract class Settings { 26 | def modelUrl: URL 27 | def outputFile: Option[File] 28 | def sentenceFile: File 29 | 30 | def confidenceThreshold: Double 31 | def expandArguments: Boolean 32 | def verbose: Boolean 33 | 34 | def parallel: Boolean 35 | def invincible: Boolean 36 | } 37 | 38 | def main(args: Array[String]) { 39 | object settings extends Settings { 40 | var modelUrl: URL = OpenParse.defaultModelUrl 41 | var outputFile: Option[File] = None 42 | var sentenceFile: File = null 43 | 44 | var confidenceThreshold = 0.0; 45 | var expandArguments: Boolean = true 46 | var verbose: Boolean = false 47 | 48 | var parallel: Boolean = false 49 | var invincible: Boolean = false 50 | } 51 | 52 | val parser = new OptionParser("openparse-cli") { 53 | arg("sentences", "sentence file", { path: String => 54 | val file = new File(path) 55 | require(file.exists, "file does not exist: " + path) 56 | settings.sentenceFile = file 57 | }) 58 | opt(Some("m"), "model", "", "model file", { path: String => 59 | val file = new File(path) 60 | require(file.exists, "file does not exist: " + path) 61 | settings.modelUrl = file.toURI.toURL 62 | }) 63 | doubleOpt(Some("t"), "threshold", "", "confident threshold for shown extractions", { t: Double => settings.confidenceThreshold = t }) 64 | opt("o", "output", "output file (otherwise stdout)", { path => settings.outputFile = Some(new File(path)) }) 65 | 66 | opt("x", "expand-arguments", "expand extraction arguments", { settings.expandArguments = true }) 67 | opt("v", "verbose", "", { settings.verbose = true }) 68 | 69 | opt("p", "parallel", "", { settings.parallel = true }) 70 | opt("invincible", "", { settings.invincible = true }) 71 | } 72 | 73 | if (parser.parse(args)) { 74 | logger.info("args: " + args.mkString(" ")) 75 | run(settings) 76 | } 77 | } 78 | 79 | def run(settings: Settings) { 80 | val parser = new MaltParser 81 | def parse(line: String): Option[DependencyGraph] = { 82 | Some(parser.dependencyGraph(line)) 83 | } 84 | 85 | val other = new OpenParse.Settings { 86 | var modelUrl = settings.modelUrl 87 | var outputFile = settings.outputFile 88 | var sentenceFile = settings.sentenceFile 89 | var confidenceThreshold = settings.confidenceThreshold 90 | val duplicates = false 91 | var expandArguments = settings.expandArguments 92 | val showAll = false 93 | var verbose = settings.verbose 94 | val collapseVB = false 95 | var parallel = settings.parallel 96 | var invincible = settings.invincible 97 | } 98 | 99 | OpenParse.run(other, parse) 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /app/src/main/scala/edu/knowitall/openparse/gui/Dot.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.openparse.gui 2 | 3 | import edu.knowitall.openparse.extract.TemplateExtractor 4 | import edu.knowitall.common.Resource.using 5 | import edu.knowitall.tool.parse.graph.DependencyGraph 6 | import edu.knowitall.tool.parse.graph.DependencyNode 7 | import java.io.IOException 8 | import scala.swing.Dialog 9 | import scala.io.Source 10 | import java.io.InputStream 11 | import java.io.OutputStream 12 | import java.io.PrintWriter 13 | import java.io.File 14 | 15 | /** Code pertaining to rendering and converting DOT graphs. */ 16 | object Dot { 17 | def dot2svg(graphvizFile: Option[File], dotgraph: String) = { 18 | import sys.process.ProcessIO 19 | 20 | trait InputHandler[A] { 21 | def handle(a: A)(input: OutputStream) 22 | } 23 | 24 | trait OutputHandler[A] { 25 | def handle(output: InputStream) 26 | def value: A 27 | } 28 | 29 | val errHandler = new OutputHandler[String] { 30 | var value: String = null 31 | 32 | def handle(out: InputStream) { 33 | value = Source.fromInputStream(out).mkString 34 | out.close() 35 | } 36 | } 37 | 38 | val inputHandler = new InputHandler[String] { 39 | def handle(a: String)(os: OutputStream) { 40 | val pw = new PrintWriter(os) 41 | pw write a 42 | pw.close() 43 | } 44 | } 45 | 46 | val outputHandler = new OutputHandler[String] { 47 | var value: String = null 48 | 49 | def handle(out: InputStream) { 50 | value = Source.fromInputStream(out).mkString 51 | out.close() 52 | } 53 | } 54 | val io = new ProcessIO(inputHandler.handle(dotgraph), outputHandler.handle, errHandler.handle, false) 55 | 56 | val process = graphvizFile match { 57 | case Some(file) => sys.process.Process(file.getAbsolutePath, Seq("-T", "svg")) 58 | case None => sys.process.Process("dot", Seq("-T", "svg")) 59 | } 60 | 61 | val proc = try (process run io) 62 | catch { 63 | case e: IOException => 64 | Dialog.showMessage(message = e.getMessage() + ". You may need to install graphviz and add it to the PATH variable, or specify the path to the dot program using the '--graphviz' argument.", messageType = Dialog.Message.Error) 65 | throw e 66 | } 67 | 68 | proc.exitValue() match { 69 | case 0 => outputHandler.value 70 | case x => sys.error("Dot exited with error code: " + x + " with output:\n" + errHandler.value) 71 | } 72 | } 73 | 74 | def svg2xml(svgString: String, nodeClickEvent: String=>Unit) = { 75 | import org.apache.batik.dom.svg.SVGDOMImplementation; 76 | import org.apache.batik.util.XMLResourceDescriptor 77 | import org.apache.batik.dom.svg.SAXSVGDocumentFactory 78 | 79 | val uri = SVGDOMImplementation.SVG_NAMESPACE_URI; 80 | 81 | val doc = using(new java.io.StringReader(svgString)) { reader => 82 | val parser = XMLResourceDescriptor.getXMLParserClassName(); 83 | val f = new SAXSVGDocumentFactory(parser); 84 | f.createSVGDocument(uri, reader); 85 | } 86 | 87 | val gs = doc.getElementsByTagNameNS(uri, "g") 88 | for (i <- 0 until gs.getLength) { 89 | val g = gs.item(i) 90 | val attributes = g.getAttributes 91 | val clazz = attributes.getNamedItem("class").getNodeValue 92 | 93 | if (clazz == "node") { 94 | val children = g.getChildNodes 95 | for (j <- 0 until children.getLength) { 96 | val child = children.item(j) 97 | if (child.getNodeName == "title") { 98 | val text = child.getFirstChild.getNodeValue 99 | 100 | import org.w3c.dom.events._ 101 | g.asInstanceOf[EventTarget].addEventListener("click", 102 | new EventListener() { 103 | def handleEvent(e: Event) { nodeClickEvent(text) } 104 | }, 105 | true); 106 | } 107 | } 108 | } 109 | } 110 | 111 | doc 112 | } 113 | 114 | def dotgraph(dgraph: DependencyGraph, nodes: Set[DependencyNode]) = { 115 | val nodeStyle = nodes.map((_, "style=filled,color=lightblue")) 116 | dgraph.dot(dgraph.text, nodeStyle.toMap, Map.empty) 117 | } 118 | 119 | def dotgraph(dgraph: DependencyGraph, extraction: ExtractionEntry) = { 120 | def originalNodes(nodes: Iterable[DependencyNode]) = nodes.map { node => 121 | dgraph.nodes.find(_.indices == node.indices).get 122 | } 123 | 124 | val title = "\\n" + dgraph.text + "\\n" + extraction.toString + "\\n" + extraction.`match`.pattern.toStringF((s: String) => if (s.length < 60) s else s.take(20) + "...") + 125 | (extraction.extractor match { case ex: TemplateExtractor => "\\n" + ex.template case _ => "" }) 126 | 127 | // nodes 128 | val darkNodes = extraction.`match`.nodeGroups 129 | val lightNodes = originalNodes(extraction.nodes).toSet -- originalNodes(darkNodes.map(_._2.node)) 130 | val filledNodes = (lightNodes zip Stream.continually("style=filled,fillcolor=lightgray")) ++ 131 | (darkNodes.map { nodeGroup => 132 | val style = "style=filled,fillcolor=" + (nodeGroup._1 match { 133 | case "rel" => "salmon1" 134 | case "arg1" | "arg2" => "lightblue" 135 | case "slot0" | "slot1" | "slot2" | "slot3" => "seashell" 136 | case _ => "yellow" 137 | }) 138 | 139 | (nodeGroup._2.node, style) 140 | }) 141 | 142 | // edges 143 | val solidEdges = extraction.edges.toSet 144 | 145 | val nodeStyle = filledNodes 146 | val edgeStyle = (solidEdges zip Stream.continually("style=filled")) ++ 147 | ((dgraph.graph.edges.toSet -- solidEdges.toSet) zip Stream.continually("style=dotted,color=gray")) 148 | 149 | dgraph.dot(title, nodeStyle.toMap, edgeStyle.toMap) 150 | } 151 | } -------------------------------------------------------------------------------- /app/src/main/scala/edu/knowitall/openparse/gui/ExtractionEntry.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.openparse.gui 2 | 3 | import edu.knowitall.collection.immutable.graph.pattern.Match 4 | import edu.knowitall.tool.parse.graph.DependencyNode 5 | import edu.knowitall.openparse.extract.PatternExtractor 6 | import edu.knowitall.openparse.extract.DetailedExtraction 7 | 8 | /** 9 | * A more generic representation of an extraction. 10 | * 11 | * This is needed to allow for raw matches, which do 12 | * not have an arg1, rel, etc. 13 | */ 14 | case class ExtractionEntry( 15 | confidence: Option[Double], 16 | `match`: Match[DependencyNode], 17 | nodes: Set[DependencyNode], 18 | extractor: PatternExtractor, 19 | parser: Parser.ParserEnum, 20 | string: String = "", 21 | correct: Option[Boolean]) { 22 | 23 | /** 24 | * Convenient constructor for instantiating from 25 | * an OpenParse extraction. 26 | */ 27 | def this(confidence: Double, extraction: DetailedExtraction, parser: Parser.ParserEnum, correct: Option[Boolean] = None) = this(Some(confidence), extraction.`match`, extraction.nodes.toSet, extraction.extractor, parser, extraction.toString, correct) 28 | 29 | def edges = `match`.edges 30 | 31 | def annotate(correct: Boolean) = this.copy(correct = Some(correct)) 32 | def unannotate = this.copy(correct = None) 33 | 34 | private def goldString = { 35 | correct match { 36 | case Some(true) => "+ " 37 | case Some(false) => "- " 38 | case None => "" 39 | } 40 | } 41 | 42 | override def toString = confidence.map("%1.4f:" format _).getOrElse("") + goldString + string 43 | } -------------------------------------------------------------------------------- /app/src/main/scala/edu/knowitall/openparse/gui/Parser.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.openparse.gui 2 | 3 | import edu.knowitall.tool.parse.DependencyParser 4 | import edu.knowitall.tool.parse.MaltParser 5 | import edu.knowitall.tool.parse.graph.Dependencies 6 | import edu.knowitall.tool.parse.graph.DependencyGraph 7 | 8 | /** An enumerator for parser options */ 9 | object Parser extends Enumeration { 10 | type ParserEnum = Value 11 | 12 | val Deserialize = Value("Deserialize") 13 | val Stanford = Value("Stanford") 14 | val MaltL = Value("Malt (Linear)") 15 | val MaltPoly = Value("Malt (Poly)") 16 | 17 | def default = MaltL 18 | 19 | def load(parserType: ParserEnum): (ParserEnum, DependencyParser) = parserType match { 20 | case Parser.Stanford => (parserType, new edu.knowitall.tool.parse.StanfordParser) 21 | case Parser.MaltL => (parserType, new MaltParser()) 22 | case Parser.MaltPoly => (parserType, new MaltParser(modelUrl = new java.io.File("engmalt.poly-1.7.mco").toURI.toURL)) 23 | case Parser.Deserialize => (parserType, new DependencyParser() { 24 | override def dependencies(input: String) = Dependencies.deserialize(input) 25 | override def dependencyGraph(input: String) = DependencyGraph.deserialize(input) 26 | }) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /app/src/main/scala/edu/knowitall/openparse/gui/Sentence.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.openparse.gui 2 | 3 | import scala.util.control.Exception.catching 4 | 5 | import edu.knowitall.tool.parse.graph.DependencyGraph 6 | import edu.knowitall.tool.parse.graph.DependencyGraph.SerializationException 7 | import edu.knowitall.tool.parse.graph.DependencyGraph.deserialize 8 | 9 | /** A representation of the input sentence. */ 10 | sealed abstract class Sentence 11 | object Sentence { 12 | case class Text(text: String) extends Sentence { 13 | override def toString = text 14 | } 15 | case class Graph(dgraph: DependencyGraph) extends Sentence { 16 | override def toString = dgraph.serialize 17 | } 18 | 19 | def apply(string: String): Sentence = { 20 | import DependencyGraph._ 21 | 22 | catching(classOf[SerializationException]).opt { 23 | deserialize(string) 24 | } match { 25 | case Some(dgraph) => Graph(dgraph) 26 | case None => Text(string) 27 | } 28 | } 29 | } -------------------------------------------------------------------------------- /app/src/test/resources/logback-test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /core/build.sbt: -------------------------------------------------------------------------------- 1 | organization := "edu.washington.cs.knowitall.ollie" 2 | 3 | name := "ollie-core" 4 | 5 | description := "Wrapper and implementation for extractors of chunked sentences." 6 | 7 | version := "1.0.4-SNAPSHOT" 8 | 9 | crossScalaVersions := Seq("2.9.2", "2.10.1") 10 | 11 | scalaVersion <<= crossScalaVersions { (vs: Seq[String]) => vs.head } 12 | 13 | libraryDependencies ++= Seq( 14 | "edu.washington.cs.knowitall.nlptools" %% "nlptools-core" % "2.4.1", 15 | "edu.washington.cs.knowitall.nlptools" %% "nlptools-conf-breeze" % "2.4.1", 16 | "edu.washington.cs.knowitall.nlptools" %% "nlptools-stem-morpha" % "2.4.1", 17 | "org.slf4j" % "slf4j-api" % "1.7.2", 18 | "org.scalaz" %% "scalaz-core" % "7.0.0", 19 | "ch.qos.logback" % "logback-classic" % "1.0.9" % "test", 20 | "ch.qos.logback" % "logback-core" % "1.0.9" % "test", 21 | "junit" % "junit" % "4.11" % "test", 22 | "org.specs2" %% "specs2" % "1.12.3" % "test") 23 | 24 | scalacOptions ++= Seq("-unchecked", "-deprecation") 25 | 26 | licenses := Seq("Ollie Software License Agreement" -> url("https://raw.github.com/knowitall/ollie/master/LICENSE")) 27 | 28 | homepage := Some(url("http://ollie.cs.washington.edu")) 29 | 30 | publishMavenStyle := true 31 | 32 | resolvers += "Sonatype OSS Snapshots" at "https://oss.sonatype.org/content/repositories/snapshots" 33 | 34 | publishTo <<= version { (v: String) => 35 | val nexus = "https://oss.sonatype.org/" 36 | if (v.trim.endsWith("SNAPSHOT")) 37 | Some("snapshots" at nexus + "content/repositories/snapshots") 38 | else 39 | Some("releases" at nexus + "service/local/staging/deploy/maven2") 40 | } 41 | 42 | pomExtra := ( 43 | 44 | https://github.com/knowitall/ollie 45 | scm:git://github.com/knowitall/ollie.git 46 | scm:git:git@github.com:knowitall/ollie.git 47 | HEAD 48 | 49 | 50 | 51 | Michael Schmitz 52 | 53 | 54 | Robert Bart 55 | 56 | ) 57 | -------------------------------------------------------------------------------- /core/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | 5 | edu.washington.cs.knowitall 6 | knowitall-oss 7 | 1.0.2 8 | 9 | edu.washington.cs.knowitall.ollie 10 | ollie-core_2.9.2 11 | 1.0.4-SNAPSHOT 12 | ollie-core 13 | Ollie is an open information extractor for binary relations. 14 | 15 | https://github.com/knowitall/ollie 16 | scm:git://github.com/knowitall/ollie.git 17 | scm:git:git@github.com:knowitall/ollie.git 18 | HEAD 19 | 20 | 21 | 22 | Ollie Software License Agreement 23 | https://raw.github.com/knowitall/ollie/master/LICENSE 24 | repo 25 | 26 | 27 | 28 | University of Washington CSE 29 | http://cs.washington.edu/ 30 | 31 | 32 | 33 | Michael Schmitz 34 | 35 | 36 | Robert Bart 37 | 38 | 39 | 2012 40 | 41 | UTF-8 42 | 2.4.1 43 | 44 | 45 | 46 | edu.washington.cs.knowitall.nlptools 47 | nlptools-core_2.9.2 48 | ${nlptools.version} 49 | 50 | 51 | edu.washington.cs.knowitall.nlptools 52 | nlptools-stem-morpha_2.9.2 53 | ${nlptools.version} 54 | 55 | 56 | edu.washington.cs.knowitall.nlptools 57 | nlptools-conf-breeze_2.9.2 58 | ${nlptools.version} 59 | 60 | 61 | org.scalaz 62 | scalaz-core_2.9.2 63 | 7.0.0 64 | 65 | 66 | 67 | org.slf4j 68 | slf4j-api 69 | 1.7.2 70 | 71 | 72 | ch.qos.logback 73 | logback-classic 74 | 1.0.9 75 | test 76 | 77 | 78 | ch.qos.logback 79 | logback-core 80 | 1.0.9 81 | test 82 | 83 | 84 | 85 | junit 86 | junit 87 | 4.11 88 | test 89 | 90 | 91 | org.specs2 92 | specs2_2.9.2 93 | 1.12.3 94 | test 95 | 96 | 97 | 98 | src/main/scala 99 | src/test/scala 100 | 101 | 102 | net.alchim31.maven 103 | scala-maven-plugin 104 | 3.1.1 105 | 106 | 107 | 108 | compile 109 | testCompile 110 | doc-jar 111 | 112 | 113 | 114 | 115 | 116 | -deprecation 117 | -unchecked 118 | 119 | 120 | -Xms128m 121 | -Xmx1024m 122 | 123 | 124 | 125 | 126 | 127 | 128 | -------------------------------------------------------------------------------- /core/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | resolvers += Resolver.url("sbt-plugin-releases", new URL("http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases")) (Resolver.ivyStylePatterns) 2 | 3 | addSbtPlugin("com.jsuereth" % "xsbt-gpg-plugin" % "0.6") 4 | -------------------------------------------------------------------------------- /core/scripts/applypatterns.sh: -------------------------------------------------------------------------------- 1 | # 1 -- patterns 2 | # 2 -- sentences 3 | mvn -q -e -f ../pom.xml compile exec:java -Dexec.mainClass=edu.washington.cs.knowitall.pattern.PatternExtractor -Dexec.args="--patterns $1 --sentences $2" 4 | -------------------------------------------------------------------------------- /core/scripts/build_templates.sh: -------------------------------------------------------------------------------- 1 | mkdir "$1/templates/" 2 | mvn exec:java -Dexec.mainClass=edu.washington.cs.knowitall.pattern.BuildTemplates -Dexec.args="$1/raw/patterned.txt $1/templates/templates.txt --reltemplates $HOME/public/read/reltemplates.txt --debug $1/templates/" 3 | -------------------------------------------------------------------------------- /core/scripts/create_patterns.sh: -------------------------------------------------------------------------------- 1 | mvn -q -e exec:java -Dexec.mainClass=edu.washington.cs.knowitall.pattern.BuildTreePatterns -Dexec.args="$1/raw/parsed.txt $1/raw/patterned-all.txt -p --length 3" 2> $1/raw/patterned-all.log 2 | -------------------------------------------------------------------------------- /core/scripts/create_test_train.sh: -------------------------------------------------------------------------------- 1 | # 1 -- lda directory 2 | ROWS="$1/raw/patterned.txt" 3 | TEST="$1/raw/test.txt" 4 | TRAIN="$1/raw/train.txt" 5 | mvn -q -e exec:java -Dexec.mainClass=edu.washington.cs.knowitall.pattern.lda.CreateTestSet -Dexec.args="$ROWS $TEST $TRAIN" 6 | 7 | -------------------------------------------------------------------------------- /core/scripts/extractor.sh: -------------------------------------------------------------------------------- 1 | echo "$*" 2 | mvn -q -e exec:java -Dexec.mainClass=edu.washington.cs.knowitall.pattern.OpenParse -Dexec.args="$*" 3 | -------------------------------------------------------------------------------- /core/scripts/keep_common_patterns.sh: -------------------------------------------------------------------------------- 1 | # 1 -- lda directory 2 | cut -f5 "$1/raw/patterned-all.txt" | sort | uniq -c | sort -nr > "$1/raw/patterns.txt" 3 | mvn -q -e exec:java -Dexec.mainClass=edu.washington.cs.knowitall.pattern.KeepCommonPatterns -Dexec.args="$1/raw/patterned-all.txt 10" > "$1/raw/patterned.txt" 4 | -------------------------------------------------------------------------------- /core/src/main/resources/edu/knowitall/ollie/cognitiveWords.txt: -------------------------------------------------------------------------------- 1 | accept 2 | admit 3 | affirm 4 | aim 5 | allow 6 | apprehend 7 | assert 8 | attest 9 | aver 10 | avouch 11 | avow 12 | believe 13 | claim 14 | comprehend 15 | confirm 16 | conjecture 17 | consider 18 | contend 19 | define 20 | deny 21 | describe 22 | discover 23 | doubt 24 | dream 25 | envisage 26 | expect 27 | fathom 28 | feel 29 | follow 30 | foreknow 31 | foresee 32 | foretell 33 | grant 34 | grasp 35 | guarantee 36 | guess 37 | hold 38 | hope 39 | identify 40 | imagine 41 | infer 42 | intend 43 | know 44 | maintain 45 | mean 46 | misapprehend 47 | misconstrue 48 | misinterpret 49 | misunderstand 50 | observe 51 | plan 52 | portray 53 | presume 54 | prophesy 55 | propose 56 | reaffirm 57 | realize 58 | recognize 59 | recollect 60 | remember 61 | report 62 | represent 63 | repute 64 | reveal 65 | see 66 | show 67 | speculate 68 | suppose 69 | surmise 70 | suspect 71 | swear 72 | think 73 | trust 74 | understand 75 | vaticinate 76 | visualize 77 | wish 78 | yen -------------------------------------------------------------------------------- /core/src/main/resources/edu/knowitall/ollie/communicationWords.txt: -------------------------------------------------------------------------------- 1 | acknowledge 2 | acquaint 3 | add 4 | advise 5 | affirm 6 | allege 7 | announce 8 | apprise 9 | articulate 10 | believe 11 | blab 12 | blurt 13 | claim 14 | comment 15 | communicate 16 | confess 17 | confide 18 | confirm 19 | consider 20 | convey 21 | corroborate 22 | declare 23 | deem 24 | demonstrate 25 | disclose 26 | divulge 27 | elaborate 28 | elucidate 29 | establish 30 | esteem 31 | exclaim 32 | explain 33 | explicate 34 | expound 35 | feel 36 | illustrate 37 | imagine 38 | inform 39 | insinuate 40 | insist 41 | intimate 42 | justify 43 | know 44 | leak 45 | lecture 46 | mention 47 | moralize 48 | narrate 49 | note 50 | notify 51 | observe 52 | pose 53 | preach 54 | proclaim 55 | promulgate 56 | propose 57 | prove 58 | rant 59 | rate 60 | read 61 | reaffirm 62 | recite 63 | reckon 64 | recount 65 | reiterate 66 | relate 67 | relay 68 | remark 69 | remember 70 | remind 71 | repeat 72 | reply 73 | report 74 | respond 75 | retort 76 | reveal 77 | say 78 | see 79 | show 80 | sniff 81 | speak 82 | state 83 | suppose 84 | suspect 85 | talk 86 | teach 87 | tell 88 | testify 89 | theorize 90 | think 91 | update 92 | utter 93 | venture 94 | verify 95 | view 96 | voice 97 | write -------------------------------------------------------------------------------- /core/src/main/resources/edu/knowitall/ollie/confidence/default-classifier.txt: -------------------------------------------------------------------------------- 1 | args start and end with noun 0.030924657084179144 2 | rel ends with of 0.1013506657501542 3 | arg1 contains pronoun 0.19630801348782667 4 | arg2 contains pronoun -0.13341646099789348 5 | long relation -0.3547145229191737 6 | gap of 10 in rel -0.34306426484946456 7 | vacuous extraction -0.6389807893982924 8 | nn edges in pattern 0.9130032848389 9 | arg1 is proper 0.07933280909554899 10 | Intercept 0.0 11 | sentence begins with arg1 -0.1588407285556643 12 | if right before arg1 -1.2206208992816086 13 | arg2 is proper -0.04306420146120506 14 | arg2 borders appositive -0.0017006187220647805 15 | rel contains gerund -0.26200297625650837 16 | arg1 borders appositive -0.13448972417475485 17 | noun-verb-noun in arg1 0.0 18 | prep right after arg2 0.19212879336967245 19 | prep in arg2 0.16539493294341892 20 | arg2 contains infinitive -0.0 21 | prep mismatch in pattern -0.20092201136389673 22 | sentence is imperative 0.11745202578145564 23 | hyp words in rel -0.1449927441123399 24 | sentence ends with arg2 0.11610654106632967 25 | noun-verb-noun in arg2 0.07217080739835992 26 | rel is contiguous 0.12562188545360878 27 | non-contiguous rel -0.1849662870655201 28 | semantic constraints in pattern -0.4343558913425681 29 | openparse confidence 0.43411514029724824 30 | arg1 bad characters -0.40339032821185783 31 | sentence starts with extraction 0.18854224217974247 32 | arg2 bad characters -0.009939551407472108 33 | rel contains verb 0.4757113580400253 34 | rel starts with be 0.0 35 | prep right before arg1 -0.2350155331052106 36 | sentence has question mark 0.0 37 | arg2 before arg1 -0.35791735399208685 38 | arg2 before rel -0.023882392179128745 39 | rel bad characters -0.11794120943690224 40 | -------------------------------------------------------------------------------- /core/src/main/resources/edu/knowitall/ollie/prefixWords.txt: -------------------------------------------------------------------------------- 1 | after 2 | although 3 | because 4 | before 5 | but 6 | however 7 | if 8 | once 9 | that 10 | though 11 | when 12 | whenever 13 | whether 14 | where 15 | while 16 | would -------------------------------------------------------------------------------- /core/src/main/resources/edu/knowitall/openparse/categories/location.txt: -------------------------------------------------------------------------------- 1 | abbacy 2 | abode 3 | abutment 4 | abysm 5 | abyss 6 | acme 7 | addition 8 | address 9 | aerie 10 | aerospace 11 | aery 12 | aim 13 | air 14 | airhead 15 | airspace 16 | airway 17 | ambiance 18 | ambience 19 | anchorage 20 | angle 21 | anomaly 22 | antapex 23 | antinode 24 | antipodes 25 | aperture 26 | apex 27 | aphelion 28 | apoapsis 29 | apogee 30 | apojove 31 | apolune 32 | aposelene 33 | approach 34 | apron 35 | archbishopric 36 | archdeaconry 37 | archdiocese 38 | archduchy 39 | area 40 | arena 41 | arrowhead 42 | ashram 43 | asthenosphere 44 | atelier 45 | atmosphere 46 | axil 47 | axis 48 | azimuth 49 | back 50 | backside 51 | backwater 52 | backwoods 53 | backyard 54 | bailiwick 55 | bakehouse 56 | bakery 57 | bakeshop 58 | barb 59 | barony 60 | barren 61 | barrio 62 | barycenter 63 | base 64 | basin 65 | battlefield 66 | battlefront 67 | battleground 68 | beachhead 69 | beak 70 | bearing 71 | beat 72 | bed 73 | bedground 74 | bedside 75 | beehive 76 | beeline 77 | beginning 78 | belly 79 | bellybutton 80 | belt 81 | bent 82 | benthos 83 | berm 84 | berth 85 | bight 86 | bilge 87 | bilges 88 | bilocation 89 | bindery 90 | biosphere 91 | birthplace 92 | bishopric 93 | bitthead 94 | bivouac 95 | block 96 | boatyard 97 | bookbindery 98 | boondocks 99 | border 100 | borderland 101 | borderline 102 | borough 103 | bottom 104 | bound 105 | boundary 106 | bounds 107 | bourn 108 | bourne 109 | bowels 110 | breach 111 | breadbasket 112 | break 113 | brickfield 114 | brickyard 115 | bridgehead 116 | brink 117 | brokerage 118 | brow 119 | buffer 120 | bull 121 | burg 122 | bush 123 | cabstand 124 | caliphate 125 | cambium 126 | camp 127 | campground 128 | campong 129 | campsite 130 | campus 131 | canthus 132 | canton 133 | cap 134 | capital 135 | capitulum 136 | carrefour 137 | casbah 138 | cavern 139 | cavity 140 | cell 141 | cemetery 142 | center 143 | centerfield 144 | central 145 | centre 146 | centrex 147 | centroid 148 | chap 149 | chapiter 150 | charnel 151 | chasm 152 | checkpoint 153 | chink 154 | chokepoint 155 | chromosphere 156 | churchyard 157 | circle 158 | circuit 159 | circumference 160 | circus 161 | city 162 | clear 163 | clearing 164 | cleft 165 | cloverleaf 166 | coalfield 167 | coastline 168 | col 169 | colliery 170 | colony 171 | columbarium 172 | common 173 | commons 174 | commonwealth 175 | commune 176 | community 177 | compartment 178 | confluence 179 | conurbation 180 | core 181 | corium 182 | corncob 183 | corner 184 | corneum 185 | cornfield 186 | country 187 | countryside 188 | county 189 | course 190 | court 191 | cowtown 192 | crack 193 | cradle 194 | cranny 195 | crawlspace 196 | creamery 197 | crenel 198 | crenelle 199 | crest 200 | crevasse 201 | crevice 202 | crinion 203 | croft 204 | crosscut 205 | crossing 206 | crossroad 207 | crossway 208 | crotch 209 | crown 210 | crud 211 | crust 212 | crypt 213 | cubbyhole 214 | culmination 215 | curtilage 216 | cusp 217 | cuticle 218 | dairy 219 | danger 220 | dark 221 | darkness 222 | dateline 223 | dec 224 | declination 225 | defile 226 | delimitation 227 | demarcation 228 | demesne 229 | den 230 | department 231 | dependency 232 | depth 233 | derivation 234 | derma 235 | dermis 236 | desert 237 | desktop 238 | destination 239 | determinant 240 | development 241 | diamond 242 | diastema 243 | dig 244 | diocese 245 | dip 246 | direction 247 | distance 248 | district 249 | divide 250 | dockside 251 | dockyard 252 | dogleg 253 | domain 254 | domicile 255 | dominion 256 | dooryard 257 | downtown 258 | drop 259 | duchy 260 | dukedom 261 | dump 262 | dumpsite 263 | earldom 264 | earreach 265 | earshot 266 | earth 267 | east 268 | ecliptic 269 | edge 270 | edging 271 | element 272 | emirate 273 | empire 274 | emptiness 275 | empyrean 276 | encampment 277 | enclave 278 | enclosure 279 | end 280 | endpoint 281 | entrepot 282 | environment 283 | environs 284 | eparchy 285 | epicenter 286 | epicentre 287 | epidermis 288 | episcopate 289 | epitope 290 | equator 291 | equinoctial 292 | equinox 293 | exaltation 294 | exarchate 295 | excavation 296 | exchange 297 | exosphere 298 | expanse 299 | exterior 300 | extreme 301 | extremity 302 | extremum 303 | exurbia 304 | eye 305 | eyeshot 306 | eyrie 307 | eyry 308 | face 309 | fairground 310 | fairway 311 | farm 312 | farmland 313 | farmplace 314 | farmstead 315 | fatherland 316 | faubourg 317 | fault 318 | faulting 319 | fiefdom 320 | field 321 | fingertip 322 | finish 323 | firebreak 324 | fireguard 325 | fireside 326 | firmament 327 | fishery 328 | fissure 329 | flies 330 | floor 331 | flowerbed 332 | fluke 333 | flyway 334 | focus 335 | foot 336 | foothold 337 | foramen 338 | forefront 339 | forepart 340 | forge 341 | fork 342 | fountainhead 343 | fracture 344 | fringe 345 | front 346 | frontier 347 | funfair 348 | gaff 349 | gap 350 | garden 351 | gasfield 352 | gasworks 353 | geosphere 354 | ghetto 355 | glade 356 | glassworks 357 | goal 358 | goldfield 359 | gorge 360 | grainfield 361 | grange 362 | grassland 363 | grave 364 | graveyard 365 | green 366 | greenbelt 367 | greenway 368 | gridiron 369 | ground 370 | grounds 371 | grove 372 | gulf 373 | habitat 374 | habitation 375 | hairline 376 | hamlet 377 | hand 378 | hangout 379 | harbor 380 | harborage 381 | harbour 382 | harbourage 383 | hatchery 384 | haunt 385 | haven 386 | hayfield 387 | head 388 | heading 389 | headspring 390 | headwater 391 | hearing 392 | heart 393 | hearth 394 | heartland 395 | heath 396 | heathland 397 | heaven 398 | heavens 399 | heel 400 | heights 401 | heliopause 402 | heliosphere 403 | hell 404 | hellhole 405 | hem 406 | hemisphere 407 | hemline 408 | here 409 | heronry 410 | hiatus 411 | hideaway 412 | hideout 413 | high 414 | hilltop 415 | hilum 416 | hinterland 417 | hip 418 | hipline 419 | hole 420 | hollow 421 | holy 422 | home 423 | homeland 424 | hometown 425 | horizon 426 | horst 427 | hotbed 428 | hotspot 429 | house 430 | hub 431 | hydathode 432 | hydrosphere 433 | imperium 434 | inclination 435 | inferno 436 | infield 437 | innersole 438 | inside 439 | insole 440 | interchange 441 | interface 442 | interior 443 | intersection 444 | ionosphere 445 | ironworks 446 | irredenta 447 | irridenta 448 | isarithm 449 | island 450 | isobar 451 | isochrone 452 | isoclinal 453 | isogone 454 | isogram 455 | isohel 456 | isopleth 457 | isotherm 458 | itinerary 459 | job 460 | junction 461 | jungle 462 | junkyard 463 | jurisdiction 464 | justiciary 465 | juxtaposition 466 | kampong 467 | kasbah 468 | key 469 | khanate 470 | kingdom 471 | knothole 472 | kraal 473 | lab 474 | laboratory 475 | lair 476 | land 477 | landmark 478 | landscape 479 | landscaping 480 | latitude 481 | launderette 482 | laundry 483 | lawn 484 | layer 485 | lea 486 | lead 487 | leak 488 | lee 489 | leeward 490 | left 491 | leftfield 492 | lenticel 493 | ley 494 | lie 495 | light 496 | limb 497 | limit 498 | line 499 | lineation 500 | lithosphere 501 | locale 502 | locality 503 | location 504 | locus 505 | longitude 506 | lookout 507 | lot 508 | loxodrome 509 | luff 510 | lumberyard 511 | mandate 512 | mandatory 513 | mansion 514 | mantle 515 | march 516 | marchland 517 | mare 518 | maria 519 | mastaba 520 | mastabah 521 | masthead 522 | matrix 523 | mausoleum 524 | maximum 525 | meadow 526 | mecca 527 | medina 528 | medium 529 | meeting 530 | megalopolis 531 | meridian 532 | mesosphere 533 | mete 534 | metropolis 535 | micropyle 536 | midair 537 | midden 538 | middle 539 | midfield 540 | midland 541 | midpoint 542 | midst 543 | midstream 544 | midway 545 | minefield 546 | minimum 547 | molding 548 | monument 549 | moorage 550 | mooring 551 | motherland 552 | moulding 553 | mouth 554 | municipality 555 | nadir 556 | nape 557 | navel 558 | necropolis 559 | neighborhood 560 | neighbourhood 561 | nest 562 | nib 563 | nidus 564 | nirvana 565 | node 566 | nombril 567 | nook 568 | north 569 | northeast 570 | northland 571 | northwest 572 | notch 573 | nucha 574 | nucleus 575 | oasis 576 | occident 577 | oilfield 578 | omphalos 579 | omphalus 580 | open 581 | opening 582 | orbit 583 | orchard 584 | orient 585 | origin 586 | orphrey 587 | outback 588 | outdoors 589 | outfield 590 | outline 591 | outport 592 | outpost 593 | outside 594 | outskirt 595 | outskirts 596 | outsole 597 | outstation 598 | overhead 599 | overlook 600 | ozonosphere 601 | paddy 602 | paint 603 | palaestra 604 | palate 605 | palatinate 606 | palestra 607 | pallium 608 | pampas 609 | panhandle 610 | paradise 611 | parallel 612 | parcel 613 | paries 614 | parish 615 | park 616 | parkland 617 | part 618 | parterre 619 | parting 620 | parts 621 | pass 622 | pasture 623 | pastureland 624 | patch 625 | patchboard 626 | pate 627 | path 628 | patisserie 629 | patriarchate 630 | pattern 631 | peak 632 | penetralia 633 | perch 634 | perforation 635 | periapsis 636 | perigee 637 | perigon 638 | perihelion 639 | perijove 640 | perilune 641 | periselene 642 | pesthole 643 | photosphere 644 | piazza 645 | pigeonhole 646 | piggery 647 | pike 648 | pinnacle 649 | pinpoint 650 | piscary 651 | piste 652 | pit 653 | pitch 654 | place 655 | plantation 656 | plate 657 | playground 658 | plaza 659 | pleasance 660 | plot 661 | plugboard 662 | pocket 663 | point 664 | pole 665 | poll 666 | polls 667 | pool 668 | pore 669 | port 670 | position 671 | possession 672 | post 673 | pottery 674 | pouch 675 | prairie 676 | precinct 677 | prefecture 678 | premises 679 | presence 680 | preserve 681 | princedom 682 | principality 683 | property 684 | proprioceptor 685 | protectorate 686 | provenance 687 | provenience 688 | province 689 | proximity 690 | puddle 691 | pueblo 692 | punctum 693 | pupil 694 | purlieu 695 | qibla 696 | quadrant 697 | quarter 698 | radius 699 | railhead 700 | railyard 701 | ranch 702 | range 703 | rathole 704 | reach 705 | realm 706 | rear 707 | rearward 708 | refuge 709 | region 710 | rendezvous 711 | rent 712 | repair 713 | repository 714 | reservation 715 | reserve 716 | residence 717 | resort 718 | retreat 719 | rhumb 720 | rift 721 | right 722 | rightfield 723 | rip 724 | roads 725 | roadside 726 | roadstead 727 | rockery 728 | rooftop 729 | rookery 730 | root 731 | rootage 732 | ropewalk 733 | rotary 734 | rough 735 | round 736 | roundabout 737 | roundhouse 738 | route 739 | sac 740 | sack 741 | saddle 742 | saddleback 743 | saddlery 744 | safety 745 | sanctuary 746 | sanctum 747 | sandlot 748 | savanna 749 | savannah 750 | scenario 751 | scene 752 | scenery 753 | schoolyard 754 | scissure 755 | scour 756 | scrapheap 757 | scrubland 758 | scruff 759 | seafront 760 | seam 761 | seaport 762 | seascape 763 | seat 764 | section 765 | sector 766 | see 767 | seedbed 768 | selvage 769 | selvedge 770 | semidesert 771 | semitropics 772 | separation 773 | sepulcher 774 | sepulchre 775 | sepulture 776 | setting 777 | settlement 778 | shadow 779 | shantytown 780 | sheeprun 781 | sheepwalk 782 | sheet 783 | sheikdom 784 | sheikhdom 785 | shift 786 | shipside 787 | shipyard 788 | shire 789 | shop 790 | shoreline 791 | short 792 | shoulder 793 | showplace 794 | shrubbery 795 | side 796 | sign 797 | silhouette 798 | site 799 | situation 800 | skyline 801 | skyway 802 | slack 803 | slip 804 | slit 805 | slot 806 | slum 807 | smithy 808 | snag 809 | snow 810 | sodom 811 | soil 812 | sole 813 | solitude 814 | somewhere 815 | source 816 | south 817 | southeast 818 | southland 819 | southwest 820 | spa 821 | space 822 | spearhead 823 | spearpoint 824 | sphere 825 | spike 826 | split 827 | spoor 828 | spot 829 | sprawl 830 | spread 831 | spring 832 | square 833 | stage 834 | stand 835 | state 836 | station 837 | steps 838 | stoma 839 | stomate 840 | stop 841 | stopover 842 | stratosphere 843 | stratum 844 | stretch 845 | studio 846 | subdivision 847 | substrate 848 | substratum 849 | subtopia 850 | subtropics 851 | suburb 852 | suburbia 853 | sultanate 854 | summit 855 | superstrate 856 | superstratum 857 | surface 858 | surround 859 | surroundings 860 | suzerainty 861 | swath 862 | switchboard 863 | tack 864 | tannery 865 | tape 866 | target 867 | taxistand 868 | tear 869 | tee 870 | telomere 871 | tendency 872 | tenderloin 873 | terminal 874 | termination 875 | terminus 876 | terrain 877 | terreplein 878 | territory 879 | theater 880 | theatre 881 | there 882 | thermosphere 883 | thick 884 | tiltyard 885 | timberline 886 | tip 887 | tiptoe 888 | tiptop 889 | tomb 890 | tonsure 891 | top 892 | topiary 893 | town 894 | township 895 | track 896 | tract 897 | trail 898 | trailhead 899 | treetop 900 | trend 901 | trichion 902 | tropic 903 | tropics 904 | tropopause 905 | troposphere 906 | trusteeship 907 | turf 908 | turnery 909 | umbilicus 910 | underbelly 911 | underside 912 | undersurface 913 | unknown 914 | upside 915 | uptown 916 | vacancy 917 | vacuity 918 | vacuum 919 | vantage 920 | variation 921 | vault 922 | veld 923 | veldt 924 | vent 925 | venue 926 | verge 927 | vertex 928 | viceroyalty 929 | vicinity 930 | view 931 | viewpoint 932 | village 933 | vinery 934 | vineyard 935 | viscounty 936 | void 937 | volcano 938 | wall 939 | ward 940 | warren 941 | washhouse 942 | waste 943 | wasteland 944 | wasteyard 945 | waterfront 946 | waterline 947 | watermark 948 | watershed 949 | waterworks 950 | wavefront 951 | way 952 | wayside 953 | weald 954 | wedge 955 | welkin 956 | wellhead 957 | wellspring 958 | west 959 | wheatfield 960 | whereabouts 961 | wild 962 | wilderness 963 | window 964 | windward 965 | wing 966 | wire 967 | wold 968 | woodlet 969 | work 970 | workplace 971 | workshop 972 | workspace 973 | yard 974 | yardarm 975 | zenith 976 | zodiac 977 | zone 978 | -------------------------------------------------------------------------------- /core/src/main/scala/edu/knowitall/common/enrich/Traversable.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall 2 | package common 3 | package enrich 4 | 5 | import edu.knowitall.collection.immutable.Bag 6 | 7 | import scalaz._ 8 | import Scalaz._ 9 | import Monoid._ 10 | 11 | /** 12 | * Enrichments for traversables. 13 | * 14 | * @author Michael Schmitz 15 | */ 16 | object Traversables { 17 | implicit def traversableOnceTo[T](as: TraversableOnce[T]): SuperTraversableOnce[T] = new SuperTraversableOnce[T](as) 18 | 19 | implicit def traversableOncePairIntTo[T](as: TraversableOnce[(T, Int)]): SuperTraversableOncePairInt[T] = new SuperTraversableOncePairInt[T](as) 20 | 21 | implicit def traversableOncePairTo[T, U](as: TraversableOnce[(T, U)]): SuperTraversableOncePair[T, U] = new SuperTraversableOncePair[T, U](as) 22 | } 23 | 24 | sealed class SuperTraversableOnce[T](value: TraversableOnce[T]) { 25 | def histogram: Map[T, Int] = { 26 | value.foldLeft(Map[T, Int]()) { (m, c) => 27 | m.updated(c, m.getOrElse(c, 0) + 1) 28 | } 29 | } 30 | } 31 | 32 | sealed class SuperTraversableOncePairInt[T](value: TraversableOnce[(T, Int)]) { 33 | import Traversables._ 34 | def mergeHistograms: Map[T, Int] = value.mergeKeys(_ + _) 35 | } 36 | 37 | sealed class SuperTraversableOncePair[T, U](value: TraversableOnce[(T, U)]) { 38 | def mergeKeys(implicit mon: Semigroup[U]): Map[T, U] = { 39 | value.foldLeft(Map[T, U]()) { 40 | case (map, (k, v)) => 41 | map + (k -> (map.get(k).map(_ |+| v).getOrElse(v))) 42 | } 43 | } 44 | 45 | def mergeKeys[F[_]](implicit monoid: Monoid[F[U]]): Map[T, F[U]] = { 46 | value.foldLeft(Map[T, F[U]]()) { 47 | case (map, (k, v)) => 48 | val pure = monoid.zero 49 | map + (k -> (map.get(k).map(_ |+| pure).getOrElse(pure))) 50 | } 51 | } 52 | 53 | def mergeKeys(merge: (U, U) => U): Map[T, U] = { 54 | value.foldLeft(Map[T, U]()) { 55 | case (map, (k, v)) => 56 | map + (k -> map.get(k).map(merge(_, v)).getOrElse(v)) 57 | } 58 | } 59 | 60 | def toListMultimap: Map[T, List[U]] = { 61 | value.foldLeft(Map[T, List[U]]().withDefaultValue(List.empty[U])) { 62 | case (map, (k, v)) => 63 | map + (k -> (v :: map(k))) 64 | } 65 | } 66 | 67 | def toSetMultimap: Map[T, Set[U]] = { 68 | value.foldLeft(Map[T, Set[U]]().withDefaultValue(Set.empty[U])) { 69 | case (map, (k, v)) => 70 | map + (k -> (map(k) + v)) 71 | } 72 | } 73 | 74 | def toBagMultimap: Map[T, Bag[U]] = { 75 | value.foldLeft(Map[T, Bag[U]]().withDefaultValue(Bag.empty[U])) { 76 | case (map, (k, v)) => 77 | val bag = map(k) 78 | map + (k -> (bag + v)) 79 | } 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /core/src/main/scala/edu/knowitall/ollie/DependencyGraphExtras.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.ollie 2 | 3 | import edu.knowitall.tool.parse.graph.DependencyGraph 4 | import edu.knowitall.tool.parse.graph.Dependency 5 | import edu.knowitall.tool.parse.graph.DependencyNode 6 | import edu.knowitall.collection.immutable.Interval 7 | import edu.knowitall.tool.tokenize.Tokenizer 8 | import edu.knowitall.collection.immutable.graph.Graph 9 | 10 | class DependencyGraphExtras(dgraph: DependencyGraph) { 11 | private def graph = dgraph.graph 12 | 13 | def passiveVoice: Iterable[DependencyGraph] = { 14 | require(dgraph.nodes forall (_.indices.length == 1)) 15 | 16 | // look for passive constructions 17 | val activeVoices = this.graph.vertices.filter { v => 18 | (v.postag startsWith "VB") && 19 | (dgraph.dependencies exists {edge => edge.label == "nsubj" && edge.source == v}) && 20 | (dgraph.dependencies exists {edge => edge.label == "dobj" && edge.source == v}) 21 | } 22 | 23 | activeVoices map { v => 24 | val nsubj = dgraph.dependencies.find(edge => edge.label == "nsubj" && edge.source == v).get 25 | val dobj = dgraph.dependencies.find(edge => edge.label == "dobj" && edge.source == v).get 26 | val nsubjInterval = Interval.span(dgraph.graph.inferiors(nsubj.dest).map(_.indices)) 27 | val dobjInterval = Interval.span(dgraph.graph.inferiors(dobj.dest).map(_.indices)) 28 | 29 | val nsubjpass = new Dependency(v, dobj.dest, "nsubjpass") 30 | 31 | val by = new DependencyNode("by", "IN", dobjInterval.start, -1) 32 | val prep = new Dependency(v, by, "prep") 33 | val pobj = new Dependency(by, nsubj.dest, "pobj") 34 | val was = new DependencyNode("was", "VBD", v.indices.start, -1) 35 | val auxpass = new Dependency(nsubj.source, was, "auxpass") 36 | 37 | // adjust the edges 38 | var edges: Iterable[Dependency] = dgraph.dependencies 39 | edges = edges.toSet - nsubj - dobj + prep + pobj + auxpass + nsubjpass 40 | // adjust for the "by" node 41 | def nodeMap = { (v: DependencyNode) => 42 | var interval = v.indices 43 | if (v.indices.start >= by.indices.start && v != by) interval = DependencyGraphExtras.shift(interval, 1) 44 | if (v.indices.start >= was.indices.start && v != was) interval = DependencyGraphExtras.shift(interval, 1) 45 | new DependencyNode(v.text, v.postag, interval, v.offset) 46 | } 47 | edges = edges.map { e => e mapNodes nodeMap } 48 | 49 | edges = DependencyGraphExtras.swapOrders(edges, graph.inferiors(nsubj.dest) map nodeMap, graph.inferiors(dobj.dest) map nodeMap) 50 | 51 | // create the new graph 52 | val newGraph = new DependencyGraph(edges.flatMap(_.vertices), edges) 53 | val text = newGraph.nodes.iterator.map(_.text).mkString(" ") 54 | 55 | // compute the correct offsets 56 | val offsets = Tokenizer.computeOffsets(newGraph.nodes.iterator.map(_.text).toList, text) 57 | val nodeOffsetTransformation = 58 | ((newGraph.graph.vertices.iterator zip offsets.iterator) map {case (node, token) => node -> new DependencyNode(node.text, node.postag, node.indices, token.offset)}).toMap 59 | 60 | newGraph map nodeOffsetTransformation 61 | } 62 | } 63 | 64 | def activeVoice: Iterable[DependencyGraph] = { 65 | require(dgraph.nodes forall (_.indices.length == 1)) 66 | 67 | // look for active constructions 68 | val passiveVoices = this.graph.vertices.filter { v => 69 | if (!(v.postag startsWith "VB") && 70 | (dgraph.dependencies exists {edge => edge.label == "nsubjpass" && edge.source == v}) && 71 | (dgraph.dependencies exists (edge => edge.label == "auxpass" && edge.source == v))) 72 | false 73 | else { 74 | dgraph.dependencies.find(e => e.label == "prep" && e.source == v && e.dest.text == "by") match { 75 | case None => false 76 | case Some(prep) => dgraph.dependencies.exists(e => e.source == prep.dest && e.label == "pobj") 77 | } 78 | } 79 | (dgraph.dependencies exists {edge => edge.label == "prep" && edge.source == v}) 80 | } 81 | 82 | passiveVoices map { v => 83 | val nsubjpass = dgraph.dependencies.find(edge => edge.label == "nsubjpass" && edge.source == v).get 84 | val prep = dgraph.dependencies.find(edge => edge.label == "prep" && edge.source == v && edge.dest.text == "by" && dgraph.dependencies.exists(e => e.source == edge.dest && e.label == "pobj")).get 85 | val pobj = dgraph.dependencies.find(edge => edge.label == "pobj" && edge.source == prep.dest).get 86 | val auxpass = dgraph.dependencies.find(edge => edge.label == "auxpass" && edge.source == v).get 87 | 88 | val nsubj = new Dependency(v, pobj.dest, "nsubj") 89 | val dobj = new Dependency(v, nsubjpass.dest, "dobj") 90 | 91 | // adjust the edges 92 | var edges: Iterable[Dependency] = dgraph.dependencies 93 | edges = edges.toSet - nsubjpass - auxpass - prep - pobj + nsubj + dobj 94 | edges = DependencyGraphExtras.swapOrders(edges, graph.inferiors(nsubjpass.dest), graph.inferiors(pobj.dest)) 95 | 96 | val nodes = scala.collection.immutable.SortedSet.empty[DependencyNode] ++ edges.flatMap(_.nodes) 97 | val nodeMap = nodes.iterator.zipWithIndex.map{case (node, i) => node -> new DependencyNode(node.text, node.postag, Interval.singleton(i), -1)}.toMap 98 | edges = edges.map(_ mapNodes nodeMap) 99 | 100 | // create the new graph 101 | val newGraph = new DependencyGraph(edges.flatMap(_.vertices), edges) 102 | val text = newGraph.nodes.iterator.map(_.text).mkString(" ") 103 | 104 | // compute the correct offsets 105 | val offsets = Tokenizer.computeOffsets(newGraph.nodes.iterator.map(_.text).toList, text) 106 | val nodeOffsetTransformation = 107 | ((newGraph.graph.vertices.iterator zip offsets.iterator) map {case (node, token) => node -> new DependencyNode(node.text, node.postag, node.indices, token.offset)}).toMap 108 | 109 | newGraph map nodeOffsetTransformation 110 | } 111 | } 112 | 113 | def switchVoice: Iterable[DependencyGraph] = { 114 | passiveVoice ++ activeVoice 115 | } 116 | } 117 | 118 | object DependencyGraphExtras { 119 | private def shift(interval: Interval, by: Int) = Interval.open(interval.start + by, interval.end + by) 120 | 121 | private def swapOrders(edges: Iterable[Dependency], left: scala.collection.Set[DependencyNode], right: scala.collection.Set[DependencyNode]) = { 122 | val leftInterval = Interval.span(left.map(_.indices)) 123 | val rightInterval = Interval.span(right.map(_.indices)) 124 | 125 | require(leftInterval.end <= rightInterval.start) 126 | 127 | val leftOffset = left.iterator.map(_.offset).max 128 | val rightOffset = right.iterator.map(_.offset).min 129 | 130 | val tokensBetween = rightInterval.start - leftInterval.end + 1 131 | val charsBetween = rightOffset - leftOffset 132 | 133 | edges.map(e => e.mapNodes(v => 134 | if (left contains v) new DependencyNode(v.text, v.postag, DependencyGraphExtras.shift(v.indices, tokensBetween), v.offset + charsBetween) 135 | else if (right contains v) new DependencyNode(v.text, v.postag, DependencyGraphExtras.shift(v.indices, -tokensBetween), v.offset - charsBetween) 136 | else v)) 137 | } 138 | } -------------------------------------------------------------------------------- /core/src/main/scala/edu/knowitall/ollie/NaryExtraction.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.ollie 2 | 3 | import scala.Option.option2Iterable 4 | import scala.collection.SortedSet 5 | import scala.collection.immutable 6 | 7 | import edu.knowitall.collection.immutable.Interval 8 | import edu.knowitall.openparse.extract.DetailedExtraction 9 | import edu.knowitall.openparse.extract.Extraction 10 | import edu.knowitall.openparse.extract.Extraction.AdverbialModifier 11 | import edu.knowitall.openparse.extract.Extraction.ClausalComponent 12 | import edu.knowitall.tool.parse.graph.DependencyNode 13 | import edu.knowitall.tool.postag.Postagger 14 | 15 | /** 16 | * Represents a part {arg1, rel, arg2} of an extraction. 17 | * 18 | * @param string the representation of the part 19 | * @param interval the interval of the part in the source sentence 20 | */ 21 | class ExtractionPart(val string: String, val interval: Interval) extends Ordered[ExtractionPart] { 22 | override def compare(that: ExtractionPart) = 23 | this.interval compare that.interval 24 | 25 | override def toString = string.replaceAll("/", "") 26 | } 27 | 28 | /** 29 | * Represents a possible suffix for an extended extraction. 30 | * For example, in the sentence "He ate from 7 until 10." 31 | * there are two suffixes: "from 7" and "until 10". 32 | * 33 | * @param string the text of the suffix 34 | * @param interval the interval of the suffix in the source sentence 35 | * @param confidence the confidence of the suffix 36 | */ 37 | class Suffix( 38 | text: String, 39 | nodes: SortedSet[DependencyNode], 40 | val confidence: Double) 41 | extends Extraction.Part(nodes, text) { 42 | override def toString = ("%1.4f" format confidence) + "/\"" + super.toString + "\"" 43 | 44 | /** Annote the suffix with a type. */ 45 | def annotate(string: String) = 46 | new AnnotatedSuffix(this, string) 47 | } 48 | 49 | /** 50 | * Represents a possible suffix for an extended extraction 51 | * along with an annotation. 52 | * 53 | * For example, in the sentence "He ate from 7 until 10." 54 | * there are two suffixes: "from 7" and "until 10". 55 | * 56 | * @param string the text of the suffix 57 | * @param interval the interval of the suffix in the source sentence 58 | * @param confidence the confidence of the suffix 59 | * @param annotation an annotation for the suffix 60 | */ 61 | class AnnotatedSuffix( 62 | text: String, 63 | nodes: SortedSet[DependencyNode], 64 | confidence: Double, 65 | val annotation: String) 66 | extends Suffix(text, nodes, confidence) { 67 | def this(suffix: Suffix, annotation: String) = 68 | this(suffix.text, suffix.nodes, suffix.confidence, annotation) 69 | override def toString = annotation + "/" + super.toString 70 | } 71 | 72 | /** 73 | * A representaiton of an n-ary extraction, i.e. 74 | * 75 | * (Michael, ran, to the store, on Monday, at 2 PM) 76 | * 77 | * N-ary extractions have multiple secondary arguments (objects) 78 | * and these arguments include the preposition. 79 | * 80 | * @param arg1 the first argument 81 | * @param rel the relation 82 | * @param suffixes the suffixes 83 | * @param clausals a clause restricting this extraction to a context 84 | * @param modifier a modifier for this extraction (i.e. attribution) 85 | * 86 | * @author Michael Schmitz 87 | */ 88 | class NaryExtraction(val arg1: Extraction.Part, val rel: Extraction.Part, val suffixes: Seq[Suffix], val attributions: Seq[Attribution] = Seq.empty, val enablers: Seq[EnablingCondition] = Seq.empty) { 89 | override def toString = 90 | "(" + arg1.text + ", " + rel.text + ", " + suffixes.map(_.text).mkString(", ") + ")" 91 | } 92 | 93 | object NaryExtraction { 94 | implicit object SuffixOrdering extends Ordering[Suffix] { 95 | def compare(x: Suffix, y: Suffix) = x.span.compare(y.span) 96 | } 97 | 98 | /** 99 | * Create extended extractions from a collection of extractions 100 | * from the same sentence. 101 | */ 102 | def from(extrs: Iterable[(Double, OllieExtractionInstance)]): Iterable[NaryExtraction] = { 103 | // keep extractions that end with a one-word preposition 104 | val prepositionEnding = extrs.filter { 105 | case (conf, inst) => 106 | Postagger.simplePrepositions(inst.extr.rel.text drop (1 + inst.extr.rel.text lastIndexOf ' ')) 107 | } 108 | 109 | // break off the preposition 110 | case class BrokenExtraction(rel: String, preposition: String, extr: (Double, OllieExtraction)) 111 | val split: Iterable[BrokenExtraction] = prepositionEnding.map { 112 | case (conf, inst) => 113 | val preps = Postagger.prepositions.filter(inst.extr.rel.text endsWith _) 114 | val longest = preps.maxBy(_.length) 115 | BrokenExtraction(inst.extr.rel.text.dropRight(longest.length + 1), longest, (conf, inst.extr)) 116 | } 117 | 118 | // group by the arg1 and text 119 | split groupBy { 120 | case BrokenExtraction(rel, preposition, (conf, extr)) => 121 | (extr.arg1.text, rel) 122 | } filter (_._2.size > 1) map { 123 | case ((arg1, rel), extrs) => 124 | val suffixes: immutable.SortedSet[Suffix] = extrs.map { 125 | case BrokenExtraction(rel, prep, (conf, extr)) => 126 | new Suffix(prep + " " + extr.arg2.text, extr.arg2.nodes, conf) 127 | }(scala.collection.breakOut) 128 | 129 | val first = extrs.head.extr._2 130 | val argument1 = new Extraction.Part(first.arg1.nodes, arg1) 131 | val relation = new Extraction.Part(first.rel.nodes, rel) 132 | 133 | val attributions = extrs.flatMap(_.extr._2.attribution).toSet.toSeq 134 | val enablers = extrs.flatMap(_.extr._2.enabler).toSet.toSeq 135 | 136 | new NaryExtraction(argument1, relation, suffixes.toSeq, enablers = enablers, attributions = attributions) 137 | } 138 | } 139 | } 140 | -------------------------------------------------------------------------------- /core/src/main/scala/edu/knowitall/ollie/Ollie.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.ollie 2 | 3 | import scala.io.Source 4 | import edu.knowitall.collection.immutable.Interval 5 | import edu.knowitall.common.Resource.using 6 | import edu.knowitall.openparse.OpenParse 7 | import edu.knowitall.openparse.extract.DetailedExtraction 8 | import edu.knowitall.tool.parse.graph.DependencyGraph 9 | import edu.knowitall.tool.stem.MorphaStemmer 10 | import edu.knowitall.tool.stem.Stemmer 11 | 12 | /** Ollie is an Open Information Extractor that produces binary extractions 13 | * with context. The constructor takes an OpenParse instance. Ollie extends 14 | * OpenParse's extractions with enabling conditions and attributions. There 15 | * is also a trained confidence function for OllieExtractions. 16 | * 17 | * @author Michael Schmitz 18 | */ 19 | class Ollie(val openparse: OpenParse) { 20 | val stemmer = new MorphaStemmer 21 | 22 | /** Construct with the default model. */ 23 | def this() = this(OpenParse.withDefaultModel(OpenParse.Configuration(confidenceThreshold = 0.005))) 24 | 25 | def apply(dgraph: DependencyGraph): Iterable[OllieExtractionInstance] = 26 | extract(dgraph) 27 | 28 | /** 29 | * primary method for getting extractions 30 | */ 31 | def extract(dgraph: DependencyGraph): Iterable[OllieExtractionInstance] = { 32 | val openparseExtrs = openparse.extract(dgraph) 33 | 34 | for { 35 | (conf, extr) <- openparseExtrs 36 | enabler = enablingAdverbialClauseHelper(extr) 37 | attribution = attribClausalComponentHelper(extr) 38 | } yield new OllieExtractionInstance( 39 | new OllieExtraction(extr.arg1, extr.rel, extr.arg2, conf, enabler, attribution), dgraph, extr.extractor) 40 | } 41 | 42 | /** Identify enabling condition, i.e. "if it's raining..." */ 43 | private def enablingAdverbialClauseHelper(extr: DetailedExtraction): Option[EnablingCondition] = { 44 | extr.modifier map { modifier => 45 | val prefix = modifier.contents.nodes.head.text 46 | val phrase = modifier.contents.nodes.iterator.drop(1).map(_.text).mkString(" ") 47 | 48 | new EnablingCondition(prefix, phrase, modifier.contents.span) 49 | } 50 | } 51 | 52 | /** Identify attributions from clausal components, i.e. "He said..." */ 53 | private def attribClausalComponentHelper(extr: DetailedExtraction): Option[Attribution] = { 54 | extr.clausal flatMap { clausal => 55 | // find the first verb in the clausal rel 56 | clausal.rel.nodes.find(_.postag.startsWith("VB")).flatMap { node => 57 | val normalized = stemmer.stem(node.text.toLowerCase()) 58 | if (Ollie.communicationWords.contains(normalized) || Ollie.cognitiveWords.contains(normalized)) { 59 | val clausalArgInterval = Interval.span(clausal.arg.nodes.map(_.indices)) 60 | val clausalRelInterval = Interval.span(clausal.rel.nodes.map(_.indices)) 61 | Some(new Attribution( 62 | clausal.arg.text, 63 | clausal.arg.span, 64 | clausal.rel.text, 65 | clausal.rel.span)) 66 | } else None 67 | } 68 | } 69 | } 70 | } 71 | 72 | object Ollie { 73 | implicit def stemmer: Stemmer = MorphaStemmer 74 | 75 | /** A collection of verbs used for communication, i.e. "said" */ 76 | val communicationWords = using(Source.fromInputStream(classOf[Ollie].getResource("communicationWords.txt").openStream())) { source => 77 | source.getLines.toSet 78 | } 79 | 80 | /** A collection of verbs used for beliefs, i.e. "think" */ 81 | val cognitiveWords = using(Source.fromInputStream(classOf[Ollie].getResource("cognitiveWords.txt").openStream())) { source => 82 | source.getLines.toSet 83 | } 84 | 85 | /** A collection of prefixes used for enabling conditions, i.e. "if" and "when" */ 86 | val enablerPrefixes = using(Source.fromInputStream(classOf[Ollie].getResource("prefixWords.txt").openStream())) { source => 87 | source.getLines.toSet 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /core/src/main/scala/edu/knowitall/ollie/OllieExtraction.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.ollie 2 | 3 | import scala.Option.option2Iterable 4 | import scala.collection.breakOut 5 | 6 | import edu.knowitall.collection.immutable.Interval 7 | import edu.knowitall.common.HashCodeHelper 8 | import edu.knowitall.openparse.extract.Extraction.Part 9 | import edu.knowitall.tool.parse.graph.DependencyNode 10 | 11 | /** A base representation for additional context around an extraction. */ 12 | sealed abstract class Context { 13 | def text: String 14 | def interval: Interval 15 | } 16 | 17 | /** A representation for an enabling condition. 18 | * An example of an enabling condition is "if it's raining". 19 | */ 20 | case class EnablingCondition( 21 | /** The enabling condition word, i.e. "if" */ 22 | val prefix: String, 23 | /** The rest of the enabling condition, i.e. "it's raining" */ 24 | val phrase: String, 25 | /** The token interval of the enabling condition */ 26 | override val interval: Interval) extends Context { 27 | override def text = prefix + " " + phrase 28 | 29 | def serialize: String = Seq(prefix, phrase, interval.start.toString, interval.last.toString).map(_.replaceAll("_", "_UNSC_")).mkString("_") 30 | } 31 | 32 | object EnablingCondition { 33 | def deserialize(string: String) = { 34 | val Array(prefix, phrase, intervalStart, intervalLast) = try (string.split("_")) 35 | catch { 36 | case e => throw new RuntimeException("could not deserialize EnablingCondition: " + string, e); 37 | } 38 | new EnablingCondition(prefix, phrase, Interval.closed(intervalStart.toInt, intervalLast.toInt)) 39 | } 40 | } 41 | 42 | /** A representation for an attribution. 43 | * An example of an is "Obama believes". 44 | */ 45 | case class Attribution( 46 | /** The argument of the attribution, i.e. "Obama" */ 47 | val arg: String, 48 | /** The token interval of the argument of the attribution */ 49 | val argInterval: Interval, 50 | /** The relation of the attribution, i.e. "believes" */ 51 | val rel: String, 52 | /** The token interval of the relation of the attribution */ 53 | override val interval: Interval) extends Context { 54 | override def text = arg + " " + rel 55 | 56 | def serialize: String = { 57 | val fields = Seq(arg, rel, argInterval.start.toString, argInterval.last.toString, interval.start.toString, interval.last.toString) 58 | fields.map(_.replaceAll("_", "_UNSC_")).mkString("_") 59 | } 60 | } 61 | 62 | object Attribution { 63 | def deserialize(string: String) = { 64 | val Array(arg, rel, argIntervalStart, argIntervalLast, relIntervalStart, relIntervalLast) = try (string.split("_")) 65 | catch { 66 | case e => throw new RuntimeException("could not deserialize Attribution: " + string, e); 67 | } 68 | val argInterval = Interval.closed(argIntervalStart.toInt, argIntervalLast.toInt) 69 | val relInterval = Interval.closed(relIntervalStart.toInt, relIntervalLast.toInt) 70 | 71 | new Attribution(arg, argInterval, rel, relInterval) 72 | } 73 | } 74 | 75 | /** A representation of an Ollie extraction, i.e. we could get the following 76 | * extraction from the example sentence. 77 | * 78 | * {{{ 79 | * When I'm dreaming David Bowie sings that Ziggy sucked up into his mind. 80 | * (Ziggy, sucked up, into his mind)[attribution = "David Bowie") 81 | * }}} 82 | */ 83 | class OllieExtraction( 84 | /** The first argument (subject) of the extraction, i.e. "Ziggy" */ 85 | val arg1: Part, 86 | /** The relation of the extraction, i.e. "sucked up" */ 87 | val rel: Part, 88 | /** The second argument (object) of the extraction, i.e. "into his mind" */ 89 | val arg2: Part, 90 | /** The confidence value from OpenParse. */ 91 | private[ollie] val openparseConfidence: Double, 92 | /** The enabling condition, if any. I.e. "When I'm dreaming" */ 93 | val enabler: Option[EnablingCondition], 94 | /** The attribution, if any. I.e. "David Bowie sings that" */ 95 | val attribution: Option[Attribution]) { 96 | 97 | import OllieExtraction.{serializePart, deserializePart} 98 | 99 | override def equals(that: Any) = that match { 100 | case that: OllieExtraction => 101 | this.arg1 == that.arg1 && 102 | this.rel == that.rel && 103 | this.arg2 == that.arg2 && 104 | this.enabler == that.enabler && 105 | this.attribution == that.attribution && 106 | this.openparseConfidence == that.openparseConfidence 107 | case _ => false 108 | } 109 | 110 | override def hashCode = HashCodeHelper( 111 | this.arg1, 112 | this.rel, 113 | this.arg2, 114 | this.enabler, 115 | this.attribution, 116 | this.openparseConfidence) 117 | 118 | def tabSerialize: String = { 119 | val enablerString = enabler match { 120 | case Some(enablingCondition) => enablingCondition.serialize 121 | case None => "None" 122 | } 123 | val attrString = attribution match { 124 | case Some(attr) => attr.serialize 125 | case None => "None" 126 | } 127 | 128 | val fieldStrings = Seq(arg1, rel, arg2).map(serializePart(_)) ++ Seq("%.05f".format(openparseConfidence), enablerString, attrString) 129 | fieldStrings.map(_.replaceAll("\t", "_TAB_")).mkString("\t") 130 | } 131 | 132 | /** The full text of this extraction. */ 133 | def text = Iterable(arg1.text, rel.text, arg2.text).mkString(" ") 134 | 135 | /** All the nodes in this extraction. */ 136 | def nodes = arg1.nodes ++ rel.nodes ++ arg2.nodes 137 | 138 | /** The spanning interval of the nodes in this extraction. */ 139 | def span = Interval.span(nodes.map(_.indices)) 140 | 141 | override def toString = { 142 | val extentions = Iterable( 143 | enabler.map("enabler="+_.text), 144 | attribution.map("attrib="+_.text)).flatten match { 145 | case Nil => "" 146 | case list => list.mkString("[", ";", "]") 147 | } 148 | "(%s; %s; %s)".format(arg1.text, rel.text, arg2.text) + extentions 149 | } 150 | } 151 | 152 | object OllieExtraction { 153 | def tabDelimitedColumns = Seq("Arg1Part", "RelPart", "Arg2Part", "Confidence", "Enabler", "Attribution").mkString("\t") 154 | 155 | def tabDeserialize(array: Seq[String]): (OllieExtraction, Seq[String]) = { 156 | array match { 157 | case Seq(arg1Part, relPart, arg2Part, openparseConfString, enablerString, attrString, rest @ _*) => { 158 | val parts = Seq(arg1Part, relPart, arg2Part) map deserializePart 159 | val enabler = if (enablerString.equals("None")) None else Some(EnablingCondition.deserialize(enablerString)) 160 | val attribution = if (attrString.equals("None")) None else Some(Attribution.deserialize(attrString)) 161 | val extr = new OllieExtraction(parts(0), parts(1), parts(2), openparseConfString.toDouble, enabler, attribution) 162 | (extr, rest) 163 | } 164 | } 165 | } 166 | 167 | def tabDeserialize(s: String): OllieExtraction = { 168 | val (extr, rest) = tabDeserialize(s.split("\t")) 169 | require(rest.isEmpty) 170 | extr 171 | } 172 | 173 | def serializePart(part: Part): String = { 174 | val serializedNodes = part.nodes.iterator.map(_.serialize).mkString("; ") 175 | Iterable(part.text, serializedNodes).mkString(" ;;; ") 176 | } 177 | 178 | def deserializePart(string: String): Part = { 179 | val Array(partText, partNodes) = try (string.split("\\s*;;;\\s*")) 180 | catch { 181 | case e => throw new RuntimeException("could not deserialize Extraction.Part: " + string, e); 182 | } 183 | 184 | val nodesSortedSet: scala.collection.SortedSet[DependencyNode] = 185 | try (partNodes.split("\\s*;\\s*").map(DependencyNode.deserialize(_))(breakOut)) 186 | catch { 187 | case e => throw new RuntimeException("could not deserialize Extraction.Part: " + string, e); 188 | } 189 | 190 | new Part(nodesSortedSet, partText) 191 | } 192 | } 193 | 194 | -------------------------------------------------------------------------------- /core/src/main/scala/edu/knowitall/ollie/OllieExtractionInstance.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.ollie 2 | 3 | import edu.knowitall.common.HashCodeHelper 4 | import edu.knowitall.openparse.extract.PatternExtractor 5 | import edu.knowitall.tool.parse.graph.DependencyGraph 6 | import scala.util.matching.Regex 7 | 8 | /** OllieExtractionInstance represents an extraction coupled with 9 | * its source sentence. 10 | */ 11 | class OllieExtractionInstance( 12 | /** The associated extraction. */ 13 | val extr: OllieExtraction, 14 | /** The associated sentence. */ 15 | val sent: DependencyGraph, 16 | /** The extractor used. */ 17 | val pat: PatternExtractor) { 18 | 19 | override def equals(that: Any) = that match { 20 | case that: OllieExtractionInstance => this.extr == that.extr && this.sent == that.sent 21 | case _ => false 22 | } 23 | override def hashCode = HashCodeHelper(extr, sent) 24 | 25 | def extraction = extr 26 | def sentence = sent 27 | def pattern = pat 28 | 29 | private val passivePatternRegex = new Regex("""^\{arg1:?\w*\} dobj> \{arg2:?\w*\}""") 40 | /** Report if this extraction is an active construction. 41 | * This is a crude measure so false should not be taken to mean 42 | * that it is not active. 43 | * 44 | * An extraction is active if it has a valid passive formulation 45 | * by swapping the arguments and modifying the relation (adding "be" 46 | * and "by"). 47 | */ 48 | def active: Boolean = 49 | activePatternRegex.pattern.matcher(pat.pattern.serialize).matches() 50 | 51 | def tabSerialize: String = { 52 | val serializedGraph = sent.serialize 53 | val serializedExtr = extr.tabSerialize 54 | Seq(serializedGraph, pat.tabSerialize, serializedExtr).mkString("\t") 55 | } 56 | } 57 | 58 | object OllieExtractionInstance { 59 | def tabDeserialize(string: String): OllieExtractionInstance = { 60 | val array = string.split('\t') 61 | 62 | val (extr, rest) = tabDeserialize(array) 63 | require(rest.isEmpty) 64 | 65 | extr 66 | } 67 | 68 | def tabDeserialize(array: Seq[String]): (OllieExtractionInstance, Seq[String]) = { 69 | try { 70 | val Seq(serializedGraph, r0 @ _*) = array 71 | 72 | val graph = DependencyGraph.deserialize(serializedGraph) 73 | val (pat, r1) = PatternExtractor.tabDeserialize(r0) 74 | val (extr, r2) = OllieExtraction.tabDeserialize(r1) 75 | 76 | (new OllieExtractionInstance(extr, graph, pat), r2) 77 | } catch { 78 | case e => throw new IllegalArgumentException("Could not tab deserialize: " + array.mkString("\t"), e) 79 | } 80 | } 81 | 82 | val numFinder = "[0-9]+".r 83 | } 84 | -------------------------------------------------------------------------------- /core/src/main/scala/edu/knowitall/ollie/ScoredOllieExtractionInstance.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.ollie 2 | 3 | import edu.knowitall.tool.conf.Labelled 4 | 5 | /** OllieExtractionInstance represents a boolean score coupled with 6 | * an extraciton instance. 7 | * 8 | * @param score the label for this extraction 9 | * @param inst the extraction instance labelled 10 | */ 11 | class ScoredOllieExtractionInstance( 12 | val score: Boolean, 13 | val inst: OllieExtractionInstance) extends Labelled[OllieExtractionInstance](score, inst) { 14 | 15 | override def toString = score + ":" + inst.extr 16 | 17 | def tabSerialize: String = { 18 | Iterable(if (score) 1 else 0, inst.extr.toString, inst.tabSerialize).mkString("\t") 19 | } 20 | } 21 | 22 | object ScoredOllieExtractionInstance { 23 | def tabDeserialize(string: String): ScoredOllieExtractionInstance = { 24 | try { 25 | val Array(scoreString, _, rest @ _*) = string.split('\t') 26 | 27 | val score = 28 | if (scoreString == "1") true 29 | else if (scoreString == "0") false 30 | else throw new IllegalArgumentException("bad score: " + scoreString) 31 | val (inst, r2) = OllieExtractionInstance.tabDeserialize(rest) 32 | 33 | require(r2.isEmpty) 34 | 35 | new ScoredOllieExtractionInstance(score, inst) 36 | } catch { 37 | case e => throw new IllegalArgumentException("could not tab deserialize: " + string, e) 38 | } 39 | } 40 | 41 | val numFinder = "[0-9]+".r 42 | } 43 | -------------------------------------------------------------------------------- /core/src/main/scala/edu/knowitall/ollie/confidence/OllieConfidenceFunction.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.ollie.confidence 2 | 3 | import java.io.InputStream 4 | import java.net.URL 5 | import java.util.Scanner 6 | 7 | import scala.collection.mutable 8 | 9 | import org.slf4j.LoggerFactory 10 | 11 | import edu.knowitall.common.Resource.using 12 | import edu.knowitall.ollie.OllieExtractionInstance 13 | import edu.knowitall.tool.conf.FeatureSet 14 | import edu.knowitall.tool.conf.impl.LogisticRegression 15 | 16 | /** An implementation of logistic regression of features that can be 17 | * represented as a double. */ 18 | 19 | object OllieConfidenceFunction { 20 | val logger = LoggerFactory.getLogger(classOf[OllieIndependentConfFunction]) 21 | 22 | type OllieIndependentConfFunction = LogisticRegression[OllieExtractionInstance] 23 | 24 | val defaultModelUrl = Option(this.getClass.getResource("default-classifier.txt")).getOrElse { 25 | throw new IllegalArgumentException("Could not load confidence function resource.") 26 | } 27 | 28 | def loadDefaultClassifier(): OllieIndependentConfFunction = { 29 | fromUrl(OllieFeatureSet, defaultModelUrl) 30 | } 31 | 32 | def fromUrl(featureSet: FeatureSet[OllieExtractionInstance, Double], url: URL): OllieIndependentConfFunction = { 33 | LogisticRegression.fromUrl(featureSet, url) 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /core/src/main/scala/edu/knowitall/ollie/confidence/OllieFeatureEvaluation.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.ollie.confidence 2 | 3 | import java.io.File 4 | import java.io.PrintWriter 5 | import java.net.URL 6 | 7 | import scala.io.Source 8 | 9 | import edu.knowitall.common.Analysis 10 | import edu.knowitall.common.Resource.using 11 | import edu.knowitall.ollie.ScoredOllieExtractionInstance 12 | import scopt.OptionParser 13 | 14 | object OllieFeatureEvaluation { 15 | /** Settings for OpenParse. */ 16 | abstract class Settings { 17 | /** source file of scored extractions */ 18 | def inputFile: File 19 | 20 | /** file to output; None means stdout */ 21 | def outputFile: Option[File] 22 | 23 | /** confidence model url */ 24 | def confidenceModelUrl: URL 25 | } 26 | 27 | def main(args: Array[String]) = { 28 | var settings = new Settings { 29 | var inputFile: File = _ 30 | var outputFile: Option[File] = None 31 | var confidenceModelUrl: URL = OllieConfidenceFunction.defaultModelUrl 32 | } 33 | 34 | val parser = new OptionParser("feature-eval") { 35 | opt(Some("c"), "confidence model", "", "confidence model file", { path: String => 36 | val file = new File(path) 37 | require(file.exists, "file does not exist: " + path) 38 | settings.confidenceModelUrl = file.toURI.toURL 39 | }) 40 | 41 | opt("o", "output", "output file (otherwise stdout)", { path => 42 | val file = new File(path) 43 | settings.outputFile = Some(file) 44 | }) 45 | 46 | arg("input", "input dependencies file", { path: String => 47 | val file = new File(path) 48 | require(file.exists, "input file does not exist: " + path) 49 | settings.inputFile = file 50 | }) 51 | } 52 | 53 | if (parser.parse(args)) { 54 | run(settings) 55 | } 56 | } 57 | 58 | def run(settings: Settings) = { 59 | val confFunc = OllieConfidenceFunction.fromUrl(OllieFeatureSet, settings.confidenceModelUrl) 60 | 61 | val extrs = using (Source.fromFile(settings.inputFile)) { source => 62 | for ( 63 | line <- source.getLines.toList; 64 | val scored = ScoredOllieExtractionInstance.tabDeserialize(line); 65 | val conf = confFunc(scored.inst) 66 | ) yield (conf, scored) 67 | } 68 | 69 | val sorted = extrs.sortBy(-_._1).toList 70 | 71 | val pyed = (sorted.head, 0, 1.0) +: Analysis.precisionYieldMeta(sorted zip sorted.map(_._2.score)) 72 | 73 | val featureNames = confFunc.featureSet.featureNames.filter(confFunc.featureWeights.get(_).isDefined).toList.sorted 74 | using { 75 | settings.outputFile match { 76 | case Some(f) => new PrintWriter(f, "UTF8") 77 | case None => new PrintWriter(System.out) 78 | } 79 | } { writer => 80 | writer.println((Iterable("score", "conf", "op-conf", "yield", "precision", 81 | "extr", "enabler", "attrib", "sentence", "dependencies") ++ 82 | featureNames).mkString("\t")) 83 | writer.println("\t" * 10 + featureNames.map(confFunc.featureWeights(_).toString).mkString("\t")) 84 | (pyed) foreach { case ((conf, scored), y, p) => 85 | val features = 86 | for ( 87 | featureName <- featureNames; 88 | val featureValue = confFunc.featureSet(featureName)(scored.inst) 89 | ) yield featureValue 90 | 91 | writer.println((Iterable(if (scored.score) 1 else 0, 92 | conf, 93 | scored.inst.extr.openparseConfidence, 94 | y, 95 | p, 96 | scored.inst.extr.toString, 97 | scored.inst.extr.enabler.isDefined.toString.toLowerCase, 98 | scored.inst.extr.attribution.isDefined.toString.toLowerCase, 99 | scored.inst.sent.text, 100 | scored.inst.sent.serialize) ++ features).mkString("\t")) 101 | } 102 | } 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /core/src/main/scala/edu/knowitall/ollie/confidence/train/CrossValidateConfidence.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.ollie.confidence.train 2 | 3 | import java.io.File 4 | 5 | import scala.io.Source 6 | 7 | import edu.knowitall.common.Analysis 8 | import edu.knowitall.common.Resource.using 9 | import edu.knowitall.ollie.ScoredOllieExtractionInstance 10 | import edu.knowitall.ollie.confidence.OllieFeatureSet 11 | import edu.knowitall.tool.conf.BreezeLogisticRegressionTrainer 12 | import scopt.mutable.OptionParser 13 | 14 | object CrossValidateConfidence { 15 | def main(args: Array[String]) { 16 | object settings extends Settings { 17 | var inputFile: File = _ 18 | var outputFile: Option[File] = None 19 | } 20 | 21 | val parser = new OptionParser("scoreextr") { 22 | arg("labelled", "labelled extractions", { path: String => settings.inputFile = new File(path) }) 23 | argOpt("output", "output file", { path: String => settings.outputFile = Some(new File(path)) }) 24 | } 25 | 26 | if (parser.parse(args)) { 27 | run(settings) 28 | } 29 | } 30 | 31 | abstract class Settings { 32 | def inputFile: File 33 | def outputFile: Option[File] 34 | 35 | val splits = 10 36 | } 37 | 38 | 39 | def run(settings: Settings) = { 40 | val trainer = new BreezeLogisticRegressionTrainer(OllieFeatureSet) 41 | 42 | val data = 43 | using (Source.fromFile(settings.inputFile)) { source => 44 | (source.getLines map (ScoredOllieExtractionInstance.tabDeserialize)).toList 45 | } 46 | 47 | val splits = data.iterator.sliding(data.size / settings.splits, data.size / settings.splits).withPartial(false) 48 | val results = for { 49 | split <- splits.toList 50 | 51 | val test = split 52 | val training = data filterNot (test contains _) 53 | 54 | val classifier = trainer.train(training) 55 | } yield { 56 | for (example <- test) yield { 57 | val conf = classifier.apply(example.inst) 58 | val correct = 59 | if (conf >= 0.5 && example.score) true 60 | else if (conf < 0.5 && !example.score) true 61 | else false 62 | (conf, correct) 63 | } 64 | } 65 | 66 | val pys = results.map { list => 67 | val py = Analysis.precisionYield(list.sortBy(-_._1).map(_._2)) 68 | 69 | py 70 | } 71 | 72 | val aucs = pys.zipWithIndex map { case (py, i) => 73 | println("Split " + i) 74 | py foreach { case (y, p) => 75 | println(Iterable(y.toString, "%1.4f" format p).mkString("\t")) 76 | } 77 | 78 | val auc = Analysis.areaUnderCurve(py) 79 | println("auc: " + auc) 80 | 81 | println() 82 | auc 83 | } 84 | 85 | var auc = breeze.linalg.mean(aucs) 86 | println("avg auc: " + auc) 87 | } 88 | } -------------------------------------------------------------------------------- /core/src/main/scala/edu/knowitall/ollie/confidence/train/TrainOllieConfidence.scala: -------------------------------------------------------------------------------- 1 | 2 | package edu.knowitall.ollie.confidence.train 3 | 4 | import java.io.File 5 | 6 | import scala.io.Source 7 | 8 | import edu.knowitall.common.Resource.using 9 | import edu.knowitall.ollie.ScoredOllieExtractionInstance 10 | import edu.knowitall.ollie.confidence.OllieFeatureSet 11 | import edu.knowitall.tool.conf.BreezeLogisticRegressionTrainer 12 | import scopt.mutable.OptionParser 13 | 14 | object TrainOllieConfidence { 15 | def main(args: Array[String]) { 16 | object settings extends Settings { 17 | var inputFile: File = _ 18 | var outputFile: Option[File] = None 19 | } 20 | 21 | val parser = new OptionParser("scoreextr") { 22 | arg("labelled", "labelled extractions", { path: String => settings.inputFile = new File(path) }) 23 | argOpt("output", "output file", { path: String => settings.outputFile = Some(new File(path)) }) 24 | } 25 | 26 | if (parser.parse(args)) { 27 | run(settings) 28 | } 29 | } 30 | 31 | abstract class Settings { 32 | def inputFile: File 33 | def outputFile: Option[File] 34 | } 35 | 36 | def run(settings: Settings) = { 37 | val trainer = new BreezeLogisticRegressionTrainer(OllieFeatureSet) 38 | 39 | val data = 40 | using (Source.fromFile(settings.inputFile)) { source => 41 | (source.getLines map (ScoredOllieExtractionInstance.tabDeserialize)).toList 42 | } 43 | 44 | val classifier = trainer.train(data) 45 | settings.outputFile match { 46 | case Some(file) => classifier.saveFile(file) 47 | case None => 48 | classifier.save(System.out) 49 | } 50 | } 51 | } -------------------------------------------------------------------------------- /core/src/main/scala/edu/knowitall/ollie/output/BratOutput.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.ollie.output 2 | 3 | import edu.knowitall.ollie.OllieExtractionInstance 4 | import edu.knowitall.openparse.extract.Extraction 5 | import edu.knowitall.collection.immutable.Interval 6 | import edu.knowitall.ollie.ExtractionPart 7 | import edu.knowitall.tool.segment.Segment 8 | import java.io.PrintWriter 9 | 10 | class BratOutput(extractor: String => Iterable[OllieExtractionInstance]) { 11 | def process(sentences: Iterable[Segment], writer: PrintWriter) = { 12 | val document = new Document() 13 | for { 14 | Segment(text, offset) <- sentences 15 | inst <- extractor(text) 16 | entry <- document.annotations(inst, offset) 17 | } { 18 | writer.println(entry) 19 | } 20 | } 21 | 22 | class Document { 23 | var entityIndex = 0 24 | var relationIndex = 0 25 | 26 | def annotations(inst: OllieExtractionInstance, sentenceCharacterOffset: Int) = { 27 | def partToAnnotation(inst: OllieExtractionInstance, part: Extraction.Part, partName: String) = { 28 | val tokens = inst.sentence.nodes.toList.slice(part.span.start, part.span.end) 29 | val charInterval = Interval.open(tokens.head.offset, tokens.last.offsets.end) 30 | partName + " " + (sentenceCharacterOffset + charInterval.start) + " " + (sentenceCharacterOffset + charInterval.end) + "\t" + inst.sentence.text.substring(charInterval.start, charInterval.end) 31 | } 32 | 33 | case class LabelledEntry(label: String, entry: String) 34 | def label(identifier: Char, index: Int, entry: String) = LabelledEntry(identifier.toString + index, entry) 35 | 36 | val entries = { 37 | val arguments = List(inst.extr.arg1, inst.extr.arg2) map { arg => 38 | val labelled = label('T', entityIndex, partToAnnotation(inst, arg, "Argument")) 39 | entityIndex += 1 40 | labelled 41 | } 42 | val relation = { 43 | val labelled = label('T', entityIndex, partToAnnotation(inst, inst.extr.rel, "Relation")) 44 | entityIndex += 1 45 | labelled 46 | } 47 | 48 | val entities = relation :: arguments 49 | 50 | val relations = arguments zip List("Arg1", "Arg2") map { 51 | case (entry, edge) => 52 | val labelled = label('R', relationIndex, edge + "-of Arg1:" + relation.label + " Arg2:" + entry.label) 53 | relationIndex += 1 54 | labelled 55 | } 56 | 57 | entities ::: relations 58 | } 59 | 60 | entries map { 61 | case LabelledEntry(label, entry) => label + "\t" + entry 62 | } 63 | } 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /core/src/main/scala/edu/knowitall/openparse/AnalyzePatterns.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.openparse 2 | 3 | import java.io.{PrintWriter, File} 4 | 5 | import scala.Option.option2Iterable 6 | import scala.collection.mutable 7 | import scala.io.Source 8 | 9 | import edu.knowitall.collection.immutable.graph.pattern.DirectedEdgeMatcher 10 | import edu.knowitall.common.Resource 11 | import edu.knowitall.tool.parse.graph.{PostagNodeMatcher, LabelEdgeMatcher, DependencyPattern, DependencyGraph} 12 | import edu.knowitall.ollie.Ollie.stemmer 13 | 14 | object AnalyzePatterns { 15 | def main(args: Array[String]) { 16 | val patternedFilePath = args(0) 17 | val outputFilePath = args(1) 18 | 19 | println("Counting pattern occurrence...") 20 | val patterns = mutable.HashMap[String, Int]().withDefaultValue(0) 21 | Resource.using(Source.fromFile(patternedFilePath, "UTF8")) { source => 22 | for (line <- source.getLines) { 23 | val Array(_, _, _, _, pattern, _, _, _*) = line.split("\t", -1) 24 | patterns += pattern -> (patterns(pattern) + 1) 25 | } 26 | } 27 | 28 | println("Grouping patterns...") 29 | Resource.using(new PrintWriter(new File(outputFilePath), "UTF8")) { writer => 30 | val ordered = patterns.toList.sortBy(_._2)(implicitly(Ordering[Int]).reverse) 31 | for ((pattern, count) <- ordered.filter(_._2 > 100)) { 32 | println(count + ":" + pattern) 33 | Resource.using(Source.fromFile(patternedFilePath, "UTF8")) { source => 34 | writer.println(pattern + "\t" + count) 35 | for (line <- source.getLines) { 36 | val Array(rel, arg1, arg2, lemmas, p, sentence, deps, _*) = line.split("\t", -1) 37 | if (p == pattern) { 38 | writer.println(Iterable(rel, arg1, arg2, lemmas).mkString("\t")) 39 | writer.println(sentence) 40 | writer.println(deps) 41 | writer.println() 42 | } 43 | } 44 | } 45 | } 46 | 47 | println() 48 | } 49 | } 50 | } 51 | 52 | object CountPatternComponents { 53 | def main(args: Array[String]) { 54 | val patternedFilePath = args(0) 55 | 56 | val edgeCounts = mutable.HashMap[String, Int]().withDefaultValue(0) 57 | val postagCounts = mutable.HashMap[String, Int]().withDefaultValue(0) 58 | Resource.using(Source.fromFile(patternedFilePath, "UTF8")) { source => 59 | for (line <- source.getLines) { 60 | val Array(_, _, _, _, pickledPattern, _, _, _*) = line.split("\t", -1) 61 | val pattern = new ExtractorPattern(DependencyPattern.deserialize(pickledPattern)) 62 | val labels = (pattern.edgeMatchers.toList).flatMap { _ match { 63 | case e: DirectedEdgeMatcher[_] if e.matcher.isInstanceOf[LabelEdgeMatcher] => 64 | Some(e.matcher.asInstanceOf[LabelEdgeMatcher].label) 65 | case _ => None 66 | } 67 | } 68 | val postags = (pattern.baseNodeMatchers.toList).collect { 69 | case m: PostagNodeMatcher => m.postag 70 | } 71 | 72 | for (l <- labels) { 73 | edgeCounts += l -> (edgeCounts(l)+1) 74 | } 75 | 76 | for (postag <- postags) { 77 | postagCounts += postag -> (postagCounts(postag)+1) 78 | } 79 | } 80 | } 81 | 82 | println("Postag counts: ") 83 | for ((k, v) <- postagCounts.toList.sortBy(_._2).reverse) { 84 | println(k + "\t" + v) 85 | } 86 | 87 | println() 88 | println("Edge counts: ") 89 | for ((k, v) <- edgeCounts.toList.sortBy(_._2).reverse) { 90 | println(k + "\t" + v) 91 | } 92 | } 93 | } 94 | 95 | object CountSentenceComponents { 96 | def main(args: Array[String]) { 97 | val patternedFilePath = args(0) 98 | 99 | val edgeCounts = mutable.HashMap[String, Int]().withDefaultValue(0) 100 | val postagCounts = mutable.HashMap[String, Int]().withDefaultValue(0) 101 | val pieceCounts = mutable.HashMap[String, Int]().withDefaultValue(0) 102 | Resource.using(Source.fromFile(patternedFilePath, "UTF8")) { source => 103 | for (line <- source.getLines) { 104 | val Array(_, _, _, _, _, _, pickledGraph, _*) = line.split("\t", -1) 105 | val graph = DependencyGraph.deserialize(pickledGraph) 106 | val labels = (graph.graph.edges).toList.map(_.label ) 107 | val postags = (graph.graph.vertices).toList.map(_.postag) 108 | 109 | for (l <- labels) { 110 | edgeCounts += l -> (edgeCounts(l)+1) 111 | } 112 | 113 | for (postag <- postags) { 114 | postagCounts += postag -> (postagCounts(postag)+1) 115 | } 116 | 117 | for (edge <- graph.graph.edges) { 118 | val piece1 = edge.source.postag + " " + edge.label + " " + edge.dest.postag 119 | val piece2 = edge.dest.postag + " " + edge.label + " " + edge.source.postag 120 | 121 | pieceCounts += piece1 -> (pieceCounts(piece1)+1) 122 | pieceCounts += piece2 -> (pieceCounts(piece2)+1) 123 | } 124 | } 125 | } 126 | 127 | println("Postag counts: ") 128 | for ((k, v) <- postagCounts.toList.sortBy(_._2).reverse) { 129 | println(k + "\t" + v) 130 | } 131 | 132 | println() 133 | println("Edge counts: ") 134 | for ((k, v) <- edgeCounts.toList.sortBy(_._2).reverse) { 135 | println(k + "\t" + v) 136 | } 137 | 138 | println() 139 | println("Piece counts: ") 140 | for ((k, v) <- pieceCounts.toList.sortBy(_._2).reverse) { 141 | println(k + "\t" + v) 142 | } 143 | } 144 | } -------------------------------------------------------------------------------- /core/src/main/scala/edu/knowitall/openparse/ExtractorPattern.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.openparse 2 | 3 | import scala.io.Source 4 | 5 | import org.slf4j.LoggerFactory 6 | 7 | import edu.knowitall.collection.immutable.graph.pattern.CaptureNodeMatcher 8 | import edu.knowitall.collection.immutable.graph.pattern.EdgeMatcher 9 | import edu.knowitall.collection.immutable.graph.pattern.Matcher 10 | import edu.knowitall.collection.immutable.graph.pattern.NodeMatcher 11 | import edu.knowitall.collection.immutable.graph.pattern.Pattern 12 | import edu.knowitall.collection.immutable.graph.pattern.TrivialNodeMatcher 13 | import edu.knowitall.ollie.Ollie.stemmer 14 | import edu.knowitall.tool.parse.graph.DependencyNode 15 | import edu.knowitall.tool.parse.graph.DependencyPattern 16 | import edu.knowitall.tool.parse.graph.LabelEdgeMatcher 17 | import edu.knowitall.tool.parse.graph.RegexNodeMatcher 18 | import scalaz._ 19 | import scalaz.Scalaz._ 20 | 21 | /** A wrapper for a dependency pattern that adds some convenience methods 22 | * for working with patterns intended for extraction of binary relations. 23 | * 24 | * @author Michael Schmitz 25 | */ 26 | class ExtractorPattern(matchers: List[Matcher[DependencyNode]]) extends DependencyPattern(matchers) { 27 | val logger = LoggerFactory.getLogger(this.getClass) 28 | 29 | def this(pattern: Pattern[DependencyNode]) = this(pattern.matchers.map { _ match { 30 | case m: ExtractionPartMatcher => m 31 | // lift extractor matchers to a more representitive class 32 | case m: CaptureNodeMatcher[_] => m.alias.take(3) match { 33 | case "arg" => new ArgumentMatcher(m.alias, m.matcher) 34 | case "rel" => new RelationMatcher(m.alias, m.matcher) 35 | case "slo" => new SlotMatcher(m.alias, m.matcher) 36 | case _ => throw new IllegalArgumentException("Unknown capture alias: " + m.alias) 37 | } 38 | // keep everything else the same 39 | case m => m 40 | }}) 41 | 42 | override def canEqual(that: Any) = that.isInstanceOf[ExtractorPattern] 43 | override def equals(that: Any) = that match { 44 | case that: ExtractorPattern => (that canEqual this) && this.matchers == that.matchers 45 | case _ => false 46 | } 47 | 48 | def semantic: Boolean = matchers.exists { 49 | case m: RelationMatcher => m.baseNodeMatchers exists { case m: RegexNodeMatcher => true case _ => false } 50 | case _ => false 51 | } 52 | 53 | def valid: Boolean = { 54 | def existsEdge(pred: LabelEdgeMatcher=>Boolean) = 55 | this.baseEdgeMatchers.collect { 56 | case e: LabelEdgeMatcher => e 57 | }exists(pred) 58 | 59 | /* check for multiple prep edges */ 60 | def multiplePreps = this.baseEdgeMatchers.collect { 61 | case e: LabelEdgeMatcher => e 62 | }.count(_.label.contains("prep")) > 1 63 | 64 | /* check for a conj_and edge */ 65 | def conjAnd = existsEdge(_.label == "conj_and") 66 | 67 | /* check for a conj_and edge */ 68 | def conjOr = existsEdge(_.label == "conj_or") 69 | 70 | /* eliminate all conj edges */ 71 | def conj = existsEdge(_.label startsWith "conj") 72 | 73 | def slotBordersNN = { 74 | import scalaz._ 75 | import Scalaz._ 76 | 77 | def isNN(m: Matcher[DependencyNode]) = m match { 78 | case e: NodeMatcher[_] => 79 | e.baseNodeMatchers exists { 80 | case m: LabelEdgeMatcher if m.label == "nn" => true 81 | case _ => false 82 | } 83 | case _ => false 84 | } 85 | 86 | def isSlot(m: Matcher[DependencyNode]) = m match { 87 | case m: SlotMatcher => true 88 | case _ => false 89 | } 90 | 91 | this.matchers.toZipper.map(_.positions.toStream.exists { z => 92 | def focusedOnNN(z: Option[Zipper[Matcher[DependencyNode]]]) = z.map(z => isNN(z.focus)).getOrElse(false) 93 | isSlot(z.focus) && (focusedOnNN(z.previous) || focusedOnNN(z.next)) 94 | }).getOrElse(false) 95 | } 96 | 97 | if (existsEdge(_.label == "dep")) { 98 | logger.debug("invalid: dep edge: " + this.toString) 99 | return false 100 | } 101 | 102 | if (existsEdge(_.label == "dep")) { 103 | logger.debug("invalid: dep edge: " + this.toString) 104 | return false 105 | } 106 | 107 | /* check if ends with slot */ 108 | def slotAtEnd = { 109 | def isSlot(node: NodeMatcher[_]) = node match { 110 | case m: CaptureNodeMatcher[_] => m.alias.startsWith("slot") 111 | case _ => false 112 | } 113 | 114 | !this.nodeMatchers.isEmpty && (isSlot(this.nodeMatchers.head) || isSlot(this.nodeMatchers.last)) 115 | } 116 | 117 | val length = edgeMatchers.length 118 | 119 | if (length == 2 && multiplePreps) { 120 | logger.debug("invalid: multiple preps: " + this.toString) 121 | false 122 | } 123 | else if (conjAnd) { 124 | logger.debug("invalid: conj_and: " + this.toString) 125 | false 126 | } 127 | else if (conjOr) { 128 | logger.debug("invalid: conj_or: " + this.toString) 129 | false 130 | } 131 | else if (conj) { 132 | logger.debug("invalid: alt conj: " + this.toString) 133 | false 134 | } 135 | else if (slotAtEnd) { 136 | logger.debug("invalid: ends with slot: " + this.toString) 137 | false 138 | } 139 | else if (slotBordersNN) { 140 | logger.debug("invalid: slot borders nn: " + this.toString) 141 | false 142 | } 143 | else { 144 | true 145 | } 146 | } 147 | 148 | /* determine if the pattern is symmetric, such as: 149 | * {arg1} >prep> {rel} compare(m1s, m2s) 155 | // edge matchers should be equals but opposite 156 | case (((m1: EdgeMatcher[_]) :: m1s), ((m2: EdgeMatcher[_]) :: m2s)) => m1 == m2.flip && compare(m1s, m2s) 157 | // edges and other nodes must be equal 158 | case (((m1: Matcher[_]) :: m1s), ((m2: Matcher[_]) :: m2s)) => m1 == m2 && compare(m1s, m2s) 159 | case (Nil, Nil) => true 160 | case _ => false 161 | } 162 | 163 | compare(matchers, matchers.reverse) 164 | } 165 | } 166 | 167 | object ExtractorPattern { 168 | import scala.io.Source 169 | def main(args: Array[String]) { 170 | val iter = if (args.length == 0) Source.stdin.getLines else args.iterator 171 | for (line <- iter) { 172 | val pattern = DependencyPattern.deserialize(line) 173 | val extractor = new ExtractorPattern(pattern) 174 | def verdict = if (extractor.valid) "valid" else "invalid" 175 | println(verdict + ": " + extractor.toString) 176 | } 177 | } 178 | } 179 | 180 | /** A dependency node used to match an extraction part in a pattern extractor. 181 | * 182 | * @author Michael Schmitz 183 | */ 184 | sealed abstract class ExtractionPartMatcher(alias: String, matcher: NodeMatcher[DependencyNode]) 185 | extends CaptureNodeMatcher[DependencyNode](alias, matcher) { 186 | def this(alias: String) = this(alias, new TrivialNodeMatcher[DependencyNode]) 187 | 188 | def withMatcher(matcher: NodeMatcher[DependencyNode]): ExtractionPartMatcher 189 | } 190 | 191 | /** A dependency node used to match an argument in a pattern extractor. 192 | * 193 | * @author Michael Schmitz 194 | */ 195 | class ArgumentMatcher(alias: String, matcher: NodeMatcher[DependencyNode]) extends ExtractionPartMatcher(alias, matcher) { 196 | def this(alias: String) = this(alias, new TrivialNodeMatcher[DependencyNode]) 197 | override def canEqual(that: Any) = that.isInstanceOf[ExtractionPartMatcher] 198 | override def equals(that: Any) = that match { 199 | case that: ExtractionPartMatcher => (that canEqual this) && super.equals(that.asInstanceOf[Any]) 200 | case _ => false 201 | } 202 | 203 | override def withMatcher(matcher: NodeMatcher[DependencyNode]) = new ArgumentMatcher(this.alias, matcher) 204 | } 205 | 206 | /** A dependency node used to match a relation in a pattern extractor. 207 | * 208 | * @author Michael Schmitz 209 | */ 210 | class RelationMatcher(alias: String, matcher: NodeMatcher[DependencyNode]) 211 | extends ExtractionPartMatcher(alias, matcher) { 212 | override def canEqual(that: Any) = that.isInstanceOf[RelationMatcher] 213 | override def equals(that: Any) = that match { 214 | case that: RelationMatcher => (that canEqual this) && super.equals(that.asInstanceOf[Any]) 215 | case _ => false 216 | } 217 | 218 | override def withMatcher(matcher: NodeMatcher[DependencyNode]) = new RelationMatcher(this.alias, matcher) 219 | } 220 | 221 | /** A dependency node used to match a slot in a pattern extractor. 222 | * 223 | * @author Michael Schmitz 224 | */ 225 | class SlotMatcher(alias: String, matcher: NodeMatcher[DependencyNode]) 226 | extends ExtractionPartMatcher(alias, matcher) { 227 | override def canEqual(that: Any) = that.isInstanceOf[SlotMatcher] 228 | override def equals(that: Any) = that match { 229 | case that: SlotMatcher => (that canEqual this) && super.equals(that.asInstanceOf[Any]) 230 | case _ => false 231 | } 232 | 233 | override def withMatcher(matcher: NodeMatcher[DependencyNode]) = new SlotMatcher(this.alias, matcher) 234 | } -------------------------------------------------------------------------------- /core/src/main/scala/edu/knowitall/openparse/GraphExpansions.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.openparse 2 | 3 | import scala.collection.Set 4 | import scala.collection.SortedSet 5 | 6 | import edu.knowitall.collection.immutable.graph.{Graph, DirectedEdge} 7 | import edu.knowitall.collection.immutable.graph.Direction 8 | import edu.knowitall.collection.immutable.Interval 9 | import edu.knowitall.tool.parse.graph.{DependencyNode, DependencyGraph} 10 | 11 | /** A collection of helper methods for expanding a node in a graph 12 | * and/or sentence according to some metric. */ 13 | object GraphExpansions { 14 | def neighborsUntil(graph: DependencyGraph, node: DependencyNode, inferiors: List[DependencyNode], until: Set[DependencyNode]): SortedSet[DependencyNode] = { 15 | val lefts = inferiors.takeWhile(_ != node).reverse 16 | val rights = inferiors.dropWhile(_ != node).drop(1) 17 | 18 | val indices = Interval.span(node.indices :: lefts.takeWhile(!until(_)).map(_.indices) ++ rights.takeWhile(!until(_)).map(_.indices)) 19 | 20 | // use the original dependencies nodes in case some information 21 | // was lost. For example, of is collapsed into the edge prep_of 22 | graph.nodes.filter(node => node.indices.max >= indices.min && node.indices.max <= indices.max) 23 | } 24 | 25 | def expandAdjacent(graph: DependencyGraph, node: DependencyNode, until: Set[DependencyNode], labels: Set[String]) = { 26 | def takeAdjacent(interval: Interval, nodes: List[DependencyNode], pool: List[DependencyNode]): List[DependencyNode] = pool match { 27 | // can we add the top node? 28 | case head :: tail if (head.indices borders interval) && !until.contains(head) => 29 | takeAdjacent(interval union head.indices, head :: nodes, tail) 30 | // otherwise abort 31 | case _ => nodes 32 | } 33 | 34 | // it might be possible to simply have an adjacency restriction 35 | // in this condition 36 | def cond(e: Graph.Edge[DependencyNode]) = 37 | labels.contains(e.label) 38 | val inferiors = graph.graph.inferiors(node, cond).toList.sortBy(_.indices) 39 | 40 | // split into nodes left and right of node 41 | val lefts = inferiors.takeWhile(_ != node).reverse 42 | val rights = inferiors.dropWhile(_ != node).drop(1) 43 | 44 | // take adjacent nodes from each list 45 | val withLefts = takeAdjacent(node.indices, List(node), lefts) 46 | val expanded = takeAdjacent(node.indices, withLefts, rights) 47 | 48 | SortedSet(expanded: _*) 49 | } 50 | 51 | def expand(graph: DependencyGraph, node: DependencyNode, until: Set[DependencyNode], labels: Set[String]) = { 52 | // don't restrict to adjacent (by interval) because prep_of, etc. 53 | // remove some nodes that we want to expand across. In the end, 54 | // we get the span over the inferiors. Do go beneath until 55 | // nodes because we need them for neighborsUntil. 56 | def cond(e: Graph.Edge[DependencyNode]) = 57 | labels.contains(e.label) 58 | val inferiors = graph.graph.inferiors(node, cond) 59 | 60 | // get all nodes connected by an nn edge 61 | val nns = graph.graph.connected(node, dedge => dedge.edge.label == "nn") 62 | 63 | // order the nodes by their indices 64 | val ordered = (inferiors ++ nns).toList.sortBy(_.indices) 65 | 66 | // get neighbors, moving left and right, until a bad node is it 67 | neighborsUntil(graph, node, ordered, until) 68 | } 69 | 70 | def augment(graph: DependencyGraph, node: DependencyNode, without: Set[DependencyNode], pred: Graph.Edge[DependencyNode] => Boolean): List[SortedSet[DependencyNode]] = { 71 | // don't restrict to adjacent (by interval) because prep_of, etc. 72 | // remove some nodes that we want to expand across. In the end, 73 | // we get the span over the inferiors. 74 | graph.graph.successors(node, pred).map { successor => 75 | SortedSet[DependencyNode]() ++ graph.graph.inferiors(successor) 76 | }.toList 77 | } 78 | 79 | /** 80 | * Find all nodes in a components next to the node. 81 | * @param node components will be found adjacent to this node 82 | * @param labels components may be connected by edges with any of these labels 83 | * @param without components may not include any of these nodes 84 | */ 85 | def components(graph: DependencyGraph, node: DependencyNode, labels: Set[String], without: Set[DependencyNode], nested: Boolean) = { 86 | // nodes across an allowed label to a subcomponent 87 | val across = graph.graph.neighbors(node, (dedge: DirectedEdge[_]) => dedge.dir match { 88 | case Direction.Down if labels.contains(dedge.edge.label) => true 89 | case _ => false 90 | }) 91 | 92 | across.flatMap { start => 93 | // get inferiors without passing back to node 94 | val inferiors = graph.graph.inferiors(start, 95 | (e: Graph.Edge[DependencyNode]) => 96 | // don't cross a conjunction that goes back an across node 97 | !((e.label startsWith "conj") && (across contains e.dest)) && 98 | // make sure we don't cycle out of the component 99 | e.dest != node && 100 | // make sure we don't descend into another component 101 | // i.e. "John M. Synge who came to us with his play direct 102 | // from the Aran Islands , where the material for most of 103 | // his later works was gathered" if nested is false 104 | (nested || !labels.contains(e.label))) 105 | 106 | // make sure none of the without nodes are in the component 107 | if (without.forall(!inferiors.contains(_))) { 108 | val span = Interval.span(inferiors.map(_.indices).toSeq) 109 | Some(graph.nodes.filter(node => span.superset(node.indices)).toList) 110 | } else None 111 | } 112 | } 113 | } -------------------------------------------------------------------------------- /core/src/main/scala/edu/knowitall/openparse/bootstrap/FilterTargetExtractions.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.openparse.bootstrap 2 | 3 | import scala.io.Source 4 | 5 | import org.slf4j.LoggerFactory 6 | 7 | /** Filter the target extractions. We only want to keep extractions that 8 | * occur more than once and have a relation with more than 15 seeds. 9 | * 10 | * @author Michael Schmitz 11 | */ 12 | object FilterTargetExtractions { 13 | val logger = LoggerFactory.getLogger(this.getClass) 14 | 15 | final val MIN_RELATION_SEEDS = 15 16 | 17 | def main(args: Array[String]) { 18 | val inputFile = Source.fromFile(args(0), "UTF8") 19 | 20 | logger.info("reading lines and counting") 21 | var relationCounts = Map[String, Int]().withDefaultValue(0) 22 | var seedCounts = Map[(String, String, String, String), Int]().withDefaultValue(0) 23 | for (line <- inputFile.getLines) { 24 | val Array(rel, arg1, arg2, lemmas, _*) = line.split("\t") 25 | 26 | val seed = (rel, arg1, arg2, lemmas) 27 | 28 | // make sure the relation contains at least on of the lemmas 29 | // this excludes, for example, "be in" 30 | if (rel.split(" ").exists (lemmas contains _)) { 31 | seedCounts += seed -> (seedCounts(seed) + 1) 32 | relationCounts += rel -> (relationCounts(rel) + 1) 33 | } 34 | } 35 | 36 | // keep relations with more than 15 seeds 37 | // and more than 0 lemmas 38 | val relations: Set[String] = 39 | (for { 40 | (rel, count) <- relationCounts; 41 | if (count > MIN_RELATION_SEEDS) 42 | } yield (rel))(scala.collection.breakOut) 43 | logger.info("keeping " + relations.size + "/" + relationCounts.size + " relations") 44 | 45 | // keep seeds that occur more than once 46 | val seeds = 47 | for { 48 | (seed @ (rel, arg1, arg2, lemmas), count) <- seedCounts; 49 | if count > 1 && relations.contains(rel) 50 | } yield (seed) 51 | 52 | logger.info("keeping " + seeds.size + "/" + seedCounts.size + " seeds") 53 | 54 | logger.info("printing seeds to keep") 55 | for (seed <- seeds) { 56 | println(seed.productIterator.mkString("\t")) 57 | } 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /core/src/main/scala/edu/knowitall/openparse/bootstrap/FindCommon.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.openparse.bootstrap 2 | 3 | import scala.util.matching.Regex 4 | 5 | /** Common functionality for bootstrap code. 6 | * 7 | * @author Michael Schmitz 8 | */ 9 | object FindCommon { 10 | // tags allowed in proper arguments 11 | val properPostags = Set("DT", "IN", "NNP", "NNPS") 12 | def proper(lemmas: Array[String]) = 13 | lemmas.forall(properPostags.contains(_)) && lemmas.exists(lemma => lemma == "NNP" || lemma == "NNPS") 14 | 15 | def stripPostag(target: String, part: Seq[(String, String, String)]) = { 16 | part.filter { case (pos, tok, lem) => target != pos } 17 | } 18 | def stripPostag(target: Regex, part: Seq[(String, String, String)]) = { 19 | part.filter { case (pos, tok, lem) => !target.pattern.matcher(pos).matches} 20 | } 21 | def stripLemma(target: String, part: Seq[(String, String, String)]) = { 22 | part.filter { case (pos, tok, lem) => target != lem } 23 | } 24 | 25 | def cleanArg(part: Seq[(String, String, String)]) = stripPostag("DT", part) 26 | 27 | def zip3(l1 : List[String], l2 : List[String],l3 : List[String]) : List[(String, String, String)] = 28 | { 29 | def zip3$ (l1$ : List[String], l2$ : List[String], l3$ : List[String], acc : List[(String, String, String)]) : List[(String, String, String)] = l1$ match 30 | { 31 | case Nil => acc reverse 32 | case l1$head :: l1$tail => zip3$(l1$tail, l2$.tail, l3$.tail, (l1$head, l2$.head, l3$.head) :: acc) 33 | } 34 | 35 | zip3$(l1, l2, l3, List[(String,String,String)]()) 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /core/src/main/scala/edu/knowitall/openparse/bootstrap/FindTargetArguments.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.openparse.bootstrap 2 | 3 | import scala.Array.canBuildFrom 4 | import scala.collection.mutable 5 | import scala.io.Source 6 | 7 | import edu.knowitall.tool.stem.MorphaStemmer 8 | 9 | import FindCommon.{zip3, proper, cleanArg} 10 | 11 | /** Detemine valid arguments of extractions for the boostrap process. 12 | * 13 | * Only frequent proper arguments are used. 14 | * 15 | * @author Michael Schmitz 16 | */ 17 | object FindTargetArguments { 18 | import FindCommon._ 19 | 20 | val blacklist = Set("inc", "ltd", "page", 21 | "vehicle", "turn", "site", "photo", "image", "gallery") 22 | 23 | def valid(lemma: String) = { 24 | lemma.length > 2 && lemma.length < 64 && !blacklist.contains(lemma) 25 | } 26 | 27 | /** Run over a file with four columns: 28 | * 29 | * string 30 | * lemma 31 | * postag 32 | * count 33 | * 34 | * Count all of the proper arguments and print any arguments that 35 | * exceed the lower bound. The lower bound is specified by the first 36 | * command-line argument. */ 37 | def main(args: Array[String]) { 38 | val source = Source.fromFile(args(0), "UTF8") 39 | val lowerBound = args(1).toInt 40 | 41 | val map = new mutable.HashMap[String, Int]().withDefaultValue(0) 42 | for (line <- source.getLines) { 43 | try { 44 | val Array(string, lem, postag, count) = line.split("\t") 45 | // do our own normalization 46 | val lemma = string.split(" ").map( 47 | MorphaStemmer.lemmatize(_)).mkString(" ") 48 | 49 | if (!string.contains("_")) { 50 | // remove DT 51 | val arg = cleanArg( 52 | zip3( 53 | postag.split("""\s+""").toList, 54 | string.split("""\s+""").toList, 55 | lemma.split("""\s+""").toList)) 56 | val cleanLemma = arg.unzip3._3.mkString(" ") 57 | 58 | // make sure lemma is valid 59 | if (proper(postag.split(" ")) && valid(cleanLemma)) { 60 | map += cleanLemma -> (map(cleanLemma)+count.toInt) 61 | } 62 | } 63 | } 64 | catch { 65 | case e: MatchError => 66 | } 67 | } 68 | 69 | source.close 70 | 71 | val keepers: List[(String, Int)] = (for ((k, v) <- map if v > lowerBound) yield { 72 | (k, v) 73 | })(scala.collection.breakOut) 74 | 75 | keepers.sortBy(_._2).reverse.foreach { case (k, v) => println(k + "\t" + v) } 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /core/src/main/scala/edu/knowitall/openparse/bootstrap/FindTargetExtractions.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.openparse.bootstrap 2 | 3 | import scala.Array.canBuildFrom 4 | import scala.Option.option2Iterable 5 | import scala.io.Source 6 | 7 | import org.slf4j.LoggerFactory 8 | 9 | import edu.knowitall.openparse.OpenParse 10 | import edu.knowitall.tool.stem.MorphaStemmer 11 | 12 | import FindCommon.{zip3, stripPostag, proper, cleanArg} 13 | import scopt.OptionParser 14 | 15 | /** Detemine valid extractions for the boostrap process. 16 | * 17 | * Extractions need frequent arguments from FindTargetArguments 18 | * and cannot contain a negation word. 19 | * 20 | * @author Michael Schmitz 21 | */ 22 | object FindTargetExtractions { 23 | import FindCommon._ 24 | 25 | val logger = LoggerFactory.getLogger(this.getClass) 26 | 27 | def negated(lemmas: Array[String]) = 28 | lemmas.contains("not") || lemmas.contains("no") || lemmas.contains("n't") || lemmas.contains("never") 29 | 30 | val lemmaBlacklist = Set("the", "that", "of") 31 | 32 | def main(args: Array[String]) { 33 | 34 | val parser = new OptionParser("findextr") { 35 | var extractionFilePath: String = _ 36 | var relationFilePath: Option[String] = None 37 | var argumentFilePath: String = _ 38 | 39 | arg("extractions", "extraction file", { v: String => require(v != null); extractionFilePath = v }) 40 | arg("arguments", "argument file", { v: String => require(v != null); argumentFilePath = v }) 41 | opt("r", "relations", "", "relation file", { v: String => require(v != null); relationFilePath = Some(v) }) 42 | } 43 | 44 | if (parser.parse(args)) { 45 | // read in the argument files 46 | val extractions = Source.fromFile(parser.extractionFilePath, "UTF8") 47 | logger.info("loading targets") 48 | val relationsRows = parser.relationFilePath.map(Source.fromFile(_, "UTF8").getLines.map(line => line.split("\t")).toList) 49 | val targets = relationsRows.map(_ map (_(0))) 50 | val relationLemmaLookup = relationsRows.map(_.map(row => (row(0), row(1).split(" "))).toMap) 51 | def relationLemmas(relation: String): Seq[String] = { 52 | relationLemmaLookup match { 53 | case Some(lookup) => lookup(relation) 54 | case None => relation.split(" ") filterNot OpenParse.LEMMA_BLACKLIST 55 | } 56 | } 57 | 58 | targets match { 59 | case Some(targets) => logger.info("5 targets: " + targets.take(5).mkString(", ")) 60 | case None => logger.info("No target restriction") 61 | } 62 | logger.info("loading arguments") 63 | val arguments = Source.fromFile(parser.argumentFilePath, "UTF8").getLines.map(line => line.split("\t")(0)).toSet 64 | logger.info("5 arguments: " + arguments.take(5).mkString(", ")) 65 | 66 | // iterate over extractions 67 | logger.info("iterating over extractions") 68 | for (line <- extractions.getLines) { 69 | try { 70 | val Array(id, arg1String, relationString, arg2String, _, relationLemma, _, arg1Postag, relationPostag, arg2Postag, _, _, _, count, confidence, url, sentence) = line.split("\t", -1) 71 | val arg1Lemma = arg1String.split(" ").map(MorphaStemmer.lemmatize(_)).mkString(" ") 72 | val arg2Lemma = arg2String.split(" ").map(MorphaStemmer.lemmatize(_)).mkString(" ") 73 | // val rs = new RelationString(relationString, relationLemma, relationPostag) 74 | // rs.correctNormalization() 75 | 76 | val arg1 = zip3(arg1Postag.split("""\s+""").toList, arg1String.split("""\s+""").toList, arg1Lemma.split("""\s+""").toList) 77 | // val rel = zip3(rs.getPosPred.split("""\s+""").toList, rs.getPred.split("""\s+""").toList, rs.getNormPred.split("""\s+""").toList) 78 | val rel = zip3(relationPostag.split("""\s+""").toList, relationString.split("""\s+""").toList, relationLemma.split("""\s+""").toList) 79 | val arg2 = zip3(arg2Postag.split("""\s+""").toList, arg2String.split("""\s+""").toList, arg2Lemma.split("""\s+""").toList) 80 | 81 | implicit def t2mapper[A, B](t: (A, B)) = new { 82 | def map[R](f: A => R, g: B => R) = (f(t._1), g(t._2)) 83 | } 84 | 85 | val (arg1cleanPostags, arg1cleanStrings, arg1cleanLemmas) = cleanArg(arg1).unzip3 86 | val (arg2cleanPostags, arg2cleanStrings, arg2cleanLemmas) = cleanArg(arg2).unzip3 87 | val (relcleanPostags, relcleanStrings, relcleanLemmas) = { 88 | val stripped = stripPostag("RB.*", stripPostag("DT", rel)) 89 | val beIndex = rel.indexWhere(_._3 == "be") 90 | val penultimateAdjective = 91 | if (rel.length - beIndex >= 3 && (rel.drop(beIndex).head._3 startsWith "be") && rel.last._1 == "IN") { 92 | // return the penultimate if it's VERB ADJECTIVE PREPOSITION 93 | Some(rel.init.last) 94 | } 95 | else None 96 | 97 | (stripPostag("JJS?".r, stripped) ++ penultimateAdjective).unzip3 98 | } 99 | 100 | val relcleanLemmaString = relcleanLemmas.mkString(" ") 101 | val arg1cleanLemmaString = arg1cleanLemmas.mkString(" ") 102 | val arg2cleanLemmaString = arg2cleanLemmas.mkString(" ") 103 | 104 | // ensure the extraction parts are relatively small 105 | if (relationLemma.length < 64 && 106 | // ensure the normalized relation string is a target 107 | targets.map(_ contains relcleanLemmaString).getOrElse(true) && 108 | // ensure arguments are proper 109 | (proper(arg1Postag.split("\\s+")) || 110 | proper(arg2Postag.split("\\s+"))) && 111 | arg1cleanLemmaString != arg2cleanLemmaString && 112 | // ensure the args are permissible 113 | arguments.contains(arg1cleanLemmaString) && arguments.contains(arg2cleanLemmaString) && 114 | // ensure the unnormalized relation is not negated 115 | !negated(relationLemma.split(" "))) { 116 | 117 | val lemmas = (arg1cleanLemmas ++ relationLemmas(relcleanLemmaString) ++ arg2cleanLemmas) filterNot lemmaBlacklist 118 | 119 | for (i <- 0 until count.toInt) { 120 | println(Iterable( 121 | relcleanLemmaString, 122 | arg1cleanLemmaString, 123 | arg2cleanLemmaString, 124 | lemmas.mkString(" "), 125 | arg1String, relationString, arg2String, arg1Postag, relationPostag, arg2Postag).mkString("\t")) 126 | } 127 | } 128 | } 129 | catch { 130 | case e => // e.printStackTrace 131 | } 132 | } 133 | } 134 | } 135 | } 136 | -------------------------------------------------------------------------------- /core/src/main/scala/edu/knowitall/openparse/eval/GroupScoredBy.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.openparse.eval 2 | 3 | import java.io.File 4 | 5 | import edu.knowitall.common.Analysis 6 | 7 | import scopt.OptionParser 8 | 9 | /** Group scored extractions by precision and yield. 10 | * 11 | * @author Michael Schmitz 12 | */ 13 | object GroupScoredBy { 14 | def main(args: Array[String]) = { 15 | val parser = new OptionParser("groupscored") { 16 | var scoredFile: File = _ 17 | var column: Int = 2 18 | 19 | arg("scored", "scored extractions", { path: String => scoredFile = new File(path) }) 20 | intOpt("k", "column", "column", { c: Int => column = c }) 21 | } 22 | 23 | if (parser.parse(args)) { 24 | require(parser.column >= 2, "column must be >= 2") 25 | 26 | val scores = Score.loadScoredFile(parser.scoredFile) 27 | val grouped = scores.groupBy(scored => scored.extra(parser.column - 2)) 28 | 29 | val scored = (for (group <- grouped) yield { 30 | val title = group._1 31 | val scoreds = group._2 32 | 33 | (group._1, Analysis.precision(scoreds.map(scored => scored.score.getOrElse(throw new IllegalArgumentException("unscored extraction: " + scored)))), group._2) 34 | }).toList.sortBy(tuple => (tuple._2, tuple._3.mkString("\t"))).reverse 35 | 36 | scored.foreach { item => 37 | println(item._2 + ": " + item._1) 38 | item._3.sortBy(scored => (scored.confidence, scored.toRow)).iterator.map(_.toRow).foreach(println) 39 | println() 40 | } 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /core/src/main/scala/edu/knowitall/openparse/eval/PrecisionYield.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.openparse.eval 2 | 3 | import java.io.{PrintWriter, File} 4 | 5 | import edu.knowitall.common.Resource.using 6 | import edu.knowitall.common.Analysis 7 | 8 | import scopt.OptionParser 9 | 10 | /** Compute precision yield point from scored extractions. 11 | * 12 | * @author Michael Schmitz 13 | */ 14 | object PrecisionYield { 15 | abstract class Settings { 16 | def scoredFile: File 17 | def outputFile: Option[File] 18 | } 19 | 20 | def main(args: Array[String]) = { 21 | val settings = new Settings { 22 | var scoredFile: File = _ 23 | var outputFile: Option[File] = None 24 | } 25 | 26 | val parser = new OptionParser("precyield") { 27 | arg("scored", "scored extractions file", { path: String => settings.scoredFile = new File(path) }) 28 | argOpt("output", "output file", { path: String => settings.outputFile = Some(new File(path)) }) 29 | } 30 | 31 | if (parser.parse(args)) { 32 | run(settings) 33 | } 34 | } 35 | 36 | def run(settings: Settings) = { 37 | val scores = Score.loadScoredFile(settings.scoredFile).sortBy(_.confidence).reverse 38 | val input = scores.map(scored => ("%.4f".format(scored.confidence), scored.score.getOrElse(throw new IllegalArgumentException("unscored extraction: " + scored)))) 39 | 40 | using { 41 | settings.outputFile match { 42 | case Some(file) => new PrintWriter(file, "UTF8") 43 | case None => new PrintWriter(System.out) 44 | } 45 | } { writer => 46 | val py = Analysis.precisionYieldMeta(input) 47 | val area = Analysis.areaUnderCurve(py.map { case (conf, yld, pr) => (yld, pr) }) 48 | println("auc: " + area) 49 | for ((conf, yld, pr) <- Analysis.precisionYieldMeta(input)) { 50 | writer.println(conf + "\t" + yld + "\t" + pr) 51 | } 52 | } 53 | } 54 | } 55 | 56 | /** Merge precision yield points into a single file, 57 | * usually so they can be graphed together. 58 | * 59 | * @author Michael Schmitz 60 | */ 61 | object MergePYFiles { 62 | abstract class Settings { 63 | def files: List[File] 64 | } 65 | 66 | def main(args: Array[String]) { 67 | val settings = new Settings { 68 | var files: List[File] = Nil 69 | } 70 | 71 | val parser = new OptionParser("mergepy") { 72 | arglist("...", "input files", { file: String => settings.files = new File(file) :: settings.files }) 73 | } 74 | 75 | if (parser.parse(args)) { 76 | run(settings) 77 | } 78 | } 79 | 80 | def run(settings: Settings) { 81 | val points = for ((file, i) <- settings.files.zipWithIndex) yield { 82 | using(io.Source.fromFile(file, "UTF8")) { source => 83 | source.getLines.dropWhile(line => !(line contains "\t")).map { line => 84 | val Array(_, yld, prec) = line.split("\t", -1) 85 | (yld.toInt, (i, prec.toDouble)) 86 | }.toList 87 | } 88 | } 89 | 90 | println("\t" + settings.files.map(_.getName).mkString("\t")) 91 | points.flatten.sortBy(_._1).reverse.groupBy(_._1).toSeq.sortBy(_._1).reverse foreach { case (grp, seq) => 92 | var vec = Vector.fill[String](settings.files.size)("") 93 | seq.foreach { 94 | case (k, (i, v)) => vec = vec updated (i, "%1.4f" format v) 95 | } 96 | println(grp+"\t"+vec.mkString("\t")) 97 | } 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /core/src/main/scala/edu/knowitall/openparse/eval/RankPatterns.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.openparse.eval 2 | 3 | import java.io.{PrintWriter, File} 4 | 5 | import edu.knowitall.common.Resource.using 6 | 7 | import scopt.OptionParser 8 | 9 | /** Read a scored file and rank the patterns by their precision and frequency. 10 | * 11 | * @author Michael Schmitz 12 | */ 13 | object RankPatterns { 14 | abstract class Settings { 15 | def scoredFile: File 16 | def outputFile: Option[File] 17 | } 18 | 19 | def main(args: Array[String]) = { 20 | val settings = new Settings { 21 | var scoredFile: File = _ 22 | var outputFile: Option[File] = None 23 | } 24 | 25 | val parser = new OptionParser("rankpat") { 26 | var scoredFile: File = _ 27 | 28 | arg("scored", "scored extractions file", { path: String => settings.scoredFile = new File(path) }) 29 | argOpt("output", "output file", { path: String => settings.outputFile = Some(new File(path)) }) 30 | } 31 | 32 | if (parser.parse(args)) { 33 | run(settings) 34 | } 35 | } 36 | 37 | def run(settings: Settings) = { 38 | val scores = Score.loadScoredFile(settings.scoredFile).sortBy(_.confidence).reverse 39 | val grouped = scores.groupBy(_.extra(0)) 40 | .mapValues { scoreds => 41 | val yld = scoreds.map(scored => if (scored.score.getOrElse(throw new IllegalArgumentException("unscored extraction: " + scored))) 1 else 0).sum 42 | val precision = yld.toDouble / scoreds.size.toDouble 43 | (precision, scoreds.size) 44 | } 45 | 46 | using { 47 | settings.outputFile match { 48 | case Some(file) => new PrintWriter(file, "UTF8") 49 | case None => new PrintWriter(System.out) 50 | } 51 | } { writer => 52 | for ((pattern, (p, y)) <- grouped.toSeq.sortBy(_._2).reverse) { 53 | writer.println(pattern+"\t"+p+"\t"+y) 54 | } 55 | } 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /core/src/main/scala/edu/knowitall/openparse/eval/Score.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.openparse.eval 2 | 3 | import java.io.{PrintWriter, File} 4 | 5 | import scala.io.Source 6 | 7 | import edu.knowitall.common.Resource.using 8 | 9 | import scopt.OptionParser 10 | 11 | /** A main method to annotate extractions, 12 | * using a gold set for previously scored extractions. 13 | * 14 | * @author Michael Schmitz 15 | */ 16 | object Score { 17 | abstract class Settings { 18 | def extractionFile: File 19 | def outputFile: File 20 | def goldFile: Option[File] 21 | def goldOutputFile: Option[File] 22 | def confidenceThreshold: Double 23 | def skipAll: Boolean 24 | def keepSkipped: Boolean 25 | } 26 | 27 | def main(args: Array[String]) = { 28 | object settings extends Settings { 29 | var extractionFile: File = _ 30 | var outputFile: File = _ 31 | var goldFile: Option[File] = None 32 | var goldOutputFile: Option[File] = None 33 | var confidenceThreshold = 0.0 34 | var skipAll = false 35 | var keepSkipped = false 36 | } 37 | 38 | val parser = new OptionParser("scoreextr") { 39 | arg("extrs", "extractions", { path: String => settings.extractionFile = new File(path) }) 40 | arg("output", "scored output", { path: String => settings.outputFile = new File(path) }) 41 | opt("g", "gold", "gold set", { path: String => settings.goldFile = Some(new File(path)) }) 42 | opt("u", "goldoutput", "output for updated gold set", { path: String => settings.goldOutputFile = Some(new File(path)) }) 43 | doubleOpt("t", "threshold", "confidence threshold for considered extractions", { x: Double => settings.confidenceThreshold = x }) 44 | opt("skip-all", "don't prompt for items not in the gold set", { settings.skipAll = true }) 45 | opt("keep-skipped", "keep unannotated extractions in output file", { settings.keepSkipped = true }) 46 | } 47 | 48 | if (parser.parse(args)) { 49 | run(settings) 50 | } 51 | } 52 | 53 | def run(settings: Settings) { 54 | val gold = settings.goldFile match { 55 | case None => Map[String, Boolean]() 56 | case Some(goldFile) => GoldSet.load(goldFile) 57 | } 58 | 59 | val (scoreds, golden) = using(Source.fromFile(settings.extractionFile, "UTF8")) { source => 60 | score(source.getLines, gold, settings.confidenceThreshold, !settings.skipAll) 61 | } 62 | 63 | // print the scored extractions 64 | using(new PrintWriter(settings.outputFile, "UTF8")) { writer => 65 | for (scored <- scoreds.filter(scored => settings.keepSkipped || scored.score.isDefined)) { 66 | writer.println(scored.toRow) 67 | } 68 | } 69 | 70 | // output updated gold set 71 | settings.goldOutputFile match { 72 | case Some(file) => 73 | using(new PrintWriter(file, "UTF8")) { writer => 74 | golden.foreach { case (k, v) => writer.println((if (v) 1 else 0) + "\t" + k) } 75 | } 76 | case None => 77 | } 78 | } 79 | 80 | def loadScoredFile(file: File): Seq[Scored] = { 81 | using(Source.fromFile(file, "UTF8")) { source => 82 | source.getLines.map { line => 83 | Scored.fromRow(line) 84 | }.toList 85 | } 86 | } 87 | 88 | def score(lines: Iterator[String], gold: Map[String, Boolean], confidenceThreshold: Double, prompt: Boolean) = { 89 | def stringDistance(s1: String, s2: String): Int = { 90 | def minimum(i1: Int, i2: Int, i3: Int) = math.min(math.min(i1, i2), i3) 91 | 92 | val dist = Array.ofDim[Int](s1.length + 1, s2.length + 1) 93 | 94 | for (idx <- 0 to s1.length) dist(idx)(0) = idx 95 | for (jdx <- 0 to s2.length) dist(0)(jdx) = jdx 96 | 97 | for (idx <- 1 to s1.length; jdx <- 1 to s2.length) 98 | dist(idx)(jdx) = minimum ( 99 | dist(idx-1)(jdx ) + 1, 100 | dist(idx )(jdx-1) + 1, 101 | dist(idx-1)(jdx-1) + (if (s1(idx-1) == s2(jdx-1)) 0 else 1) 102 | ) 103 | dist(s1.length)(s2.length) 104 | } 105 | 106 | def suggest(extr: String) = { 107 | for { 108 | k <- gold.keys; 109 | if stringDistance(k, extr) < extr.length / 2 110 | } yield ((k, gold(k))) 111 | } 112 | 113 | def promptScore(index: Int, extr: String, confidence: String, rest: Seq[Any]): Option[Boolean] = { 114 | println() 115 | System.out.println("Please score " + index + ": " + confidence + ":" + extr + ". (1/y/0/n/skip) ") 116 | if (rest.length > 0) println(rest.mkString("\t")) 117 | suggest(extr) foreach { case (k, v) => 118 | println("suggest: " + v + "\t" + k) 119 | } 120 | readLine match { 121 | case "0" | "y" => Some(false) 122 | case "1" | "n" => Some(true) 123 | case "s" | "skip" => None 124 | case _ => promptScore(index, extr, confidence, rest) 125 | } 126 | } 127 | 128 | var golden = gold 129 | 130 | val scored = for { 131 | (line, index) <- lines.zipWithIndex 132 | val Array(confidence, extr, rest @ _*) = line.split("\t") 133 | val conf = confidence.toDouble 134 | 135 | if (conf >= confidenceThreshold) 136 | 137 | val scoreOption = gold.get(extr) match { 138 | case Some(score) => Some(score) 139 | case None if prompt => promptScore(index, extr, confidence, rest) 140 | case None => None 141 | } 142 | } yield { 143 | scoreOption match { 144 | case Some(score) => 145 | // update golden set 146 | golden += extr -> score 147 | case None => 148 | } 149 | 150 | // output 151 | Scored(scoreOption, conf, extr, rest) 152 | } 153 | 154 | (scored.toList, golden) 155 | } 156 | } 157 | 158 | case class Scored(score: Option[Boolean], confidence: Double, extraction: String, extra: Seq[String]) { 159 | def toRow = (if (!score.isDefined) "" else if (score.get == true) "1" else "0")+"\t"+confidence+"\t"+extraction+"\t"+extra.mkString("\t") 160 | } 161 | 162 | object Scored { 163 | def fromRow(row: String) = { 164 | val parts = row.split("\t") 165 | val score = parts(0) match { 166 | case "1" => true 167 | case "0" => false 168 | case _ => throw new IllegalArgumentException("must be 1 or 0: " + parts(0)) 169 | } 170 | val confidence = parts(1).toDouble 171 | val extraction = parts(2) 172 | val extra = parts.drop(3) 173 | 174 | Scored(Some(score), confidence, extraction, extra) 175 | } 176 | } 177 | 178 | object GoldSet { 179 | def load(file: File) = { 180 | using(Source.fromFile(file, "UTF8")) { source => 181 | source.getLines.map { line => 182 | val parts = line.split("\t") 183 | parts(1) -> (if (parts(0) == "1") true else false) 184 | }.toMap 185 | } 186 | } 187 | 188 | def save(gold: Map[String, Boolean], file: File) = { 189 | using(new PrintWriter(file, "UTF8")) { writer => 190 | gold.foreach { case (extr, correct) => writer.println((if (correct) 1 else 0) + "\t" + extr) } 191 | } 192 | } 193 | } 194 | -------------------------------------------------------------------------------- /core/src/main/scala/edu/knowitall/openparse/eval/StatisticalSignificance.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.openparse.eval 2 | 3 | import java.io.File 4 | 5 | import edu.knowitall.common.{Random, Analysis} 6 | 7 | import scopt.OptionParser 8 | 9 | /** Compute the statistical significance of scored extractions to a baseline. 10 | * 11 | * @author Michael Schmitz 12 | */ 13 | object StatisticalSignificance { 14 | abstract class Settings { 15 | def iterations: Int 16 | def systemFile: File 17 | def baselineFile: File 18 | } 19 | 20 | def main(args: Array[String]) { 21 | object settings extends Settings { 22 | var systemFile: File = _ 23 | var baselineFile: File = _ 24 | var iterations: Int = 1000 25 | } 26 | 27 | val parser = new OptionParser("statsig") { 28 | arg("system", "scored extractions from the new system", { path: String => settings.systemFile = new File(path) }) 29 | arg("baseline", "scored extractions from the baseline system", { path: String => settings.baselineFile = new File(path) }) 30 | intOpt("i", "iterations", "number of iterations", { n: Int => settings.iterations = n }) 31 | } 32 | 33 | if (parser.parse(args)) { 34 | run(settings) 35 | } 36 | } 37 | 38 | /** 39 | * Uses the bootstrap test for statistical significance. 40 | * This is described in the following paper: 41 | * 42 | * http://maroo.cs.umass.edu/pub/web/getpdf.php?id=744 43 | * 44 | * Note that this function is agnostic to the order of 45 | * `system` and `baseline`. 46 | * 47 | * @param system a metric for the system, i.e. AUC 48 | * @param baseline a metric for the baseline, i.e. AUC 49 | * @param sample a lambda that resamples the systems, returning the metric, i.e. AUC 50 | * @param iterations the number of iterations 51 | */ 52 | def bootstrapTestWithMetric(system: Double, 53 | baseline: Double, 54 | sample: ()=>(Double, Double), 55 | iterations: Int) = { 56 | val difference = math.abs(system - baseline) 57 | val sampled = for (i <- 0 until iterations) yield (sample()) 58 | val differences = sampled.map { case (sys, base) => math.abs(sys - base) } 59 | val average = differences.sum / differences.size.toDouble 60 | val normed = differences.map(_ - average) 61 | val pscore = normed.count(_ >= difference).toDouble / normed.size.toDouble 62 | 63 | pscore 64 | } 65 | 66 | /** 67 | * Uses the bootstrap test for statistical significance. 68 | * This is described in the following paper: 69 | * 70 | * http://maroo.cs.umass.edu/pub/web/getpdf.php?id=744 71 | * 72 | * Note that this function is agnostic to the order of 73 | * `system` and `baseline`. 74 | * 75 | * @param system a metric for the system, i.e. AUC 76 | * @param baseline a metric for the baseline, i.e. AUC 77 | * @param sample a lambda that resamples the systems, returning the metric, i.e. AUC 78 | * @param iterations the number of iterations 79 | */ 80 | def bootstrapTestWithScores(system: Seq[Boolean], 81 | baseline: Seq[Boolean], 82 | metric: Seq[Boolean]=>Double, 83 | iterations: Int, rand: util.Random) = { 84 | 85 | def sample(extrs: Seq[Boolean]) = 86 | metric(extrs.map(extr=>Random.choose(extrs, extrs.size, rand))) 87 | 88 | bootstrapTestWithMetric(metric(system), metric(baseline), 89 | ()=>(sample(system), sample(baseline)), iterations) 90 | } 91 | 92 | def run(settings: Settings) { 93 | val rand = new util.Random 94 | 95 | def areaUnderCurve(scoreds: Seq[Scored]) = { 96 | val points = Analysis.precisionYieldMeta(scoreds.map(extr => (extr.confidence, extr.score.get))) 97 | Analysis.areaUnderCurve(points.map { case (conf, yld, prc) => (yld, prc) }) 98 | } 99 | 100 | val systemExtractionsAll: Seq[Scored] = 101 | Score.loadScoredFile(settings.systemFile).sortBy(-_.confidence) 102 | val baselineExtractionsAll: Seq[Scored] = 103 | Score.loadScoredFile(settings.baselineFile).sortBy(-_.confidence) 104 | 105 | val sentences = (systemExtractionsAll.map(_.extra(0)).toSet ++ baselineExtractionsAll.map(_.extra(0)).toSet).toSeq.take(50).toSet 106 | 107 | val systemExtractions = systemExtractionsAll.filter(extr => sentences.contains(extr.extra(0))) 108 | val baselineExtractions = baselineExtractionsAll.filter(extr => sentences.contains(extr.extra(0))) 109 | 110 | def sample(): (Double, Double) = { 111 | def helper(extrs: Seq[Scored]) = { 112 | val sent = sentences.map(extr=>Random.choose(sentences, sentences.size, rand)) 113 | val set = sent.flatMap(sent => extrs.filter(sent == _.extra(0))).toSeq.sortBy(_.confidence) 114 | val auc = areaUnderCurve(set) 115 | auc 116 | } 117 | 118 | (helper(systemExtractions), helper(baselineExtractions)) 119 | } 120 | 121 | val pscore = bootstrapTestWithMetric( 122 | areaUnderCurve(systemExtractions), 123 | areaUnderCurve(baselineExtractions), 124 | sample, settings.iterations) 125 | 126 | println(pscore) 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /core/src/main/scala/edu/knowitall/openparse/extract/Extraction.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.openparse.extract 2 | 3 | import scala.collection.{SortedSet, Set} 4 | import edu.knowitall.collection.immutable.graph.pattern.Match 5 | import edu.knowitall.collection.immutable.graph.{Direction, Graph, DirectedEdge} 6 | import edu.knowitall.collection.immutable.Interval 7 | import edu.knowitall.openparse.GraphExpansions.{expand, components, augment} 8 | import edu.knowitall.openparse.OpenParse 9 | import edu.knowitall.tool.parse.graph.{DependencyPattern, DependencyNode, DependencyGraph} 10 | import edu.knowitall.ollie.Ollie.stemmer 11 | import edu.knowitall.tool.stem.Stemmer 12 | import Extraction.{Part, ClausalComponent, AdverbialModifier} 13 | import edu.knowitall.tool.parse.graph.LabelEdgeMatcher 14 | import edu.knowitall.collection.immutable.graph.pattern.DirectedEdgeMatcher 15 | 16 | /** A representation of an OpenParse extraction. 17 | * 18 | * @author Michael Schmitz 19 | */ 20 | abstract class Extraction(val relLemmas: Set[String]) { 21 | /** the text of the first argument */ 22 | def arg1Text: String 23 | /** the text of the relation */ 24 | def relText: String 25 | /** the text of the second argument */ 26 | def arg2Text: String 27 | 28 | def this(relText: String) = this(relText.split(" ").map(implicitly[Stemmer].lemmatize(_)).toSet -- OpenParse.LEMMA_BLACKLIST) 29 | 30 | override def equals(that: Any) = that match { 31 | case that: Extraction => (that canEqual this) && that.arg1Text == this.arg1Text && that.relText == this.relText && that.arg2Text == this.arg2Text 32 | case _ => false 33 | } 34 | def canEqual(that: Any) = that.isInstanceOf[Extraction] 35 | override def hashCode = arg1Text.hashCode + 39 * (relText.hashCode + 39 * arg2Text.hashCode) 36 | 37 | override def toString() = Iterable(arg1Text, relText, arg2Text).mkString("(", "; ", ")") 38 | 39 | def softMatch(that: Extraction) = 40 | (that.arg1Text.contains(this.arg1Text) || this.arg1Text.contains(that.arg1Text)) && 41 | this.relLemmas == that.relLemmas && 42 | (that.arg2Text.contains(this.arg2Text) || this.arg2Text.contains(that.arg2Text)) 43 | } 44 | 45 | /** A simple representation of an OpenParse extraction. 46 | * 47 | * @author Michael Schmitz 48 | */ 49 | class SimpleExtraction( 50 | override val arg1Text: String, 51 | override val relText: String, 52 | relLemmas: Set[String], 53 | override val arg2Text: String) 54 | extends Extraction(relLemmas) { 55 | 56 | def this(arg1Text: String, relText: String, arg2Text: String) = this(arg1Text, 57 | relText, 58 | relText.split(" ").map(implicitly[Stemmer].lemmatize(_)).toSet -- OpenParse.LEMMA_BLACKLIST, 59 | arg2Text) 60 | 61 | def replaceRelation(relation: String) = 62 | new SimpleExtraction(this.arg1Text, this.relText, this.relLemmas, this.arg2Text) 63 | } 64 | 65 | /** A more informative representation of an OpenParse extraction. 66 | * 67 | * @author Michael Schmitz 68 | */ 69 | class DetailedExtraction( 70 | val extractor: PatternExtractor, 71 | val `match`: Match[DependencyNode], 72 | val arg1: Part, 73 | val rel: Part, 74 | val arg2: Part, 75 | val clausal: Option[ClausalComponent] = None, 76 | val modifier: Option[AdverbialModifier] = None) 77 | extends Extraction(rel.text) { 78 | 79 | override def arg1Text = arg1.text 80 | override def relText = rel.text 81 | override def arg2Text = arg2.text 82 | 83 | def this(extractor: PatternExtractor, mch: Match[DependencyNode], 84 | arg1Nodes: SortedSet[DependencyNode], 85 | relNodes: SortedSet[DependencyNode], 86 | arg2Nodes: SortedSet[DependencyNode]) = 87 | this(extractor, mch, new Part(arg1Nodes), new Part(relNodes), new Part(arg2Nodes)) 88 | 89 | /** all the nodes in this extraction */ 90 | def nodes = arg1.nodes ++ rel.nodes ++ arg2.nodes 91 | 92 | /** all the edges in this extraction */ 93 | def edges = `match`.bipath.path 94 | 95 | def replaceRelation(relation: String) = 96 | new DetailedExtraction(extractor, `match`, this.arg1, Part(this.rel.nodes, relation), this.arg2, this.clausal, this.modifier) 97 | } 98 | 99 | object DetailedExtraction { 100 | def nodesToString(nodes: Iterable[DependencyNode]) = nodes.iterator.map(_.text).mkString(" ") 101 | } 102 | 103 | 104 | /** Includes logic for expanding relations and arguments. 105 | * 106 | * @author Michael Schmitz 107 | */ 108 | object Extraction { 109 | /** Representation of a part of an extraction. 110 | * 111 | * @author Michael Schmitz 112 | */ 113 | case class Part(nodes: SortedSet[DependencyNode], text: String) { 114 | def this(nodes: SortedSet[DependencyNode]) = { 115 | this(nodes, DetailedExtraction.nodesToString(nodes)) 116 | } 117 | 118 | def this(nodes: Iterable[DependencyNode]) = { 119 | this(SortedSet[DependencyNode]() ++ nodes, DetailedExtraction.nodesToString(nodes)) 120 | } 121 | 122 | def span = Interval.span(nodes.map(_.indices)) 123 | } 124 | object Part { 125 | def connections(m: Match[DependencyNode], node: DependencyNode): Set[Graph.Edge[DependencyNode]] = { 126 | m.edges.filter(edge => edge.source == node || edge.dest == node).toSet 127 | } 128 | 129 | def connections(m: Match[DependencyNode], nodes: Set[DependencyNode]): Set[Graph.Edge[DependencyNode]] = { 130 | m.edges.filter(edge => nodes.contains(edge.source) || nodes.contains(edge.dest)).toSet 131 | } 132 | 133 | def connections(m: Match[DependencyNode], nodes: Seq[DependencyNode]): Set[Graph.Edge[DependencyNode]] = { 134 | m.edges.filter(edge => nodes.contains(edge.source) || nodes.contains(edge.dest)).toSet 135 | } 136 | } 137 | case class ClausalComponent(rel: Part, arg: Part) { 138 | def text = arg.text + " " + rel.text 139 | } 140 | case class AdverbialModifier(contents: Part) { 141 | def text = contents.text 142 | } 143 | 144 | private val attributionPattern = DependencyPattern.deserialize("{old} nsubj> {arg}") 145 | private val conditionalPattern = DependencyPattern.deserialize("{old} nsubj> {arg}") 146 | def fromMatch(expand: Boolean)(graph: DependencyGraph, m: Match[DependencyNode], ex: PatternExtractor): Iterable[DetailedExtraction] = { 147 | def clausalComponent(node: DependencyNode, until: Set[DependencyNode]) = { 148 | attributionPattern.apply(graph.graph, node) match { 149 | case List(m) => 150 | assume(m.nodeGroups.get("rel").isDefined) 151 | assume(m.nodeGroups.get("arg").isDefined) 152 | 153 | val rel = m.nodeGroups("rel").node 154 | val arg = m.nodeGroups("arg").node 155 | 156 | val Part(expandedRelNodes, expandedRelText) = expandRelation(graph, rel, until + arg).head 157 | val expandedArg = expandArgument(graph, arg, until + rel) 158 | 159 | Some(ClausalComponent(Part(expandedRelNodes, expandedRelText), Part(expandedArg, DetailedExtraction.nodesToString(expandedArg)))) 160 | case _ => None 161 | } 162 | } 163 | 164 | def adverbialModifier(node: DependencyNode, until: Set[DependencyNode]): Option[AdverbialModifier] = { 165 | val neighbors = graph.graph.neighbors(node, dedge => dedge.dir == Direction.Down && dedge.edge.label == "advcl") 166 | val nodes = neighbors.flatMap(graph.graph.inferiors(_)) 167 | if (nodes.isEmpty) None 168 | else { 169 | val span = Interval.span(nodes.map(_.indices)) 170 | val clause = graph.nodes.filter(node => span.superset(node.indices)) 171 | Some(AdverbialModifier(Part(clause, DetailedExtraction.nodesToString(clause)))) 172 | } 173 | } 174 | 175 | val groups = m.nodeGroups 176 | 177 | val rels = groups.filter(_._1 startsWith "rel").toSeq.sortBy(_._1).map(_._2.node) 178 | if (rels.isEmpty) (throw new IllegalArgumentException("no rel: " + m)) 179 | val arg1 = groups.get("arg1").map(_.node) getOrElse (throw new IllegalArgumentException("no arg1: " + m)) 180 | val arg2 = groups.get("arg2").map(_.node) getOrElse (throw new IllegalArgumentException("no arg2: " + m)) 181 | 182 | val expandedArg1 = if (expand) expandArgument(graph, arg1, rels.toSet) else SortedSet(arg1) 183 | val expandedArg2 = if (expand) expandArgument(graph, arg2, rels.toSet) else SortedSet(arg2) 184 | val expandRels = 185 | // hack to exclude rel rel extractions with a second nsubj 186 | if (rels.size > 0 && rels.tail.exists(rel => graph.graph.dedges(rel).exists(dedge => dedge.dir == Direction.Down && dedge.edge.label == "nsubj"))) { 187 | Set.empty 188 | } 189 | else if (expand) { 190 | import scalaz._ 191 | import Scalaz._ 192 | 193 | val expansions = rels.map(rel => expandRelation(graph, rel, expandedArg1 ++ expandedArg2).toList).toList.sequence 194 | 195 | expansions.map(expansion => Part(expansion.map(_.nodes).reduce(_ ++ _), expansion.map(_.text).mkString(" "))) 196 | } else { 197 | Set(Part(SortedSet.empty[DependencyNode] ++ rels, rels.map(_.text).mkString(" "))) 198 | } 199 | 200 | for { 201 | Part(expandedRelNodes, expandedRelText) <- expandRels 202 | val nodes = expandedArg1 ++ expandedArg2 ++ expandedRelNodes 203 | val clausal = rels.flatMap(rel => clausalComponent(rel, nodes)).headOption 204 | val modifier = rels.flatMap(rel => adverbialModifier(rel, nodes)).headOption 205 | 206 | // arguments don't overlap 207 | if (!(Interval.span(expandedArg1.map(_.indices)(scala.collection.breakOut)) intersects Interval.span(expandedArg2.map(_.indices)(scala.collection.breakOut)))) 208 | } yield ( 209 | new DetailedExtraction(ex, m, new Part(expandedArg1), Part(expandedRelNodes, expandedRelText), new Part(expandedArg2), clausal = clausal, modifier = modifier) 210 | ) 211 | 212 | } 213 | 214 | private val argumentExpansionLabels = Set("det", "prep_of", "amod", "num", "number", "nn", "poss", "quantmod", "neg") 215 | def expandArgument(graph: DependencyGraph, node: DependencyNode, until: Set[DependencyNode]): SortedSet[DependencyNode] = { 216 | def expandNode(node: DependencyNode) = { 217 | val expansion = expand(graph, node, until, argumentExpansionLabels) 218 | if (expansion.exists(_.isProperNoun)) expansion 219 | else expansion ++ components(graph, node, Set("rcmod", "infmod", "partmod", "ref", "prepc_of"), until, false).flatten 220 | } 221 | 222 | // expand over any conjunction/disjunction edges to non-verbs 223 | val nodes = graph.graph.connected(node, (dedge: DirectedEdge[DependencyNode]) => 224 | !(dedge.end.postag startsWith "VB") && (dedge.edge.label == "conj_and" || dedge.edge.label == "conj_or")) 225 | 226 | if (nodes.size == 1) { 227 | // there are no conjunctive edges 228 | expandNode(node) 229 | } 230 | else { 231 | val flat = nodes.map(expandNode).flatten 232 | val span = Interval.span(flat.map(_.indices).toSeq) 233 | // take the nodes that cover all the nodes found 234 | graph.nodes.filter(node => span.superset(node.indices)) 235 | } 236 | } 237 | 238 | /** Expand the relation nodes of a match. 239 | * 240 | * Multiple parts can be returned if there are multiple dobj or iobjs. 241 | * 242 | * @return parts the part (or multiple parts) that describes the relation 243 | */ 244 | def expandRelation(graph: DependencyGraph, node: DependencyNode, until: Set[DependencyNode]): Set[Part] = { 245 | // count the adjacent dobj edges. We will only expand across 246 | // dobj components if there is exactly one adjacent dobj edge. 247 | // This edge may already be used, but in that case we won't 248 | // expand over it because of the until set. 249 | val dobjCount = graph.graph.edges(node).count(_.label == "dobj") 250 | val iobjCount = graph.graph.edges(node).count(_.label == "iobj") 251 | 252 | var attachLabels = Set[String]() 253 | if (dobjCount == 1) attachLabels += "dobj" 254 | if (iobjCount == 1) attachLabels += "iobj" 255 | 256 | /* 257 | * acomp: "She looks beautiful on Thursday." 258 | */ 259 | def pred(edge: Graph.Edge[DependencyNode]) = 260 | // make sure we don't re-add the relation node 261 | edge.dest != node && ( 262 | // attach adverbs 263 | edge.label == "advmod" && edge.dest.postag == "RB" || 264 | edge.label == "aux" || edge.label == "cop" || edge.label == "auxpass" || edge.label == "prt" || edge.label == "acomp") 265 | 266 | // expand across noun label for relational nouns 267 | // i.e. "He is the *best* president of the USA" 268 | val expandNounLabels = 269 | if (node.postag startsWith "NN") expand(graph, node, until, argumentExpansionLabels) 270 | else expand(graph, node, until, Set("det", "amod", "num", "number", "nn", "poss", "quantmod", "neg")) 271 | 272 | // modifiers on copulars are stored on a different node 273 | // i.e. in "he *will* be the president" 274 | val cops = graph.graph.predecessors(node, (e: Graph.Edge[DependencyNode])=>e.label == "cop").headOption 275 | val expandCopLabels = cops.map(cop => augment(graph, cop, until, pred)).getOrElse(List.empty) 276 | 277 | def f(s: Set[List[DependencyNode]]): Set[List[DependencyNode]] = 278 | if (s.isEmpty) Set(List()) 279 | else s 280 | val dobjs = f(components(graph, node, Set("dobj"), until, true)) 281 | val iobjs = f(components(graph, node, Set("iobj"), until, true)) 282 | 283 | for (dobj <- dobjs; iobj <- iobjs) yield { 284 | val expansion = expandCopLabels ++ (expandNounLabels :: 285 | // make sure that we don't use a label that was 286 | // already captured by expandNounlabels. This 287 | // can happen when a verb edges goes between two 288 | // noun labels. 289 | ((augment(graph, node, until, pred).map(_ -- expandNounLabels)) :+ 290 | // add subcomponents 291 | (SortedSet[DependencyNode]() ++ dobj) :+ 292 | (SortedSet[DependencyNode]() ++ iobj)).filterNot { c => 293 | // don't add empty components 294 | c.isEmpty || 295 | // don't add components with just "who" or "whom" 296 | c.size == 1 && c.headOption.map(_.postag == "WP").getOrElse(false) 297 | }) 298 | 299 | val sorted = expansion.sortBy(nodes => Interval.span(nodes.map(_.indices))) 300 | 301 | // perform a more complicated node->text transformation 302 | val texts = sorted.map(DetailedExtraction.nodesToString(_)) 303 | Part(expansion.reduce(_ ++ _), texts.mkString(" ")) 304 | } 305 | } 306 | } 307 | -------------------------------------------------------------------------------- /core/src/main/scala/edu/knowitall/openparse/extract/GeneralExtractor.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.openparse.extract 2 | 3 | import org.slf4j.LoggerFactory 4 | import edu.knowitall.collection.immutable.graph.pattern.{Pattern, Match} 5 | import edu.knowitall.collection.immutable.graph.Graph 6 | import edu.knowitall.tool.parse.graph.{DependencyPattern, DependencyNode, DependencyGraph} 7 | import edu.knowitall.ollie.Ollie.stemmer 8 | import GeneralExtractor.logger 9 | import edu.knowitall.openparse.ExtractorPattern 10 | 11 | /** An extractor that is purely specified by a pattern. 12 | * 13 | * @param pattern the pattern to extract 14 | * @param conf the confidence of this extractor 15 | * 16 | * @author Michael Schmitz 17 | */ 18 | class GeneralExtractor(pattern: ExtractorPattern, val conf: Double) extends PatternExtractor(pattern) { 19 | import GeneralExtractor._ 20 | 21 | def this(pattern: Pattern[DependencyNode], conf: Double) = 22 | this(new ExtractorPattern(pattern), conf) 23 | 24 | protected def extractWithMatches(dgraph: DependencyGraph)(implicit 25 | buildExtraction: (DependencyGraph, Match[DependencyNode], PatternExtractor)=>Iterable[DetailedExtraction], 26 | validMatch: Graph[DependencyNode]=>Match[DependencyNode]=>Boolean) = { 27 | 28 | // apply pattern and keep valid matches 29 | val matches = pattern(dgraph.graph) 30 | if (!matches.isEmpty && logger.isDebugEnabled) logger.debug("matches: " + matches.mkString(", ")) 31 | 32 | val filtered = matches.filter(validMatch(dgraph.graph)) 33 | if (!filtered.isEmpty && logger.isDebugEnabled) logger.debug("filtered: " + filtered.mkString(", ")) 34 | 35 | for (m <- filtered; extr <- buildExtraction(dgraph, m, this)) yield { 36 | (extr, m) 37 | } 38 | } 39 | 40 | override def extract(dgraph: DependencyGraph)(implicit 41 | buildExtraction: (DependencyGraph, Match[DependencyNode], PatternExtractor)=>Iterable[DetailedExtraction], 42 | validMatch: Graph[DependencyNode]=>Match[DependencyNode]=>Boolean) = { 43 | logger.debug("pattern: " + pattern) 44 | 45 | val extractions = this.extractWithMatches(dgraph).map(_._1) 46 | if (!extractions.isEmpty) logger.debug("extractions: " + extractions.mkString(", ")) 47 | 48 | extractions 49 | } 50 | 51 | override def confidence(extr: Extraction): Double = { 52 | this.conf 53 | } 54 | 55 | /** A maximum confidence for any extraction from this extractor. 56 | * This is used for optimization. If the minimum confidence is 57 | * larger than the threshold, we don't need to run this extractor. */ 58 | override def maximumConfidence: Double = this.conf 59 | } 60 | 61 | case object GeneralExtractor extends PatternExtractorType { 62 | val logger = LoggerFactory.getLogger(this.getClass) 63 | 64 | def fromLines(lines: Iterator[String]): List[GeneralExtractor] = { 65 | val patterns: List[(Pattern[DependencyNode], Int)] = lines.map { line => 66 | line.split("\t") match { 67 | // full information specified 68 | case Array(pat, count) => (DependencyPattern.deserialize(pat), count.toInt) 69 | // assume a count of 1 if nothing is specified 70 | case Array(pat) => logger.warn("warning: pattern has no count: " + pat); (DependencyPattern.deserialize(pat), 1) 71 | case _ => throw new IllegalArgumentException("line must have one or two columns: " + line) 72 | } 73 | }.toList 74 | 75 | (for ((p, conf) <- patterns) yield { 76 | new GeneralExtractor(new ExtractorPattern(p), conf.toDouble) 77 | }).toList 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /core/src/main/scala/edu/knowitall/openparse/extract/PatternExtractor.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.openparse.extract 2 | 3 | import java.io.File 4 | import scala.io.Source 5 | import edu.knowitall.collection.immutable.graph.pattern.{Pattern, Match} 6 | import edu.knowitall.collection.immutable.graph.Graph 7 | import edu.knowitall.common.Resource.using 8 | import edu.knowitall.tool.parse.graph.{DependencyNode, DependencyGraph} 9 | import javax.naming.OperationNotSupportedException 10 | import edu.knowitall.collection.immutable.graph.pattern.CaptureNodeMatcher 11 | import edu.knowitall.openparse.ExtractorPattern 12 | 13 | /** An superclass for extractors based on patterns. 14 | * 15 | * @param pattern the pattern to extract 16 | * 17 | * @author Michael Schmitz 18 | */ 19 | abstract class PatternExtractor(val pattern: ExtractorPattern) { 20 | def extract(dgraph: DependencyGraph)(implicit 21 | buildExtraction: (DependencyGraph, Match[DependencyNode], PatternExtractor)=>Iterable[DetailedExtraction], 22 | validMatch: Graph[DependencyNode]=>Match[DependencyNode]=>Boolean): Iterable[DetailedExtraction] 23 | def confidence(extr: Extraction): Double 24 | 25 | /** A maximum confidence for any extraction from this extractor. 26 | * This is used for optimization. If the minimum confidence is 27 | * larger than the threshold, we don't need to run this extractor. 28 | */ 29 | def maximumConfidence: Double 30 | 31 | override def toString = pattern.toString 32 | 33 | def tabSerialize: String = throw new OperationNotSupportedException() 34 | 35 | def prepMismatch: Boolean = false 36 | } 37 | 38 | object PatternExtractor { 39 | def tabDeserialize(seq: Seq[String]): (PatternExtractor, Seq[String]) = { 40 | seq(0).toLowerCase match { 41 | case "template" => TemplateExtractor.tabDeserialize(seq.drop(1)) 42 | } 43 | } 44 | } 45 | 46 | abstract class PatternExtractorType { 47 | def fromFile(file: File): Seq[PatternExtractor] = { 48 | using (Source.fromFile(file, "UTF8")) { source => 49 | fromLines(source.getLines) 50 | } 51 | 52 | } 53 | def fromLines(lines: Iterator[String]): Seq[PatternExtractor] 54 | 55 | def name = this.getClass.getSimpleName 56 | } 57 | 58 | object PatternExtractorType { 59 | def apply(string: String) = string match { 60 | case "general" => GeneralExtractor 61 | case "template" => TemplateExtractor 62 | case "specific" => SpecificExtractor 63 | case _ => throw new IllegalArgumentException("unknown extractor: " + string) 64 | } 65 | } -------------------------------------------------------------------------------- /core/src/main/scala/edu/knowitall/openparse/extract/SpecificExtractor.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.openparse.extract 2 | 3 | import scala.Array.canBuildFrom 4 | import edu.knowitall.collection.immutable.graph.pattern.{Pattern, Match} 5 | import edu.knowitall.collection.immutable.graph.Graph 6 | import edu.knowitall.tool.parse.graph.{DependencyNode, DependencyGraph} 7 | import edu.knowitall.tool.stem.MorphaStemmer 8 | import edu.knowitall.openparse.ExtractorPattern 9 | 10 | /** An extractor that is specified only with a pattern 11 | * but only works for specific relation lemmas. 12 | * 13 | * @param relation the resulting relation string 14 | * @param relationLemmas the acceptible matched lemmas 15 | * @param pattern the pattern to extract 16 | * @param conf the confidence of this extractor 17 | * 18 | * @author Michael Schmitz 19 | */ 20 | class SpecificExtractor(val relation: String, 21 | val relationLemmas: List[String], 22 | pattern: ExtractorPattern, conf: Double) 23 | extends GeneralExtractor(pattern, conf) { 24 | 25 | def this(relation: String, relationLemmas: List[String], pattern: Pattern[DependencyNode], conf: Double) = 26 | this(relation, relationLemmas, new ExtractorPattern(pattern), conf) 27 | 28 | override def extract(dgraph: DependencyGraph)(implicit 29 | buildExtraction: (DependencyGraph, Match[DependencyNode], PatternExtractor)=>Iterable[DetailedExtraction], 30 | validMatch: Graph[DependencyNode]=>Match[DependencyNode]=>Boolean) = { 31 | val extractions = super.extract(dgraph) 32 | extractions.withFilter{ extr => 33 | val extrRelationLemmas = extr.rel.text.split(" ").map(MorphaStemmer.lemmatize(_)) 34 | relationLemmas.forall(extrRelationLemmas.contains(_)) 35 | }.map(_.replaceRelation(relation)) 36 | } 37 | } 38 | 39 | case object SpecificExtractor extends PatternExtractorType { 40 | def fromLines(lines: Iterator[String]) = throw new UnsupportedOperationException 41 | } -------------------------------------------------------------------------------- /core/src/main/scala/edu/knowitall/openparse/extract/TemplateExtractor.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.openparse.extract 2 | 3 | import scala.util.matching.Regex 4 | import org.slf4j.LoggerFactory 5 | import edu.knowitall.collection.immutable.graph.pattern.{Pattern, Match} 6 | import edu.knowitall.collection.immutable.graph.Graph 7 | import edu.knowitall.tool.parse.graph.{DependencyPattern, DependencyNode, DependencyGraph} 8 | import Template.group 9 | import edu.knowitall.ollie.Ollie.stemmer 10 | import edu.knowitall.tool.postag.Postagger 11 | import edu.knowitall.tool.parse.graph.RegexEdgeMatcher 12 | import edu.knowitall.tool.parse.graph.LabelEdgeMatcher 13 | import edu.knowitall.openparse.ExtractorPattern 14 | 15 | /** An extractor that is specified by a pattern and a template. 16 | * the template can add a "to be" and/or preposition word around 17 | * the relation. It can also change the preposition word to another 18 | * preposition (i.e., switch "of" to "in"). 19 | * 20 | * @param template a template in which to put the relation words 21 | * @param pattern the pattern to extract 22 | * @param conf the confidence of this extractor 23 | * 24 | * @author Michael Schmitz 25 | */ 26 | class TemplateExtractor(val template: Template, pattern: ExtractorPattern, conf: Double) 27 | extends GeneralExtractor(pattern, conf) { 28 | 29 | def this(template: Template, pattern: Pattern[DependencyNode], conf: Double) = 30 | this(template, new ExtractorPattern(pattern), conf) 31 | 32 | override def extract(dgraph: DependencyGraph)(implicit 33 | buildExtraction: (DependencyGraph, Match[DependencyNode], PatternExtractor)=>Iterable[DetailedExtraction], 34 | validMatch: Graph[DependencyNode]=>Match[DependencyNode]=>Boolean) = { 35 | 36 | val extractions = super.extractWithMatches(dgraph) 37 | 38 | extractions.map{ case (extr, m) => template(extr, dgraph, m) } 39 | } 40 | 41 | override def tabSerialize = Iterable("Template", template.serialize, pattern.serialize, conf.toString).mkString("\t") 42 | 43 | override def prepMismatch: Boolean = { 44 | val trailingPrep = TemplateExtractor.trailingPreposition.findFirstIn(template.serialize) 45 | val lastPatternPrep = pattern.baseEdgeMatchers.flatMap { 46 | case m: RegexEdgeMatcher if m.labelRegex == new Regex("""prep_(.*)""") => Some("{prep}") 47 | case m: LabelEdgeMatcher if m.label startsWith "prep_" => Some(m.label.drop(5)) 48 | case _ => None 49 | }.lastOption 50 | 51 | trailingPrep == lastPatternPrep 52 | } 53 | } 54 | 55 | case object TemplateExtractor extends PatternExtractorType { 56 | val logger = LoggerFactory.getLogger(this.getClass) 57 | 58 | private val trailingPreposition = new Regex("\\s(?:" + Postagger.prepositions.mkString("|") + "|\\{prep\\})$") 59 | 60 | override def fromLines(lines: Iterator[String]): List[PatternExtractor] = { 61 | val patterns: List[(Template, Pattern[DependencyNode], Double)] = lines.map { line => 62 | line.split("\t") match { 63 | // full information specified 64 | case Array(template, pat, conf) => 65 | (Template.deserialize(template), DependencyPattern.deserialize(pat), conf.toDouble) 66 | // assume a count of 1 if nothing is specified 67 | case Array(template, pat) => 68 | logger.warn("warning: pattern has no confidence: " + pat); 69 | (Template.deserialize(template), DependencyPattern.deserialize(pat), 1.0) 70 | case _ => throw new IllegalArgumentException("line must have two or three columns: " +line) 71 | } 72 | }.toList 73 | 74 | val maxCount = patterns.maxBy(_._3)._3 75 | (for ((template, pattern, conf) <- patterns) yield { 76 | new TemplateExtractor(template, new ExtractorPattern(pattern), conf) 77 | }).toList 78 | } 79 | 80 | def tabDeserialize(string: String) = { 81 | val parts = string.split("\t") 82 | } 83 | 84 | def tabDeserialize(parts: Seq[String]): (TemplateExtractor, Seq[String]) = { 85 | val Seq(templateString, patternString, confString, rest @ _*) = parts 86 | 87 | val template = Template.deserialize(templateString) 88 | val pattern = new ExtractorPattern(DependencyPattern.deserialize(patternString)) 89 | val conf = confString.toDouble 90 | 91 | (new TemplateExtractor(template, pattern, conf), rest) 92 | } 93 | } 94 | 95 | case class Template(template: String, be: Boolean) { 96 | import Template._ 97 | def apply(extr: DetailedExtraction, dgraph: DependencyGraph, m: Match[DependencyNode]) = { 98 | def matchGroup(name: String): String = name match { 99 | case "rel" => extr.relText 100 | case "arg1" => extr.arg1Text 101 | case "arg2" => extr.arg2Text 102 | case _ => m.groups(name).text 103 | } 104 | 105 | // don't add the be if we attach a verb using a cop, aux, or auxpass edge. 106 | // there are a lot of examples where adding "be" makes it very messy 107 | // "She has practiced law, with Foo, Bar." 108 | // don't want: (Bar; be has practiced with; Foo) 109 | // This is somewhat of a hack that makes bad patterns look less bad. 110 | val prefix = if (be && 111 | !(dgraph.graph.neighbors(m.nodeGroups.getOrElse("rel", m.nodeGroups("rel1")).node, dedge => (dedge.edge.label startsWith "aux") || dedge.edge.label == "cop") filter (_.postag startsWith "VB") exists (neighbor => extr.rel.nodes contains neighbor))) { 112 | "be" 113 | } 114 | else "" 115 | 116 | // pull out the modals because they must preceed the prefix 117 | // also include "to" 118 | val modals = extr.rel.nodes.filter(node => (node.postag startsWith "MD") || 119 | (node.postag == "TO")) 120 | 121 | // horrible escape is required. See JavaDoc for Match.replaceAll 122 | // or https://issues.scala-lang.org/browse/SI-5437 123 | var rel = group.replaceAllIn(template, (gm: Regex.Match) => matchGroup(gm.group(1)) 124 | .replaceAll("_", " ") 125 | .replaceAll("""\\""", """\\\\""") 126 | .replaceAll("""\$""", """\\\$""")) 127 | 128 | if (!prefix.isEmpty) { 129 | if (modals.isEmpty) { 130 | rel = prefix + " " + rel 131 | } else { 132 | val regex = new Regex("(^.*\\b(?:" + modals.iterator.map(_.text).mkString("|") + "))\\b") 133 | rel = regex.replaceAllIn(rel, "$1 " + prefix) 134 | } 135 | } 136 | 137 | extr.replaceRelation(rel) 138 | } 139 | 140 | override def toString = (if (be) "be " else "") + template 141 | 142 | def serialize = this.toString 143 | } 144 | 145 | object Template { 146 | val group = """\{(.*?)}""".r 147 | def deserialize(string: String) = { 148 | if (string.startsWith("be ")) { 149 | Template(string.drop(3), true) 150 | } 151 | else { 152 | Template(string, false) 153 | } 154 | } 155 | } 156 | -------------------------------------------------------------------------------- /core/src/main/scala/edu/knowitall/openparse/template/CountsToConfidence.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.openparse.template 2 | 3 | import scopt.mutable.OptionParser 4 | import java.io.File 5 | import edu.knowitall.common.Resource.using 6 | import scala.io.Source 7 | import java.io.PrintWriter 8 | 9 | object CountsToConfidence { 10 | abstract class Settings { 11 | def sourceFile: File 12 | def destFile: Option[File] 13 | } 14 | 15 | def main(args: Array[String]) = { 16 | object settings extends Settings { 17 | var sourceFile: File = _ 18 | var destFile: Option[File] = None 19 | } 20 | 21 | val parser = new OptionParser("convertconf") { 22 | arg("source", "file with pattern, count pairs", { path: String => settings.sourceFile = new File(path) }) 23 | argOpt("dest", "optional parameter to specify output to a file", { path: String => settings.destFile = Some(new File(path)) }) 24 | } 25 | 26 | if (parser.parse(args)) { 27 | run(settings) 28 | } 29 | } 30 | 31 | def run(settings: Settings) = { 32 | using (Source.fromFile(settings.sourceFile)) { source => 33 | using ( 34 | settings.destFile match { 35 | case Some(file) => new PrintWriter(file) 36 | case None => new PrintWriter(System.out) 37 | } 38 | ) { output => 39 | val lines = { 40 | val it = source.getLines 41 | val first = it.next 42 | output.println(first) 43 | it.toList 44 | } 45 | 46 | val max = lines.map(_.split("\t").last.toInt).max 47 | 48 | for (line <- lines) { 49 | val parts = line.split("\t") 50 | val count = parts.last.toInt 51 | output.println(parts.take(parts.length - 1).mkString("\t") + "\t" + ("%1.4f" format (count.toDouble / max.toDouble))) 52 | } 53 | } 54 | } 55 | } 56 | } -------------------------------------------------------------------------------- /core/src/main/scala/edu/knowitall/openparse/template/GeneralizeTemplate.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.openparse.template 2 | 3 | import java.io.{PrintWriter, File} 4 | 5 | import scala.collection.immutable 6 | import scala.io.Source 7 | 8 | import edu.knowitall.collection.immutable.graph.pattern.{NodeMatcher, ConjunctiveNodeMatcher} 9 | import edu.knowitall.common.Resource.using 10 | import edu.knowitall.openparse.{SlotMatcher, RelationMatcher, ExtractorPattern, ExtractionPartMatcher} 11 | import edu.knowitall.tool.parse.graph.{RegexNodeMatcher, PostagNodeMatcher, DependencyPattern, DependencyNode} 12 | import edu.knowitall.ollie.Ollie.stemmer 13 | 14 | import scopt.OptionParser 15 | 16 | /** Generalize semantic restrictions to semantic classes. 17 | * 18 | * @author Michael Schmitz 19 | */ 20 | object GeneralizeTemplates { 21 | abstract class Settings { 22 | def sourceFile: File 23 | def destFile: Option[File] 24 | 25 | val categories = List("person", "location") 26 | } 27 | 28 | def main(args: Array[String]) = { 29 | object settings extends Settings { 30 | var sourceFile: File = null 31 | var destFile: Option[File] = None 32 | } 33 | 34 | val parser = new OptionParser("buildtemp") { 35 | arg("source", "file with source relation, pattern pairs", { path: String => settings.sourceFile = new File(path) }) 36 | argOpt("dest", "optional parameter to specify output to a file", { path: String => settings.destFile = Some(new File(path)) }) 37 | } 38 | 39 | if (parser.parse(args)) { 40 | run(settings) 41 | } 42 | } 43 | 44 | def lexicalRestrictions(extractionPartMatcher: ExtractionPartMatcher) = { 45 | extractionPartMatcher.matcher match { 46 | case m: ConjunctiveNodeMatcher[_] => 47 | val postag = (m.matchers.collect { case m: PostagNodeMatcher => m } head).postag 48 | val lemmas = (m.matchers.collect { case m: RegexNodeMatcher => m } head).regex.toString.split("\\|").toSeq 49 | Some(postag, lemmas) 50 | case _ => None 51 | } 52 | } 53 | 54 | case class Category(name: String, elements: Set[String]) { 55 | override def toString = "Category(" + name + ")" 56 | } 57 | 58 | def loadCategories(categories: Seq[String]) = { 59 | def loadCategory(name: String) = { 60 | val elements = 61 | using(this.getClass.getClassLoader.getResourceAsStream("categories/" + name + ".txt")) { stream => 62 | using(Source.fromInputStream(stream)) { source => 63 | source.getLines().toSet 64 | } 65 | } 66 | 67 | Category(name, elements) 68 | } 69 | 70 | (for (cat <- categories) yield (loadCategory(cat))).toList 71 | } 72 | 73 | def run(settings: Settings) { 74 | val categories = loadCategories(settings.categories) 75 | 76 | def generalize(matcher: NodeMatcher[DependencyNode], postag: String, lemmas: Set[String]) = { 77 | def distance(cat: Category) = { 78 | val intersectSize = (cat.elements intersect lemmas).size 79 | intersectSize.toDouble / lemmas.size.toDouble 80 | if (intersectSize < 5) 0.0 81 | else intersectSize.toDouble / lemmas.size.toDouble 82 | } 83 | if (lemmas.size < 10) matcher 84 | else { 85 | postag match { 86 | case "NN" | "NNS" => 87 | val overlaps = categories map (cat => (cat, distance(cat))) sortBy (-_._2) 88 | if (overlaps.iterator.map(_._2).sum > 0.75) { 89 | val categories = overlaps.filter(_._2 > 0.10).map(_._1) 90 | val uncategorized = lemmas -- categories.flatMap(_.elements) 91 | val elements = immutable.SortedSet[String]() ++ categories.flatMap(_.elements) ++ uncategorized 92 | new ConjunctiveNodeMatcher(new PostagNodeMatcher(postag), new RegexNodeMatcher(elements.mkString("|").r)) 93 | } else matcher 94 | case m => matcher 95 | } 96 | } 97 | } 98 | 99 | var templates = 100 | using(Source.fromFile(settings.sourceFile, "UTF8")) { source => 101 | source.getLines().map { line => 102 | val Array(template, pattern, count) = line.split("\t") 103 | ((template, new ExtractorPattern(DependencyPattern.deserialize(pattern))), count.toInt) 104 | }.toList 105 | } 106 | 107 | templates = templates.map { 108 | case ((template, pattern), count) => 109 | val matchers = pattern.matchers.map { matcher => 110 | matcher match { 111 | case m: ExtractionPartMatcher if m.isInstanceOf[SlotMatcher] || m.isInstanceOf[RelationMatcher] => 112 | lexicalRestrictions(m) match { 113 | case Some((postag, lemmas)) => m.withMatcher(generalize(m.matcher, postag, lemmas.toSet)) 114 | case None => m 115 | } 116 | case m => m 117 | } 118 | } 119 | 120 | ((template, new ExtractorPattern(matchers)), count) 121 | } 122 | 123 | using ( 124 | settings.destFile match { 125 | case Some(file) => new PrintWriter(file, "UTF8") 126 | case None => new PrintWriter(System.out) 127 | }) 128 | { writer => 129 | templates map { case ((template, pattern), count) => Iterable(template, pattern, count).mkString("\t") } foreach writer.println 130 | } 131 | } 132 | } 133 | -------------------------------------------------------------------------------- /core/src/main/scala/edu/knowitall/openparse/template/PassiveReflections.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.openparse.template 2 | 3 | import java.io.{PrintWriter, File} 4 | import scala.Option.option2Iterable 5 | import scala.annotation.elidable 6 | import scala.collection.immutable 7 | import scala.io.Source 8 | import scala.util.matching.Regex 9 | import org.slf4j.LoggerFactory 10 | import edu.knowitall.collection.immutable.graph.pattern.{Matcher, ConjunctiveNodeMatcher, DirectedEdgeMatcher, CaptureEdgeMatcher} 11 | import edu.knowitall.collection.immutable.Bag 12 | import edu.knowitall.common.Resource.using 13 | import edu.knowitall.common.enrich.Traversables.traversableOncePairTo 14 | import edu.knowitall.openparse.{SlotMatcher, RelationMatcher, ExtractorPattern, ArgumentMatcher} 15 | import edu.knowitall.tool.parse.graph.{RegexNodeMatcher, RegexEdgeMatcher, PostagNodeMatcher, LabelEdgeMatcher, DependencyPattern, DependencyNode} 16 | import edu.knowitall.tool.postag.Postagger 17 | import edu.knowitall.ollie.Ollie.stemmer 18 | import scalaz.Scalaz._ 19 | import scalaz._ 20 | import scopt.OptionParser 21 | import edu.knowitall.collection.immutable.graph.pattern.CaptureNodeMatcher 22 | 23 | /** A main method for adding active and passive conversions 24 | * of patterns to a model file. BuiltTemplates removes 25 | * duplicate patterns, removing a lot of garbage but also 26 | * the active/passive conversions. 27 | * 28 | * @author Michael Schmitz 29 | */ 30 | object PassiveReflections { 31 | val logger = LoggerFactory.getLogger(this.getClass) 32 | 33 | abstract class Settings { 34 | def sourceFile: File 35 | def destFile: Option[File] 36 | } 37 | 38 | def main(args: Array[String]) { 39 | val settings = new Settings { 40 | var sourceFile: File = null 41 | var destFile: Option[File] = None 42 | } 43 | 44 | val parser = new OptionParser("passivemodel") { 45 | arg("source", "input model file", { path: String => settings.sourceFile = new File(path) }) 46 | argOpt("dest", "output model file", { path: String => settings.destFile = Some(new File(path)) }) 47 | } 48 | 49 | if (parser.parse(args)) { 50 | run(settings) 51 | } 52 | } 53 | 54 | def run(settings: Settings) { 55 | def switchArgs(pattern: ExtractorPattern) = { 56 | val arg1 = pattern.matchers.find { case m: CaptureNodeMatcher[_] => m.alias == "arg1" case _ => false } get 57 | val arg2 = pattern.matchers.find { case m: CaptureNodeMatcher[_] => m.alias == "arg2" case _ => false } get 58 | 59 | new ExtractorPattern(pattern.matchers.map { 60 | case m: CaptureNodeMatcher[_] if m.alias == "arg1" => arg2 61 | case m: CaptureNodeMatcher[_] if m.alias == "arg2" => arg1 62 | case m => m 63 | }) 64 | } 65 | 66 | val patterns = using { 67 | Source.fromFile(settings.sourceFile) 68 | } { source => 69 | source.getLines.drop(1).map { line => 70 | val Array(template, pattern, count) = line.split("\t") 71 | (template, new ExtractorPattern(DependencyPattern.deserialize(pattern)), count) 72 | }.toList 73 | } 74 | 75 | using( 76 | settings.destFile match { 77 | case Some(file) => new PrintWriter(file, "UTF8") 78 | case None => new PrintWriter(System.out) 79 | }) { output => 80 | patterns.foreach { 81 | case (template, pattern, count) => 82 | output.println(Iterable(template, pattern, count).mkString("\t")) 83 | 84 | if (pattern.baseEdgeMatchers.exists { case m: LabelEdgeMatcher => m.label == "nsubj" case _ => false }) { 85 | // print the passive conversion 86 | 87 | if (!(template startsWith "be ")) { 88 | output.println(Iterable("be " + template, switchArgs(pattern), count).mkString("\t")) 89 | } 90 | } else if (pattern.baseEdgeMatchers.exists { case m: LabelEdgeMatcher => m.label == "nsubjpass" case _ => false }) { 91 | if (template startsWith "be ") { 92 | output.println(Iterable(template.drop(3), switchArgs(pattern), count).mkString("\t")) 93 | } 94 | } 95 | } 96 | } 97 | } 98 | } -------------------------------------------------------------------------------- /core/src/test/resources/logback-test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /core/src/test/scala/edu/knowitall/common/enrich/TraversableSpecTest.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.common.enrich 2 | 3 | import edu.knowitall.collection.immutable.Bag 4 | 5 | import org.junit.runner.RunWith 6 | import org.specs2.mutable.Specification 7 | import org.specs2.runner.JUnitRunner 8 | 9 | @RunWith(classOf[JUnitRunner]) 10 | object TraversableSpecTest extends Specification { 11 | import Traversables._ 12 | 13 | "simple histogram works fine" in { 14 | val h1 = List(1, 2, 2, 3, 3, 3).histogram 15 | val h2 = List(3, 2, 1, 3, 2, 3).histogram 16 | h1 must_== h2 17 | h1 must haveTheSameElementsAs(List((1, 1), (2, 2), (3, 3))) 18 | } 19 | 20 | "histogram from partials works fine" in { 21 | val list = List((1, 1), (2, 2), (2, 2), (3, 3), (3, 3), (3, 3)) 22 | val h1 = list.mergeHistograms 23 | val h2 = list.reverse.mergeHistograms 24 | val h3 = list.mergeKeys(_ + _) 25 | h1 must_== h2 26 | h1 must_== h3 27 | h1 must haveTheSameElementsAs(List((1, 1), (2, 4), (3, 9))) 28 | } 29 | 30 | "list multimaps works fine" in { 31 | val list = List(1 -> 1, 1 -> 2, 1 -> 1, 2 -> 2) 32 | val multimap = list.toListMultimap 33 | 34 | multimap must haveTheSameElementsAs(Map(1 -> List(1, 2, 1), 2 -> List(2))) 35 | 36 | val extended = (multimap.toSeq :+ (1 -> List(2, 3, 4, 5))) 37 | val merged = extended.mergeKeys(_ ++ _) 38 | 39 | merged must haveTheSameElementsAs(Map(1 -> List(1, 2, 1, 2, 3, 4, 5), 2 -> List(2))) 40 | } 41 | 42 | "set multimaps works fine" in { 43 | val list = List(1 -> 1, 1 -> 2, 1 -> 1, 2 -> 2) 44 | val multimap = list.toSetMultimap 45 | 46 | multimap must haveTheSameElementsAs(Map(1 -> Set(1, 2), 2 -> Set(2))) 47 | 48 | val extended = (multimap.toSeq :+ (1 -> Set(2, 3, 4, 5))) 49 | val merged = extended.mergeKeys(_ ++ _) 50 | 51 | merged must haveTheSameElementsAs(Map(1 -> Set(1, 2, 3, 4, 5), 2 -> Set(2))) 52 | } 53 | 54 | "bag multimaps works fine" in { 55 | val list = List(1 -> 1, 1 -> 2, 1 -> 1, 2 -> 2) 56 | val multimap = list.toBagMultimap 57 | 58 | multimap must haveTheSameElementsAs(Map(1 -> Bag(1, 1, 2), 2 -> Bag(2))) 59 | 60 | val extended = (multimap.toSeq :+ (1 -> Bag(2, 3, 4, 5))) 61 | val merged = extended.mergeKeys(_ ++ _) 62 | 63 | merged must haveTheSameElementsAs(Map(1 -> Bag(1, 1, 2, 2, 3, 4, 5), 2 -> Bag(2))) 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /core/src/test/scala/edu/knowitall/ollie/DependencyGraphExtrasSpec.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.ollie 2 | 3 | import org.junit.runner.RunWith 4 | import org.specs2.mutable.Specification 5 | import org.specs2.runner.JUnitRunner 6 | import edu.knowitall.tool.parse.graph.DependencyGraph 7 | 8 | @RunWith(classOf[JUnitRunner]) 9 | object DependencyGraphExtrasTest extends Specification { 10 | "switch to passive voice works" in { 11 | val graph = DependencyGraph.deserialize("nsubj(hit_VBD_1_8, Michael_NNP_0_0); dobj(hit_VBD_1_8, ball_NN_3_16); punct(hit_VBD_1_8, ._._4_20); det(ball_NN_3_16, the_DT_2_12)") 12 | val extras = new DependencyGraphExtras(graph) 13 | 14 | val switched = extras.switchVoice 15 | 16 | switched.size must_== 1 17 | switched.head.serialize must_== "det(ball_NN_1_4, the_DT_0_0); auxpass(hit_VBD_2_13, was_VBD_1_9); nsubjpass(hit_VBD_2_13, ball_NN_1_4); prep(hit_VBD_2_13, by_IN_3_17); punct(hit_VBD_2_13, ._._6_28); pobj(by_IN_3_17, Michael_NNP_4_20)" 18 | } 19 | 20 | "switch to active voice works" in { 21 | val graph = DependencyGraph.deserialize("det(ball_NN_1_4, The_DT_0_0); nsubjpass(hit_VBN_3_13, ball_NN_1_4); auxpass(hit_VBN_3_13, was_VBD_2_9); prep(hit_VBN_3_13, by_IN_4_17); punct(hit_VBN_3_13, ._._6_27); pobj(by_IN_4_17, Michael_NNP_5_20)") 22 | val extras = new DependencyGraphExtras(graph) 23 | 24 | val switched = extras.switchVoice 25 | 26 | switched.size must_== 1 27 | switched.head.serialize must_== "nsubj(hit_VBN_1_8, Michael_NNP_0_0); dobj(hit_VBN_1_8, ball_NN_3_16); punct(hit_VBN_1_8, ._._4_21); det(ball_NN_3_16, The_DT_2_12)" 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /core/src/test/scala/edu/knowitall/ollie/confidence/OllieFeatureSetSpec.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.ollie.confidence 2 | 3 | import org.junit._ 4 | import org.junit.Assert._ 5 | import org.specs2.mutable.Specification 6 | import org.junit.runner.RunWith 7 | import org.specs2.runner.JUnitRunner 8 | import edu.knowitall.tool.parse.graph.DependencyGraph 9 | import edu.knowitall.ollie.Ollie 10 | import edu.knowitall.ollie.OllieExtractionInstance 11 | import edu.knowitall.ollie.ScoredOllieExtractionInstance 12 | import edu.knowitall.openparse.OpenParse 13 | import org.junit.runner.RunWith 14 | import org.specs2.runner.JUnitRunner 15 | 16 | @RunWith(classOf[JUnitRunner]) 17 | object OllieFeatureSetSpec extends Specification { 18 | val ollie = new Ollie(OpenParse.withDefaultModel()) 19 | 20 | "if right before arg1" in { 21 | val graph = DependencyGraph.deserialize("poss(father_NN_2_12, his_PRP$_1_8); punct(father_NN_2_12, ,_,_3_19); appos(father_NN_2_12, Whitechapel_NNP_4_21); punct(father_NN_2_12, ,_,_5_33); advmod(betrays_VBZ_6_35, However_RB_0_0); nsubj(betrays_VBZ_6_35, father_NN_2_12); dobj(betrays_VBZ_6_35, whereabouts_NN_8_47); punct(betrays_VBZ_6_35, ,_,_9_59); xcomp(betrays_VBZ_6_35, fearing_VBG_10_61); punct(betrays_VBZ_6_35, ._._27_149); poss(whereabouts_NN_8_47, his_PRP$_7_43); ccomp(fearing_VBG_10_61, die_VB_15_87); poss(son_NN_13_78, his_PRP$_12_74); complm(die_VB_15_87, that_IN_11_69); nsubj(die_VB_15_87, son_NN_13_78); aux(die_VB_15_87, will_MD_14_82); advcl(die_VB_15_87, captured_VBN_20_104); mark(captured_VBN_20_104, if_IN_16_91); nsubjpass(captured_VBN_20_104, he_PRP_17_94); auxpass(captured_VBN_20_104, is_VBZ_18_97); neg(captured_VBN_20_104, not_RB_19_100); cc(captured_VBN_20_104, and_CC_21_113); conj(captured_VBN_20_104, returned_VBN_22_117); dobj(captured_VBN_20_104, home_NN_23_126); prep(captured_VBN_20_104, to_TO_24_131); pobj(to_TO_24_131, plantation_NN_26_138); det(plantation_NN_26_138, the_DT_25_134)") 22 | val extrs = ollie.extract(graph) 23 | 24 | val extr = extrs.toSeq(2) 25 | OllieFeatures.ifRightBeforeArg1(extr) must_== 1.0 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /core/src/test/scala/edu/knowitall/openparse/BuildPatternsSpec.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.openparse 2 | 3 | import org.junit._ 4 | import org.junit.Assert._ 5 | import org.specs2.mutable.Specification 6 | import org.junit.runner.RunWith 7 | import org.specs2.runner.JUnitRunner 8 | 9 | import edu.knowitall.tool.parse.graph.DependencyGraph 10 | import edu.knowitall.tool.stem.MorphaStemmer 11 | 12 | @RunWith(classOf[JUnitRunner]) 13 | object BuildPatternsSpecTest extends Specification { 14 | def findPatterns(row: (String, String, String, String, String), maxLength: Option[Int] = None) = { 15 | val (rel, arg1, arg2, lemmasString, pickled) = row 16 | val lemmas = lemmasString.split("\\s+").toSet 17 | val graph = DependencyGraph.deserialize(pickled).map(_.lemmatize(MorphaStemmer)).normalize 18 | BuildPatterns.findRelationPatterns(graph, rel, arg1, arg2, lemmas, maxLength) 19 | } 20 | 21 | "A pattern is found when the argument overlap" in { 22 | val row, (arg1, rel, arg2, lemmas, pickled) = ("be marry to", "hillary clinton", "bill clinton", "hillary clinton marry bill", "cc(married_VBN_11_0, And_CC_0_0); nn(Clinton_NNP_2_0, Hillary_NNP_1_0); nsubjpass(married_VBN_11_0, Clinton_NNP_2_0); punct(Clinton_NNP_2_0, _,_3_0); dep(know_VBP_8_0, who_WP_4_0); punct(know_VBP_8_0, _,_5_0); mark(know_VBP_8_0, as_IN_6_0); nsubj(know_VBP_8_0, we_PRP_7_0); rcmod(Clinton_NNP_2_0, know_VBP_8_0); punct(Clinton_NNP_2_0, _,_9_0); auxpass(married_VBN_11_0, is_VBZ_10_0); nn(Clinton_NNP_14_0, Bill_NNP_13_0); prep_to(married_VBN_11_0, Clinton_NNP_14_0); punct(married_VBN_11_0, ._._15_0)") 23 | val patterns = findPatterns(row) 24 | patterns.size must_== 1 25 | patterns.head._1.toString must_== "{arg1} prep_to> {arg2}" 26 | } 27 | 28 | "A pattern is found with exactly one slot" in { 29 | val row = ("arrive in", "barack obama", "afghanistan", "barack obama arrive afghanistan", "(to_TO_4_0), (in_IN_12_0), (on_IN_14_0), (or_CC_16_0), (for_IN_20_0), (to_TO_23_0), (and_CC_27_0), (in_IN_29_0), (of_IN_34_0), (from_IN_38_0), poss(trip_NN_3_0, his_PRP$_1_0); amod(trip_NN_3_0, two-day_JJ_2_0); pobj(After_IN_0_0, trip_NN_3_0); prep_to(trip_NN_3_0, Afghanistan_NNP_5_0); punct(trip_NN_3_0, ,_,_6_0); nn(Obama_NNP_10_0, U.S._NNP_7_0); nn(Obama_NNP_10_0, Senator_NNP_8_0); nn(Obama_NNP_10_0, Barack_NNP_9_0); nsubj(arrived_VBD_11_0, Obama_NNP_10_0); rcmod(trip_NN_3_0, arrived_VBD_11_0); prep_in(arrived_VBD_11_0, Iraq_NNP_13_0); prep_on(arrived_VBD_11_0, Monday_NNP_15_0); prep_on(arrived_VBD_11_0, July_NNP_17_0); conj_or(Monday_NNP_15_0, July_NNP_17_0); num(July_NNP_17_0, 21_CD_18_0); punct(trip_NN_3_0, ,_,_19_0); det(visit_NN_22_0, a_DT_21_0); prep_for(trip_NN_3_0, visit_NN_22_0); det(East_NNP_26_0, the_DT_24_0); nn(East_NNP_26_0, Middle_NNP_25_0); prep_to(visit_NN_22_0, East_NNP_26_0); prep_to(visit_NN_22_0, Europe_NNP_28_0); conj_and(East_NNP_26_0, Europe_NNP_28_0); poss(capacity_NN_31_0, his_PRP$_30_0); prep_in(visit_NN_22_0, capacity_NN_31_0); det(member_NN_33_0, a_DT_32_0); dep(capacity_NN_31_0, member_NN_33_0); det(Senate_NNP_37_0, the_DT_35_0); nn(Senate_NNP_37_0, U.S._NNP_36_0); prep_of(member_NN_33_0, Senate_NNP_37_0); prep_from(member_NN_33_0, Illinois_NNP_39_0); punct(After_IN_0_0, ._._40_0)") 30 | val patterns = findPatterns(row) 31 | patterns.size must_== 1 32 | patterns.head._1.toString must_== "{arg1} prep_to> {arg2}" 33 | } 34 | 35 | "A pattern is NOT found because of a length restriction" in { 36 | val row = ("arrive in", "barack obama", "afghanistan", "barack obama arrive afghanistan", "(to_TO_4_0), (in_IN_12_0), (on_IN_14_0), (or_CC_16_0), (for_IN_20_0), (to_TO_23_0), (and_CC_27_0), (in_IN_29_0), (of_IN_34_0), (from_IN_38_0), poss(trip_NN_3_0, his_PRP$_1_0); amod(trip_NN_3_0, two-day_JJ_2_0); pobj(After_IN_0_0, trip_NN_3_0); prep_to(trip_NN_3_0, Afghanistan_NNP_5_0); punct(trip_NN_3_0, ,_,_6_0); nn(Obama_NNP_10_0, U.S._NNP_7_0); nn(Obama_NNP_10_0, Senator_NNP_8_0); nn(Obama_NNP_10_0, Barack_NNP_9_0); nsubj(arrived_VBD_11_0, Obama_NNP_10_0); rcmod(trip_NN_3_0, arrived_VBD_11_0); prep_in(arrived_VBD_11_0, Iraq_NNP_13_0); prep_on(arrived_VBD_11_0, Monday_NNP_15_0); prep_on(arrived_VBD_11_0, July_NNP_17_0); conj_or(Monday_NNP_15_0, July_NNP_17_0); num(July_NNP_17_0, 21_CD_18_0); punct(trip_NN_3_0, ,_,_19_0); det(visit_NN_22_0, a_DT_21_0); prep_for(trip_NN_3_0, visit_NN_22_0); det(East_NNP_26_0, the_DT_24_0); nn(East_NNP_26_0, Middle_NNP_25_0); prep_to(visit_NN_22_0, East_NNP_26_0); prep_to(visit_NN_22_0, Europe_NNP_28_0); conj_and(East_NNP_26_0, Europe_NNP_28_0); poss(capacity_NN_31_0, his_PRP$_30_0); prep_in(visit_NN_22_0, capacity_NN_31_0); det(member_NN_33_0, a_DT_32_0); dep(capacity_NN_31_0, member_NN_33_0); det(Senate_NNP_37_0, the_DT_35_0); nn(Senate_NNP_37_0, U.S._NNP_36_0); prep_of(member_NN_33_0, Senate_NNP_37_0); prep_from(member_NN_33_0, Illinois_NNP_39_0); punct(After_IN_0_0, ._._40_0)") 37 | val patterns = findPatterns(row, Some(2)) 38 | patterns.size must_== 0 39 | } 40 | 41 | // rel rel 42 | "A pattern is found" in { 43 | val row = ("be bear a", "queequag", "slave", "bear queequag slave", "(in_IN_5_0), (._._7_0), nsubjpass(born_VBN_2_0, Queequag_NNP_0_0); auxpass(born_VBN_2_0, was_VBD_1_0); dobj(born_VBN_2_0, slave_NN_4_0); det(slave_NN_4_0, a_DT_3_0); prep_in(slave_NN_4_0, Africa_NNP_6_0)") 44 | val patterns = findPatterns(row, Some(2)) 45 | patterns.size must_== 1 46 | patterns.head._1.toString must_== "{arg1} dobj> {arg2}" 47 | } 48 | 49 | "A single pattern is found with a slot instead of a rel rel" in { 50 | val row = ("be elect president of", "barack obama", "unite state", "barack obama unite state elect president", "(of_IN_5_0), (._._9_0), nn(Obama_NNP_1_0, Barack_NNP_0_0); nsubjpass(elected_VBN_3_0, Obama_NNP_1_0); auxpass(elected_VBN_3_0, was_VBD_2_0); dobj(elected_VBN_3_0, president_NN_4_0); prep_of(president_NN_4_0, States_NNPS_8_0); det(States_NNPS_8_0, the_DT_6_0); nn(States_NNPS_8_0, United_NNP_7_0)") 51 | val patterns = findPatterns(row) 52 | patterns.size must_== 1 53 | patterns.head._1.toString must_== "{arg1} dobj> {rel1:postag=NN} >prep_of> {arg2}" 54 | } 55 | 56 | "A single pattern is found with a slot instead of a rel rel" in { 57 | val row = ("be team locate in", "mariner", "seattle", "mariner team locate seattle", "(in_IN_6_0), (._._8_0), det(Mariners_NNPS_1_0, The_DT_0_0); nsubj(team_NN_4_0, Mariners_NNPS_1_0); cop(team_NN_4_0, are_VBP_2_0); det(team_NN_4_0, a_DT_3_0); partmod(team_NN_4_0, located_VBN_5_0); prep_in(located_VBN_5_0, Seattle_NNP_7_0)") 58 | val patterns = findPatterns(row) 59 | patterns.head._1.toString must_== "{arg1} partmod> {rel1:postag=VBN} >prep_in> {arg2}" 60 | } 61 | 62 | "A single pattern is found with a slot instead of a rel rel" in { 63 | val row = ("be going populate", "human", "earth", "human go populate earth", "(._._7_0), nsubj(going_VBG_2_0, Humans_NNS_0_0); aux(going_VBG_2_0, are_VBP_1_0); xcomp(going_VBG_2_0, populate_VB_4_0); aux(populate_VB_4_0, to_TO_3_0); dobj(populate_VB_4_0, earth_NN_6_0); det(earth_NN_6_0, the_DT_5_0)") 64 | val patterns = findPatterns(row) 65 | patterns.size must_== 1 66 | patterns.head._1.toString must_== "{arg1} xcomp> {rel:postag=VB} >dobj> {arg2}" 67 | } 68 | 69 | "A single pattern is found with a slot instead of a rel rel" in { 70 | val row = ("have crush on", "juliette", "romeo", "juliette have crush romeo", "(on_IN_4_0), (._._6_0), nsubj(has_VBZ_1_0, Juliette_NNP_0_0); dobj(has_VBZ_1_0, crush_NN_3_0); det(crush_NN_3_0, a_DT_2_0); prep_on(crush_NN_3_0, Romeo_NNP_5_0)") 71 | val patterns = findPatterns(row) 72 | patterns.size must_== 1 73 | patterns.head._1.toString must_== "{arg1} dobj> {rel1:postag=NN} >prep_on> {arg2}" 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /core/src/test/scala/edu/knowitall/openparse/ExtractorPatternSpec.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.openparse 2 | 3 | import org.junit._ 4 | import org.junit.Assert._ 5 | import org.specs2.mutable.Specification 6 | import org.junit.runner.RunWith 7 | import org.specs2.runner.JUnitRunner 8 | 9 | import edu.knowitall.tool.parse.graph.DependencyPattern 10 | import edu.knowitall.ollie.Ollie.stemmer 11 | 12 | @RunWith(classOf[JUnitRunner]) 13 | object ExtractorPatternSpecTest extends Specification { 14 | def testSymmetric(pattern: String, symmetric: Boolean) { 15 | (pattern + " is " + (if (symmetric) "symmetric" else "not symmetric")) in { 16 | new ExtractorPattern(DependencyPattern.deserialize(pattern)).symmetric must be_==(symmetric) 17 | } 18 | } 19 | 20 | testSymmetric("{arg1} dobj> {arg2}", false) 21 | testSymmetric("{arg1} nsubj> {arg2}", true) 22 | testSymmetric("{arg1} prep_of> {arg2}", true) 23 | testSymmetric("{rel:postag=NN} nn> {arg2}", false) 24 | } 25 | -------------------------------------------------------------------------------- /core/src/test/scala/edu/knowitall/openparse/OllieSpec.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.openparse 2 | 3 | import org.junit._ 4 | import org.junit.Assert._ 5 | import org.specs2.mutable.Specification 6 | import org.junit.runner.RunWith 7 | import org.specs2.runner.JUnitRunner 8 | import edu.knowitall.tool.parse.graph.DependencyGraph 9 | import edu.knowitall.tool.stem.MorphaStemmer 10 | import edu.knowitall.ollie.Ollie 11 | import edu.knowitall.ollie.OllieExtractionInstance 12 | import edu.knowitall.ollie.ScoredOllieExtractionInstance 13 | import edu.knowitall.ollie.confidence.OllieConfidenceFunction 14 | 15 | @RunWith(classOf[JUnitRunner]) 16 | object OllieSpecTest extends Specification { 17 | val ollie = new Ollie(OpenParse.withDefaultModel()) 18 | val conf = OllieConfidenceFunction.loadDefaultClassifier() 19 | 20 | "Ollie finds an example extraction" in { 21 | val graph = DependencyGraph.deserialize("(._._5_37), nsubj(finds_VBZ_1_10, OpenParse_NNP_0_0); dobj(finds_VBZ_1_10, extraction_NN_4_27); det(extraction_NN_4_27, an_DT_2_16); nn(extraction_NN_4_27, example_NN_3_19)") 22 | val extrs = ollie.extract(graph) 23 | 24 | val extr = extrs.head 25 | extr must_== OllieExtractionInstance.tabDeserialize(extr.tabSerialize) 26 | 27 | val scored = new ScoredOllieExtractionInstance(true, extr) 28 | scored must_== ScoredOllieExtractionInstance.tabDeserialize(scored.tabSerialize) 29 | } 30 | 31 | "Ollie confidence function executes" in { 32 | val graph = DependencyGraph.deserialize("(._._5_37), nsubj(finds_VBZ_1_10, OpenParse_NNP_0_0); dobj(finds_VBZ_1_10, extraction_NN_4_27); det(extraction_NN_4_27, an_DT_2_16); nn(extraction_NN_4_27, example_NN_3_19)") 33 | val extrs = ollie.extract(graph) 34 | extrs map conf must not(throwA[Exception]) 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /core/src/test/scala/edu/knowitall/openparse/OpenParseSpec.scala: -------------------------------------------------------------------------------- 1 | package edu.knowitall.openparse 2 | 3 | import org.junit._ 4 | import org.junit.Assert._ 5 | import org.specs2.mutable.Specification 6 | import org.junit.runner.RunWith 7 | import org.specs2.runner.JUnitRunner 8 | 9 | import edu.knowitall.tool.parse.graph.DependencyGraph 10 | import edu.knowitall.tool.stem.MorphaStemmer 11 | 12 | @RunWith(classOf[JUnitRunner]) 13 | object OpenParseSpecTest extends Specification { 14 | val openparse = OpenParse.withDefaultModel() 15 | 16 | "OpenParse finds an example extraction" in { 17 | val graph = DependencyGraph.deserialize("(._._5_37), nsubj(finds_VBZ_1_10, OpenParse_NNP_0_0); dobj(finds_VBZ_1_10, extraction_NN_4_27); det(extraction_NN_4_27, an_DT_2_16); nn(extraction_NN_4_27, example_NN_3_19)") 18 | val extrs = openparse.extract(graph) 19 | 20 | extrs.size must_== 1 21 | extrs.head._2.toString must_== "(OpenParse; finds; an example extraction)" 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /example/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | edu.washington.cs.knowitall.ollie 5 | ollie-example 6 | ollie-example 7 | 1.0.0-SNAPSHOT 8 | 9 | edu.washington.cs.knowitall 10 | knowitall-oss 11 | 1.0.2 12 | 13 | 14 | UTF-8 15 | 2.4.0 16 | 17 | 18 | 19 | edu.washington.cs.knowitall.ollie 20 | ollie-core_2.9.2 21 | 1.0.2 22 | 23 | 24 | edu.washington.cs.knowitall.nlptools 25 | nlptools-parse-malt_2.9.2 26 | ${nlptools.version} 27 | 28 | 29 | 30 | ch.qos.logback 31 | logback-classic 32 | 1.0.7 33 | 34 | 35 | ch.qos.logback 36 | logback-core 37 | 1.0.7 38 | 39 | 40 | 41 | src/main/scala 42 | src/test/scala 43 | 44 | 45 | src/main/resources 46 | 47 | 48 | 49 | 50 | net.alchim31.maven 51 | scala-maven-plugin 52 | 3.1.0 53 | 54 | 55 | -deprecation 56 | -unchecked 57 | 58 | 59 | 60 | 61 | 62 | compile 63 | testCompile 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /example/src/main/java/example/JavaOllieWrapper.java: -------------------------------------------------------------------------------- 1 | package example; 2 | 3 | import java.io.File; 4 | import java.net.MalformedURLException; 5 | 6 | import edu.knowitall.ollie.Ollie; 7 | import edu.knowitall.ollie.OllieExtraction; 8 | import edu.knowitall.ollie.OllieExtractionInstance; 9 | import edu.knowitall.tool.parse.MaltParser; 10 | import edu.knowitall.tool.parse.graph.DependencyGraph; 11 | 12 | /** This is an example class that shows one way of using Ollie from Java. */ 13 | public class JavaOllieWrapper { 14 | // the extractor itself 15 | private Ollie ollie; 16 | 17 | // the parser--a step required before the extractor 18 | private MaltParser maltParser; 19 | 20 | // the path of the malt parser model file 21 | private static final String MALT_PARSER_FILENAME = "engmalt.linear-1.7.mco"; 22 | 23 | public JavaOllieWrapper() throws MalformedURLException { 24 | // initialize MaltParser 25 | scala.Option nullOption = scala.Option.apply(null); 26 | maltParser = new MaltParser(new File(MALT_PARSER_FILENAME).toURI().toURL(), nullOption); 27 | 28 | // initialize Ollie 29 | ollie = new Ollie(); 30 | } 31 | 32 | /** 33 | * Gets Ollie extractions from a single sentence. 34 | * @param sentence 35 | * @return the set of ollie extractions 36 | */ 37 | public Iterable extract(String sentence) { 38 | // parse the sentence 39 | DependencyGraph graph = maltParser.dependencyGraph(sentence); 40 | 41 | // run Ollie over the sentence and convert to a Java collection 42 | Iterable extrs = scala.collection.JavaConversions.asJavaIterable(ollie.extract(graph)); 43 | return extrs; 44 | } 45 | 46 | public static void main(String args[]) throws MalformedURLException { 47 | System.out.println(JavaOllieWrapper.class.getResource("/logback.xml")); 48 | // initialize 49 | JavaOllieWrapper ollieWrapper = new JavaOllieWrapper(); 50 | 51 | // extract from a single sentence. 52 | String sentence = "President Obama will meet with Congressional leaders on Friday, and House Republicans summoned lawmakers back for a Sunday session, in a last-ditch effort to avert a fiscal crisis brought on by automatic tax increases and spending cuts scheduled to hit next week."; 53 | Iterable extrs = ollieWrapper.extract(sentence); 54 | 55 | // print the extractions. 56 | for (OllieExtractionInstance inst : extrs) { 57 | OllieExtraction extr = inst.extr(); 58 | System.out.println(extr.arg1().text()+"\t"+extr.rel().text()+"\t"+extr.arg2().text()); 59 | } 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /example/src/main/resouces/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 6 | 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /example/src/main/scala/ollie/Example.scala: -------------------------------------------------------------------------------- 1 | package ollie 2 | 3 | import edu.knowitall.ollie.Ollie 4 | import edu.knowitall.tool.parse.MaltParser 5 | import scala.io.Source 6 | import edu.knowitall.ollie.confidence.OllieConfidenceFunction 7 | 8 | /** This is an example project that takes lines as input from stdin, 9 | * parses them, runs the Ollie extractor on them, scores the 10 | * extractions with a confidence function, and then prints the results. 11 | * 12 | * You can run this project with the following command: 13 | * mvn clean compile exec:java -Dexec.mainClass=ollie.Example 14 | * 15 | * You will need to have engmalt.linear-1.7.mco in the base directory 16 | * of this example for the program to work. You can download this 17 | * file from the MaltParser website: 18 | * 19 | * http://www.maltparser.org/mco/english_parser/engmalt.html 20 | */ 21 | object Example extends App { 22 | val parser = new MaltParser 23 | val ollie = new Ollie 24 | val confidence = OllieConfidenceFunction.loadDefaultClassifier() 25 | for (line <- Source.stdin.getLines; if !line.trim.isEmpty) { 26 | val parsed = parser.dependencyGraph(line) 27 | val extractionInstances = ollie.extract(parsed) 28 | 29 | println("Extractions:") 30 | for (inst <- extractionInstances) { 31 | val conf = confidence(inst) 32 | println(("%.2f" format conf) + "\t" + inst.extraction) 33 | } 34 | println("Waiting for next input...") 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | ollie 5 | ollie 6 | 1.0.0-SNAPSHOT 7 | pom 8 | edu.washington.cs.knowitall.ollie 9 | 10 | UTF-8 11 | 12 | 13 | core 14 | app 15 | 16 | 17 | --------------------------------------------------------------------------------