├── .dockerignore
├── .gitignore
├── .travis.yml
├── Dockerfile
├── LICENSE
├── README.md
├── app
    ├── pom.xml
    └── src
    │   ├── main
    │       ├── resources
    │       │   └── logback.xml
    │       └── scala
    │       │   └── edu
    │       │       └── knowitall
    │       │           ├── ollie
    │       │               ├── OllieCli.scala
    │       │               └── SentenceIterator.scala
    │       │           └── openparse
    │       │               ├── OpenParseCli.scala
    │       │               ├── OpenParseGui.scala
    │       │               └── gui
    │       │                   ├── Dot.scala
    │       │                   ├── ExtractionEntry.scala
    │       │                   ├── Parser.scala
    │       │                   └── Sentence.scala
    │   └── test
    │       └── resources
    │           └── logback-test.xml
├── core
    ├── build.sbt
    ├── here.txt
    ├── pom.xml
    ├── project
    │   └── plugins.sbt
    ├── scripts
    │   ├── applypatterns.sh
    │   ├── build_templates.sh
    │   ├── create_patterns.sh
    │   ├── create_test_train.sh
    │   ├── extractor.sh
    │   └── keep_common_patterns.sh
    └── src
    │   ├── main
    │       ├── resources
    │       │   └── edu
    │       │   │   └── knowitall
    │       │   │       ├── ollie
    │       │   │           ├── cognitiveWords.txt
    │       │   │           ├── communicationWords.txt
    │       │   │           ├── confidence
    │       │   │           │   └── default-classifier.txt
    │       │   │           └── prefixWords.txt
    │       │   │       └── openparse
    │       │   │           ├── categories
    │       │   │               ├── location.txt
    │       │   │               └── person.txt
    │       │   │           └── openparse.model
    │       └── scala
    │       │   └── edu
    │       │       └── knowitall
    │       │           ├── common
    │       │               └── enrich
    │       │               │   └── Traversable.scala
    │       │           ├── ollie
    │       │               ├── DependencyGraphExtras.scala
    │       │               ├── NaryExtraction.scala
    │       │               ├── Ollie.scala
    │       │               ├── OllieExtraction.scala
    │       │               ├── OllieExtractionInstance.scala
    │       │               ├── ScoredOllieExtractionInstance.scala
    │       │               ├── confidence
    │       │               │   ├── OllieConfidenceFunction.scala
    │       │               │   ├── OllieFeatureEvaluation.scala
    │       │               │   ├── OllieFeatureSet.scala
    │       │               │   └── train
    │       │               │   │   ├── CrossValidateConfidence.scala
    │       │               │   │   └── TrainOllieConfidence.scala
    │       │               └── output
    │       │               │   └── BratOutput.scala
    │       │           └── openparse
    │       │               ├── AnalyzePatterns.scala
    │       │               ├── BuildPatterns.scala
    │       │               ├── ExtractorPattern.scala
    │       │               ├── GraphExpansions.scala
    │       │               ├── OpenParse.scala
    │       │               ├── bootstrap
    │       │                   ├── FilterTargetExtractions.scala
    │       │                   ├── FindCommon.scala
    │       │                   ├── FindTargetArguments.scala
    │       │                   └── FindTargetExtractions.scala
    │       │               ├── eval
    │       │                   ├── GroupScoredBy.scala
    │       │                   ├── PrecisionYield.scala
    │       │                   ├── RankPatterns.scala
    │       │                   ├── Score.scala
    │       │                   └── StatisticalSignificance.scala
    │       │               ├── extract
    │       │                   ├── Extraction.scala
    │       │                   ├── GeneralExtractor.scala
    │       │                   ├── PatternExtractor.scala
    │       │                   ├── SpecificExtractor.scala
    │       │                   └── TemplateExtractor.scala
    │       │               └── template
    │       │                   ├── BuildTemplates.scala
    │       │                   ├── CountsToConfidence.scala
    │       │                   ├── GeneralizeTemplate.scala
    │       │                   └── PassiveReflections.scala
    │   └── test
    │       ├── resources
    │           └── logback-test.xml
    │       └── scala
    │           └── edu
    │               └── knowitall
    │                   ├── common
    │                       └── enrich
    │                       │   └── TraversableSpecTest.scala
    │                   ├── ollie
    │                       ├── DependencyGraphExtrasSpec.scala
    │                       └── confidence
    │                       │   └── OllieFeatureSetSpec.scala
    │                   └── openparse
    │                       ├── BuildPatternsSpec.scala
    │                       ├── ExtractorPatternSpec.scala
    │                       ├── OllieSpec.scala
    │                       ├── OpenParseSpec.scala
    │                       └── PatternExtractorSpec.scala
├── data
    └── training.tsv
├── example
    ├── pom.xml
    └── src
    │   └── main
    │       ├── java
    │           └── example
    │           │   └── JavaOllieWrapper.java
    │       ├── resouces
    │           └── logback.xml
    │       └── scala
    │           └── ollie
    │               └── Example.scala
└── pom.xml


/.dockerignore:
--------------------------------------------------------------------------------
1 | Dockerfile
2 | .dockerignore
3 | .gitignore
4 | .git
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | target
2 | .cache
3 | .classpath
4 | .project
5 | .settings
6 | engmalt.linear.mco
7 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: scala
2 | scala:
3 |   - "2.9.2"
4 | jdk:
5 |   - oraclejdk7
6 |   - openjdk7
7 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM maven:3.5.2-jdk-7
 2 | 
 3 | WORKDIR /stage
 4 | 
 5 | COPY ./ /stage/
 6 | RUN curl http://www.maltparser.org/mco/english_parser/engmalt.linear-1.7.mco > /stage/engmalt.linear-1.7.mco
 7 | RUN mvn clean package
 8 | 
 9 | CMD ["java", "-Xmx512m", "-jar", "ollie-app-1.0.1-SNAPSHOT.jar"]
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Ollie Software License Agreement
 2 |  
 3 | Ollie Software
 4 | (C) 2011-2012, University of Washington.  All rights reserved.
 5 | US patent number 7,877,343 and 12/970,155 patent pending
 6 |  
 7 | The University of Washington (UW), Professor Mausam, Michael Schmitz, Robert
 8 | Bart, and Stephen Soderland, (Developers) give permission for you and your
 9 | laboratory (University) to use Ollie. Ollie is a system that extracts
10 | relational triples from text. Ollie is protected by a United States copyright
11 | and patents. The National Science Foundation supported work on Ollie. Under
12 | University of Washington's patents 7,877,343 (issued) and 12/970,155 (patent
13 | pending), the UW grants to you the non-exclusive right to use patent claims
14 | practiced by the University of Washington's Ollie software solely for
15 | non-commercial purposes and as long as you comply with the terms of this Ollie
16 | Software License Agreement.  UW and the Developers allow you to copy and modify
17 | Ollie for non-commercial purposes, and to distribute modifications through
18 | GitHub or directly to the University of Washington, on the following
19 | conditions:
20 | 
21 |  
22 | 1.  Ollie is not used for any commercial purposes, or as part of a system
23 | which has commercial purposes.
24 | 
25 |  
26 | 2.  Any software derived from Ollie must carry prominent notices stating that
27 | you modified it along with the date modified.  The derivative must also carry
28 | prominent notices stating that it is released under this Ollie Software
29 | License Agreement
30 |  
31 | If you wish to obtain Ollie or to obtain any patent rights for any commercial
32 | purposes, you will need to contact the University of Washington to see if
33 | rights are available and to negotiate a commercial license and pay a fee. This
34 | includes, but is not limited to, using Ollie to provide services to outside
35 | parties for a fee. In that case please contact:
36 |  
37 |     UW Center for Commercialization 
38 |     University of Washington
39 |     4311 11th Ave. NE,
40 |     Suite 500 Seattle, WA 98105-4608
41 | 
42 |     Phone: (206) 543-3970
43 |     Email: license@u.washington.edu
44 | 
45 |  
46 | 3. You retain in Ollie and any modifications to Ollie, the copyright,
47 | trademark, patent or other notices pertaining to Ollie as provided by UW.
48 | 
49 |  
50 | 4. You provide the Developers with feedback on the use of the Ollie software
51 | in your research, and that the Developers and UW are permitted to use any
52 | information you provide in making changes to the Ollie software. All bug
53 | reports and technical questions shall be sent to: afader@cs.washington.edu.
54 | Modifications may be communicated through GitHub pull requests at:
55 | 
56 |     https://github.com/knowitall/
57 | 
58 |  
59 | 5. You acknowledge that the Developers, UW and its licensees may develop
60 | modifications to Ollie that may be substantially similar to your modifications
61 | of Ollie, and that the Developers, UW and its licensees shall not be
62 | constrained in any way by you in UW's or its licensees' use or management of
63 | such modifications. You acknowledge the right of the Developers and UW to
64 | prepare and publish modifications to Ollie that may be substantially similar
65 | or functionally equivalent to your modifications and improvements, and if you
66 | obtain patent protection for any modification or improvement to Ollie you
67 | agree not to allege or enjoin infringement of your patent by the Developers,
68 | the UW or by any of UW's licensees obtaining modifications or improvements to
69 | Ollie from the University of Washington or the Developers.
70 | 
71 |  
72 | 6. If utilization of the Ollie software results in outcomes which will be
73 | published, please specify the version of Ollie you used and cite the UW
74 | Developers.
75 | 
76 |   @inproceedings{ollie-emnlp12,
77 |       author = {Mausam and Michael Schmitz and Robert Bart and 
78 |           Stephen Soderland and Oren Etzioni},
79 |       title = {Open Language Learning for Information Extraction},
80 |       booktitle = {Proceedings of Conference on Empirical Methods in 
81 |           Natural Language Processing and Computational Natural 
82 |           Language Learning (EMNLP-CONLL)},
83 |       year = {2012}
84 |   }
85 | 
86 |  
87 | 7. Any risk associated with using the Ollie software at your organization is
88 | with you and your organization. Ollie is experimental in nature and is made
89 | available as a research courtesy "AS IS," without obligation by UW to provide
90 | accompanying services or support.
91 | 
92 |  
93 | UW AND THE AUTHORS EXPRESSLY DISCLAIM ANY AND ALL WARRANTIES REGARDING THE 
94 | SOFTWARE, WHETHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO WARRANTIES 
95 | PERTAINING TO MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
96 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Ollie
  2 | 
  3 | Ollie is a program that automatically identifies and extracts binary
  4 | relationships from English sentences.  Ollie is designed for Web-scale
  5 | information extraction, where target relations are not specified in advance.
  6 | 
  7 | Ollie is our second-generation information extraction system .  Whereas <a
  8 | href="http://reverb.cs.washington.edu/">ReVerb</a> operates on flat sequences
  9 | of tokens, Ollie works with the tree-like (graph with only small cycles)
 10 | representation using Stanford's compression of the dependencies.  This allows
 11 | Ollie to capture expression that ReVerb misses, such as long-range relations.
 12 | 
 13 | Ollie also captures context that modifies a binary relation.  Presently Ollie
 14 | handles attribution (He said/she believes) and enabling conditions (if X
 15 | then).
 16 | 
 17 | ## Quick Start
 18 | 
 19 | ### Docker
 20 | 
 21 | You can now run Ollie with a single Docker command.
 22 | 
 23 | ```
 24 | docker run -it schmmd/ollie:latest
 25 | ```
 26 | 
 27 | To configure Ollie, you can drop into a bash shell with `docker run -it schmmd/ollie:latest /bin/bash` 
 28 | and run Ollie from the command line.
 29 | 
 30 | ### Local Machine
 31 | 
 32 | If you want to run Ollie on a small amount of text without modifying the source
 33 | code, you can use an executable file that can be run from the command line.
 34 | Please note that Ollie was built using Scala 2.9 and so it requires Java 7.
 35 | Follow these steps to get started:
 36 | 
 37 | 1.  Download the latest Ollie binary from
 38 |     http://knowitall.cs.washington.edu/ollie/ollie-app-latest.jar.
 39 | 
 40 | 2.  Download the linear English MaltParser model (engmalt.linear-1.7.mco) from
 41 |     http://www.maltparser.org/mco/english_parser/engmalt.html
 42 |     and place it in the same directory as Ollie.
 43 | 
 44 | 3.  Run `java -Xmx512m -jar ollie-app-latest.jar yourfile.txt`.  The input file
 45 |     should contain one sentence per line unless `--split` is specified.  Omit
 46 |     the input file for an interactive console.
 47 | 
 48 | ## Examples
 49 | 
 50 | ### Enabling Condition
 51 | 
 52 | An enabling condition is a condition that needs to be met for the extraction to
 53 | be true.  Certain words demark an enabling condition, such as "if" and "when".
 54 | Ollie captures enabling conditions if they are present.
 55 | 
 56 |     sentence: If I slept past noon, I'd be late for work.
 57 |     extraction: (I; 'd be late for; work)[enabler=If I slept past noon]
 58 | 
 59 | ### Attribution
 60 | 
 61 | An attribution clause specifies an entity that asserted an extraction and a
 62 | verb that specifies the expression.  Ollie captures attributions if they are
 63 | present.
 64 | 
 65 |     sentence: Some people say Barack Obama was not born in the United States.
 66 |     extraction: (Barack Obama; was not born in; the United States)[attrib=Some people say]
 67 | 
 68 |     sentence: Early astronomers believe that the earth is the center of the universe.
 69 |     extraction: (the earth; is the center of; the universe)[attrib=Early astronomers believe]
 70 | 
 71 | ### Relational noun
 72 | 
 73 | Some relations are expressed without verbs.  Ollie can capture these as well as
 74 | verb-mediated relations.
 75 | 
 76 |     sentence: Microsoft co-founder Bill Gates spoke at a conference on Monday.
 77 |     extraction: (Bill Gates; be co-founder of; Microsoft)
 78 | 
 79 | 
 80 | ### N-ary extractions
 81 | 
 82 | Often times similar relations will specify different aspects of the same event.
 83 | Since Ollie captures long-range relations it can capture N-ary extractions by
 84 | collapsing extractions where the relation phrase only differs by the
 85 | preposition.
 86 | 
 87 |     sentence: I learned that the 2012 Sasquatch music festival is scheduled for May 25th until May 28th.
 88 |     extraction: (the 2012 Sasquatch music festival; is scheduled for; May 25th)
 89 |     extraction: (the 2012 Sasquatch music festival; is scheduled until; May 28th)
 90 |     nary: (the 2012 Sasquatch music festival; is scheduled; [for May 25th; to May 28th])
 91 | 
 92 | ## Building
 93 | 
 94 | Building Ollie from source requires Apache Maven (<http://maven.apache.org>).
 95 | First, clone or download the Ollie source from GitHub.  Run this command in the
 96 | top-level source folder to download the required dependencies, compile, and
 97 | create a single jar file.
 98 | 
 99 |     mvn clean package
100 | 
101 | The compiled class files will be put in the base directory.  The single
102 | executable jar file will be written to `ollie-app-VERSION.jar` where `VERSION`
103 | is the version number.
104 | 
105 | ## Command Line Interface
106 | 
107 | Once you have built Ollie, you can run it from the command line.
108 | 
109 |     java -Xmx512m -jar ollie-app-VERSION.jar yourfile.txt
110 | 
111 | Omit the input file for an interactive console.
112 | 
113 | Ollie takes sentences, one-per-line as input or splits text into sentences if
114 | `--split` is specified.  Run Ollie with `--usage` to see full usage.
115 | 
116 | The Ollie command line tool has a few output formats.  The output format is
117 | specified by `--output-format` and a valid format:
118 | 
119 | 1. The `interactive` format that is meant to be easily human readable.
120 | 2. The `tabbed` format is mean to be easily parsable.  A header will be output
121 |    as the first row to label the columns.
122 | 3. `tabbedsingle` is similar to `tabbed` but the extraction is output as (arg1; relation;
123 |    arg2) in a single column.
124 | 4. The `serialized` is meant to be fully deserialized into an
125 |    `OllieExtractionInstance` class.
126 | 
127 | ## Graphical Interface
128 | 
129 | Ollie works ontop of a subcomponent called OpenParse.  The distinction is
130 | largely technical; OpenParse does not handle attribution and enabling condition
131 | and uses a coarser confidence metric.  You can use a GUI application to
132 | visualize the OpenParse extractions in a parse tree.  To use it, you will need
133 | to have [graphviz](http://www.graphviz.org/) installed.  You can run the GUI
134 | with:
135 | 
136 |     java -Xms512M -Xmx1g -cp ollie-app-VERSION.jar edu.knowitall.openparse.OpenParseGui
137 | 
138 | By default, this application will look for graphviz's `dot` program at
139 | `/usr/bin/dot`.  You can specify a location with the `--graphviz` parameter.
140 | 
141 | You can try out your own models with `Options->Load Model...`.  To see an
142 | example model, look at `openparse.model` in `src/main/resources`. Your model
143 | may have one or more patterns in it.  If you want to see pattern matches
144 | (without node expansion) instead of triple extractions, you can choose to show
145 | the raw match with `Options->Raw Matches`.  This will allow you to use patterns
146 | that do not capture an arg1, rel, and arg2.
147 | 
148 | ## Parsers
149 | 
150 | Ollie is packaged to use Malt Parser, one of the fastest dependency parsers
151 | available.  You will need the model file (`engmalt.linear-1.7.mco`) in the
152 | directory the application is run from or you will need to specify its location
153 | with the `--malt-model` parameter.  Malt Parser models are available online.
154 | 
155 |   http://www.maltparser.org/mco/english_parser/engmalt.html
156 | 
157 | Ollie works with any other parser in the `nlptools` project.  For example, it
158 | is easy to swap out Malt for Stanford's parser.  Stanford's parser is not a
159 | part of the Ollie distribution by default because of licensing conflicts, but
160 | the Stanford parser was used as the execution parser for the results in the
161 | paper.  Malt Parser was used to bootstrap the patterns.  We are interested
162 | in Clear parser as an alternative, but it's not a trivial change because Clear
163 | uses a slightly different dependency representation.
164 | 
165 | ## Using Eclipse
166 | 
167 | To modify the Ollie source code in Eclipse, use the [M2Eclipse
168 | plugin](http://www.sonatype.org/m2eclipse/) along with
169 | [ScalaIDE](http://scala-ide.org/).  You can then import the project using 
170 | the following.
171 | 
172 |     File > Import > Existing Maven Projects
173 | 
174 | ## Including Ollie as a Dependency
175 | 
176 | Add the following as a Maven dependency.
177 | 
178 |     <groupId>edu.washington.cs.knowitall.ollie</groupId>
179 |     <artifactId>ollie-core_2.9.2</artifactId>
180 |     <version>[1.0.0, )</version>
181 | 
182 | The best way to find the latest version is to browse [Maven Central](http://search.maven.org/#search%7Cga%7C1%7Cg%3A%22edu.washington.cs.knowitall%22).
183 | 
184 | `ollie-core` does not include a way to parse sentences.  You will need to use a
185 | parser supplied by the [nlptools](https://github.com/knowitall/nlptools)
186 | project.  The source for for `ollie-app` is an excellent example of a project
187 | using `ollie-core` as a dependency.  `ollie-app` supplies a parser from
188 | [nlptools](https://github.com/knowitall/nlptools).
189 | 
190 | There is an example project that uses Ollie in the `example` folder of the
191 | source distribution.
192 | 
193 | ## Training the Confidence Function
194 | 
195 | While Ollie comes with a trained confidence function, it is possible to retrain
196 | the confidence function.  First, you need to run Ollie over a set of sentences
197 | and store the output in the *serialized* format.
198 | 
199 |     echo "Michael rolled down the hill." | java -jar ollie-app-1.0.0-SNAPSHOT.jar --serialized --output toannotate.tsv
200 | 
201 | Next you need to annotate the extractions.  Modify the output file and
202 | **change** the first column to a binary annotation--`1` for correct and `0` for
203 | wrong.  Your final file will look similar to `ollie/data/training.tsv`.  Now
204 | run the logistic regression trainer.
205 | 
206 |     java -cp ollie-app-1.0.0-SNAPSHOT.jar edu.washington.cs.knowitall.ollie.confidence.train.TrainOllieConfidence toannotate.tsv
207 | 
208 | ## Concurrency
209 | 
210 | When operating at web scale, parallelism is essential.  While the base Ollie
211 | extractor is immutable and thread safe, the parser may not be thread safe.  I
212 | do not know whether Malt parser is thread safe.
213 | 
214 | ## FAQ
215 | 
216 | 1.  How fast is Ollie?
217 | 
218 |     You should really benchmark Ollie yourself, but on my computer (a new computer in 2011), Ollie processed 5000 high-quality web sentences in 56 seconds, or 89 sentences per second, in a single thread.  Ollie is easily parallelizable and the Ollie extractor itself is threadsafe (see Concurrency section).
219 | 
220 | ## Contact
221 | 
222 | To contact the UW about Ollie, email knowit-ollie@cs.washington.edu.
223 | 
224 | ## Citing Ollie
225 | If you use Ollie in your academic work, please cite Ollie with the following 
226 | BibTeX citation:
227 | 
228 |     @inproceedings{ollie-emnlp12,
229 |       author = {Mausam and Michael Schmitz and Robert Bart and Stephen Soderland and Oren Etzioni},
230 |       title = {Open Language Learning for Information Extraction},
231 |       booktitle = {Proceedings of Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning (EMNLP-CONLL)},
232 |       year = {2012}
233 |     }
234 | 


--------------------------------------------------------------------------------
/app/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 |   <modelVersion>4.0.0</modelVersion>
  4 |   <groupId>edu.washington.cs.knowitall.ollie</groupId>
  5 |   <artifactId>ollie-app</artifactId>
  6 |   <name>ollie-app</name>
  7 |   <version>1.0.1-SNAPSHOT</version>
  8 |   <parent>
  9 |     <groupId>edu.washington.cs.knowitall</groupId>
 10 |     <artifactId>knowitall-oss</artifactId>
 11 |     <version>1.0.2</version>
 12 |   </parent>
 13 |   <properties>
 14 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 15 |     <nlptools.version>2.4.0</nlptools.version>
 16 |   </properties>
 17 |   <dependencies>
 18 |     <dependency>
 19 |       <groupId>org.scala-lang</groupId>
 20 |       <artifactId>scala-swing</artifactId>
 21 |       <version>2.9.2</version>
 22 |     </dependency>
 23 |     <dependency>
 24 |       <groupId>edu.washington.cs.knowitall.ollie</groupId>
 25 |       <artifactId>ollie-core_2.9.2</artifactId>
 26 |       <version>1.0.2</version>
 27 |     </dependency>
 28 |     <dependency>
 29 |       <groupId>edu.washington.cs.knowitall.nlptools</groupId>
 30 |       <artifactId>nlptools-parse-malt_2.9.2</artifactId>
 31 |       <version>${nlptools.version}</version>
 32 |     </dependency>
 33 |     <dependency>
 34 |       <groupId>edu.washington.cs.knowitall.nlptools</groupId>
 35 |       <artifactId>nlptools-parse-stanford_2.9.2</artifactId>
 36 |       <version>${nlptools.version}</version>
 37 |     </dependency>
 38 |     <dependency>
 39 |       <groupId>edu.washington.cs.knowitall.nlptools</groupId>
 40 |       <artifactId>nlptools-sentence-opennlp_2.9.2</artifactId>
 41 |       <version>${nlptools.version}</version>
 42 |     </dependency>
 43 |     <dependency>
 44 |       <groupId>junit</groupId>
 45 |       <artifactId>junit</artifactId>
 46 |       <version>4.11</version>
 47 |       <scope>test</scope>
 48 |     </dependency>
 49 |     <dependency>
 50 |       <groupId>batik</groupId>
 51 |       <artifactId>batik-swing</artifactId>
 52 |       <version>1.6-1</version>
 53 |     </dependency>
 54 |     <dependency>
 55 |       <groupId>org.specs2</groupId>
 56 |       <artifactId>specs2_2.9.2</artifactId>
 57 |       <version>1.12.3</version>
 58 |       <scope>test</scope>
 59 |     </dependency>
 60 |     <!-- Logging -->
 61 |     <dependency>
 62 |       <groupId>ch.qos.logback</groupId>
 63 |       <artifactId>logback-classic</artifactId>
 64 |       <version>1.0.9</version>
 65 |     </dependency>
 66 |     <dependency>
 67 |       <groupId>ch.qos.logback</groupId>
 68 |       <artifactId>logback-core</artifactId>
 69 |       <version>1.0.9</version>
 70 |     </dependency>
 71 |   </dependencies>
 72 |   <build>
 73 |     <sourceDirectory>src/main/scala</sourceDirectory>
 74 |     <testSourceDirectory>src/test/scala</testSourceDirectory>
 75 |     <!-- build scala code -->
 76 |     <plugins>
 77 |       <plugin>
 78 |         <groupId>net.alchim31.maven</groupId>
 79 |         <artifactId>scala-maven-plugin</artifactId>
 80 |         <version>3.1.1</version>
 81 |         <configuration>
 82 |           <args>
 83 |             <arg>-deprecation</arg>
 84 |             <arg>-unchecked</arg>
 85 |           </args>
 86 |         </configuration>
 87 |         <executions>
 88 |           <execution>
 89 |             <goals>
 90 |               <goal>compile</goal>
 91 |               <goal>testCompile</goal>
 92 |             </goals>
 93 |           </execution>
 94 |         </executions>
 95 |       </plugin>
 96 |       <plugin>
 97 |         <artifactId>maven-assembly-plugin</artifactId>
 98 |         <configuration>
 99 |           <outputDirectory>${project.build.directory}/../..</outputDirectory>
100 |           <appendAssemblyId>false</appendAssemblyId>
101 |           <archive>
102 |             <manifest>
103 |               <mainClass>edu.knowitall.ollie.OllieCli</mainClass>
104 |             </manifest>
105 |           </archive>
106 |         </configuration>
107 |         <executions>
108 |           <execution>
109 |             <id>distro-assembly</id>
110 |             <phase>package</phase>
111 |             <goals>
112 |               <goal>single</goal>
113 |             </goals>
114 |           </execution>
115 |         </executions>
116 |       </plugin>
117 |     </plugins>
118 |   </build>
119 | </project>
120 | 


--------------------------------------------------------------------------------
/app/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <configuration>
 3 |   <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
 4 |     <!-- encoders are assigned the type
 5 |          ch.qos.logback.classic.encoder.PatternLayoutEncoder by default -->
 6 |     <encoder>
 7 |       <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 8 |     </encoder>
 9 |   </appender>
10 |   <root level="info">
11 |     <appender-ref ref="STDOUT"/>
12 |   </root>
13 | </configuration>
14 | 


--------------------------------------------------------------------------------
/app/src/main/scala/edu/knowitall/ollie/SentenceIterator.scala:
--------------------------------------------------------------------------------
 1 | package edu.knowitall.ollie
 2 | 
 3 | import edu.knowitall.tool.segment.Segmenter
 4 | 
 5 | class SentenceIterator(sentencer: Segmenter, private var lines: BufferedIterator[String]) extends Iterator[String] {
 6 |   var sentences: Iterator[String] = Iterator.empty
 7 |       
 8 |   lines.dropWhile(_.trim.isEmpty)
 9 | 
10 |   def nextSentences = {
11 |     val (paragraph, rest) = lines.span(!_.trim.isEmpty)
12 |     lines = rest.dropWhile(_.trim.isEmpty).buffered
13 |     sentencer.segmentTexts(paragraph.mkString(" ")).iterator.buffered
14 |   }
15 |   
16 |   def hasNext: Boolean = {
17 |     if (sentences.hasNext) {
18 |       true
19 |     }
20 |     else if (!lines.hasNext) {
21 |       false
22 |     }
23 |     else {
24 |       sentences = nextSentences
25 |       sentences.hasNext
26 |     }
27 |   }
28 |   
29 |   def next: String = {
30 |     if (sentences.hasNext) {
31 |       sentences.next()
32 |     }
33 |     else {
34 |       sentences = nextSentences
35 | 	  sentences.next()
36 |     }
37 |   }
38 | }


--------------------------------------------------------------------------------
/app/src/main/scala/edu/knowitall/openparse/OpenParseCli.scala:
--------------------------------------------------------------------------------
  1 | package edu.knowitall.openparse
  2 | 
  3 | import java.io.{PrintWriter, File}
  4 | import java.net.URL
  5 | 
  6 | import scala.collection.Set
  7 | import scala.io.Source
  8 | 
  9 | import org.slf4j.LoggerFactory
 10 | 
 11 | import edu.knowitall.collection.immutable.graph.pattern.Match
 12 | import edu.knowitall.collection.immutable.graph.Graph
 13 | import edu.knowitall.common.Resource.using
 14 | import edu.knowitall.common.Timing
 15 | import edu.knowitall.tool.parse.MaltParser
 16 | import edu.knowitall.openparse.OpenParse.validMatch
 17 | import edu.knowitall.openparse.extract.{TemplateExtractor, PatternExtractorType, PatternExtractor, GeneralExtractor, Extraction, DetailedExtraction}
 18 | import edu.knowitall.tool.parse.graph.{DependencyNode, DependencyGraph}
 19 | 
 20 | import scopt.OptionParser
 21 | 
 22 | object OpenParseCli {
 23 |   val logger = LoggerFactory.getLogger(this.getClass)
 24 | 
 25 |   abstract class Settings {
 26 |     def modelUrl: URL
 27 |     def outputFile: Option[File]
 28 |     def sentenceFile: File
 29 | 
 30 |     def confidenceThreshold: Double
 31 |     def expandArguments: Boolean
 32 |     def verbose: Boolean
 33 | 
 34 |     def parallel: Boolean
 35 |     def invincible: Boolean
 36 |   }
 37 | 
 38 |   def main(args: Array[String]) {
 39 |     object settings extends Settings {
 40 |       var modelUrl: URL = OpenParse.defaultModelUrl
 41 |       var outputFile: Option[File] = None
 42 |       var sentenceFile: File = null
 43 | 
 44 |       var confidenceThreshold = 0.0;
 45 |       var expandArguments: Boolean = true
 46 |       var verbose: Boolean = false
 47 | 
 48 |       var parallel: Boolean = false
 49 |       var invincible: Boolean = false
 50 |     }
 51 | 
 52 |     val parser = new OptionParser("openparse-cli") {
 53 |       arg("sentences", "sentence file", { path: String => 
 54 |         val file = new File(path)
 55 |         require(file.exists, "file does not exist: " + path)
 56 |         settings.sentenceFile = file
 57 |       })
 58 |       opt(Some("m"), "model", "<file>", "model file", { path: String => 
 59 |         val file = new File(path)
 60 |         require(file.exists, "file does not exist: " + path)
 61 |         settings.modelUrl = file.toURI.toURL 
 62 |       })
 63 |       doubleOpt(Some("t"), "threshold", "<threshold>", "confident threshold for shown extractions", { t: Double => settings.confidenceThreshold = t })
 64 |       opt("o", "output", "output file (otherwise stdout)", { path => settings.outputFile = Some(new File(path)) })
 65 | 
 66 |       opt("x", "expand-arguments", "expand extraction arguments", { settings.expandArguments = true })
 67 |       opt("v", "verbose", "", { settings.verbose = true })
 68 | 
 69 |       opt("p", "parallel", "", { settings.parallel = true })
 70 |       opt("invincible", "", { settings.invincible = true })
 71 |     }
 72 | 
 73 |     if (parser.parse(args)) {
 74 |       logger.info("args: " + args.mkString(" "))
 75 |       run(settings)
 76 |     }
 77 |   }
 78 | 
 79 |   def run(settings: Settings) {
 80 |     val parser = new MaltParser
 81 |     def parse(line: String): Option[DependencyGraph] = {
 82 |       Some(parser.dependencyGraph(line))
 83 |     }
 84 | 
 85 |     val other = new OpenParse.Settings {
 86 |       var modelUrl = settings.modelUrl
 87 |       var outputFile = settings.outputFile
 88 |       var sentenceFile = settings.sentenceFile 
 89 |       var confidenceThreshold = settings.confidenceThreshold 
 90 |       val duplicates = false
 91 |       var expandArguments = settings.expandArguments 
 92 |       val showAll = false
 93 |       var verbose = settings.verbose 
 94 |       val collapseVB = false
 95 |       var parallel = settings.parallel 
 96 |       var invincible = settings.invincible 
 97 |     }
 98 | 
 99 |     OpenParse.run(other, parse)
100 |   }
101 | }
102 | 


--------------------------------------------------------------------------------
/app/src/main/scala/edu/knowitall/openparse/gui/Dot.scala:
--------------------------------------------------------------------------------
  1 | package edu.knowitall.openparse.gui
  2 | 
  3 | import edu.knowitall.openparse.extract.TemplateExtractor
  4 | import edu.knowitall.common.Resource.using
  5 | import edu.knowitall.tool.parse.graph.DependencyGraph
  6 | import edu.knowitall.tool.parse.graph.DependencyNode
  7 | import java.io.IOException
  8 | import scala.swing.Dialog
  9 | import scala.io.Source
 10 | import java.io.InputStream
 11 | import java.io.OutputStream
 12 | import java.io.PrintWriter
 13 | import java.io.File
 14 | 
 15 | /** Code pertaining to rendering and converting DOT graphs. */
 16 | object Dot {
 17 |   def dot2svg(graphvizFile: Option[File], dotgraph: String) = {
 18 |     import sys.process.ProcessIO
 19 | 
 20 |     trait InputHandler[A] {
 21 |       def handle(a: A)(input: OutputStream)
 22 |     }
 23 | 
 24 |     trait OutputHandler[A] {
 25 |       def handle(output: InputStream)
 26 |       def value: A
 27 |     }
 28 | 
 29 |     val errHandler = new OutputHandler[String] {
 30 |       var value: String = null
 31 | 
 32 |       def handle(out: InputStream) {
 33 |         value = Source.fromInputStream(out).mkString
 34 |         out.close()
 35 |       }
 36 |     }
 37 | 
 38 |     val inputHandler = new InputHandler[String] {
 39 |       def handle(a: String)(os: OutputStream) {
 40 |         val pw = new PrintWriter(os)
 41 |         pw write a
 42 |         pw.close()
 43 |       }
 44 |     }
 45 | 
 46 |     val outputHandler = new OutputHandler[String] {
 47 |       var value: String = null
 48 | 
 49 |       def handle(out: InputStream) {
 50 |         value = Source.fromInputStream(out).mkString
 51 |         out.close()
 52 |       }
 53 |     }
 54 |     val io = new ProcessIO(inputHandler.handle(dotgraph), outputHandler.handle, errHandler.handle, false)
 55 | 
 56 |     val process = graphvizFile match {
 57 |       case Some(file) => sys.process.Process(file.getAbsolutePath, Seq("-T", "svg"))
 58 |       case None => sys.process.Process("dot", Seq("-T", "svg"))
 59 |     }
 60 | 
 61 |     val proc = try (process run io)
 62 |     catch {
 63 |       case e: IOException =>
 64 |         Dialog.showMessage(message = e.getMessage() + ". You may need to install graphviz and add it to the PATH variable, or specify the path to the dot program using the '--graphviz' argument.", messageType = Dialog.Message.Error)
 65 |         throw e
 66 |     }
 67 | 
 68 |     proc.exitValue() match {
 69 |       case 0 => outputHandler.value
 70 |       case x => sys.error("Dot exited with error code: " + x + " with output:\n" + errHandler.value)
 71 |     }
 72 |   }
 73 | 
 74 |   def svg2xml(svgString: String, nodeClickEvent: String=>Unit) = {
 75 |     import org.apache.batik.dom.svg.SVGDOMImplementation;
 76 |     import org.apache.batik.util.XMLResourceDescriptor
 77 |     import org.apache.batik.dom.svg.SAXSVGDocumentFactory
 78 | 
 79 |     val uri = SVGDOMImplementation.SVG_NAMESPACE_URI;
 80 | 
 81 |     val doc = using(new java.io.StringReader(svgString)) { reader =>
 82 |       val parser = XMLResourceDescriptor.getXMLParserClassName();
 83 |       val f = new SAXSVGDocumentFactory(parser);
 84 |       f.createSVGDocument(uri, reader);
 85 |     }
 86 | 
 87 |     val gs = doc.getElementsByTagNameNS(uri, "g")
 88 |     for (i <- 0 until gs.getLength) {
 89 |       val g = gs.item(i)
 90 |       val attributes = g.getAttributes
 91 |       val clazz = attributes.getNamedItem("class").getNodeValue
 92 | 
 93 |       if (clazz == "node") {
 94 |         val children = g.getChildNodes
 95 |         for (j <- 0 until children.getLength) {
 96 |           val child = children.item(j)
 97 |           if (child.getNodeName == "title") {
 98 |             val text = child.getFirstChild.getNodeValue
 99 | 
100 |             import org.w3c.dom.events._
101 |             g.asInstanceOf[EventTarget].addEventListener("click",
102 |               new EventListener() {
103 |                 def handleEvent(e: Event) { nodeClickEvent(text) }
104 |               },
105 |               true);
106 |           }
107 |         }
108 |       }
109 |     }
110 | 
111 |     doc
112 |   }
113 | 
114 |   def dotgraph(dgraph: DependencyGraph, nodes: Set[DependencyNode]) = {
115 |     val nodeStyle = nodes.map((_, "style=filled,color=lightblue"))
116 |     dgraph.dot(dgraph.text, nodeStyle.toMap, Map.empty)
117 |   }
118 | 
119 |   def dotgraph(dgraph: DependencyGraph, extraction: ExtractionEntry) = {
120 |     def originalNodes(nodes: Iterable[DependencyNode]) = nodes.map { node =>
121 |       dgraph.nodes.find(_.indices == node.indices).get
122 |     }
123 | 
124 |     val title = "\\n" + dgraph.text + "\\n" + extraction.toString + "\\n" + extraction.`match`.pattern.toStringF((s: String) => if (s.length < 60) s else s.take(20) + "...") +
125 |       (extraction.extractor match { case ex: TemplateExtractor => "\\n" + ex.template case _ => "" })
126 | 
127 |     // nodes
128 |     val darkNodes = extraction.`match`.nodeGroups
129 |     val lightNodes = originalNodes(extraction.nodes).toSet -- originalNodes(darkNodes.map(_._2.node))
130 |     val filledNodes = (lightNodes zip Stream.continually("style=filled,fillcolor=lightgray")) ++
131 |       (darkNodes.map { nodeGroup =>
132 |         val style = "style=filled,fillcolor=" + (nodeGroup._1 match {
133 |           case "rel" => "salmon1"
134 |           case "arg1" | "arg2" => "lightblue"
135 |           case "slot0" | "slot1" | "slot2" | "slot3" => "seashell"
136 |           case _ => "yellow"
137 |         })
138 | 
139 |         (nodeGroup._2.node, style)
140 |       })
141 | 
142 |     // edges
143 |     val solidEdges = extraction.edges.toSet
144 | 
145 |     val nodeStyle = filledNodes
146 |     val edgeStyle = (solidEdges zip Stream.continually("style=filled")) ++
147 |       ((dgraph.graph.edges.toSet -- solidEdges.toSet) zip Stream.continually("style=dotted,color=gray"))
148 | 
149 |     dgraph.dot(title, nodeStyle.toMap, edgeStyle.toMap)
150 |   }
151 | }


--------------------------------------------------------------------------------
/app/src/main/scala/edu/knowitall/openparse/gui/ExtractionEntry.scala:
--------------------------------------------------------------------------------
 1 | package edu.knowitall.openparse.gui
 2 | 
 3 | import edu.knowitall.collection.immutable.graph.pattern.Match
 4 | import edu.knowitall.tool.parse.graph.DependencyNode
 5 | import edu.knowitall.openparse.extract.PatternExtractor
 6 | import edu.knowitall.openparse.extract.DetailedExtraction
 7 | 
 8 | /**
 9 |   * A more generic representation of an extraction.
10 |   *
11 |   * This is needed to allow for raw matches, which do
12 |   * not have an arg1, rel, etc.
13 |   */
14 | case class ExtractionEntry(
15 |   confidence: Option[Double],
16 |   `match`: Match[DependencyNode],
17 |   nodes: Set[DependencyNode],
18 |   extractor: PatternExtractor,
19 |   parser: Parser.ParserEnum,
20 |   string: String = "",
21 |   correct: Option[Boolean]) {
22 | 
23 |   /**
24 |     * Convenient constructor for instantiating from
25 |     * an OpenParse extraction.
26 |     */
27 |   def this(confidence: Double, extraction: DetailedExtraction, parser: Parser.ParserEnum, correct: Option[Boolean] = None) = this(Some(confidence), extraction.`match`, extraction.nodes.toSet, extraction.extractor, parser, extraction.toString, correct)
28 | 
29 |   def edges = `match`.edges
30 | 
31 |   def annotate(correct: Boolean) = this.copy(correct = Some(correct))
32 |   def unannotate = this.copy(correct = None)
33 | 
34 |   private def goldString = {
35 |     correct match {
36 |       case Some(true) => "+ "
37 |       case Some(false) => "- "
38 |       case None => ""
39 |     }
40 |   }
41 | 
42 |   override def toString = confidence.map("%1.4f:" format _).getOrElse("") + goldString + string
43 | }


--------------------------------------------------------------------------------
/app/src/main/scala/edu/knowitall/openparse/gui/Parser.scala:
--------------------------------------------------------------------------------
 1 | package edu.knowitall.openparse.gui
 2 | 
 3 | import edu.knowitall.tool.parse.DependencyParser
 4 | import edu.knowitall.tool.parse.MaltParser
 5 | import edu.knowitall.tool.parse.graph.Dependencies
 6 | import edu.knowitall.tool.parse.graph.DependencyGraph
 7 | 
 8 | /** An enumerator for parser options */
 9 | object Parser extends Enumeration {
10 |   type ParserEnum = Value
11 | 
12 |   val Deserialize = Value("Deserialize")
13 |   val Stanford = Value("Stanford")
14 |   val MaltL = Value("Malt (Linear)")
15 |   val MaltPoly = Value("Malt (Poly)")
16 | 
17 |   def default = MaltL
18 | 
19 |   def load(parserType: ParserEnum): (ParserEnum, DependencyParser) = parserType match {
20 |     case Parser.Stanford => (parserType, new edu.knowitall.tool.parse.StanfordParser)
21 |     case Parser.MaltL => (parserType, new MaltParser())
22 |     case Parser.MaltPoly => (parserType, new MaltParser(modelUrl = new java.io.File("engmalt.poly-1.7.mco").toURI.toURL))
23 |     case Parser.Deserialize => (parserType, new DependencyParser() {
24 |       override def dependencies(input: String) = Dependencies.deserialize(input)
25 |       override def dependencyGraph(input: String) = DependencyGraph.deserialize(input)
26 |     })
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/app/src/main/scala/edu/knowitall/openparse/gui/Sentence.scala:
--------------------------------------------------------------------------------
 1 | package edu.knowitall.openparse.gui
 2 | 
 3 | import scala.util.control.Exception.catching
 4 | 
 5 | import edu.knowitall.tool.parse.graph.DependencyGraph
 6 | import edu.knowitall.tool.parse.graph.DependencyGraph.SerializationException
 7 | import edu.knowitall.tool.parse.graph.DependencyGraph.deserialize
 8 | 
 9 | /** A representation of the input sentence. */
10 | sealed abstract class Sentence
11 | object Sentence {
12 |   case class Text(text: String) extends Sentence {
13 |     override def toString = text
14 |   }
15 |   case class Graph(dgraph: DependencyGraph) extends Sentence {
16 |     override def toString = dgraph.serialize
17 |   }
18 | 
19 |   def apply(string: String): Sentence = {
20 |     import DependencyGraph._
21 | 
22 |     catching(classOf[SerializationException]).opt {
23 |       deserialize(string)
24 |     } match {
25 |       case Some(dgraph) => Graph(dgraph)
26 |       case None => Text(string)
27 |     }
28 |   }
29 | }


--------------------------------------------------------------------------------
/app/src/test/resources/logback-test.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <configuration>
 3 |   <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
 4 |     <!-- encoders are assigned the type
 5 |          ch.qos.logback.classic.encoder.PatternLayoutEncoder by default -->
 6 |     <encoder>
 7 |       <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 8 |     </encoder>
 9 |   </appender>
10 |   <root level="info">
11 |     <appender-ref ref="STDOUT"/>
12 |   </root>
13 | </configuration>
14 | 


--------------------------------------------------------------------------------
/core/build.sbt:
--------------------------------------------------------------------------------
 1 | organization := "edu.washington.cs.knowitall.ollie"
 2 | 
 3 | name := "ollie-core"
 4 | 
 5 | description := "Wrapper and implementation for extractors of chunked sentences."
 6 | 
 7 | version := "1.0.4-SNAPSHOT"
 8 | 
 9 | crossScalaVersions := Seq("2.9.2", "2.10.1")
10 | 
11 | scalaVersion <<= crossScalaVersions { (vs: Seq[String]) => vs.head }
12 | 
13 | libraryDependencies ++= Seq(
14 |     "edu.washington.cs.knowitall.nlptools" %% "nlptools-core" % "2.4.1",
15 |     "edu.washington.cs.knowitall.nlptools" %% "nlptools-conf-breeze" % "2.4.1",
16 |     "edu.washington.cs.knowitall.nlptools" %% "nlptools-stem-morpha" % "2.4.1",
17 |     "org.slf4j" % "slf4j-api" % "1.7.2",
18 |     "org.scalaz" %% "scalaz-core" % "7.0.0",
19 |     "ch.qos.logback" % "logback-classic" % "1.0.9" % "test",
20 |     "ch.qos.logback" % "logback-core" % "1.0.9" % "test",
21 |     "junit" % "junit" % "4.11" % "test",
22 |     "org.specs2" %% "specs2" % "1.12.3" % "test")
23 | 
24 | scalacOptions ++= Seq("-unchecked", "-deprecation")
25 | 
26 | licenses := Seq("Ollie Software License Agreement" -> url("https://raw.github.com/knowitall/ollie/master/LICENSE"))
27 | 
28 | homepage := Some(url("http://ollie.cs.washington.edu"))
29 | 
30 | publishMavenStyle := true
31 | 
32 | resolvers += "Sonatype OSS Snapshots" at "https://oss.sonatype.org/content/repositories/snapshots"
33 | 
34 | publishTo <<= version { (v: String) =>
35 |   val nexus = "https://oss.sonatype.org/"
36 |   if (v.trim.endsWith("SNAPSHOT"))
37 |     Some("snapshots" at nexus + "content/repositories/snapshots")
38 |   else
39 |     Some("releases"  at nexus + "service/local/staging/deploy/maven2")
40 | }
41 | 
42 | pomExtra := (
43 |   <scm>
44 |     <url>https://github.com/knowitall/ollie</url>
45 |     <connection>scm:git://github.com/knowitall/ollie.git</connection>
46 |     <developerConnection>scm:git:git@github.com:knowitall/ollie.git</developerConnection>
47 |     <tag>HEAD</tag>
48 |   </scm>
49 |   <developers>
50 |    <developer>
51 |       <name>Michael Schmitz</name>
52 |     </developer>
53 |     <developer>
54 |       <name>Robert Bart</name>
55 |     </developer>
56 |   </developers>)
57 | 


--------------------------------------------------------------------------------
/core/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 |   <modelVersion>4.0.0</modelVersion>
  4 |   <parent>
  5 |     <groupId>edu.washington.cs.knowitall</groupId>
  6 |     <artifactId>knowitall-oss</artifactId>
  7 |     <version>1.0.2</version>
  8 |   </parent>
  9 |   <groupId>edu.washington.cs.knowitall.ollie</groupId>
 10 |   <artifactId>ollie-core_2.9.2</artifactId>
 11 |   <version>1.0.4-SNAPSHOT</version>
 12 |   <name>ollie-core</name>
 13 |   <description>Ollie is an open information extractor for binary relations.</description>
 14 |   <scm>
 15 |     <url>https://github.com/knowitall/ollie</url>
 16 |     <connection>scm:git://github.com/knowitall/ollie.git</connection>
 17 |     <developerConnection>scm:git:git@github.com:knowitall/ollie.git</developerConnection>
 18 |     <tag>HEAD</tag>
 19 |   </scm>
 20 |   <licenses>
 21 |     <license>
 22 |       <name>Ollie Software License Agreement</name>
 23 |       <url>https://raw.github.com/knowitall/ollie/master/LICENSE</url>
 24 |       <distribution>repo</distribution>
 25 |     </license>
 26 |   </licenses>
 27 |   <organization>
 28 |     <name>University of Washington CSE</name>
 29 |     <url>http://cs.washington.edu/</url>
 30 |   </organization>
 31 |   <developers>
 32 |     <developer>
 33 |       <name>Michael Schmitz</name>
 34 |     </developer>
 35 |     <developer>
 36 |       <name>Robert Bart</name>
 37 |     </developer>
 38 |   </developers>
 39 |   <inceptionYear>2012</inceptionYear>
 40 |   <properties>
 41 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 42 |     <nlptools.version>2.4.1</nlptools.version>
 43 |   </properties>
 44 |   <dependencies>
 45 |     <dependency>
 46 |       <groupId>edu.washington.cs.knowitall.nlptools</groupId>
 47 |       <artifactId>nlptools-core_2.9.2</artifactId>
 48 |       <version>${nlptools.version}</version>
 49 |     </dependency>
 50 |     <dependency>
 51 |       <groupId>edu.washington.cs.knowitall.nlptools</groupId>
 52 |       <artifactId>nlptools-stem-morpha_2.9.2</artifactId>
 53 |       <version>${nlptools.version}</version>
 54 |     </dependency>
 55 |     <dependency>
 56 |       <groupId>edu.washington.cs.knowitall.nlptools</groupId>
 57 |       <artifactId>nlptools-conf-breeze_2.9.2</artifactId>
 58 |       <version>${nlptools.version}</version>
 59 |     </dependency>
 60 |     <dependency>
 61 |       <groupId>org.scalaz</groupId>
 62 |       <artifactId>scalaz-core_2.9.2</artifactId>
 63 |       <version>7.0.0</version>
 64 |     </dependency>
 65 |     <!-- Logging -->
 66 |     <dependency>
 67 |       <groupId>org.slf4j</groupId>
 68 |       <artifactId>slf4j-api</artifactId>
 69 |       <version>1.7.2</version>
 70 |     </dependency>
 71 |     <dependency>
 72 |       <groupId>ch.qos.logback</groupId>
 73 |       <artifactId>logback-classic</artifactId>
 74 |       <version>1.0.9</version>
 75 |       <scope>test</scope>
 76 |     </dependency>
 77 |     <dependency>
 78 |       <groupId>ch.qos.logback</groupId>
 79 |       <artifactId>logback-core</artifactId>
 80 |       <version>1.0.9</version>
 81 |       <scope>test</scope>
 82 |     </dependency>
 83 |     <!-- Test -->
 84 |     <dependency>
 85 |       <groupId>junit</groupId>
 86 |       <artifactId>junit</artifactId>
 87 |       <version>4.11</version>
 88 |       <scope>test</scope>
 89 |     </dependency>
 90 |     <dependency>
 91 |       <groupId>org.specs2</groupId>
 92 |       <artifactId>specs2_2.9.2</artifactId>
 93 |       <version>1.12.3</version>
 94 |       <scope>test</scope>
 95 |     </dependency>
 96 |   </dependencies>
 97 |   <build>
 98 |     <sourceDirectory>src/main/scala</sourceDirectory>
 99 |     <testSourceDirectory>src/test/scala</testSourceDirectory>
100 |     <plugins>
101 |       <plugin>
102 |         <groupId>net.alchim31.maven</groupId>
103 |         <artifactId>scala-maven-plugin</artifactId>
104 |         <version>3.1.1</version>
105 |         <executions>
106 |           <execution>
107 |             <goals>
108 |               <goal>compile</goal>
109 |               <goal>testCompile</goal>
110 |               <goal>doc-jar</goal>
111 |             </goals>
112 |           </execution>
113 |         </executions>
114 |         <configuration>
115 |           <args>
116 |             <arg>-deprecation</arg>
117 |             <arg>-unchecked</arg>
118 |           </args>
119 |           <jvmArgs>
120 |             <jvmArg>-Xms128m</jvmArg>
121 |             <jvmArg>-Xmx1024m</jvmArg>
122 |           </jvmArgs>
123 |         </configuration>
124 |       </plugin>
125 |     </plugins>
126 |   </build>
127 | </project>
128 | 


--------------------------------------------------------------------------------
/core/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | resolvers += Resolver.url("sbt-plugin-releases", new URL("http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases")) (Resolver.ivyStylePatterns)
2 | 
3 | addSbtPlugin("com.jsuereth" % "xsbt-gpg-plugin" % "0.6")
4 | 


--------------------------------------------------------------------------------
/core/scripts/applypatterns.sh:
--------------------------------------------------------------------------------
1 | # 1 -- patterns
2 | # 2 -- sentences
3 | mvn -q -e -f ../pom.xml compile exec:java -Dexec.mainClass=edu.washington.cs.knowitall.pattern.PatternExtractor -Dexec.args="--patterns $1 --sentences $2"
4 | 


--------------------------------------------------------------------------------
/core/scripts/build_templates.sh:
--------------------------------------------------------------------------------
1 | mkdir "$1/templates/"
2 | mvn exec:java -Dexec.mainClass=edu.washington.cs.knowitall.pattern.BuildTemplates -Dexec.args="$1/raw/patterned.txt $1/templates/templates.txt --reltemplates $HOME/public/read/reltemplates.txt --debug $1/templates/"
3 | 


--------------------------------------------------------------------------------
/core/scripts/create_patterns.sh:
--------------------------------------------------------------------------------
1 | mvn -q -e exec:java -Dexec.mainClass=edu.washington.cs.knowitall.pattern.BuildTreePatterns -Dexec.args="$1/raw/parsed.txt $1/raw/patterned-all.txt -p --length 3" 2> $1/raw/patterned-all.log
2 | 


--------------------------------------------------------------------------------
/core/scripts/create_test_train.sh:
--------------------------------------------------------------------------------
1 | # 1 -- lda directory
2 | ROWS="$1/raw/patterned.txt"
3 | TEST="$1/raw/test.txt"
4 | TRAIN="$1/raw/train.txt"
5 | mvn -q -e exec:java -Dexec.mainClass=edu.washington.cs.knowitall.pattern.lda.CreateTestSet -Dexec.args="$ROWS $TEST $TRAIN"
6 | 
7 | 


--------------------------------------------------------------------------------
/core/scripts/extractor.sh:
--------------------------------------------------------------------------------
1 | echo "$*"
2 | mvn -q -e exec:java -Dexec.mainClass=edu.washington.cs.knowitall.pattern.OpenParse -Dexec.args="$*"
3 | 


--------------------------------------------------------------------------------
/core/scripts/keep_common_patterns.sh:
--------------------------------------------------------------------------------
1 | # 1 -- lda directory
2 | cut -f5 "$1/raw/patterned-all.txt" | sort | uniq -c | sort -nr > "$1/raw/patterns.txt"
3 | mvn -q -e exec:java -Dexec.mainClass=edu.washington.cs.knowitall.pattern.KeepCommonPatterns -Dexec.args="$1/raw/patterned-all.txt 10" > "$1/raw/patterned.txt"
4 | 


--------------------------------------------------------------------------------
/core/src/main/resources/edu/knowitall/ollie/cognitiveWords.txt:
--------------------------------------------------------------------------------
 1 | accept
 2 | admit
 3 | affirm
 4 | aim
 5 | allow
 6 | apprehend
 7 | assert
 8 | attest
 9 | aver
10 | avouch
11 | avow
12 | believe
13 | claim
14 | comprehend
15 | confirm
16 | conjecture
17 | consider
18 | contend
19 | define
20 | deny
21 | describe
22 | discover
23 | doubt
24 | dream
25 | envisage
26 | expect
27 | fathom
28 | feel
29 | follow
30 | foreknow
31 | foresee
32 | foretell
33 | grant
34 | grasp
35 | guarantee
36 | guess
37 | hold
38 | hope
39 | identify
40 | imagine
41 | infer
42 | intend
43 | know
44 | maintain
45 | mean
46 | misapprehend
47 | misconstrue
48 | misinterpret
49 | misunderstand
50 | observe
51 | plan
52 | portray
53 | presume
54 | prophesy
55 | propose
56 | reaffirm
57 | realize
58 | recognize
59 | recollect
60 | remember
61 | report
62 | represent
63 | repute
64 | reveal
65 | see
66 | show
67 | speculate
68 | suppose
69 | surmise
70 | suspect
71 | swear
72 | think
73 | trust
74 | understand
75 | vaticinate
76 | visualize
77 | wish
78 | yen


--------------------------------------------------------------------------------
/core/src/main/resources/edu/knowitall/ollie/communicationWords.txt:
--------------------------------------------------------------------------------
 1 | acknowledge
 2 | acquaint
 3 | add
 4 | advise
 5 | affirm
 6 | allege
 7 | announce
 8 | apprise
 9 | articulate
10 | believe
11 | blab
12 | blurt
13 | claim
14 | comment
15 | communicate
16 | confess
17 | confide
18 | confirm
19 | consider
20 | convey
21 | corroborate
22 | declare
23 | deem
24 | demonstrate
25 | disclose
26 | divulge
27 | elaborate
28 | elucidate
29 | establish
30 | esteem
31 | exclaim
32 | explain
33 | explicate
34 | expound
35 | feel
36 | illustrate
37 | imagine
38 | inform
39 | insinuate
40 | insist
41 | intimate
42 | justify
43 | know
44 | leak
45 | lecture
46 | mention
47 | moralize
48 | narrate
49 | note
50 | notify
51 | observe
52 | pose
53 | preach
54 | proclaim
55 | promulgate
56 | propose
57 | prove
58 | rant
59 | rate
60 | read
61 | reaffirm
62 | recite
63 | reckon
64 | recount
65 | reiterate
66 | relate
67 | relay
68 | remark
69 | remember
70 | remind
71 | repeat
72 | reply
73 | report
74 | respond
75 | retort
76 | reveal
77 | say
78 | see
79 | show
80 | sniff
81 | speak
82 | state
83 | suppose
84 | suspect
85 | talk
86 | teach
87 | tell
88 | testify
89 | theorize
90 | think
91 | update
92 | utter
93 | venture
94 | verify
95 | view
96 | voice
97 | write


--------------------------------------------------------------------------------
/core/src/main/resources/edu/knowitall/ollie/confidence/default-classifier.txt:
--------------------------------------------------------------------------------
 1 | args start and end with noun	0.030924657084179144
 2 | rel ends with of	0.1013506657501542
 3 | arg1 contains pronoun	0.19630801348782667
 4 | arg2 contains pronoun	-0.13341646099789348
 5 | long relation	-0.3547145229191737
 6 | gap of 10 in rel	-0.34306426484946456
 7 | vacuous extraction	-0.6389807893982924
 8 | nn edges in pattern	0.9130032848389
 9 | arg1 is proper	0.07933280909554899
10 | Intercept	0.0
11 | sentence begins with arg1	-0.1588407285556643
12 | if right before arg1	-1.2206208992816086
13 | arg2 is proper	-0.04306420146120506
14 | arg2 borders appositive	-0.0017006187220647805
15 | rel contains gerund	-0.26200297625650837
16 | arg1 borders appositive	-0.13448972417475485
17 | noun-verb-noun in arg1	0.0
18 | prep right after arg2	0.19212879336967245
19 | prep in arg2	0.16539493294341892
20 | arg2 contains infinitive	-0.0
21 | prep mismatch in pattern	-0.20092201136389673
22 | sentence is imperative	0.11745202578145564
23 | hyp words in rel	-0.1449927441123399
24 | sentence ends with arg2	0.11610654106632967
25 | noun-verb-noun in arg2	0.07217080739835992
26 | rel is contiguous	0.12562188545360878
27 | non-contiguous rel	-0.1849662870655201
28 | semantic constraints in pattern	-0.4343558913425681
29 | openparse confidence	0.43411514029724824
30 | arg1 bad characters	-0.40339032821185783
31 | sentence starts with extraction	0.18854224217974247
32 | arg2 bad characters	-0.009939551407472108
33 | rel contains verb	0.4757113580400253
34 | rel starts with be	0.0
35 | prep right before arg1	-0.2350155331052106
36 | sentence has question mark	0.0
37 | arg2 before arg1	-0.35791735399208685
38 | arg2 before rel	-0.023882392179128745
39 | rel bad characters	-0.11794120943690224
40 | 


--------------------------------------------------------------------------------
/core/src/main/resources/edu/knowitall/ollie/prefixWords.txt:
--------------------------------------------------------------------------------
 1 | after
 2 | although
 3 | because
 4 | before
 5 | but
 6 | however
 7 | if
 8 | once
 9 | that
10 | though
11 | when
12 | whenever
13 | whether
14 | where
15 | while
16 | would


--------------------------------------------------------------------------------
/core/src/main/resources/edu/knowitall/openparse/categories/location.txt:
--------------------------------------------------------------------------------
  1 | abbacy
  2 | abode
  3 | abutment
  4 | abysm
  5 | abyss
  6 | acme
  7 | addition
  8 | address
  9 | aerie
 10 | aerospace
 11 | aery
 12 | aim
 13 | air
 14 | airhead
 15 | airspace
 16 | airway
 17 | ambiance
 18 | ambience
 19 | anchorage
 20 | angle
 21 | anomaly
 22 | antapex
 23 | antinode
 24 | antipodes
 25 | aperture
 26 | apex
 27 | aphelion
 28 | apoapsis
 29 | apogee
 30 | apojove
 31 | apolune
 32 | aposelene
 33 | approach
 34 | apron
 35 | archbishopric
 36 | archdeaconry
 37 | archdiocese
 38 | archduchy
 39 | area
 40 | arena
 41 | arrowhead
 42 | ashram
 43 | asthenosphere
 44 | atelier
 45 | atmosphere
 46 | axil
 47 | axis
 48 | azimuth
 49 | back
 50 | backside
 51 | backwater
 52 | backwoods
 53 | backyard
 54 | bailiwick
 55 | bakehouse
 56 | bakery
 57 | bakeshop
 58 | barb
 59 | barony
 60 | barren
 61 | barrio
 62 | barycenter
 63 | base
 64 | basin
 65 | battlefield
 66 | battlefront
 67 | battleground
 68 | beachhead
 69 | beak
 70 | bearing
 71 | beat
 72 | bed
 73 | bedground
 74 | bedside
 75 | beehive
 76 | beeline
 77 | beginning
 78 | belly
 79 | bellybutton
 80 | belt
 81 | bent
 82 | benthos
 83 | berm
 84 | berth
 85 | bight
 86 | bilge
 87 | bilges
 88 | bilocation
 89 | bindery
 90 | biosphere
 91 | birthplace
 92 | bishopric
 93 | bitthead
 94 | bivouac
 95 | block
 96 | boatyard
 97 | bookbindery
 98 | boondocks
 99 | border
100 | borderland
101 | borderline
102 | borough
103 | bottom
104 | bound
105 | boundary
106 | bounds
107 | bourn
108 | bourne
109 | bowels
110 | breach
111 | breadbasket
112 | break
113 | brickfield
114 | brickyard
115 | bridgehead
116 | brink
117 | brokerage
118 | brow
119 | buffer
120 | bull
121 | burg
122 | bush
123 | cabstand
124 | caliphate
125 | cambium
126 | camp
127 | campground
128 | campong
129 | campsite
130 | campus
131 | canthus
132 | canton
133 | cap
134 | capital
135 | capitulum
136 | carrefour
137 | casbah
138 | cavern
139 | cavity
140 | cell
141 | cemetery
142 | center
143 | centerfield
144 | central
145 | centre
146 | centrex
147 | centroid
148 | chap
149 | chapiter
150 | charnel
151 | chasm
152 | checkpoint
153 | chink
154 | chokepoint
155 | chromosphere
156 | churchyard
157 | circle
158 | circuit
159 | circumference
160 | circus
161 | city
162 | clear
163 | clearing
164 | cleft
165 | cloverleaf
166 | coalfield
167 | coastline
168 | col
169 | colliery
170 | colony
171 | columbarium
172 | common
173 | commons
174 | commonwealth
175 | commune
176 | community
177 | compartment
178 | confluence
179 | conurbation
180 | core
181 | corium
182 | corncob
183 | corner
184 | corneum
185 | cornfield
186 | country
187 | countryside
188 | county
189 | course
190 | court
191 | cowtown
192 | crack
193 | cradle
194 | cranny
195 | crawlspace
196 | creamery
197 | crenel
198 | crenelle
199 | crest
200 | crevasse
201 | crevice
202 | crinion
203 | croft
204 | crosscut
205 | crossing
206 | crossroad
207 | crossway
208 | crotch
209 | crown
210 | crud
211 | crust
212 | crypt
213 | cubbyhole
214 | culmination
215 | curtilage
216 | cusp
217 | cuticle
218 | dairy
219 | danger
220 | dark
221 | darkness
222 | dateline
223 | dec
224 | declination
225 | defile
226 | delimitation
227 | demarcation
228 | demesne
229 | den
230 | department
231 | dependency
232 | depth
233 | derivation
234 | derma
235 | dermis
236 | desert
237 | desktop
238 | destination
239 | determinant
240 | development
241 | diamond
242 | diastema
243 | dig
244 | diocese
245 | dip
246 | direction
247 | distance
248 | district
249 | divide
250 | dockside
251 | dockyard
252 | dogleg
253 | domain
254 | domicile
255 | dominion
256 | dooryard
257 | downtown
258 | drop
259 | duchy
260 | dukedom
261 | dump
262 | dumpsite
263 | earldom
264 | earreach
265 | earshot
266 | earth
267 | east
268 | ecliptic
269 | edge
270 | edging
271 | element
272 | emirate
273 | empire
274 | emptiness
275 | empyrean
276 | encampment
277 | enclave
278 | enclosure
279 | end
280 | endpoint
281 | entrepot
282 | environment
283 | environs
284 | eparchy
285 | epicenter
286 | epicentre
287 | epidermis
288 | episcopate
289 | epitope
290 | equator
291 | equinoctial
292 | equinox
293 | exaltation
294 | exarchate
295 | excavation
296 | exchange
297 | exosphere
298 | expanse
299 | exterior
300 | extreme
301 | extremity
302 | extremum
303 | exurbia
304 | eye
305 | eyeshot
306 | eyrie
307 | eyry
308 | face
309 | fairground
310 | fairway
311 | farm
312 | farmland
313 | farmplace
314 | farmstead
315 | fatherland
316 | faubourg
317 | fault
318 | faulting
319 | fiefdom
320 | field
321 | fingertip
322 | finish
323 | firebreak
324 | fireguard
325 | fireside
326 | firmament
327 | fishery
328 | fissure
329 | flies
330 | floor
331 | flowerbed
332 | fluke
333 | flyway
334 | focus
335 | foot
336 | foothold
337 | foramen
338 | forefront
339 | forepart
340 | forge
341 | fork
342 | fountainhead
343 | fracture
344 | fringe
345 | front
346 | frontier
347 | funfair
348 | gaff
349 | gap
350 | garden
351 | gasfield
352 | gasworks
353 | geosphere
354 | ghetto
355 | glade
356 | glassworks
357 | goal
358 | goldfield
359 | gorge
360 | grainfield
361 | grange
362 | grassland
363 | grave
364 | graveyard
365 | green
366 | greenbelt
367 | greenway
368 | gridiron
369 | ground
370 | grounds
371 | grove
372 | gulf
373 | habitat
374 | habitation
375 | hairline
376 | hamlet
377 | hand
378 | hangout
379 | harbor
380 | harborage
381 | harbour
382 | harbourage
383 | hatchery
384 | haunt
385 | haven
386 | hayfield
387 | head
388 | heading
389 | headspring
390 | headwater
391 | hearing
392 | heart
393 | hearth
394 | heartland
395 | heath
396 | heathland
397 | heaven
398 | heavens
399 | heel
400 | heights
401 | heliopause
402 | heliosphere
403 | hell
404 | hellhole
405 | hem
406 | hemisphere
407 | hemline
408 | here
409 | heronry
410 | hiatus
411 | hideaway
412 | hideout
413 | high
414 | hilltop
415 | hilum
416 | hinterland
417 | hip
418 | hipline
419 | hole
420 | hollow
421 | holy
422 | home
423 | homeland
424 | hometown
425 | horizon
426 | horst
427 | hotbed
428 | hotspot
429 | house
430 | hub
431 | hydathode
432 | hydrosphere
433 | imperium
434 | inclination
435 | inferno
436 | infield
437 | innersole
438 | inside
439 | insole
440 | interchange
441 | interface
442 | interior
443 | intersection
444 | ionosphere
445 | ironworks
446 | irredenta
447 | irridenta
448 | isarithm
449 | island
450 | isobar
451 | isochrone
452 | isoclinal
453 | isogone
454 | isogram
455 | isohel
456 | isopleth
457 | isotherm
458 | itinerary
459 | job
460 | junction
461 | jungle
462 | junkyard
463 | jurisdiction
464 | justiciary
465 | juxtaposition
466 | kampong
467 | kasbah
468 | key
469 | khanate
470 | kingdom
471 | knothole
472 | kraal
473 | lab
474 | laboratory
475 | lair
476 | land
477 | landmark
478 | landscape
479 | landscaping
480 | latitude
481 | launderette
482 | laundry
483 | lawn
484 | layer
485 | lea
486 | lead
487 | leak
488 | lee
489 | leeward
490 | left
491 | leftfield
492 | lenticel
493 | ley
494 | lie
495 | light
496 | limb
497 | limit
498 | line
499 | lineation
500 | lithosphere
501 | locale
502 | locality
503 | location
504 | locus
505 | longitude
506 | lookout
507 | lot
508 | loxodrome
509 | luff
510 | lumberyard
511 | mandate
512 | mandatory
513 | mansion
514 | mantle
515 | march
516 | marchland
517 | mare
518 | maria
519 | mastaba
520 | mastabah
521 | masthead
522 | matrix
523 | mausoleum
524 | maximum
525 | meadow
526 | mecca
527 | medina
528 | medium
529 | meeting
530 | megalopolis
531 | meridian
532 | mesosphere
533 | mete
534 | metropolis
535 | micropyle
536 | midair
537 | midden
538 | middle
539 | midfield
540 | midland
541 | midpoint
542 | midst
543 | midstream
544 | midway
545 | minefield
546 | minimum
547 | molding
548 | monument
549 | moorage
550 | mooring
551 | motherland
552 | moulding
553 | mouth
554 | municipality
555 | nadir
556 | nape
557 | navel
558 | necropolis
559 | neighborhood
560 | neighbourhood
561 | nest
562 | nib
563 | nidus
564 | nirvana
565 | node
566 | nombril
567 | nook
568 | north
569 | northeast
570 | northland
571 | northwest
572 | notch
573 | nucha
574 | nucleus
575 | oasis
576 | occident
577 | oilfield
578 | omphalos
579 | omphalus
580 | open
581 | opening
582 | orbit
583 | orchard
584 | orient
585 | origin
586 | orphrey
587 | outback
588 | outdoors
589 | outfield
590 | outline
591 | outport
592 | outpost
593 | outside
594 | outskirt
595 | outskirts
596 | outsole
597 | outstation
598 | overhead
599 | overlook
600 | ozonosphere
601 | paddy
602 | paint
603 | palaestra
604 | palate
605 | palatinate
606 | palestra
607 | pallium
608 | pampas
609 | panhandle
610 | paradise
611 | parallel
612 | parcel
613 | paries
614 | parish
615 | park
616 | parkland
617 | part
618 | parterre
619 | parting
620 | parts
621 | pass
622 | pasture
623 | pastureland
624 | patch
625 | patchboard
626 | pate
627 | path
628 | patisserie
629 | patriarchate
630 | pattern
631 | peak
632 | penetralia
633 | perch
634 | perforation
635 | periapsis
636 | perigee
637 | perigon
638 | perihelion
639 | perijove
640 | perilune
641 | periselene
642 | pesthole
643 | photosphere
644 | piazza
645 | pigeonhole
646 | piggery
647 | pike
648 | pinnacle
649 | pinpoint
650 | piscary
651 | piste
652 | pit
653 | pitch
654 | place
655 | plantation
656 | plate
657 | playground
658 | plaza
659 | pleasance
660 | plot
661 | plugboard
662 | pocket
663 | point
664 | pole
665 | poll
666 | polls
667 | pool
668 | pore
669 | port
670 | position
671 | possession
672 | post
673 | pottery
674 | pouch
675 | prairie
676 | precinct
677 | prefecture
678 | premises
679 | presence
680 | preserve
681 | princedom
682 | principality
683 | property
684 | proprioceptor
685 | protectorate
686 | provenance
687 | provenience
688 | province
689 | proximity
690 | puddle
691 | pueblo
692 | punctum
693 | pupil
694 | purlieu
695 | qibla
696 | quadrant
697 | quarter
698 | radius
699 | railhead
700 | railyard
701 | ranch
702 | range
703 | rathole
704 | reach
705 | realm
706 | rear
707 | rearward
708 | refuge
709 | region
710 | rendezvous
711 | rent
712 | repair
713 | repository
714 | reservation
715 | reserve
716 | residence
717 | resort
718 | retreat
719 | rhumb
720 | rift
721 | right
722 | rightfield
723 | rip
724 | roads
725 | roadside
726 | roadstead
727 | rockery
728 | rooftop
729 | rookery
730 | root
731 | rootage
732 | ropewalk
733 | rotary
734 | rough
735 | round
736 | roundabout
737 | roundhouse
738 | route
739 | sac
740 | sack
741 | saddle
742 | saddleback
743 | saddlery
744 | safety
745 | sanctuary
746 | sanctum
747 | sandlot
748 | savanna
749 | savannah
750 | scenario
751 | scene
752 | scenery
753 | schoolyard
754 | scissure
755 | scour
756 | scrapheap
757 | scrubland
758 | scruff
759 | seafront
760 | seam
761 | seaport
762 | seascape
763 | seat
764 | section
765 | sector
766 | see
767 | seedbed
768 | selvage
769 | selvedge
770 | semidesert
771 | semitropics
772 | separation
773 | sepulcher
774 | sepulchre
775 | sepulture
776 | setting
777 | settlement
778 | shadow
779 | shantytown
780 | sheeprun
781 | sheepwalk
782 | sheet
783 | sheikdom
784 | sheikhdom
785 | shift
786 | shipside
787 | shipyard
788 | shire
789 | shop
790 | shoreline
791 | short
792 | shoulder
793 | showplace
794 | shrubbery
795 | side
796 | sign
797 | silhouette
798 | site
799 | situation
800 | skyline
801 | skyway
802 | slack
803 | slip
804 | slit
805 | slot
806 | slum
807 | smithy
808 | snag
809 | snow
810 | sodom
811 | soil
812 | sole
813 | solitude
814 | somewhere
815 | source
816 | south
817 | southeast
818 | southland
819 | southwest
820 | spa
821 | space
822 | spearhead
823 | spearpoint
824 | sphere
825 | spike
826 | split
827 | spoor
828 | spot
829 | sprawl
830 | spread
831 | spring
832 | square
833 | stage
834 | stand
835 | state
836 | station
837 | steps
838 | stoma
839 | stomate
840 | stop
841 | stopover
842 | stratosphere
843 | stratum
844 | stretch
845 | studio
846 | subdivision
847 | substrate
848 | substratum
849 | subtopia
850 | subtropics
851 | suburb
852 | suburbia
853 | sultanate
854 | summit
855 | superstrate
856 | superstratum
857 | surface
858 | surround
859 | surroundings
860 | suzerainty
861 | swath
862 | switchboard
863 | tack
864 | tannery
865 | tape
866 | target
867 | taxistand
868 | tear
869 | tee
870 | telomere
871 | tendency
872 | tenderloin
873 | terminal
874 | termination
875 | terminus
876 | terrain
877 | terreplein
878 | territory
879 | theater
880 | theatre
881 | there
882 | thermosphere
883 | thick
884 | tiltyard
885 | timberline
886 | tip
887 | tiptoe
888 | tiptop
889 | tomb
890 | tonsure
891 | top
892 | topiary
893 | town
894 | township
895 | track
896 | tract
897 | trail
898 | trailhead
899 | treetop
900 | trend
901 | trichion
902 | tropic
903 | tropics
904 | tropopause
905 | troposphere
906 | trusteeship
907 | turf
908 | turnery
909 | umbilicus
910 | underbelly
911 | underside
912 | undersurface
913 | unknown
914 | upside
915 | uptown
916 | vacancy
917 | vacuity
918 | vacuum
919 | vantage
920 | variation
921 | vault
922 | veld
923 | veldt
924 | vent
925 | venue
926 | verge
927 | vertex
928 | viceroyalty
929 | vicinity
930 | view
931 | viewpoint
932 | village
933 | vinery
934 | vineyard
935 | viscounty
936 | void
937 | volcano
938 | wall
939 | ward
940 | warren
941 | washhouse
942 | waste
943 | wasteland
944 | wasteyard
945 | waterfront
946 | waterline
947 | watermark
948 | watershed
949 | waterworks
950 | wavefront
951 | way
952 | wayside
953 | weald
954 | wedge
955 | welkin
956 | wellhead
957 | wellspring
958 | west
959 | wheatfield
960 | whereabouts
961 | wild
962 | wilderness
963 | window
964 | windward
965 | wing
966 | wire
967 | wold
968 | woodlet
969 | work
970 | workplace
971 | workshop
972 | workspace
973 | yard
974 | yardarm
975 | zenith
976 | zodiac
977 | zone
978 | 


--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/common/enrich/Traversable.scala:
--------------------------------------------------------------------------------
 1 | package edu.knowitall
 2 | package common
 3 | package enrich
 4 | 
 5 | import edu.knowitall.collection.immutable.Bag
 6 | 
 7 | import scalaz._
 8 | import Scalaz._
 9 | import Monoid._
10 | 
11 | /**
12 |  * Enrichments for traversables.
13 |  *
14 |  * @author  Michael Schmitz
15 |  */
16 | object Traversables {
17 |   implicit def traversableOnceTo[T](as: TraversableOnce[T]): SuperTraversableOnce[T] = new SuperTraversableOnce[T](as)
18 | 
19 |   implicit def traversableOncePairIntTo[T](as: TraversableOnce[(T, Int)]): SuperTraversableOncePairInt[T] = new SuperTraversableOncePairInt[T](as)
20 | 
21 |   implicit def traversableOncePairTo[T, U](as: TraversableOnce[(T, U)]): SuperTraversableOncePair[T, U] = new SuperTraversableOncePair[T, U](as)
22 | }
23 | 
24 | sealed class SuperTraversableOnce[T](value: TraversableOnce[T]) {
25 |   def histogram: Map[T, Int] = {
26 |     value.foldLeft(Map[T, Int]()) { (m, c) =>
27 |       m.updated(c, m.getOrElse(c, 0) + 1)
28 |     }
29 |   }
30 | }
31 | 
32 | sealed class SuperTraversableOncePairInt[T](value: TraversableOnce[(T, Int)]) {
33 |   import Traversables._
34 |   def mergeHistograms: Map[T, Int] = value.mergeKeys(_ + _)
35 | }
36 | 
37 | sealed class SuperTraversableOncePair[T, U](value: TraversableOnce[(T, U)]) {
38 |   def mergeKeys(implicit mon: Semigroup[U]): Map[T, U] = {
39 |     value.foldLeft(Map[T, U]()) {
40 |       case (map, (k, v)) =>
41 |         map + (k -> (map.get(k).map(_ |+| v).getOrElse(v)))
42 |     }
43 |   }
44 | 
45 |   def mergeKeys[F[_]](implicit monoid: Monoid[F[U]]): Map[T, F[U]] = {
46 |     value.foldLeft(Map[T, F[U]]()) {
47 |       case (map, (k, v)) =>
48 |         val pure = monoid.zero
49 |         map + (k -> (map.get(k).map(_ |+| pure).getOrElse(pure)))
50 |     }
51 |   }
52 | 
53 |   def mergeKeys(merge: (U, U) => U): Map[T, U] = {
54 |     value.foldLeft(Map[T, U]()) {
55 |       case (map, (k, v)) =>
56 |         map + (k -> map.get(k).map(merge(_, v)).getOrElse(v))
57 |     }
58 |   }
59 | 
60 |   def toListMultimap: Map[T, List[U]] = {
61 |     value.foldLeft(Map[T, List[U]]().withDefaultValue(List.empty[U])) {
62 |       case (map, (k, v)) =>
63 |         map + (k -> (v :: map(k)))
64 |     }
65 |   }
66 | 
67 |   def toSetMultimap: Map[T, Set[U]] = {
68 |     value.foldLeft(Map[T, Set[U]]().withDefaultValue(Set.empty[U])) {
69 |       case (map, (k, v)) =>
70 |         map + (k -> (map(k) + v))
71 |     }
72 |   }
73 | 
74 |   def toBagMultimap: Map[T, Bag[U]] = {
75 |     value.foldLeft(Map[T, Bag[U]]().withDefaultValue(Bag.empty[U])) {
76 |       case (map, (k, v)) =>
77 |         val bag = map(k)
78 |         map + (k -> (bag + v))
79 |     }
80 |   }
81 | }
82 | 


--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/ollie/DependencyGraphExtras.scala:
--------------------------------------------------------------------------------
  1 | package edu.knowitall.ollie
  2 | 
  3 | import edu.knowitall.tool.parse.graph.DependencyGraph
  4 | import edu.knowitall.tool.parse.graph.Dependency
  5 | import edu.knowitall.tool.parse.graph.DependencyNode
  6 | import edu.knowitall.collection.immutable.Interval
  7 | import edu.knowitall.tool.tokenize.Tokenizer
  8 | import edu.knowitall.collection.immutable.graph.Graph
  9 | 
 10 | class DependencyGraphExtras(dgraph: DependencyGraph) {
 11 |   private def graph = dgraph.graph
 12 | 
 13 |   def passiveVoice: Iterable[DependencyGraph] = {
 14 |     require(dgraph.nodes forall (_.indices.length == 1))
 15 | 
 16 |     // look for passive constructions
 17 |     val activeVoices = this.graph.vertices.filter { v =>
 18 |       (v.postag startsWith "VB") &&
 19 |       (dgraph.dependencies exists {edge => edge.label == "nsubj" && edge.source == v}) &&
 20 |       (dgraph.dependencies exists {edge => edge.label == "dobj" && edge.source == v})
 21 |     }
 22 | 
 23 |     activeVoices map { v =>
 24 |       val nsubj = dgraph.dependencies.find(edge => edge.label == "nsubj" && edge.source == v).get
 25 |       val dobj = dgraph.dependencies.find(edge => edge.label == "dobj" && edge.source == v).get
 26 |       val nsubjInterval = Interval.span(dgraph.graph.inferiors(nsubj.dest).map(_.indices))
 27 |       val dobjInterval = Interval.span(dgraph.graph.inferiors(dobj.dest).map(_.indices))
 28 | 
 29 |       val nsubjpass = new Dependency(v, dobj.dest, "nsubjpass")
 30 | 
 31 |       val by = new DependencyNode("by", "IN", dobjInterval.start, -1)
 32 |       val prep = new Dependency(v, by, "prep")
 33 |       val pobj = new Dependency(by, nsubj.dest, "pobj")
 34 |       val was = new DependencyNode("was", "VBD", v.indices.start, -1)
 35 |       val auxpass = new Dependency(nsubj.source, was, "auxpass")
 36 | 
 37 |       // adjust the edges
 38 |       var edges: Iterable[Dependency] = dgraph.dependencies
 39 |       edges = edges.toSet - nsubj - dobj + prep + pobj + auxpass + nsubjpass
 40 |       // adjust for the "by" node
 41 |       def nodeMap = { (v: DependencyNode) =>
 42 |         var interval = v.indices
 43 |         if (v.indices.start >= by.indices.start && v != by) interval = DependencyGraphExtras.shift(interval, 1)
 44 |         if (v.indices.start >= was.indices.start && v != was) interval = DependencyGraphExtras.shift(interval, 1)
 45 |         new DependencyNode(v.text, v.postag, interval, v.offset)
 46 |       }
 47 |       edges = edges.map { e => e mapNodes nodeMap }
 48 | 
 49 |       edges = DependencyGraphExtras.swapOrders(edges, graph.inferiors(nsubj.dest) map nodeMap, graph.inferiors(dobj.dest) map nodeMap)
 50 | 
 51 |       // create the new graph
 52 |       val newGraph = new DependencyGraph(edges.flatMap(_.vertices), edges)
 53 |       val text = newGraph.nodes.iterator.map(_.text).mkString(" ")
 54 | 
 55 |       // compute the correct offsets
 56 |       val offsets = Tokenizer.computeOffsets(newGraph.nodes.iterator.map(_.text).toList, text)
 57 |       val nodeOffsetTransformation =
 58 |         ((newGraph.graph.vertices.iterator zip offsets.iterator) map {case (node, token) => node -> new DependencyNode(node.text, node.postag, node.indices, token.offset)}).toMap
 59 | 
 60 |       newGraph map nodeOffsetTransformation
 61 |     }
 62 |   }
 63 | 
 64 |   def activeVoice: Iterable[DependencyGraph] = {
 65 |     require(dgraph.nodes forall (_.indices.length == 1))
 66 | 
 67 |     // look for active constructions
 68 |     val passiveVoices = this.graph.vertices.filter { v =>
 69 |       if (!(v.postag startsWith "VB") &&
 70 |           (dgraph.dependencies exists {edge => edge.label == "nsubjpass" && edge.source == v}) &&
 71 |           (dgraph.dependencies exists (edge => edge.label == "auxpass" && edge.source == v)))
 72 |         false
 73 |       else {
 74 |         dgraph.dependencies.find(e => e.label == "prep" && e.source == v && e.dest.text == "by") match {
 75 |           case None => false
 76 |           case Some(prep) => dgraph.dependencies.exists(e => e.source == prep.dest && e.label == "pobj")
 77 |         }
 78 |       }
 79 |       (dgraph.dependencies exists {edge => edge.label == "prep" && edge.source == v})
 80 |     }
 81 | 
 82 |     passiveVoices map { v =>
 83 |       val nsubjpass = dgraph.dependencies.find(edge => edge.label == "nsubjpass" && edge.source == v).get
 84 |       val prep = dgraph.dependencies.find(edge => edge.label == "prep" && edge.source == v && edge.dest.text == "by" && dgraph.dependencies.exists(e => e.source == edge.dest && e.label == "pobj")).get
 85 |       val pobj = dgraph.dependencies.find(edge => edge.label == "pobj" && edge.source == prep.dest).get
 86 |       val auxpass = dgraph.dependencies.find(edge => edge.label == "auxpass" && edge.source == v).get
 87 | 
 88 |       val nsubj = new Dependency(v, pobj.dest, "nsubj")
 89 |       val dobj = new Dependency(v, nsubjpass.dest, "dobj")
 90 | 
 91 |       // adjust the edges
 92 |       var edges: Iterable[Dependency] = dgraph.dependencies
 93 |       edges = edges.toSet - nsubjpass - auxpass - prep - pobj + nsubj + dobj
 94 |       edges = DependencyGraphExtras.swapOrders(edges, graph.inferiors(nsubjpass.dest), graph.inferiors(pobj.dest))
 95 | 
 96 |       val nodes = scala.collection.immutable.SortedSet.empty[DependencyNode] ++ edges.flatMap(_.nodes)
 97 |       val nodeMap = nodes.iterator.zipWithIndex.map{case (node, i) => node -> new DependencyNode(node.text, node.postag, Interval.singleton(i), -1)}.toMap
 98 |       edges = edges.map(_ mapNodes nodeMap)
 99 | 
100 |       // create the new graph
101 |       val newGraph = new DependencyGraph(edges.flatMap(_.vertices), edges)
102 |       val text = newGraph.nodes.iterator.map(_.text).mkString(" ")
103 | 
104 |       // compute the correct offsets
105 |       val offsets = Tokenizer.computeOffsets(newGraph.nodes.iterator.map(_.text).toList, text)
106 |       val nodeOffsetTransformation =
107 |         ((newGraph.graph.vertices.iterator zip offsets.iterator) map {case (node, token) => node -> new DependencyNode(node.text, node.postag, node.indices, token.offset)}).toMap
108 | 
109 |       newGraph map nodeOffsetTransformation
110 |     }
111 |   }
112 | 
113 |   def switchVoice: Iterable[DependencyGraph] = {
114 |     passiveVoice ++ activeVoice
115 |   }
116 | }
117 | 
118 | object DependencyGraphExtras {
119 |   private def shift(interval: Interval, by: Int) = Interval.open(interval.start + by, interval.end + by)
120 | 
121 |   private def swapOrders(edges: Iterable[Dependency], left: scala.collection.Set[DependencyNode], right: scala.collection.Set[DependencyNode]) = {
122 |     val leftInterval = Interval.span(left.map(_.indices))
123 |     val rightInterval = Interval.span(right.map(_.indices))
124 | 
125 |     require(leftInterval.end <= rightInterval.start)
126 | 
127 |     val leftOffset = left.iterator.map(_.offset).max
128 |     val rightOffset = right.iterator.map(_.offset).min
129 | 
130 |     val tokensBetween = rightInterval.start - leftInterval.end + 1
131 |     val charsBetween = rightOffset - leftOffset
132 | 
133 |     edges.map(e => e.mapNodes(v =>
134 |       if (left contains v) new DependencyNode(v.text, v.postag, DependencyGraphExtras.shift(v.indices, tokensBetween), v.offset + charsBetween)
135 |       else if (right contains v) new DependencyNode(v.text, v.postag, DependencyGraphExtras.shift(v.indices, -tokensBetween), v.offset - charsBetween)
136 |       else v))
137 |   }
138 | }


--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/ollie/NaryExtraction.scala:
--------------------------------------------------------------------------------
  1 | package edu.knowitall.ollie
  2 | 
  3 | import scala.Option.option2Iterable
  4 | import scala.collection.SortedSet
  5 | import scala.collection.immutable
  6 | 
  7 | import edu.knowitall.collection.immutable.Interval
  8 | import edu.knowitall.openparse.extract.DetailedExtraction
  9 | import edu.knowitall.openparse.extract.Extraction
 10 | import edu.knowitall.openparse.extract.Extraction.AdverbialModifier
 11 | import edu.knowitall.openparse.extract.Extraction.ClausalComponent
 12 | import edu.knowitall.tool.parse.graph.DependencyNode
 13 | import edu.knowitall.tool.postag.Postagger
 14 | 
 15 | /**
 16 |  * Represents a part {arg1, rel, arg2} of an extraction.
 17 |  *
 18 |  * @param  string  the representation of the part
 19 |  * @param  interval  the interval of the part in the source sentence
 20 |  */
 21 | class ExtractionPart(val string: String, val interval: Interval) extends Ordered[ExtractionPart] {
 22 |   override def compare(that: ExtractionPart) =
 23 |     this.interval compare that.interval
 24 | 
 25 |   override def toString = string.replaceAll("/", "")
 26 | }
 27 | 
 28 | /**
 29 |  * Represents a possible suffix for an extended extraction.
 30 |  * For example, in the sentence "He ate from 7 until 10."
 31 |  * there are two suffixes: "from 7" and "until 10".
 32 |  *
 33 |  * @param  string  the text of the suffix
 34 |  * @param  interval  the interval of the suffix in the source sentence
 35 |  * @param  confidence  the confidence of the suffix
 36 |  */
 37 | class Suffix(
 38 |   text: String,
 39 |   nodes: SortedSet[DependencyNode],
 40 |   val confidence: Double)
 41 |   extends Extraction.Part(nodes, text) {
 42 |   override def toString = ("%1.4f" format confidence) + "/\"" + super.toString + "\""
 43 | 
 44 |   /** Annote the suffix with a type. */
 45 |   def annotate(string: String) =
 46 |     new AnnotatedSuffix(this, string)
 47 | }
 48 | 
 49 | /**
 50 |  * Represents a possible suffix for an extended extraction
 51 |  * along with an annotation.
 52 |  *
 53 |  * For example, in the sentence "He ate from 7 until 10."
 54 |  * there are two suffixes: "from 7" and "until 10".
 55 |  *
 56 |  * @param  string  the text of the suffix
 57 |  * @param  interval  the interval of the suffix in the source sentence
 58 |  * @param  confidence  the confidence of the suffix
 59 |  * @param  annotation  an annotation for the suffix
 60 |  */
 61 | class AnnotatedSuffix(
 62 |   text: String,
 63 |   nodes: SortedSet[DependencyNode],
 64 |   confidence: Double,
 65 |   val annotation: String)
 66 |   extends Suffix(text, nodes, confidence) {
 67 |   def this(suffix: Suffix, annotation: String) =
 68 |     this(suffix.text, suffix.nodes, suffix.confidence, annotation)
 69 |   override def toString = annotation + "/" + super.toString
 70 | }
 71 | 
 72 | /**
 73 |  * A representaiton of an n-ary extraction, i.e.
 74 |  *
 75 |  *   (Michael, ran, to the store, on Monday, at 2 PM)
 76 |  *
 77 |  * N-ary extractions have multiple secondary arguments (objects)
 78 |  * and these arguments include the preposition.
 79 |  *
 80 |  * @param  arg1  the first argument
 81 |  * @param  rel  the relation
 82 |  * @param  suffixes  the suffixes
 83 |  * @param  clausals  a clause restricting this extraction to a context
 84 |  * @param  modifier  a modifier for this extraction (i.e. attribution)
 85 |  *
 86 |  * @author Michael Schmitz
 87 |  */
 88 | class NaryExtraction(val arg1: Extraction.Part, val rel: Extraction.Part, val suffixes: Seq[Suffix], val attributions: Seq[Attribution] = Seq.empty, val enablers: Seq[EnablingCondition] = Seq.empty) {
 89 |   override def toString =
 90 |     "(" + arg1.text + ", " + rel.text + ", " + suffixes.map(_.text).mkString(", ") + ")"
 91 | }
 92 | 
 93 | object NaryExtraction {
 94 |   implicit object SuffixOrdering extends Ordering[Suffix] {
 95 |     def compare(x: Suffix, y: Suffix) = x.span.compare(y.span)
 96 |   }
 97 | 
 98 |   /**
 99 |    * Create extended extractions from a collection of extractions
100 |    * from the same sentence.
101 |    */
102 |   def from(extrs: Iterable[(Double, OllieExtractionInstance)]): Iterable[NaryExtraction] = {
103 |     // keep extractions that end with a one-word preposition
104 |     val prepositionEnding = extrs.filter {
105 |       case (conf, inst) =>
106 |         Postagger.simplePrepositions(inst.extr.rel.text drop (1 + inst.extr.rel.text lastIndexOf ' '))
107 |     }
108 | 
109 |     // break off the preposition
110 |     case class BrokenExtraction(rel: String, preposition: String, extr: (Double, OllieExtraction))
111 |     val split: Iterable[BrokenExtraction] = prepositionEnding.map {
112 |       case (conf, inst) =>
113 |         val preps = Postagger.prepositions.filter(inst.extr.rel.text endsWith _)
114 |         val longest = preps.maxBy(_.length)
115 |         BrokenExtraction(inst.extr.rel.text.dropRight(longest.length + 1), longest, (conf, inst.extr))
116 |     }
117 | 
118 |     // group by the arg1 and text
119 |     split groupBy {
120 |       case BrokenExtraction(rel, preposition, (conf, extr)) =>
121 |         (extr.arg1.text, rel)
122 |     } filter (_._2.size > 1) map {
123 |       case ((arg1, rel), extrs) =>
124 |         val suffixes: immutable.SortedSet[Suffix] = extrs.map {
125 |           case BrokenExtraction(rel, prep, (conf, extr)) =>
126 |             new Suffix(prep + " " + extr.arg2.text, extr.arg2.nodes, conf)
127 |         }(scala.collection.breakOut)
128 | 
129 |         val first = extrs.head.extr._2
130 |         val argument1 = new Extraction.Part(first.arg1.nodes, arg1)
131 |         val relation = new Extraction.Part(first.rel.nodes, rel)
132 | 
133 |         val attributions = extrs.flatMap(_.extr._2.attribution).toSet.toSeq
134 |         val enablers = extrs.flatMap(_.extr._2.enabler).toSet.toSeq
135 | 
136 |         new NaryExtraction(argument1, relation, suffixes.toSeq, enablers = enablers, attributions = attributions)
137 |     }
138 |   }
139 | }
140 | 


--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/ollie/Ollie.scala:
--------------------------------------------------------------------------------
 1 | package edu.knowitall.ollie
 2 | 
 3 | import scala.io.Source
 4 | import edu.knowitall.collection.immutable.Interval
 5 | import edu.knowitall.common.Resource.using
 6 | import edu.knowitall.openparse.OpenParse
 7 | import edu.knowitall.openparse.extract.DetailedExtraction
 8 | import edu.knowitall.tool.parse.graph.DependencyGraph
 9 | import edu.knowitall.tool.stem.MorphaStemmer
10 | import edu.knowitall.tool.stem.Stemmer
11 | 
12 | /** Ollie is an Open Information Extractor that produces binary extractions
13 |   * with context.  The constructor takes an OpenParse instance.  Ollie extends
14 |   * OpenParse's extractions with enabling conditions and attributions.  There
15 |   * is also a trained confidence function for OllieExtractions.
16 |   *
17 |   * @author Michael Schmitz
18 |   */
19 | class Ollie(val openparse: OpenParse) {
20 |   val stemmer = new MorphaStemmer
21 | 
22 |   /** Construct with the default model. */
23 |   def this() = this(OpenParse.withDefaultModel(OpenParse.Configuration(confidenceThreshold = 0.005)))
24 | 
25 |   def apply(dgraph: DependencyGraph): Iterable[OllieExtractionInstance] =
26 |     extract(dgraph)
27 | 
28 |   /**
29 |     * primary method for getting extractions
30 |     */
31 |   def extract(dgraph: DependencyGraph): Iterable[OllieExtractionInstance] = {
32 |     val openparseExtrs = openparse.extract(dgraph)
33 | 
34 |     for {
35 |       (conf, extr) <- openparseExtrs
36 |       enabler = enablingAdverbialClauseHelper(extr)
37 |       attribution = attribClausalComponentHelper(extr)
38 |     } yield new OllieExtractionInstance(
39 |         new OllieExtraction(extr.arg1, extr.rel, extr.arg2, conf, enabler, attribution), dgraph, extr.extractor)
40 |   }
41 | 
42 |   /** Identify enabling condition, i.e. "if it's raining..." */
43 |   private def enablingAdverbialClauseHelper(extr: DetailedExtraction): Option[EnablingCondition] = {
44 |     extr.modifier map { modifier =>
45 |       val prefix = modifier.contents.nodes.head.text
46 |       val phrase = modifier.contents.nodes.iterator.drop(1).map(_.text).mkString(" ")
47 | 
48 |       new EnablingCondition(prefix, phrase, modifier.contents.span)
49 |     }
50 |   }
51 | 
52 |   /** Identify attributions from clausal components, i.e. "He said..." */
53 |   private def attribClausalComponentHelper(extr: DetailedExtraction): Option[Attribution] = {
54 |     extr.clausal flatMap { clausal =>
55 |       // find the first verb in the clausal rel
56 |       clausal.rel.nodes.find(_.postag.startsWith("VB")).flatMap { node =>
57 |         val normalized = stemmer.stem(node.text.toLowerCase())
58 |         if (Ollie.communicationWords.contains(normalized) || Ollie.cognitiveWords.contains(normalized)) {
59 |           val clausalArgInterval = Interval.span(clausal.arg.nodes.map(_.indices))
60 |           val clausalRelInterval = Interval.span(clausal.rel.nodes.map(_.indices))
61 |           Some(new Attribution(
62 |             clausal.arg.text,
63 |             clausal.arg.span,
64 |             clausal.rel.text,
65 |             clausal.rel.span))
66 |         } else None
67 |       }
68 |     }
69 |   }
70 | }
71 | 
72 | object Ollie {
73 |   implicit def stemmer: Stemmer = MorphaStemmer
74 | 
75 |   /** A collection of verbs used for communication, i.e. "said" */
76 |   val communicationWords = using(Source.fromInputStream(classOf[Ollie].getResource("communicationWords.txt").openStream())) { source =>
77 |     source.getLines.toSet
78 |   }
79 | 
80 |   /** A collection of verbs used for beliefs, i.e. "think" */
81 |   val cognitiveWords = using(Source.fromInputStream(classOf[Ollie].getResource("cognitiveWords.txt").openStream())) { source =>
82 |     source.getLines.toSet
83 |   }
84 | 
85 |   /** A collection of prefixes used for enabling conditions, i.e. "if" and "when" */
86 |   val enablerPrefixes = using(Source.fromInputStream(classOf[Ollie].getResource("prefixWords.txt").openStream())) { source =>
87 |     source.getLines.toSet
88 |   }
89 | }
90 | 


--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/ollie/OllieExtraction.scala:
--------------------------------------------------------------------------------
  1 | package edu.knowitall.ollie
  2 | 
  3 | import scala.Option.option2Iterable
  4 | import scala.collection.breakOut
  5 | 
  6 | import edu.knowitall.collection.immutable.Interval
  7 | import edu.knowitall.common.HashCodeHelper
  8 | import edu.knowitall.openparse.extract.Extraction.Part
  9 | import edu.knowitall.tool.parse.graph.DependencyNode
 10 | 
 11 | /** A base representation for additional context around an extraction. */
 12 | sealed abstract class Context {
 13 |   def text: String
 14 |   def interval: Interval
 15 | }
 16 | 
 17 | /** A representation for an enabling condition.
 18 |   * An example of an enabling condition is "if it's raining".
 19 |   */
 20 | case class EnablingCondition(
 21 |     /** The enabling condition word, i.e. "if" */
 22 |     val prefix: String,
 23 |     /** The rest of the enabling condition, i.e. "it's raining" */
 24 |     val phrase: String,
 25 |     /** The token interval of the enabling condition */
 26 |     override val interval: Interval) extends Context {
 27 |   override def text = prefix + " " + phrase
 28 | 
 29 |   def serialize: String = Seq(prefix, phrase, interval.start.toString, interval.last.toString).map(_.replaceAll("_", "_UNSC_")).mkString("_")
 30 | }
 31 | 
 32 | object EnablingCondition {
 33 |   def deserialize(string: String) = {
 34 |     val Array(prefix, phrase, intervalStart, intervalLast) = try (string.split("_"))
 35 |     catch {
 36 |       case e => throw new RuntimeException("could not deserialize EnablingCondition: " + string, e);
 37 |     }
 38 |     new EnablingCondition(prefix, phrase, Interval.closed(intervalStart.toInt, intervalLast.toInt))
 39 |   }
 40 | }
 41 | 
 42 | /** A representation for an attribution.
 43 |   * An example of an is "Obama believes".
 44 |   */
 45 | case class Attribution(
 46 |     /** The argument of the attribution, i.e. "Obama" */
 47 |     val arg: String,
 48 |     /** The token interval of the argument of the attribution */
 49 |     val argInterval: Interval,
 50 |     /** The relation of the attribution, i.e. "believes" */
 51 |     val rel: String,
 52 |     /** The token interval of the relation of the attribution */
 53 |     override val interval: Interval) extends Context {
 54 |   override def text = arg + " " + rel
 55 | 
 56 |   def serialize: String = {
 57 |     val fields = Seq(arg, rel, argInterval.start.toString, argInterval.last.toString, interval.start.toString, interval.last.toString)
 58 |     fields.map(_.replaceAll("_", "_UNSC_")).mkString("_")
 59 |   }
 60 | }
 61 | 
 62 | object Attribution {
 63 |   def deserialize(string: String) = {
 64 |     val Array(arg, rel, argIntervalStart, argIntervalLast, relIntervalStart, relIntervalLast) = try (string.split("_"))
 65 |     catch {
 66 |       case e => throw new RuntimeException("could not deserialize Attribution: " + string, e);
 67 |     }
 68 |     val argInterval = Interval.closed(argIntervalStart.toInt, argIntervalLast.toInt)
 69 |     val relInterval = Interval.closed(relIntervalStart.toInt, relIntervalLast.toInt)
 70 | 
 71 |     new Attribution(arg, argInterval, rel, relInterval)
 72 |   }
 73 | }
 74 | 
 75 | /** A representation of an Ollie extraction, i.e. we could get the following
 76 |   * extraction from the example sentence.
 77 |   *
 78 |   * {{{
 79 |   * When I'm dreaming David Bowie sings that Ziggy sucked up into his mind.
 80 |   * (Ziggy, sucked up, into his mind)[attribution = "David Bowie")
 81 |   * }}}
 82 |   */
 83 | class OllieExtraction(
 84 |   /** The first argument (subject) of the extraction, i.e. "Ziggy" */
 85 |   val arg1: Part,
 86 |   /** The relation of the extraction, i.e. "sucked up" */
 87 |   val rel: Part,
 88 |   /** The second argument (object) of the extraction, i.e. "into his mind" */
 89 |   val arg2: Part,
 90 |   /** The confidence value from OpenParse. */
 91 |   private[ollie] val openparseConfidence: Double,
 92 |   /** The enabling condition, if any.  I.e. "When I'm dreaming" */
 93 |   val enabler: Option[EnablingCondition],
 94 |   /** The attribution, if any.  I.e. "David Bowie sings that" */
 95 |   val attribution: Option[Attribution]) {
 96 | 
 97 |   import OllieExtraction.{serializePart, deserializePart}
 98 | 
 99 |   override def equals(that: Any) = that match {
100 |     case that: OllieExtraction =>
101 |       this.arg1 == that.arg1 &&
102 |       this.rel == that.rel &&
103 |       this.arg2 == that.arg2 &&
104 |       this.enabler == that.enabler &&
105 |       this.attribution == that.attribution &&
106 |       this.openparseConfidence == that.openparseConfidence
107 |     case _ => false
108 |   }
109 | 
110 |   override def hashCode = HashCodeHelper(
111 |       this.arg1,
112 |       this.rel,
113 |       this.arg2,
114 |       this.enabler,
115 |       this.attribution,
116 |       this.openparseConfidence)
117 | 
118 |   def tabSerialize: String = {
119 |     val enablerString = enabler match {
120 |       case Some(enablingCondition) => enablingCondition.serialize
121 |       case None => "None"
122 |     }
123 |     val attrString = attribution match {
124 |       case Some(attr) => attr.serialize
125 |       case None => "None"
126 |     }
127 | 
128 |     val fieldStrings = Seq(arg1, rel, arg2).map(serializePart(_)) ++ Seq("%.05f".format(openparseConfidence), enablerString, attrString)
129 |     fieldStrings.map(_.replaceAll("\t", "_TAB_")).mkString("\t")
130 |   }
131 | 
132 |   /** The full text of this extraction. */
133 |   def text = Iterable(arg1.text, rel.text, arg2.text).mkString(" ")
134 | 
135 |   /** All the nodes in this extraction. */
136 |   def nodes = arg1.nodes ++ rel.nodes ++ arg2.nodes
137 | 
138 |   /** The spanning interval of the nodes in this extraction. */
139 |   def span = Interval.span(nodes.map(_.indices))
140 | 
141 |   override def toString = {
142 |     val extentions = Iterable(
143 |         enabler.map("enabler="+_.text),
144 |         attribution.map("attrib="+_.text)).flatten match {
145 |       case Nil => ""
146 |       case list => list.mkString("[", ";", "]")
147 |     }
148 |     "(%s; %s; %s)".format(arg1.text, rel.text, arg2.text) + extentions
149 |   }
150 | }
151 | 
152 | object OllieExtraction {
153 |   def tabDelimitedColumns = Seq("Arg1Part", "RelPart", "Arg2Part", "Confidence", "Enabler", "Attribution").mkString("\t")
154 | 
155 |   def tabDeserialize(array: Seq[String]): (OllieExtraction, Seq[String]) = {
156 |     array match {
157 |       case Seq(arg1Part, relPart, arg2Part, openparseConfString, enablerString, attrString, rest @ _*) => {
158 |         val parts = Seq(arg1Part, relPart, arg2Part) map deserializePart
159 |         val enabler = if (enablerString.equals("None")) None else Some(EnablingCondition.deserialize(enablerString))
160 |         val attribution = if (attrString.equals("None")) None else Some(Attribution.deserialize(attrString))
161 |         val extr = new OllieExtraction(parts(0), parts(1), parts(2), openparseConfString.toDouble, enabler, attribution)
162 |         (extr, rest)
163 |       }
164 |     }
165 |   }
166 | 
167 |   def tabDeserialize(s: String): OllieExtraction = {
168 |     val (extr, rest) = tabDeserialize(s.split("\t"))
169 |     require(rest.isEmpty)
170 |     extr
171 |   }
172 | 
173 |   def serializePart(part: Part): String = {
174 |     val serializedNodes = part.nodes.iterator.map(_.serialize).mkString("; ")
175 |     Iterable(part.text, serializedNodes).mkString(" ;;; ")
176 |   }
177 | 
178 |   def deserializePart(string: String): Part = {
179 |     val Array(partText, partNodes) = try (string.split("\\s*;;;\\s*"))
180 |       catch {
181 |         case e => throw new RuntimeException("could not deserialize Extraction.Part: " + string, e);
182 |       }
183 | 
184 |     val nodesSortedSet: scala.collection.SortedSet[DependencyNode] =
185 |       try (partNodes.split("\\s*;\\s*").map(DependencyNode.deserialize(_))(breakOut))
186 |       catch {
187 |         case e => throw new RuntimeException("could not deserialize Extraction.Part: " + string, e);
188 |       }
189 | 
190 |     new Part(nodesSortedSet, partText)
191 |   }
192 | }
193 | 
194 | 


--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/ollie/OllieExtractionInstance.scala:
--------------------------------------------------------------------------------
 1 | package edu.knowitall.ollie
 2 | 
 3 | import edu.knowitall.common.HashCodeHelper
 4 | import edu.knowitall.openparse.extract.PatternExtractor
 5 | import edu.knowitall.tool.parse.graph.DependencyGraph
 6 | import scala.util.matching.Regex
 7 | 
 8 | /** OllieExtractionInstance represents an extraction coupled with
 9 |   * its source sentence.
10 |   */
11 | class OllieExtractionInstance(
12 |     /** The associated extraction. */
13 |     val extr: OllieExtraction,
14 |     /** The associated sentence. */
15 |     val sent: DependencyGraph,
16 |     /** The extractor used. */
17 |     val pat: PatternExtractor) {
18 | 
19 |   override def equals(that: Any) = that match {
20 |     case that: OllieExtractionInstance => this.extr == that.extr && this.sent == that.sent
21 |     case _ => false
22 |   }
23 |   override def hashCode = HashCodeHelper(extr, sent)
24 | 
25 |   def extraction = extr
26 |   def sentence = sent
27 |   def pattern = pat
28 | 
29 |   private val passivePatternRegex = new Regex("""^\{arg1:?\w*\} <nsubjpass<.*""")
30 |   /** Report if this extraction is an passive construction.
31 |     * This is a crude measure so false should not be taken to mean
32 |     * that it is not active.
33 |     *
34 |     * An extraction is passive if it has a valid active formulation.
35 |     */
36 |   def passive: Boolean =
37 |     passivePatternRegex.pattern.matcher(pat.pattern.serialize).matches() && (extr.rel.text.endsWith(" by"))
38 | 
39 |   private val activePatternRegex = new Regex("""^\{arg1:?\w*\} <nsubj<.*>dobj> \{arg2:?\w*\}""")
40 |   /** Report if this extraction is an active construction.
41 |     * This is a crude measure so false should not be taken to mean
42 |     * that it is not active.
43 |     *
44 |     * An extraction is active if it has a valid passive formulation
45 |     * by swapping the arguments and modifying the relation (adding "be"
46 |     * and "by").
47 |     */
48 |   def active: Boolean =
49 |     activePatternRegex.pattern.matcher(pat.pattern.serialize).matches()
50 | 
51 |   def tabSerialize: String = {
52 |     val serializedGraph = sent.serialize
53 |     val serializedExtr = extr.tabSerialize
54 |     Seq(serializedGraph, pat.tabSerialize, serializedExtr).mkString("\t")
55 |   }
56 | }
57 | 
58 | object OllieExtractionInstance {
59 |   def tabDeserialize(string: String): OllieExtractionInstance = {
60 |     val array = string.split('\t')
61 | 
62 |     val (extr, rest) = tabDeserialize(array)
63 |     require(rest.isEmpty)
64 | 
65 |     extr
66 |   }
67 | 
68 |   def tabDeserialize(array: Seq[String]): (OllieExtractionInstance, Seq[String]) = {
69 |     try {
70 |       val Seq(serializedGraph, r0 @ _*) = array
71 | 
72 |       val graph = DependencyGraph.deserialize(serializedGraph)
73 |       val (pat, r1) = PatternExtractor.tabDeserialize(r0)
74 |       val (extr, r2) = OllieExtraction.tabDeserialize(r1)
75 | 
76 |       (new OllieExtractionInstance(extr, graph, pat), r2)
77 |     } catch {
78 |       case e => throw new IllegalArgumentException("Could not tab deserialize: " + array.mkString("\t"), e)
79 |     }
80 |   }
81 | 
82 |   val numFinder = "[0-9]+".r
83 | }
84 | 


--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/ollie/ScoredOllieExtractionInstance.scala:
--------------------------------------------------------------------------------
 1 | package edu.knowitall.ollie
 2 | 
 3 | import edu.knowitall.tool.conf.Labelled
 4 | 
 5 | /** OllieExtractionInstance represents a boolean score coupled with
 6 |   * an extraciton instance.
 7 |   *
 8 |   * @param  score  the label for this extraction
 9 |   * @param  inst  the extraction instance labelled
10 |   */
11 | class ScoredOllieExtractionInstance(
12 |     val score: Boolean,
13 |     val inst: OllieExtractionInstance) extends Labelled[OllieExtractionInstance](score, inst) {
14 | 
15 |   override def toString = score + ":" + inst.extr
16 | 
17 |   def tabSerialize: String = {
18 |     Iterable(if (score) 1 else 0, inst.extr.toString, inst.tabSerialize).mkString("\t")
19 |   }
20 | }
21 | 
22 | object ScoredOllieExtractionInstance {
23 |   def tabDeserialize(string: String): ScoredOllieExtractionInstance = {
24 |     try {
25 |       val Array(scoreString, _, rest @ _*) = string.split('\t')
26 | 
27 |       val score =
28 |         if (scoreString == "1") true
29 |         else if (scoreString == "0") false
30 |         else throw new IllegalArgumentException("bad score: " + scoreString)
31 |       val (inst, r2) = OllieExtractionInstance.tabDeserialize(rest)
32 | 
33 |       require(r2.isEmpty)
34 | 
35 |       new ScoredOllieExtractionInstance(score, inst)
36 |     } catch {
37 |       case e => throw new IllegalArgumentException("could not tab deserialize: " + string, e)
38 |     }
39 |   }
40 | 
41 |   val numFinder = "[0-9]+".r
42 | }
43 | 


--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/ollie/confidence/OllieConfidenceFunction.scala:
--------------------------------------------------------------------------------
 1 | package edu.knowitall.ollie.confidence
 2 | 
 3 | import java.io.InputStream
 4 | import java.net.URL
 5 | import java.util.Scanner
 6 | 
 7 | import scala.collection.mutable
 8 | 
 9 | import org.slf4j.LoggerFactory
10 | 
11 | import edu.knowitall.common.Resource.using
12 | import edu.knowitall.ollie.OllieExtractionInstance
13 | import edu.knowitall.tool.conf.FeatureSet
14 | import edu.knowitall.tool.conf.impl.LogisticRegression
15 | 
16 | /** An implementation of logistic regression of features that can be
17 |   * represented as a double. */
18 | 
19 | object OllieConfidenceFunction {
20 |   val logger = LoggerFactory.getLogger(classOf[OllieIndependentConfFunction])
21 |   
22 |   type OllieIndependentConfFunction = LogisticRegression[OllieExtractionInstance]
23 | 
24 |   val defaultModelUrl = Option(this.getClass.getResource("default-classifier.txt")).getOrElse {
25 |     throw new IllegalArgumentException("Could not load confidence function resource.")
26 |   }
27 | 
28 |   def loadDefaultClassifier(): OllieIndependentConfFunction = {
29 |     fromUrl(OllieFeatureSet, defaultModelUrl)
30 |   }
31 | 
32 |   def fromUrl(featureSet: FeatureSet[OllieExtractionInstance, Double], url: URL): OllieIndependentConfFunction = {
33 |     LogisticRegression.fromUrl(featureSet, url)
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/ollie/confidence/OllieFeatureEvaluation.scala:
--------------------------------------------------------------------------------
  1 | package edu.knowitall.ollie.confidence
  2 | 
  3 | import java.io.File
  4 | import java.io.PrintWriter
  5 | import java.net.URL
  6 | 
  7 | import scala.io.Source
  8 | 
  9 | import edu.knowitall.common.Analysis
 10 | import edu.knowitall.common.Resource.using
 11 | import edu.knowitall.ollie.ScoredOllieExtractionInstance
 12 | import scopt.OptionParser
 13 | 
 14 | object OllieFeatureEvaluation {
 15 |     /** Settings for OpenParse. */
 16 |   abstract class Settings {
 17 |     /** source file of scored extractions */
 18 |     def inputFile: File
 19 | 
 20 |     /** file to output; None means stdout */
 21 |     def outputFile: Option[File]
 22 | 
 23 |     /** confidence model url */
 24 |     def confidenceModelUrl: URL
 25 |   }
 26 | 
 27 |   def main(args: Array[String]) = {
 28 |     var settings = new Settings {
 29 |       var inputFile: File = _
 30 |       var outputFile: Option[File] = None
 31 |       var confidenceModelUrl: URL = OllieConfidenceFunction.defaultModelUrl
 32 |     }
 33 | 
 34 |     val parser = new OptionParser("feature-eval") {
 35 |       opt(Some("c"), "confidence model", "<file>", "confidence model file", { path: String =>
 36 |         val file = new File(path)
 37 |         require(file.exists, "file does not exist: " + path)
 38 |         settings.confidenceModelUrl = file.toURI.toURL
 39 |       })
 40 | 
 41 |       opt("o", "output", "output file (otherwise stdout)", { path =>
 42 |         val file = new File(path)
 43 |         settings.outputFile = Some(file)
 44 |       })
 45 | 
 46 |       arg("input", "input dependencies file", { path: String =>
 47 |         val file = new File(path)
 48 |         require(file.exists, "input file does not exist: " + path)
 49 |         settings.inputFile = file
 50 |       })
 51 |     }
 52 | 
 53 |     if (parser.parse(args)) {
 54 |       run(settings)
 55 |     }
 56 |   }
 57 | 
 58 |   def run(settings: Settings) = {
 59 |     val confFunc = OllieConfidenceFunction.fromUrl(OllieFeatureSet, settings.confidenceModelUrl)
 60 | 
 61 |     val extrs = using (Source.fromFile(settings.inputFile)) { source =>
 62 |       for (
 63 |         line <- source.getLines.toList;
 64 |         val scored = ScoredOllieExtractionInstance.tabDeserialize(line);
 65 |         val conf = confFunc(scored.inst)
 66 |       ) yield (conf, scored)
 67 |     }
 68 | 
 69 |     val sorted = extrs.sortBy(-_._1).toList
 70 | 
 71 |     val pyed = (sorted.head, 0, 1.0) +: Analysis.precisionYieldMeta(sorted zip sorted.map(_._2.score))
 72 | 
 73 |     val featureNames = confFunc.featureSet.featureNames.filter(confFunc.featureWeights.get(_).isDefined).toList.sorted
 74 |     using {
 75 |       settings.outputFile match {
 76 |         case Some(f) => new PrintWriter(f, "UTF8")
 77 |         case None => new PrintWriter(System.out)
 78 |       }
 79 |     } { writer =>
 80 |       writer.println((Iterable("score", "conf", "op-conf", "yield", "precision",
 81 |         "extr", "enabler", "attrib", "sentence", "dependencies") ++
 82 |         featureNames).mkString("\t"))
 83 |       writer.println("\t" * 10 + featureNames.map(confFunc.featureWeights(_).toString).mkString("\t"))
 84 |     (pyed) foreach { case ((conf, scored), y, p) =>
 85 |       val features =
 86 |         for (
 87 |           featureName <- featureNames;
 88 |           val featureValue = confFunc.featureSet(featureName)(scored.inst)
 89 |         ) yield featureValue
 90 | 
 91 |       writer.println((Iterable(if (scored.score) 1 else 0,
 92 |           conf,
 93 |           scored.inst.extr.openparseConfidence,
 94 |           y,
 95 |           p,
 96 |           scored.inst.extr.toString,
 97 |           scored.inst.extr.enabler.isDefined.toString.toLowerCase,
 98 |           scored.inst.extr.attribution.isDefined.toString.toLowerCase,
 99 |           scored.inst.sent.text,
100 |           scored.inst.sent.serialize) ++ features).mkString("\t"))
101 |       }
102 |     }
103 |   }
104 | }
105 | 


--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/ollie/confidence/train/CrossValidateConfidence.scala:
--------------------------------------------------------------------------------
 1 | package edu.knowitall.ollie.confidence.train
 2 | 
 3 | import java.io.File
 4 | 
 5 | import scala.io.Source
 6 | 
 7 | import edu.knowitall.common.Analysis
 8 | import edu.knowitall.common.Resource.using
 9 | import edu.knowitall.ollie.ScoredOllieExtractionInstance
10 | import edu.knowitall.ollie.confidence.OllieFeatureSet
11 | import edu.knowitall.tool.conf.BreezeLogisticRegressionTrainer
12 | import scopt.mutable.OptionParser
13 | 
14 | object CrossValidateConfidence {
15 |   def main(args: Array[String]) {
16 |     object settings extends Settings {
17 |       var inputFile: File = _
18 |       var outputFile: Option[File] = None
19 |     }
20 | 
21 |     val parser = new OptionParser("scoreextr") {
22 |       arg("labelled", "labelled extractions", { path: String => settings.inputFile = new File(path) })
23 |       argOpt("output", "output file", { path: String => settings.outputFile = Some(new File(path)) })
24 |     }
25 | 
26 |     if (parser.parse(args)) {
27 |       run(settings)
28 |     }
29 |   }
30 | 
31 |    abstract class Settings {
32 |      def inputFile: File
33 |      def outputFile: Option[File]
34 | 
35 |      val splits = 10
36 |    }
37 | 
38 | 
39 |   def run(settings: Settings) = {
40 |     val trainer = new BreezeLogisticRegressionTrainer(OllieFeatureSet)
41 | 
42 |     val data =
43 |       using (Source.fromFile(settings.inputFile)) { source =>
44 |         (source.getLines map (ScoredOllieExtractionInstance.tabDeserialize)).toList
45 |       }
46 | 
47 |     val splits = data.iterator.sliding(data.size / settings.splits, data.size / settings.splits).withPartial(false)
48 |     val results = for {
49 |       split <- splits.toList
50 | 
51 |       val test = split
52 |       val training = data filterNot (test contains _)
53 | 
54 |       val classifier = trainer.train(training)
55 |     } yield {
56 |       for (example <- test) yield {
57 |         val conf = classifier.apply(example.inst)
58 |         val correct =
59 |           if (conf >= 0.5 && example.score) true
60 |           else if (conf < 0.5 && !example.score) true
61 |           else false
62 |         (conf, correct)
63 |       }
64 |     }
65 | 
66 |     val pys = results.map { list =>
67 |       val py = Analysis.precisionYield(list.sortBy(-_._1).map(_._2))
68 | 
69 |       py
70 |     }
71 | 
72 |     val aucs = pys.zipWithIndex map { case (py, i) =>
73 |       println("Split " + i)
74 |       py foreach { case (y, p) =>
75 |         println(Iterable(y.toString, "%1.4f" format p).mkString("\t"))
76 |       }
77 | 
78 |       val auc = Analysis.areaUnderCurve(py)
79 |       println("auc: " + auc)
80 | 
81 |       println()
82 |       auc
83 |     }
84 | 
85 |     var auc = breeze.linalg.mean(aucs)
86 |     println("avg auc: " + auc)
87 |   }
88 | }


--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/ollie/confidence/train/TrainOllieConfidence.scala:
--------------------------------------------------------------------------------
 1 | 
 2 | package edu.knowitall.ollie.confidence.train
 3 | 
 4 | import java.io.File
 5 | 
 6 | import scala.io.Source
 7 | 
 8 | import edu.knowitall.common.Resource.using
 9 | import edu.knowitall.ollie.ScoredOllieExtractionInstance
10 | import edu.knowitall.ollie.confidence.OllieFeatureSet
11 | import edu.knowitall.tool.conf.BreezeLogisticRegressionTrainer
12 | import scopt.mutable.OptionParser
13 | 
14 | object TrainOllieConfidence {
15 |   def main(args: Array[String]) {
16 |     object settings extends Settings {
17 |       var inputFile: File = _
18 |       var outputFile: Option[File] = None
19 |     }
20 | 
21 |     val parser = new OptionParser("scoreextr") {
22 |       arg("labelled", "labelled extractions", { path: String => settings.inputFile = new File(path) })
23 |       argOpt("output", "output file", { path: String => settings.outputFile = Some(new File(path)) })
24 |     }
25 | 
26 |     if (parser.parse(args)) {
27 |       run(settings)
28 |     }
29 |   }
30 | 
31 |    abstract class Settings {
32 |      def inputFile: File
33 |      def outputFile: Option[File]
34 |    }
35 | 
36 |   def run(settings: Settings) = {
37 |     val trainer = new BreezeLogisticRegressionTrainer(OllieFeatureSet)
38 | 
39 |     val data =
40 |       using (Source.fromFile(settings.inputFile)) { source =>
41 |         (source.getLines map (ScoredOllieExtractionInstance.tabDeserialize)).toList
42 |       }
43 | 
44 |     val classifier = trainer.train(data)
45 |     settings.outputFile match {
46 |       case Some(file) => classifier.saveFile(file)
47 |       case None =>
48 |         classifier.save(System.out)
49 |     }
50 |   }
51 | }


--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/ollie/output/BratOutput.scala:
--------------------------------------------------------------------------------
 1 | package edu.knowitall.ollie.output
 2 | 
 3 | import edu.knowitall.ollie.OllieExtractionInstance
 4 | import edu.knowitall.openparse.extract.Extraction
 5 | import edu.knowitall.collection.immutable.Interval
 6 | import edu.knowitall.ollie.ExtractionPart
 7 | import edu.knowitall.tool.segment.Segment
 8 | import java.io.PrintWriter
 9 | 
10 | class BratOutput(extractor: String => Iterable[OllieExtractionInstance]) {
11 |   def process(sentences: Iterable[Segment], writer: PrintWriter) = {
12 |     val document = new Document()
13 |     for {
14 |       Segment(text, offset) <- sentences 
15 |       inst <- extractor(text)
16 |       entry <- document.annotations(inst, offset)
17 |     } {
18 |       writer.println(entry)
19 |     }
20 |   }
21 | 
22 |   class Document {
23 |     var entityIndex = 0
24 |     var relationIndex = 0
25 | 
26 |     def annotations(inst: OllieExtractionInstance, sentenceCharacterOffset: Int) = {
27 |       def partToAnnotation(inst: OllieExtractionInstance, part: Extraction.Part, partName: String) = {
28 |         val tokens = inst.sentence.nodes.toList.slice(part.span.start, part.span.end)
29 |         val charInterval = Interval.open(tokens.head.offset, tokens.last.offsets.end)
30 |         partName + " " + (sentenceCharacterOffset + charInterval.start) + " " + (sentenceCharacterOffset + charInterval.end) + "\t" + inst.sentence.text.substring(charInterval.start, charInterval.end)
31 |       }
32 | 
33 |       case class LabelledEntry(label: String, entry: String)
34 |       def label(identifier: Char, index: Int, entry: String) = LabelledEntry(identifier.toString + index, entry)
35 | 
36 |       val entries = {
37 |         val arguments = List(inst.extr.arg1, inst.extr.arg2) map { arg =>
38 |           val labelled = label('T', entityIndex, partToAnnotation(inst, arg, "Argument"))
39 |           entityIndex += 1
40 |           labelled
41 |         }
42 |         val relation = {
43 |           val labelled = label('T', entityIndex, partToAnnotation(inst, inst.extr.rel, "Relation"))
44 |           entityIndex += 1
45 |           labelled
46 |         }
47 | 
48 |         val entities = relation :: arguments
49 | 
50 |         val relations = arguments zip List("Arg1", "Arg2") map {
51 |           case (entry, edge) =>
52 |             val labelled = label('R', relationIndex, edge + "-of Arg1:" + relation.label + " Arg2:" + entry.label)
53 |             relationIndex += 1
54 |             labelled
55 |         }
56 | 
57 |         entities ::: relations
58 |       }
59 | 
60 |       entries map {
61 |         case LabelledEntry(label, entry) => label + "\t" + entry
62 |       }
63 |     }
64 |   }
65 | }
66 | 


--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/openparse/AnalyzePatterns.scala:
--------------------------------------------------------------------------------
  1 | package edu.knowitall.openparse
  2 | 
  3 | import java.io.{PrintWriter, File}
  4 | 
  5 | import scala.Option.option2Iterable
  6 | import scala.collection.mutable
  7 | import scala.io.Source
  8 | 
  9 | import edu.knowitall.collection.immutable.graph.pattern.DirectedEdgeMatcher
 10 | import edu.knowitall.common.Resource
 11 | import edu.knowitall.tool.parse.graph.{PostagNodeMatcher, LabelEdgeMatcher, DependencyPattern, DependencyGraph}
 12 | import edu.knowitall.ollie.Ollie.stemmer
 13 | 
 14 | object AnalyzePatterns {
 15 |   def main(args: Array[String]) {
 16 |     val patternedFilePath = args(0)
 17 |     val outputFilePath = args(1)
 18 | 
 19 |     println("Counting pattern occurrence...")
 20 |     val patterns = mutable.HashMap[String, Int]().withDefaultValue(0)
 21 |     Resource.using(Source.fromFile(patternedFilePath, "UTF8")) { source =>
 22 |       for (line <- source.getLines) {
 23 |         val Array(_, _, _, _, pattern, _, _, _*) = line.split("\t", -1)
 24 |         patterns += pattern -> (patterns(pattern) + 1)
 25 |       }
 26 |     }
 27 | 
 28 |     println("Grouping patterns...")
 29 |     Resource.using(new PrintWriter(new File(outputFilePath), "UTF8")) { writer =>
 30 |       val ordered = patterns.toList.sortBy(_._2)(implicitly(Ordering[Int]).reverse)
 31 |       for ((pattern, count) <- ordered.filter(_._2 > 100)) {
 32 |         println(count + ":" + pattern)
 33 |         Resource.using(Source.fromFile(patternedFilePath, "UTF8")) { source =>
 34 |           writer.println(pattern + "\t" + count)
 35 |           for (line <- source.getLines) {
 36 |             val Array(rel, arg1, arg2, lemmas, p, sentence, deps, _*) = line.split("\t", -1)
 37 |             if (p == pattern) {
 38 |               writer.println(Iterable(rel, arg1, arg2, lemmas).mkString("\t"))
 39 |               writer.println(sentence)
 40 |               writer.println(deps)
 41 |               writer.println()
 42 |             }
 43 |           }
 44 |         }
 45 |       }
 46 | 
 47 |       println()
 48 |     }
 49 |   }
 50 | }
 51 | 
 52 | object CountPatternComponents {
 53 |   def main(args: Array[String]) {
 54 |     val patternedFilePath = args(0)
 55 | 
 56 |     val edgeCounts = mutable.HashMap[String, Int]().withDefaultValue(0)
 57 |     val postagCounts = mutable.HashMap[String, Int]().withDefaultValue(0)
 58 |     Resource.using(Source.fromFile(patternedFilePath, "UTF8")) { source =>
 59 |       for (line <- source.getLines) {
 60 |         val Array(_, _, _, _, pickledPattern, _, _, _*) = line.split("\t", -1)
 61 |         val pattern = new ExtractorPattern(DependencyPattern.deserialize(pickledPattern))
 62 |         val labels = (pattern.edgeMatchers.toList).flatMap { _ match {
 63 |               case e: DirectedEdgeMatcher[_] if e.matcher.isInstanceOf[LabelEdgeMatcher] =>
 64 |                 Some(e.matcher.asInstanceOf[LabelEdgeMatcher].label)
 65 |               case _ => None
 66 |             }
 67 |         }
 68 |         val postags = (pattern.baseNodeMatchers.toList).collect {
 69 |           case m: PostagNodeMatcher => m.postag
 70 |         }
 71 | 
 72 |         for (l <- labels) {
 73 |           edgeCounts += l -> (edgeCounts(l)+1)
 74 |         }
 75 | 
 76 |         for (postag <- postags) {
 77 |           postagCounts += postag -> (postagCounts(postag)+1)
 78 |         }
 79 |       }
 80 |     }
 81 | 
 82 |     println("Postag counts: ")
 83 |     for ((k, v) <- postagCounts.toList.sortBy(_._2).reverse) {
 84 |       println(k + "\t" + v)
 85 |     }
 86 | 
 87 |     println()
 88 |     println("Edge counts: ")
 89 |     for ((k, v) <- edgeCounts.toList.sortBy(_._2).reverse) {
 90 |       println(k + "\t" + v)
 91 |     }
 92 |   }
 93 | }
 94 | 
 95 | object CountSentenceComponents {
 96 |   def main(args: Array[String]) {
 97 |     val patternedFilePath = args(0)
 98 | 
 99 |     val edgeCounts = mutable.HashMap[String, Int]().withDefaultValue(0)
100 |     val postagCounts = mutable.HashMap[String, Int]().withDefaultValue(0)
101 |     val pieceCounts = mutable.HashMap[String, Int]().withDefaultValue(0)
102 |     Resource.using(Source.fromFile(patternedFilePath, "UTF8")) { source =>
103 |       for (line <- source.getLines) {
104 |         val Array(_, _, _, _, _, _, pickledGraph, _*) = line.split("\t", -1)
105 |         val graph = DependencyGraph.deserialize(pickledGraph)
106 |         val labels = (graph.graph.edges).toList.map(_.label )
107 |         val postags = (graph.graph.vertices).toList.map(_.postag)
108 | 
109 |         for (l <- labels) {
110 |           edgeCounts += l -> (edgeCounts(l)+1)
111 |         }
112 | 
113 |         for (postag <- postags) {
114 |           postagCounts += postag -> (postagCounts(postag)+1)
115 |         }
116 | 
117 |         for (edge <- graph.graph.edges) {
118 |           val piece1 = edge.source.postag + " " + edge.label + " " + edge.dest.postag
119 |           val piece2 = edge.dest.postag + " " + edge.label + " " + edge.source.postag
120 | 
121 |           pieceCounts += piece1 -> (pieceCounts(piece1)+1)
122 |           pieceCounts += piece2 -> (pieceCounts(piece2)+1)
123 |         }
124 |       }
125 |     }
126 | 
127 |     println("Postag counts: ")
128 |     for ((k, v) <- postagCounts.toList.sortBy(_._2).reverse) {
129 |       println(k + "\t" + v)
130 |     }
131 | 
132 |     println()
133 |     println("Edge counts: ")
134 |     for ((k, v) <- edgeCounts.toList.sortBy(_._2).reverse) {
135 |       println(k + "\t" + v)
136 |     }
137 | 
138 |     println()
139 |     println("Piece counts: ")
140 |     for ((k, v) <- pieceCounts.toList.sortBy(_._2).reverse) {
141 |       println(k + "\t" + v)
142 |     }
143 |   }
144 | }


--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/openparse/ExtractorPattern.scala:
--------------------------------------------------------------------------------
  1 | package edu.knowitall.openparse
  2 | 
  3 | import scala.io.Source
  4 | 
  5 | import org.slf4j.LoggerFactory
  6 | 
  7 | import edu.knowitall.collection.immutable.graph.pattern.CaptureNodeMatcher
  8 | import edu.knowitall.collection.immutable.graph.pattern.EdgeMatcher
  9 | import edu.knowitall.collection.immutable.graph.pattern.Matcher
 10 | import edu.knowitall.collection.immutable.graph.pattern.NodeMatcher
 11 | import edu.knowitall.collection.immutable.graph.pattern.Pattern
 12 | import edu.knowitall.collection.immutable.graph.pattern.TrivialNodeMatcher
 13 | import edu.knowitall.ollie.Ollie.stemmer
 14 | import edu.knowitall.tool.parse.graph.DependencyNode
 15 | import edu.knowitall.tool.parse.graph.DependencyPattern
 16 | import edu.knowitall.tool.parse.graph.LabelEdgeMatcher
 17 | import edu.knowitall.tool.parse.graph.RegexNodeMatcher
 18 | import scalaz._
 19 | import scalaz.Scalaz._
 20 | 
 21 | /** A wrapper for a dependency pattern that adds some convenience methods
 22 |   * for working with patterns intended for extraction of binary relations.
 23 |   *
 24 |   * @author Michael Schmitz
 25 |   */
 26 | class ExtractorPattern(matchers: List[Matcher[DependencyNode]]) extends DependencyPattern(matchers) {
 27 |   val logger = LoggerFactory.getLogger(this.getClass)
 28 | 
 29 |   def this(pattern: Pattern[DependencyNode]) = this(pattern.matchers.map { _ match {
 30 |     case m: ExtractionPartMatcher => m
 31 |     // lift extractor matchers to a more representitive class
 32 |     case m: CaptureNodeMatcher[_] => m.alias.take(3) match {
 33 |       case "arg" => new ArgumentMatcher(m.alias, m.matcher)
 34 |       case "rel" => new RelationMatcher(m.alias, m.matcher)
 35 |       case "slo" => new SlotMatcher(m.alias, m.matcher)
 36 |       case _ => throw new IllegalArgumentException("Unknown capture alias: " + m.alias)
 37 |     }
 38 |     // keep everything else the same
 39 |     case m => m
 40 |   }})
 41 | 
 42 |   override def canEqual(that: Any) = that.isInstanceOf[ExtractorPattern]
 43 |   override def equals(that: Any) = that match {
 44 |     case that: ExtractorPattern => (that canEqual this) && this.matchers == that.matchers
 45 |     case _ => false
 46 |   }
 47 | 
 48 |   def semantic: Boolean = matchers.exists {
 49 |     case m: RelationMatcher => m.baseNodeMatchers exists { case m: RegexNodeMatcher => true case _ => false }
 50 |     case _ => false
 51 |   }
 52 | 
 53 |   def valid: Boolean = {
 54 |     def existsEdge(pred: LabelEdgeMatcher=>Boolean) =
 55 |       this.baseEdgeMatchers.collect {
 56 |         case e: LabelEdgeMatcher => e
 57 |       }exists(pred)
 58 | 
 59 |     /* check for multiple prep edges */
 60 |     def multiplePreps = this.baseEdgeMatchers.collect {
 61 |       case e: LabelEdgeMatcher => e
 62 |     }.count(_.label.contains("prep")) > 1
 63 | 
 64 |     /* check for a conj_and edge */
 65 |     def conjAnd = existsEdge(_.label == "conj_and")
 66 | 
 67 |     /* check for a conj_and edge */
 68 |     def conjOr = existsEdge(_.label == "conj_or")
 69 | 
 70 |     /* eliminate all conj edges */
 71 |     def conj = existsEdge(_.label startsWith "conj")
 72 | 
 73 |     def slotBordersNN = {
 74 |       import scalaz._
 75 |       import Scalaz._
 76 | 
 77 |       def isNN(m: Matcher[DependencyNode]) = m match {
 78 |         case e: NodeMatcher[_] =>
 79 |           e.baseNodeMatchers exists {
 80 |             case m: LabelEdgeMatcher if m.label == "nn" => true
 81 |             case _ => false
 82 |           }
 83 |         case _ => false
 84 |       }
 85 | 
 86 |       def isSlot(m: Matcher[DependencyNode]) = m match {
 87 |         case m: SlotMatcher => true
 88 |         case _ => false
 89 |       }
 90 | 
 91 |       this.matchers.toZipper.map(_.positions.toStream.exists { z =>
 92 |         def focusedOnNN(z: Option[Zipper[Matcher[DependencyNode]]]) = z.map(z => isNN(z.focus)).getOrElse(false)
 93 |         isSlot(z.focus) && (focusedOnNN(z.previous) || focusedOnNN(z.next))
 94 |       }).getOrElse(false)
 95 |     }
 96 | 
 97 |     if (existsEdge(_.label == "dep")) {
 98 |       logger.debug("invalid: dep edge: " + this.toString)
 99 |       return false
100 |     }
101 | 
102 |     if (existsEdge(_.label == "dep")) {
103 |       logger.debug("invalid: dep edge: " + this.toString)
104 |       return false
105 |     }
106 | 
107 |     /* check if ends with slot */
108 |     def slotAtEnd = {
109 |       def isSlot(node: NodeMatcher[_]) = node match {
110 |         case m: CaptureNodeMatcher[_] => m.alias.startsWith("slot")
111 |         case _ => false
112 |       }
113 | 
114 |       !this.nodeMatchers.isEmpty && (isSlot(this.nodeMatchers.head) || isSlot(this.nodeMatchers.last))
115 |     }
116 | 
117 |     val length = edgeMatchers.length
118 | 
119 |     if (length == 2 && multiplePreps) {
120 |       logger.debug("invalid: multiple preps: " + this.toString)
121 |       false
122 |     }
123 |     else if (conjAnd) {
124 |       logger.debug("invalid: conj_and: " + this.toString)
125 |       false
126 |     }
127 |     else if (conjOr) {
128 |       logger.debug("invalid: conj_or: " + this.toString)
129 |       false
130 |     }
131 |     else if (conj) {
132 |       logger.debug("invalid: alt conj: " + this.toString)
133 |       false
134 |     }
135 |     else if (slotAtEnd) {
136 |       logger.debug("invalid: ends with slot: " + this.toString)
137 |       false
138 |     }
139 |     else if (slotBordersNN) {
140 |       logger.debug("invalid: slot borders nn: " + this.toString)
141 |       false
142 |     }
143 |     else {
144 |       true
145 |     }
146 |   }
147 | 
148 |   /* determine if the pattern is symmetric, such as:
149 |    *   {arg1} >prep> {rel} <prep< {arg2}
150 |    */
151 |   def symmetric = {
152 |     def compare(m1: List[Matcher[DependencyNode]], m2: List[Matcher[DependencyNode]]): Boolean = (m1, m2) match {
153 |       // argument matchers need not equal (in fact, they should be opposites)
154 |       case (((c1: ArgumentMatcher) :: m1s), ((c2: ArgumentMatcher) :: m2s)) => compare(m1s, m2s)
155 |       // edge matchers should be equals but opposite
156 |       case (((m1: EdgeMatcher[_]) :: m1s), ((m2: EdgeMatcher[_]) :: m2s)) => m1 == m2.flip && compare(m1s, m2s)
157 |       // edges and other nodes must be equal
158 |       case (((m1: Matcher[_]) :: m1s), ((m2: Matcher[_]) :: m2s)) => m1 == m2 && compare(m1s, m2s)
159 |       case (Nil, Nil) => true
160 |       case _ => false
161 |     }
162 | 
163 |     compare(matchers, matchers.reverse)
164 |   }
165 | }
166 | 
167 | object ExtractorPattern {
168 |   import scala.io.Source
169 |   def main(args: Array[String]) {
170 |     val iter = if (args.length == 0) Source.stdin.getLines else args.iterator
171 |     for (line <- iter) {
172 |       val pattern = DependencyPattern.deserialize(line)
173 |       val extractor = new ExtractorPattern(pattern)
174 |       def verdict = if (extractor.valid) "valid" else "invalid"
175 |       println(verdict + ": " + extractor.toString)
176 |     }
177 |   }
178 | }
179 | 
180 | /** A dependency node used to match an extraction part in a pattern extractor.
181 |   *
182 |   * @author Michael Schmitz
183 |   */
184 | sealed abstract class ExtractionPartMatcher(alias: String, matcher: NodeMatcher[DependencyNode])
185 | extends CaptureNodeMatcher[DependencyNode](alias, matcher) {
186 |   def this(alias: String) = this(alias, new TrivialNodeMatcher[DependencyNode])
187 | 
188 |   def withMatcher(matcher: NodeMatcher[DependencyNode]): ExtractionPartMatcher
189 | }
190 | 
191 | /** A dependency node used to match an argument in a pattern extractor.
192 |   *
193 |   * @author Michael Schmitz
194 |   */
195 | class ArgumentMatcher(alias: String, matcher: NodeMatcher[DependencyNode]) extends ExtractionPartMatcher(alias, matcher) {
196 |   def this(alias: String) = this(alias, new TrivialNodeMatcher[DependencyNode])
197 |   override def canEqual(that: Any) = that.isInstanceOf[ExtractionPartMatcher]
198 |   override def equals(that: Any) = that match {
199 |     case that: ExtractionPartMatcher => (that canEqual this) && super.equals(that.asInstanceOf[Any])
200 |     case _ => false
201 |   }
202 | 
203 |   override def withMatcher(matcher: NodeMatcher[DependencyNode]) = new ArgumentMatcher(this.alias, matcher)
204 | }
205 | 
206 | /** A dependency node used to match a relation in a pattern extractor.
207 |   *
208 |   * @author Michael Schmitz
209 |   */
210 | class RelationMatcher(alias: String, matcher: NodeMatcher[DependencyNode])
211 | extends ExtractionPartMatcher(alias, matcher) {
212 |   override def canEqual(that: Any) = that.isInstanceOf[RelationMatcher]
213 |   override def equals(that: Any) = that match {
214 |     case that: RelationMatcher => (that canEqual this) && super.equals(that.asInstanceOf[Any])
215 |     case _ => false
216 |   }
217 | 
218 |   override def withMatcher(matcher: NodeMatcher[DependencyNode]) = new RelationMatcher(this.alias, matcher)
219 | }
220 | 
221 | /** A dependency node used to match a slot in a pattern extractor.
222 |   *
223 |   * @author Michael Schmitz
224 |   */
225 | class SlotMatcher(alias: String, matcher: NodeMatcher[DependencyNode])
226 | extends ExtractionPartMatcher(alias, matcher) {
227 |   override def canEqual(that: Any) = that.isInstanceOf[SlotMatcher]
228 |   override def equals(that: Any) = that match {
229 |     case that: SlotMatcher => (that canEqual this) && super.equals(that.asInstanceOf[Any])
230 |     case _ => false
231 |   }
232 | 
233 |   override def withMatcher(matcher: NodeMatcher[DependencyNode]) = new SlotMatcher(this.alias, matcher)
234 | }


--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/openparse/GraphExpansions.scala:
--------------------------------------------------------------------------------
  1 | package edu.knowitall.openparse
  2 | 
  3 | import scala.collection.Set
  4 | import scala.collection.SortedSet
  5 | 
  6 | import edu.knowitall.collection.immutable.graph.{Graph, DirectedEdge}
  7 | import edu.knowitall.collection.immutable.graph.Direction
  8 | import edu.knowitall.collection.immutable.Interval
  9 | import edu.knowitall.tool.parse.graph.{DependencyNode, DependencyGraph}
 10 | 
 11 | /** A collection of helper methods for expanding a node in a graph
 12 |   * and/or sentence according to some metric. */
 13 | object GraphExpansions {
 14 |   def neighborsUntil(graph: DependencyGraph, node: DependencyNode, inferiors: List[DependencyNode], until: Set[DependencyNode]): SortedSet[DependencyNode] = {
 15 |     val lefts = inferiors.takeWhile(_ != node).reverse
 16 |     val rights = inferiors.dropWhile(_ != node).drop(1)
 17 | 
 18 |     val indices = Interval.span(node.indices :: lefts.takeWhile(!until(_)).map(_.indices) ++ rights.takeWhile(!until(_)).map(_.indices))
 19 | 
 20 |     // use the original dependencies nodes in case some information
 21 |     // was lost.  For example, of is collapsed into the edge prep_of
 22 |     graph.nodes.filter(node => node.indices.max >= indices.min && node.indices.max <= indices.max)
 23 |   }
 24 | 
 25 |   def expandAdjacent(graph: DependencyGraph, node: DependencyNode, until: Set[DependencyNode], labels: Set[String]) = {
 26 |     def takeAdjacent(interval: Interval, nodes: List[DependencyNode], pool: List[DependencyNode]): List[DependencyNode] = pool match {
 27 |       // can we add the top node?
 28 |       case head :: tail if (head.indices borders interval) && !until.contains(head) =>
 29 |         takeAdjacent(interval union head.indices, head :: nodes, tail)
 30 |       // otherwise abort
 31 |       case _ => nodes
 32 |     }
 33 | 
 34 |     // it might be possible to simply have an adjacency restriction
 35 |     // in this condition
 36 |     def cond(e: Graph.Edge[DependencyNode]) =
 37 |       labels.contains(e.label)
 38 |     val inferiors = graph.graph.inferiors(node, cond).toList.sortBy(_.indices)
 39 | 
 40 |     // split into nodes left and right of node
 41 |     val lefts = inferiors.takeWhile(_ != node).reverse
 42 |     val rights = inferiors.dropWhile(_ != node).drop(1)
 43 | 
 44 |     // take adjacent nodes from each list
 45 |     val withLefts = takeAdjacent(node.indices, List(node), lefts)
 46 |     val expanded = takeAdjacent(node.indices, withLefts, rights)
 47 | 
 48 |     SortedSet(expanded: _*)
 49 |   }
 50 | 
 51 |   def expand(graph: DependencyGraph, node: DependencyNode, until: Set[DependencyNode], labels: Set[String]) = {
 52 |     // don't restrict to adjacent (by interval) because prep_of, etc.
 53 |     // remove some nodes that we want to expand across.  In the end,
 54 |     // we get the span over the inferiors.  Do go beneath until
 55 |     // nodes because we need them for neighborsUntil.
 56 |     def cond(e: Graph.Edge[DependencyNode]) =
 57 |       labels.contains(e.label)
 58 |     val inferiors = graph.graph.inferiors(node, cond)
 59 | 
 60 |     // get all nodes connected by an nn edge
 61 |     val nns = graph.graph.connected(node, dedge => dedge.edge.label == "nn")
 62 | 
 63 |     // order the nodes by their indices
 64 |     val ordered = (inferiors ++ nns).toList.sortBy(_.indices)
 65 | 
 66 |     // get neighbors, moving left and right, until a bad node is it
 67 |     neighborsUntil(graph, node, ordered, until)
 68 |   }
 69 | 
 70 |   def augment(graph: DependencyGraph, node: DependencyNode, without: Set[DependencyNode], pred: Graph.Edge[DependencyNode] => Boolean): List[SortedSet[DependencyNode]] = {
 71 |     // don't restrict to adjacent (by interval) because prep_of, etc.
 72 |     // remove some nodes that we want to expand across.  In the end,
 73 |     // we get the span over the inferiors.
 74 |     graph.graph.successors(node, pred).map { successor =>
 75 |       SortedSet[DependencyNode]() ++ graph.graph.inferiors(successor)
 76 |     }.toList
 77 |   }
 78 | 
 79 |   /**
 80 |     *  Find all nodes in a components next to the node.
 81 |     *  @param  node  components will be found adjacent to this node
 82 |     *  @param  labels  components may be connected by edges with any of these labels
 83 |     *  @param  without  components may not include any of these nodes
 84 |     */
 85 |   def components(graph: DependencyGraph, node: DependencyNode, labels: Set[String], without: Set[DependencyNode], nested: Boolean) = {
 86 |     // nodes across an allowed label to a subcomponent
 87 |     val across = graph.graph.neighbors(node, (dedge: DirectedEdge[_]) => dedge.dir match {
 88 |       case Direction.Down if labels.contains(dedge.edge.label) => true
 89 |       case _ => false
 90 |     })
 91 | 
 92 |     across.flatMap { start =>
 93 |       // get inferiors without passing back to node
 94 |       val inferiors = graph.graph.inferiors(start,
 95 |         (e: Graph.Edge[DependencyNode]) =>
 96 |           // don't cross a conjunction that goes back an across node
 97 |           !((e.label startsWith "conj") && (across contains e.dest)) &&
 98 |             // make sure we don't cycle out of the component
 99 |             e.dest != node &&
100 |             // make sure we don't descend into another component
101 |             // i.e. "John M. Synge who came to us with his play direct
102 |             // from the Aran Islands , where the material for most of
103 |             // his later works was gathered" if nested is false
104 |             (nested || !labels.contains(e.label)))
105 | 
106 |       // make sure none of the without nodes are in the component
107 |       if (without.forall(!inferiors.contains(_))) {
108 |         val span = Interval.span(inferiors.map(_.indices).toSeq)
109 |         Some(graph.nodes.filter(node => span.superset(node.indices)).toList)
110 |       } else None
111 |     }
112 |   }
113 | }


--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/openparse/bootstrap/FilterTargetExtractions.scala:
--------------------------------------------------------------------------------
 1 | package edu.knowitall.openparse.bootstrap
 2 | 
 3 | import scala.io.Source
 4 | 
 5 | import org.slf4j.LoggerFactory
 6 | 
 7 | /** Filter the target extractions.  We only want to keep extractions that
 8 |   * occur more than once and have a relation with more than 15 seeds.
 9 |   * 
10 |   * @author Michael Schmitz
11 |   */
12 | object FilterTargetExtractions {
13 |   val logger = LoggerFactory.getLogger(this.getClass)
14 |   
15 |   final val MIN_RELATION_SEEDS = 15
16 | 
17 |   def main(args: Array[String]) {
18 |     val inputFile = Source.fromFile(args(0), "UTF8")
19 | 
20 |     logger.info("reading lines and counting")
21 |     var relationCounts = Map[String, Int]().withDefaultValue(0)
22 |     var seedCounts = Map[(String, String, String, String), Int]().withDefaultValue(0)
23 |     for (line <- inputFile.getLines) {
24 |       val Array(rel, arg1, arg2, lemmas, _*) = line.split("\t")
25 | 
26 |       val seed = (rel, arg1, arg2, lemmas)
27 | 
28 |       // make sure the relation contains at least on of the lemmas
29 |       // this excludes, for example, "be in"
30 |       if (rel.split(" ").exists (lemmas contains _)) {
31 |         seedCounts += seed -> (seedCounts(seed) + 1)
32 |         relationCounts += rel -> (relationCounts(rel) + 1)
33 |       }
34 |     }
35 | 
36 |     // keep relations with more than 15 seeds
37 |     // and more than 0 lemmas
38 |     val relations: Set[String] =
39 |       (for {
40 |         (rel, count) <- relationCounts;
41 |         if (count > MIN_RELATION_SEEDS)
42 |       } yield (rel))(scala.collection.breakOut)
43 |     logger.info("keeping " + relations.size + "/" + relationCounts.size + " relations")
44 | 
45 |     // keep seeds that occur more than once
46 |     val seeds =
47 |       for {
48 |         (seed @ (rel, arg1, arg2, lemmas), count) <- seedCounts;
49 |         if count > 1 && relations.contains(rel)
50 |       } yield (seed)
51 | 
52 |     logger.info("keeping " + seeds.size + "/" + seedCounts.size + " seeds")
53 | 
54 |     logger.info("printing seeds to keep")
55 |     for (seed <- seeds) {
56 |       println(seed.productIterator.mkString("\t"))
57 |     }
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/openparse/bootstrap/FindCommon.scala:
--------------------------------------------------------------------------------
 1 | package edu.knowitall.openparse.bootstrap
 2 | 
 3 | import scala.util.matching.Regex
 4 | 
 5 | /** Common functionality for bootstrap code.
 6 |   * 
 7 |   * @author Michael Schmitz
 8 |   */
 9 | object FindCommon {
10 |   // tags allowed in proper arguments
11 |   val properPostags = Set("DT", "IN", "NNP", "NNPS")
12 |   def proper(lemmas: Array[String]) =
13 |     lemmas.forall(properPostags.contains(_)) && lemmas.exists(lemma => lemma == "NNP" || lemma == "NNPS")
14 | 
15 |   def stripPostag(target: String, part: Seq[(String, String, String)]) = {
16 |     part.filter { case (pos, tok, lem) => target != pos }
17 |   }
18 |   def stripPostag(target: Regex, part: Seq[(String, String, String)]) = {
19 |     part.filter { case (pos, tok, lem) => !target.pattern.matcher(pos).matches}
20 |   }
21 |   def stripLemma(target: String, part: Seq[(String, String, String)]) = {
22 |     part.filter { case (pos, tok, lem) => target != lem }
23 |   }
24 |   
25 |   def cleanArg(part: Seq[(String, String, String)]) = stripPostag("DT", part)
26 |   
27 |   def zip3(l1 : List[String], l2 : List[String],l3 : List[String]) : List[(String, String, String)] =
28 |   {
29 |     def zip3$ (l1$ : List[String], l2$ : List[String], l3$ : List[String], acc : List[(String, String, String)]) : List[(String, String, String)] = l1$ match
30 |     {
31 |       case Nil => acc reverse
32 |       case l1$head :: l1$tail => zip3$(l1$tail, l2$.tail, l3$.tail, (l1$head, l2$.head, l3$.head) :: acc)
33 |     }
34 | 
35 |     zip3$(l1, l2, l3, List[(String,String,String)]())
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/openparse/bootstrap/FindTargetArguments.scala:
--------------------------------------------------------------------------------
 1 | package edu.knowitall.openparse.bootstrap
 2 | 
 3 | import scala.Array.canBuildFrom
 4 | import scala.collection.mutable
 5 | import scala.io.Source
 6 | 
 7 | import edu.knowitall.tool.stem.MorphaStemmer
 8 | 
 9 | import FindCommon.{zip3, proper, cleanArg}
10 | 
11 | /** Detemine valid arguments of extractions for the boostrap process. 
12 |   * 
13 |   * Only frequent proper arguments are used.
14 |   * 
15 |   * @author Michael Schmitz
16 |   */
17 | object FindTargetArguments {
18 |   import FindCommon._
19 | 
20 |   val blacklist = Set("inc", "ltd", "page", 
21 |     "vehicle", "turn", "site", "photo", "image", "gallery")
22 | 
23 |   def valid(lemma: String) = {
24 |     lemma.length > 2 && lemma.length < 64 && !blacklist.contains(lemma)
25 |   }
26 | 
27 |   /** Run over a file with four columns:
28 |     * 
29 |     *   string
30 |     *   lemma
31 |     *   postag
32 |     *   count
33 |     *
34 |     * Count all of the proper arguments and print any arguments that
35 |     * exceed the lower bound.  The lower bound is specified by the first
36 |     * command-line argument. */
37 |   def main(args: Array[String]) {
38 |     val source = Source.fromFile(args(0), "UTF8")
39 |     val lowerBound = args(1).toInt
40 |     
41 |     val map = new mutable.HashMap[String, Int]().withDefaultValue(0)
42 |     for (line <- source.getLines) {
43 |       try {
44 |         val Array(string, lem, postag, count) = line.split("\t")
45 |         // do our own normalization
46 |         val lemma = string.split(" ").map(
47 |           MorphaStemmer.lemmatize(_)).mkString(" ")
48 |         
49 |         if (!string.contains("_")) {
50 |           // remove DT
51 |           val arg = cleanArg(
52 |             zip3(
53 |               postag.split("""\s+""").toList, 
54 |               string.split("""\s+""").toList, 
55 |               lemma.split("""\s+""").toList))
56 |           val cleanLemma = arg.unzip3._3.mkString(" ")
57 |           
58 |           // make sure lemma is valid
59 |           if (proper(postag.split(" ")) && valid(cleanLemma)) {
60 |             map += cleanLemma -> (map(cleanLemma)+count.toInt)
61 |           }
62 |         }
63 |       }
64 |       catch {
65 |         case e: MatchError =>
66 |       }
67 |     }
68 |     
69 |     source.close
70 | 
71 |     val keepers: List[(String, Int)] = (for ((k, v) <- map if v > lowerBound) yield {
72 |       (k, v)
73 |     })(scala.collection.breakOut)
74 | 
75 |     keepers.sortBy(_._2).reverse.foreach { case (k, v) => println(k + "\t" + v) }
76 |   }
77 | }
78 | 


--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/openparse/bootstrap/FindTargetExtractions.scala:
--------------------------------------------------------------------------------
  1 | package edu.knowitall.openparse.bootstrap
  2 | 
  3 | import scala.Array.canBuildFrom
  4 | import scala.Option.option2Iterable
  5 | import scala.io.Source
  6 | 
  7 | import org.slf4j.LoggerFactory
  8 | 
  9 | import edu.knowitall.openparse.OpenParse
 10 | import edu.knowitall.tool.stem.MorphaStemmer
 11 | 
 12 | import FindCommon.{zip3, stripPostag, proper, cleanArg}
 13 | import scopt.OptionParser
 14 | 
 15 | /** Detemine valid extractions for the boostrap process. 
 16 |   * 
 17 |   * Extractions need frequent arguments from FindTargetArguments
 18 |   * and cannot contain a negation word.
 19 |   * 
 20 |   * @author Michael Schmitz
 21 |   */
 22 | object FindTargetExtractions {
 23 |   import FindCommon._
 24 | 
 25 |   val logger = LoggerFactory.getLogger(this.getClass)
 26 | 
 27 |   def negated(lemmas: Array[String]) =
 28 |     lemmas.contains("not") || lemmas.contains("no") || lemmas.contains("n't") || lemmas.contains("never")
 29 | 
 30 |   val lemmaBlacklist = Set("the", "that", "of")
 31 | 
 32 |   def main(args: Array[String]) {
 33 | 
 34 |     val parser = new OptionParser("findextr") {
 35 |       var extractionFilePath: String = _
 36 |       var relationFilePath: Option[String] = None
 37 |       var argumentFilePath: String = _
 38 | 
 39 |       arg("extractions", "extraction file", { v: String => require(v != null); extractionFilePath = v })
 40 |       arg("arguments", "argument file", { v: String => require(v != null); argumentFilePath = v })
 41 |       opt("r", "relations", "<file>", "relation file", { v: String => require(v != null); relationFilePath = Some(v) })
 42 |     }
 43 | 
 44 |     if (parser.parse(args)) {
 45 |       // read in the argument files
 46 |       val extractions = Source.fromFile(parser.extractionFilePath, "UTF8")
 47 |       logger.info("loading targets")
 48 |       val relationsRows = parser.relationFilePath.map(Source.fromFile(_, "UTF8").getLines.map(line => line.split("\t")).toList)
 49 |       val targets = relationsRows.map(_ map (_(0)))
 50 |       val relationLemmaLookup = relationsRows.map(_.map(row => (row(0), row(1).split(" "))).toMap)
 51 |       def relationLemmas(relation: String): Seq[String] = {
 52 |         relationLemmaLookup match {
 53 |           case Some(lookup) => lookup(relation)
 54 |           case None => relation.split(" ") filterNot OpenParse.LEMMA_BLACKLIST
 55 |         }
 56 |       }
 57 | 
 58 |       targets match {
 59 |         case Some(targets) => logger.info("5 targets: " + targets.take(5).mkString(", "))
 60 |         case None => logger.info("No target restriction")
 61 |       }
 62 |       logger.info("loading arguments")
 63 |       val arguments = Source.fromFile(parser.argumentFilePath, "UTF8").getLines.map(line => line.split("\t")(0)).toSet
 64 |       logger.info("5 arguments: " + arguments.take(5).mkString(", "))
 65 | 
 66 |       // iterate over extractions
 67 |       logger.info("iterating over extractions")
 68 |       for (line <- extractions.getLines) {
 69 |         try {
 70 |           val Array(id, arg1String, relationString, arg2String, _, relationLemma, _, arg1Postag, relationPostag, arg2Postag, _, _, _, count, confidence, url, sentence) = line.split("\t", -1)
 71 |           val arg1Lemma = arg1String.split(" ").map(MorphaStemmer.lemmatize(_)).mkString(" ")
 72 |           val arg2Lemma = arg2String.split(" ").map(MorphaStemmer.lemmatize(_)).mkString(" ")
 73 |           // val rs = new RelationString(relationString, relationLemma, relationPostag)
 74 |           // rs.correctNormalization()
 75 | 
 76 |           val arg1 = zip3(arg1Postag.split("""\s+""").toList, arg1String.split("""\s+""").toList, arg1Lemma.split("""\s+""").toList)
 77 |           // val rel = zip3(rs.getPosPred.split("""\s+""").toList, rs.getPred.split("""\s+""").toList, rs.getNormPred.split("""\s+""").toList)
 78 |           val rel = zip3(relationPostag.split("""\s+""").toList, relationString.split("""\s+""").toList, relationLemma.split("""\s+""").toList)
 79 |           val arg2 = zip3(arg2Postag.split("""\s+""").toList, arg2String.split("""\s+""").toList, arg2Lemma.split("""\s+""").toList)
 80 | 
 81 |           implicit def t2mapper[A, B](t: (A, B)) = new { 
 82 |             def map[R](f: A => R, g: B => R) = (f(t._1), g(t._2)) 
 83 |           }
 84 | 
 85 |           val (arg1cleanPostags, arg1cleanStrings, arg1cleanLemmas) = cleanArg(arg1).unzip3
 86 |           val (arg2cleanPostags, arg2cleanStrings, arg2cleanLemmas) = cleanArg(arg2).unzip3
 87 |           val (relcleanPostags, relcleanStrings, relcleanLemmas) = {
 88 |             val stripped = stripPostag("RB.*", stripPostag("DT", rel))
 89 |             val beIndex = rel.indexWhere(_._3 == "be")
 90 |             val penultimateAdjective = 
 91 |               if (rel.length - beIndex >= 3 && (rel.drop(beIndex).head._3 startsWith "be") && rel.last._1 == "IN") {
 92 |                 // return the penultimate if it's VERB ADJECTIVE PREPOSITION
 93 |                 Some(rel.init.last)
 94 |               }
 95 |               else None
 96 |             
 97 |             (stripPostag("JJS?".r, stripped) ++ penultimateAdjective).unzip3
 98 |           }
 99 | 
100 |           val relcleanLemmaString = relcleanLemmas.mkString(" ")
101 |           val arg1cleanLemmaString = arg1cleanLemmas.mkString(" ")
102 |           val arg2cleanLemmaString = arg2cleanLemmas.mkString(" ")
103 | 
104 |           // ensure the extraction parts are relatively small
105 |           if (relationLemma.length < 64 && 
106 |             // ensure the normalized relation string is a target
107 |             targets.map(_ contains relcleanLemmaString).getOrElse(true) &&
108 |             // ensure arguments are proper
109 |             (proper(arg1Postag.split("\\s+")) ||
110 |             proper(arg2Postag.split("\\s+"))) &&
111 |             arg1cleanLemmaString != arg2cleanLemmaString &&
112 |             // ensure the args are permissible
113 |             arguments.contains(arg1cleanLemmaString) && arguments.contains(arg2cleanLemmaString) &&
114 |             // ensure the unnormalized relation is not negated
115 |             !negated(relationLemma.split(" "))) {
116 | 
117 |             val lemmas = (arg1cleanLemmas ++ relationLemmas(relcleanLemmaString) ++ arg2cleanLemmas) filterNot lemmaBlacklist
118 | 
119 |             for (i <- 0 until count.toInt) {
120 |               println(Iterable(
121 |                 relcleanLemmaString, 
122 |                 arg1cleanLemmaString, 
123 |                 arg2cleanLemmaString,
124 |                 lemmas.mkString(" "), 
125 |                 arg1String, relationString, arg2String, arg1Postag, relationPostag, arg2Postag).mkString("\t"))
126 |             }
127 |           }
128 |         }
129 |         catch {
130 |           case e => // e.printStackTrace
131 |         }
132 |       }
133 |     }
134 |   }
135 | }
136 | 


--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/openparse/eval/GroupScoredBy.scala:
--------------------------------------------------------------------------------
 1 | package edu.knowitall.openparse.eval
 2 | 
 3 | import java.io.File
 4 | 
 5 | import edu.knowitall.common.Analysis
 6 | 
 7 | import scopt.OptionParser
 8 | 
 9 | /** Group scored extractions by precision and yield. 
10 |   * 
11 |   * @author Michael Schmitz
12 |   */
13 | object GroupScoredBy {
14 |   def main(args: Array[String]) = {
15 |     val parser = new OptionParser("groupscored") {
16 |       var scoredFile: File = _
17 |       var column: Int = 2
18 | 
19 |       arg("scored", "scored extractions", { path: String => scoredFile = new File(path) })
20 |       intOpt("k", "column", "column", { c: Int => column = c })
21 |     }
22 | 
23 |     if (parser.parse(args)) {
24 |       require(parser.column >= 2, "column must be >= 2")
25 | 
26 |       val scores = Score.loadScoredFile(parser.scoredFile)
27 |       val grouped = scores.groupBy(scored => scored.extra(parser.column - 2))
28 | 
29 |       val scored = (for (group <- grouped) yield {
30 |         val title = group._1
31 |         val scoreds = group._2
32 | 
33 |         (group._1, Analysis.precision(scoreds.map(scored => scored.score.getOrElse(throw new IllegalArgumentException("unscored extraction: " + scored)))), group._2)
34 |       }).toList.sortBy(tuple => (tuple._2, tuple._3.mkString("\t"))).reverse
35 |       
36 |       scored.foreach { item => 
37 |         println(item._2 + ": " + item._1)
38 |         item._3.sortBy(scored => (scored.confidence, scored.toRow)).iterator.map(_.toRow).foreach(println)
39 |         println()
40 |       }
41 |     }
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/openparse/eval/PrecisionYield.scala:
--------------------------------------------------------------------------------
  1 | package edu.knowitall.openparse.eval
  2 | 
  3 | import java.io.{PrintWriter, File}
  4 | 
  5 | import edu.knowitall.common.Resource.using
  6 | import edu.knowitall.common.Analysis
  7 | 
  8 | import scopt.OptionParser
  9 | 
 10 | /** Compute precision yield point from scored extractions.
 11 |   * 
 12 |   * @author Michael Schmitz
 13 |   */
 14 | object PrecisionYield {
 15 |   abstract class Settings {
 16 |     def scoredFile: File
 17 |     def outputFile: Option[File]
 18 |   }
 19 |   
 20 |   def main(args: Array[String]) = {
 21 |     val settings = new Settings {
 22 |       var scoredFile: File = _
 23 |       var outputFile: Option[File] = None 
 24 |     }
 25 |     
 26 |     val parser = new OptionParser("precyield") {
 27 |       arg("scored", "scored extractions file", { path: String => settings.scoredFile = new File(path) })
 28 |       argOpt("output", "output file", { path: String => settings.outputFile = Some(new File(path)) })
 29 |     }
 30 | 
 31 |     if (parser.parse(args)) {
 32 |       run(settings)
 33 |     }
 34 |   }
 35 |   
 36 |   def run(settings: Settings) = {
 37 |     val scores = Score.loadScoredFile(settings.scoredFile).sortBy(_.confidence).reverse
 38 |     val input = scores.map(scored => ("%.4f".format(scored.confidence), scored.score.getOrElse(throw new IllegalArgumentException("unscored extraction: " + scored))))
 39 | 
 40 |     using {
 41 |       settings.outputFile match {
 42 |         case Some(file) => new PrintWriter(file, "UTF8")
 43 |         case None => new PrintWriter(System.out)
 44 |       }
 45 |     } { writer =>
 46 |       val py = Analysis.precisionYieldMeta(input)
 47 |       val area = Analysis.areaUnderCurve(py.map { case (conf, yld, pr) => (yld, pr) })
 48 |       println("auc: " + area)
 49 |       for ((conf, yld, pr) <- Analysis.precisionYieldMeta(input)) {
 50 |         writer.println(conf + "\t" + yld + "\t" + pr)
 51 |       }
 52 |     }
 53 |   }
 54 | }
 55 | 
 56 | /** Merge precision yield points into a single file, 
 57 |   * usually so they can be graphed together.
 58 |   * 
 59 |   * @author Michael Schmitz
 60 |   */
 61 | object MergePYFiles {
 62 |   abstract class Settings {
 63 |     def files: List[File]
 64 |   }
 65 |   
 66 |   def main(args: Array[String]) {
 67 |     val settings = new Settings {
 68 |       var files: List[File] = Nil
 69 |     }
 70 |     
 71 |     val parser = new OptionParser("mergepy") {
 72 |       arglist("<file>...", "input files", { file: String => settings.files = new File(file) :: settings.files })
 73 |     }
 74 |     
 75 |     if (parser.parse(args)) {
 76 |       run(settings)
 77 |     }
 78 |   }
 79 | 
 80 |   def run(settings: Settings) {
 81 |     val points = for ((file, i) <- settings.files.zipWithIndex) yield {
 82 |       using(io.Source.fromFile(file, "UTF8")) { source =>
 83 |         source.getLines.dropWhile(line => !(line contains "\t")).map { line =>
 84 |           val Array(_, yld, prec) = line.split("\t", -1)
 85 |           (yld.toInt, (i, prec.toDouble))
 86 |         }.toList
 87 |       }
 88 |     }
 89 |     
 90 |     println("\t" + settings.files.map(_.getName).mkString("\t"))
 91 |     points.flatten.sortBy(_._1).reverse.groupBy(_._1).toSeq.sortBy(_._1).reverse foreach { case (grp, seq) => 
 92 |       var vec = Vector.fill[String](settings.files.size)("")
 93 |       seq.foreach { 
 94 |         case (k, (i, v)) => vec = vec updated (i, "%1.4f" format v)
 95 |       }
 96 |       println(grp+"\t"+vec.mkString("\t")) 
 97 |     }
 98 |   }
 99 | }
100 | 


--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/openparse/eval/RankPatterns.scala:
--------------------------------------------------------------------------------
 1 | package edu.knowitall.openparse.eval
 2 | 
 3 | import java.io.{PrintWriter, File}
 4 | 
 5 | import edu.knowitall.common.Resource.using
 6 | 
 7 | import scopt.OptionParser
 8 | 
 9 | /** Read a scored file and rank the patterns by their precision and frequency.
10 |   * 
11 |   * @author Michael Schmitz
12 |   */
13 | object RankPatterns {
14 |   abstract class Settings {
15 |     def scoredFile: File
16 |     def outputFile: Option[File]
17 |   }
18 |   
19 |   def main(args: Array[String]) = {
20 |     val settings = new Settings {
21 |       var scoredFile: File = _
22 |       var outputFile: Option[File] = None 
23 |     }
24 |     
25 |     val parser = new OptionParser("rankpat") {
26 |       var scoredFile: File = _
27 | 
28 |       arg("scored", "scored extractions file", { path: String => settings.scoredFile = new File(path) })
29 |       argOpt("output", "output file", { path: String => settings.outputFile = Some(new File(path)) })
30 |     }
31 | 
32 |     if (parser.parse(args)) {
33 |       run(settings)
34 |     }
35 |   }
36 |   
37 |   def run(settings: Settings) = {
38 |     val scores = Score.loadScoredFile(settings.scoredFile).sortBy(_.confidence).reverse
39 |     val grouped = scores.groupBy(_.extra(0))
40 |       .mapValues { scoreds =>
41 |         val yld = scoreds.map(scored => if (scored.score.getOrElse(throw new IllegalArgumentException("unscored extraction: " + scored))) 1 else 0).sum
42 |         val precision = yld.toDouble / scoreds.size.toDouble
43 |         (precision, scoreds.size)
44 |       }
45 | 
46 |     using {
47 |       settings.outputFile match {
48 |         case Some(file) => new PrintWriter(file, "UTF8")
49 |         case None => new PrintWriter(System.out)
50 |       }
51 |     } { writer =>
52 |       for ((pattern, (p, y)) <- grouped.toSeq.sortBy(_._2).reverse) {
53 |         writer.println(pattern+"\t"+p+"\t"+y)
54 |       }
55 |     }
56 |   }
57 | }
58 | 


--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/openparse/eval/Score.scala:
--------------------------------------------------------------------------------
  1 | package edu.knowitall.openparse.eval
  2 | 
  3 | import java.io.{PrintWriter, File}
  4 | 
  5 | import scala.io.Source
  6 | 
  7 | import edu.knowitall.common.Resource.using
  8 | 
  9 | import scopt.OptionParser
 10 | 
 11 | /** A main method to annotate extractions,
 12 |   * using a gold set for previously scored extractions.
 13 |   *
 14 |   * @author Michael Schmitz
 15 |   */
 16 | object Score {
 17 |   abstract class Settings {
 18 |     def extractionFile: File
 19 |     def outputFile: File
 20 |     def goldFile: Option[File]
 21 |     def goldOutputFile: Option[File]
 22 |     def confidenceThreshold: Double
 23 |     def skipAll: Boolean
 24 |     def keepSkipped: Boolean
 25 |   }
 26 | 
 27 |   def main(args: Array[String]) = {
 28 |     object settings extends Settings {
 29 |       var extractionFile: File = _
 30 |       var outputFile: File = _
 31 |       var goldFile: Option[File] = None
 32 |       var goldOutputFile: Option[File] = None
 33 |       var confidenceThreshold = 0.0
 34 |       var skipAll = false
 35 |       var keepSkipped = false
 36 |     }
 37 | 
 38 |     val parser = new OptionParser("scoreextr") {
 39 |       arg("extrs", "extractions", { path: String => settings.extractionFile = new File(path) })
 40 |       arg("output", "scored output", { path: String => settings.outputFile = new File(path) })
 41 |       opt("g", "gold", "gold set", { path: String => settings.goldFile = Some(new File(path)) })
 42 |       opt("u", "goldoutput", "output for updated gold set", { path: String => settings.goldOutputFile = Some(new File(path)) })
 43 |       doubleOpt("t", "threshold", "confidence threshold for considered extractions", { x: Double => settings.confidenceThreshold = x })
 44 |       opt("skip-all", "don't prompt for items not in the gold set", { settings.skipAll = true })
 45 |       opt("keep-skipped", "keep unannotated extractions in output file", { settings.keepSkipped = true })
 46 |     }
 47 | 
 48 |     if (parser.parse(args)) {
 49 |       run(settings)
 50 |     }
 51 |   }
 52 | 
 53 |   def run(settings: Settings) {
 54 |     val gold = settings.goldFile match {
 55 |       case None => Map[String, Boolean]()
 56 |       case Some(goldFile) => GoldSet.load(goldFile)
 57 |     }
 58 | 
 59 |     val (scoreds, golden) = using(Source.fromFile(settings.extractionFile, "UTF8")) { source =>
 60 |       score(source.getLines, gold, settings.confidenceThreshold, !settings.skipAll)
 61 |     }
 62 | 
 63 |     // print the scored extractions
 64 |     using(new PrintWriter(settings.outputFile, "UTF8")) { writer =>
 65 |       for (scored <- scoreds.filter(scored => settings.keepSkipped || scored.score.isDefined)) {
 66 |         writer.println(scored.toRow)
 67 |       }
 68 |     }
 69 | 
 70 |     // output updated gold set
 71 |     settings.goldOutputFile match {
 72 |       case Some(file) =>
 73 |         using(new PrintWriter(file, "UTF8")) { writer =>
 74 |           golden.foreach { case (k, v) => writer.println((if (v) 1 else 0) + "\t" + k) }
 75 |         }
 76 |       case None =>
 77 |     }
 78 |   }
 79 | 
 80 |   def loadScoredFile(file: File): Seq[Scored] = {
 81 |     using(Source.fromFile(file, "UTF8")) { source =>
 82 |       source.getLines.map { line =>
 83 |         Scored.fromRow(line)
 84 |       }.toList
 85 |     }
 86 |   }
 87 | 
 88 |   def score(lines: Iterator[String], gold: Map[String, Boolean], confidenceThreshold: Double, prompt: Boolean) = {
 89 |     def stringDistance(s1: String, s2: String): Int = {
 90 |       def minimum(i1: Int, i2: Int, i3: Int) = math.min(math.min(i1, i2), i3)
 91 | 
 92 |       val dist = Array.ofDim[Int](s1.length + 1, s2.length + 1)
 93 | 
 94 |       for (idx <- 0 to s1.length) dist(idx)(0) = idx
 95 |       for (jdx <- 0 to s2.length) dist(0)(jdx) = jdx
 96 | 
 97 |       for (idx <- 1 to s1.length; jdx <- 1 to s2.length)
 98 |         dist(idx)(jdx) = minimum (
 99 |           dist(idx-1)(jdx  ) + 1,
100 |           dist(idx  )(jdx-1) + 1,
101 |           dist(idx-1)(jdx-1) + (if (s1(idx-1) == s2(jdx-1)) 0 else 1)
102 |         )
103 |       dist(s1.length)(s2.length)
104 |     }
105 | 
106 |     def suggest(extr: String) = {
107 |       for {
108 |         k <- gold.keys;
109 |         if stringDistance(k, extr) < extr.length / 2
110 |       } yield ((k, gold(k)))
111 |     }
112 | 
113 |     def promptScore(index: Int, extr: String, confidence: String, rest: Seq[Any]): Option[Boolean] = {
114 |       println()
115 |       System.out.println("Please score " + index + ": " + confidence + ":" + extr + ". (1/y/0/n/skip) ")
116 |       if (rest.length > 0) println(rest.mkString("\t"))
117 |       suggest(extr) foreach { case (k, v) =>
118 |         println("suggest: " + v + "\t" + k)
119 |       }
120 |       readLine match {
121 |         case "0" | "y" => Some(false)
122 |         case "1" | "n" => Some(true)
123 |         case "s" | "skip" => None
124 |         case _ => promptScore(index, extr, confidence, rest)
125 |       }
126 |     }
127 | 
128 |     var golden = gold
129 | 
130 |     val scored = for {
131 |       (line, index) <- lines.zipWithIndex
132 |       val Array(confidence, extr, rest @ _*) = line.split("\t")
133 |       val conf = confidence.toDouble
134 | 
135 |       if (conf >= confidenceThreshold)
136 | 
137 |       val scoreOption = gold.get(extr) match {
138 |         case Some(score) => Some(score)
139 |         case None if prompt => promptScore(index, extr, confidence, rest)
140 |         case None => None
141 |       }
142 |     } yield {
143 |       scoreOption match {
144 |         case Some(score) =>
145 |           // update golden set
146 |           golden += extr -> score
147 |         case None =>
148 |       }
149 | 
150 |       // output
151 |       Scored(scoreOption, conf, extr, rest)
152 |     }
153 | 
154 |     (scored.toList, golden)
155 |   }
156 | }
157 | 
158 | case class Scored(score: Option[Boolean], confidence: Double, extraction: String, extra: Seq[String]) {
159 |   def toRow = (if (!score.isDefined) "" else if (score.get == true) "1" else "0")+"\t"+confidence+"\t"+extraction+"\t"+extra.mkString("\t")
160 | }
161 | 
162 | object Scored {
163 |   def fromRow(row: String) = {
164 |     val parts = row.split("\t")
165 |     val score = parts(0) match {
166 |       case "1" => true
167 |       case "0" => false
168 |       case _ => throw new IllegalArgumentException("must be 1 or 0: " + parts(0))
169 |     }
170 |     val confidence = parts(1).toDouble
171 |     val extraction = parts(2)
172 |     val extra = parts.drop(3)
173 | 
174 |     Scored(Some(score), confidence, extraction, extra)
175 |   }
176 | }
177 | 
178 | object GoldSet {
179 |   def load(file: File) = {
180 |     using(Source.fromFile(file, "UTF8")) { source =>
181 |       source.getLines.map { line =>
182 |         val parts = line.split("\t")
183 |         parts(1) -> (if (parts(0) == "1") true else false)
184 |       }.toMap
185 |     }
186 |   }
187 | 
188 |   def save(gold: Map[String, Boolean], file: File) = {
189 |     using(new PrintWriter(file, "UTF8")) { writer =>
190 |       gold.foreach { case (extr, correct) => writer.println((if (correct) 1 else 0) + "\t" + extr) }
191 |     }
192 |   }
193 | }
194 | 


--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/openparse/eval/StatisticalSignificance.scala:
--------------------------------------------------------------------------------
  1 | package edu.knowitall.openparse.eval
  2 | 
  3 | import java.io.File
  4 | 
  5 | import edu.knowitall.common.{Random, Analysis}
  6 | 
  7 | import scopt.OptionParser
  8 | 
  9 | /** Compute the statistical significance of scored extractions to a baseline.
 10 |   * 
 11 |   * @author Michael Schmitz
 12 |   */
 13 | object StatisticalSignificance {
 14 |   abstract class Settings {
 15 |     def iterations: Int
 16 |     def systemFile: File
 17 |     def baselineFile: File
 18 |   }
 19 | 
 20 |   def main(args: Array[String]) {
 21 |     object settings extends Settings {
 22 |       var systemFile: File = _
 23 |       var baselineFile: File = _
 24 |       var iterations: Int = 1000
 25 |     }
 26 | 
 27 |     val parser = new OptionParser("statsig") {
 28 |       arg("system", "scored extractions from the new system", { path: String => settings.systemFile = new File(path) })
 29 |       arg("baseline", "scored extractions from the baseline system", { path: String => settings.baselineFile = new File(path) })
 30 |       intOpt("i", "iterations", "number of iterations", { n: Int => settings.iterations = n })
 31 |     }
 32 | 
 33 |     if (parser.parse(args)) {
 34 |       run(settings)
 35 |     }
 36 |   }
 37 | 
 38 |   /**
 39 |     * Uses the bootstrap test for statistical significance.  
 40 |     * This is described in the following paper:
 41 |     *
 42 |     *    http://maroo.cs.umass.edu/pub/web/getpdf.php?id=744
 43 |     *
 44 |     * Note that this function is agnostic to the order of 
 45 |     * `system` and `baseline`.
 46 |     * 
 47 |     * @param  system  a metric for the system, i.e. AUC
 48 |     * @param  baseline  a metric for the baseline, i.e. AUC
 49 |     * @param  sample  a lambda that resamples the systems, returning the metric, i.e. AUC
 50 |     * @param  iterations  the number of iterations
 51 |     */
 52 |   def bootstrapTestWithMetric(system: Double, 
 53 |       baseline: Double, 
 54 |       sample: ()=>(Double, Double), 
 55 |       iterations: Int) = {
 56 |     val difference = math.abs(system - baseline)
 57 |     val sampled = for (i <- 0 until iterations) yield (sample())
 58 |     val differences = sampled.map { case (sys, base) => math.abs(sys - base) }
 59 |     val average = differences.sum / differences.size.toDouble
 60 |     val normed = differences.map(_ - average)
 61 |     val pscore = normed.count(_ >= difference).toDouble / normed.size.toDouble
 62 | 
 63 |     pscore
 64 |   }
 65 | 
 66 |   /**
 67 |     * Uses the bootstrap test for statistical significance.  
 68 |     * This is described in the following paper:
 69 |     *
 70 |     *    http://maroo.cs.umass.edu/pub/web/getpdf.php?id=744
 71 |     *
 72 |     * Note that this function is agnostic to the order of 
 73 |     * `system` and `baseline`.
 74 |     * 
 75 |     * @param  system  a metric for the system, i.e. AUC
 76 |     * @param  baseline  a metric for the baseline, i.e. AUC
 77 |     * @param  sample  a lambda that resamples the systems, returning the metric, i.e. AUC
 78 |     * @param  iterations  the number of iterations
 79 |     */
 80 |   def bootstrapTestWithScores(system: Seq[Boolean], 
 81 |     baseline: Seq[Boolean], 
 82 |     metric: Seq[Boolean]=>Double, 
 83 |     iterations: Int, rand: util.Random) = {
 84 | 
 85 |     def sample(extrs: Seq[Boolean]) = 
 86 |       metric(extrs.map(extr=>Random.choose(extrs, extrs.size, rand)))
 87 | 
 88 |     bootstrapTestWithMetric(metric(system), metric(baseline), 
 89 |         ()=>(sample(system), sample(baseline)), iterations)
 90 |   }
 91 | 
 92 |   def run(settings: Settings) {
 93 |     val rand = new util.Random
 94 | 
 95 |     def areaUnderCurve(scoreds: Seq[Scored]) = {
 96 |       val points = Analysis.precisionYieldMeta(scoreds.map(extr => (extr.confidence, extr.score.get)))
 97 |       Analysis.areaUnderCurve(points.map { case (conf, yld, prc) => (yld, prc) })
 98 |     }
 99 | 
100 |     val systemExtractionsAll: Seq[Scored] =
101 |       Score.loadScoredFile(settings.systemFile).sortBy(-_.confidence)
102 |     val baselineExtractionsAll: Seq[Scored] = 
103 |       Score.loadScoredFile(settings.baselineFile).sortBy(-_.confidence)
104 | 
105 |     val sentences = (systemExtractionsAll.map(_.extra(0)).toSet ++ baselineExtractionsAll.map(_.extra(0)).toSet).toSeq.take(50).toSet
106 | 
107 |     val systemExtractions = systemExtractionsAll.filter(extr => sentences.contains(extr.extra(0)))
108 |     val baselineExtractions = baselineExtractionsAll.filter(extr => sentences.contains(extr.extra(0)))
109 | 
110 |     def sample(): (Double, Double) = {
111 |       def helper(extrs: Seq[Scored]) = {
112 |         val sent = sentences.map(extr=>Random.choose(sentences, sentences.size, rand))
113 |         val set = sent.flatMap(sent => extrs.filter(sent == _.extra(0))).toSeq.sortBy(_.confidence)
114 |         val auc = areaUnderCurve(set)
115 |         auc
116 |       }
117 | 
118 |       (helper(systemExtractions), helper(baselineExtractions))
119 |     }
120 | 
121 |     val pscore = bootstrapTestWithMetric(
122 |       areaUnderCurve(systemExtractions),
123 |       areaUnderCurve(baselineExtractions),
124 |       sample, settings.iterations)
125 | 
126 |     println(pscore)
127 |   }
128 | }
129 | 


--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/openparse/extract/Extraction.scala:
--------------------------------------------------------------------------------
  1 | package edu.knowitall.openparse.extract
  2 | 
  3 | import scala.collection.{SortedSet, Set}
  4 | import edu.knowitall.collection.immutable.graph.pattern.Match
  5 | import edu.knowitall.collection.immutable.graph.{Direction, Graph, DirectedEdge}
  6 | import edu.knowitall.collection.immutable.Interval
  7 | import edu.knowitall.openparse.GraphExpansions.{expand, components, augment}
  8 | import edu.knowitall.openparse.OpenParse
  9 | import edu.knowitall.tool.parse.graph.{DependencyPattern, DependencyNode, DependencyGraph}
 10 | import edu.knowitall.ollie.Ollie.stemmer
 11 | import edu.knowitall.tool.stem.Stemmer
 12 | import Extraction.{Part, ClausalComponent, AdverbialModifier}
 13 | import edu.knowitall.tool.parse.graph.LabelEdgeMatcher
 14 | import edu.knowitall.collection.immutable.graph.pattern.DirectedEdgeMatcher
 15 | 
 16 | /** A representation of an OpenParse extraction.
 17 |   *
 18 |   * @author Michael Schmitz
 19 |   */
 20 | abstract class Extraction(val relLemmas: Set[String]) {
 21 |   /** the text of the first argument */
 22 |   def arg1Text: String
 23 |   /** the text of the relation */
 24 |   def relText: String
 25 |   /** the text of the second argument */
 26 |   def arg2Text: String
 27 | 
 28 |   def this(relText: String) = this(relText.split(" ").map(implicitly[Stemmer].lemmatize(_)).toSet -- OpenParse.LEMMA_BLACKLIST)
 29 | 
 30 |   override def equals(that: Any) = that match {
 31 |     case that: Extraction => (that canEqual this) && that.arg1Text == this.arg1Text && that.relText == this.relText && that.arg2Text == this.arg2Text
 32 |     case _ => false
 33 |   }
 34 |   def canEqual(that: Any) = that.isInstanceOf[Extraction]
 35 |   override def hashCode = arg1Text.hashCode + 39 * (relText.hashCode + 39 * arg2Text.hashCode)
 36 | 
 37 |   override def toString() = Iterable(arg1Text, relText, arg2Text).mkString("(", "; ", ")")
 38 | 
 39 |   def softMatch(that: Extraction) =
 40 |     (that.arg1Text.contains(this.arg1Text) || this.arg1Text.contains(that.arg1Text)) &&
 41 |       this.relLemmas == that.relLemmas &&
 42 |       (that.arg2Text.contains(this.arg2Text) || this.arg2Text.contains(that.arg2Text))
 43 | }
 44 | 
 45 | /** A simple representation of an OpenParse extraction.
 46 |   *
 47 |   * @author Michael Schmitz
 48 |   */
 49 | class SimpleExtraction(
 50 |   override val arg1Text: String,
 51 |   override val relText: String,
 52 |   relLemmas: Set[String],
 53 |   override val arg2Text: String)
 54 |   extends Extraction(relLemmas) {
 55 | 
 56 |   def this(arg1Text: String, relText: String, arg2Text: String) = this(arg1Text,
 57 |     relText,
 58 |     relText.split(" ").map(implicitly[Stemmer].lemmatize(_)).toSet -- OpenParse.LEMMA_BLACKLIST,
 59 |     arg2Text)
 60 | 
 61 |   def replaceRelation(relation: String) =
 62 |     new SimpleExtraction(this.arg1Text, this.relText, this.relLemmas, this.arg2Text)
 63 | }
 64 | 
 65 | /** A more informative representation of an OpenParse extraction.
 66 |   *
 67 |   * @author Michael Schmitz
 68 |   */
 69 | class DetailedExtraction(
 70 |   val extractor: PatternExtractor,
 71 |   val `match`: Match[DependencyNode],
 72 |   val arg1: Part,
 73 |   val rel: Part,
 74 |   val arg2: Part,
 75 |   val clausal: Option[ClausalComponent] = None,
 76 |   val modifier: Option[AdverbialModifier] = None)
 77 |   extends Extraction(rel.text) {
 78 | 
 79 |   override def arg1Text = arg1.text
 80 |   override def relText = rel.text
 81 |   override def arg2Text = arg2.text
 82 | 
 83 |   def this(extractor: PatternExtractor, mch: Match[DependencyNode],
 84 |     arg1Nodes: SortedSet[DependencyNode],
 85 |     relNodes: SortedSet[DependencyNode],
 86 |     arg2Nodes: SortedSet[DependencyNode]) =
 87 |     this(extractor, mch, new Part(arg1Nodes), new Part(relNodes), new Part(arg2Nodes))
 88 | 
 89 |   /** all the nodes in this extraction */
 90 |   def nodes = arg1.nodes ++ rel.nodes ++ arg2.nodes
 91 | 
 92 |   /** all the edges in this extraction */
 93 |   def edges = `match`.bipath.path
 94 | 
 95 |   def replaceRelation(relation: String) =
 96 |     new DetailedExtraction(extractor, `match`, this.arg1, Part(this.rel.nodes, relation), this.arg2, this.clausal, this.modifier)
 97 | }
 98 | 
 99 | object DetailedExtraction {
100 |   def nodesToString(nodes: Iterable[DependencyNode]) = nodes.iterator.map(_.text).mkString(" ")
101 | }
102 | 
103 | 
104 | /** Includes logic for expanding relations and arguments.
105 |   *
106 |   * @author Michael Schmitz
107 |   */
108 | object Extraction {
109 |   /** Representation of a part of an extraction.
110 |     *
111 |     * @author Michael Schmitz
112 |     */
113 |   case class Part(nodes: SortedSet[DependencyNode], text: String) {
114 |     def this(nodes: SortedSet[DependencyNode]) = {
115 |       this(nodes, DetailedExtraction.nodesToString(nodes))
116 |     }
117 | 
118 |     def this(nodes: Iterable[DependencyNode]) = {
119 |       this(SortedSet[DependencyNode]() ++ nodes, DetailedExtraction.nodesToString(nodes))
120 |     }
121 | 
122 |     def span = Interval.span(nodes.map(_.indices))
123 |   }
124 |   object Part {
125 |     def connections(m: Match[DependencyNode], node: DependencyNode): Set[Graph.Edge[DependencyNode]] = {
126 |       m.edges.filter(edge => edge.source == node || edge.dest == node).toSet
127 |     }
128 | 
129 |     def connections(m: Match[DependencyNode], nodes: Set[DependencyNode]): Set[Graph.Edge[DependencyNode]] = {
130 |       m.edges.filter(edge => nodes.contains(edge.source) || nodes.contains(edge.dest)).toSet
131 |     }
132 | 
133 |     def connections(m: Match[DependencyNode], nodes: Seq[DependencyNode]): Set[Graph.Edge[DependencyNode]] = {
134 |       m.edges.filter(edge => nodes.contains(edge.source) || nodes.contains(edge.dest)).toSet
135 |     }
136 |   }
137 |   case class ClausalComponent(rel: Part, arg: Part) {
138 |     def text = arg.text + " " + rel.text
139 |   }
140 |   case class AdverbialModifier(contents: Part) {
141 |     def text = contents.text
142 |   }
143 | 
144 |   private val attributionPattern = DependencyPattern.deserialize("{old} <ccomp< {rel} >nsubj> {arg}")
145 |   private val conditionalPattern = DependencyPattern.deserialize("{old} <ccomp< {rel} >nsubj> {arg}")
146 |   def fromMatch(expand: Boolean)(graph: DependencyGraph, m: Match[DependencyNode], ex: PatternExtractor): Iterable[DetailedExtraction] = {
147 |     def clausalComponent(node: DependencyNode, until: Set[DependencyNode]) = {
148 |       attributionPattern.apply(graph.graph, node) match {
149 |         case List(m) =>
150 |           assume(m.nodeGroups.get("rel").isDefined)
151 |           assume(m.nodeGroups.get("arg").isDefined)
152 | 
153 |           val rel = m.nodeGroups("rel").node
154 |           val arg = m.nodeGroups("arg").node
155 | 
156 |           val Part(expandedRelNodes, expandedRelText) = expandRelation(graph, rel, until + arg).head
157 |           val expandedArg = expandArgument(graph, arg, until + rel)
158 | 
159 |           Some(ClausalComponent(Part(expandedRelNodes, expandedRelText), Part(expandedArg, DetailedExtraction.nodesToString(expandedArg))))
160 |         case _ => None
161 |       }
162 |     }
163 | 
164 |     def adverbialModifier(node: DependencyNode, until: Set[DependencyNode]): Option[AdverbialModifier] = {
165 |       val neighbors = graph.graph.neighbors(node, dedge => dedge.dir == Direction.Down && dedge.edge.label == "advcl")
166 |       val nodes = neighbors.flatMap(graph.graph.inferiors(_))
167 |       if (nodes.isEmpty) None
168 |       else {
169 |         val span = Interval.span(nodes.map(_.indices))
170 |         val clause = graph.nodes.filter(node => span.superset(node.indices))
171 |         Some(AdverbialModifier(Part(clause, DetailedExtraction.nodesToString(clause))))
172 |       }
173 |     }
174 | 
175 |     val groups = m.nodeGroups
176 | 
177 |     val rels = groups.filter(_._1 startsWith "rel").toSeq.sortBy(_._1).map(_._2.node)
178 |     if (rels.isEmpty) (throw new IllegalArgumentException("no rel: " + m))
179 |     val arg1 = groups.get("arg1").map(_.node) getOrElse (throw new IllegalArgumentException("no arg1: " + m))
180 |     val arg2 = groups.get("arg2").map(_.node) getOrElse (throw new IllegalArgumentException("no arg2: " + m))
181 | 
182 |     val expandedArg1 = if (expand) expandArgument(graph, arg1, rels.toSet) else SortedSet(arg1)
183 |     val expandedArg2 = if (expand) expandArgument(graph, arg2, rels.toSet) else SortedSet(arg2)
184 |     val expandRels =
185 |       // hack to exclude rel rel extractions with a second nsubj
186 |       if (rels.size > 0 && rels.tail.exists(rel => graph.graph.dedges(rel).exists(dedge => dedge.dir == Direction.Down && dedge.edge.label == "nsubj"))) {
187 |         Set.empty
188 |       }
189 |       else if (expand) {
190 |         import scalaz._
191 |         import Scalaz._
192 | 
193 |         val expansions = rels.map(rel => expandRelation(graph, rel, expandedArg1 ++ expandedArg2).toList).toList.sequence
194 | 
195 |         expansions.map(expansion => Part(expansion.map(_.nodes).reduce(_ ++ _), expansion.map(_.text).mkString(" ")))
196 |       } else {
197 |         Set(Part(SortedSet.empty[DependencyNode] ++ rels, rels.map(_.text).mkString(" ")))
198 |       }
199 | 
200 |     for {
201 |       Part(expandedRelNodes, expandedRelText) <- expandRels
202 |       val nodes = expandedArg1 ++ expandedArg2 ++ expandedRelNodes
203 |       val clausal = rels.flatMap(rel => clausalComponent(rel, nodes)).headOption
204 |       val modifier = rels.flatMap(rel => adverbialModifier(rel, nodes)).headOption
205 | 
206 |       // arguments don't overlap
207 |       if (!(Interval.span(expandedArg1.map(_.indices)(scala.collection.breakOut)) intersects Interval.span(expandedArg2.map(_.indices)(scala.collection.breakOut))))
208 |     } yield (
209 |       new DetailedExtraction(ex, m, new Part(expandedArg1), Part(expandedRelNodes, expandedRelText), new Part(expandedArg2), clausal = clausal, modifier = modifier)
210 |     )
211 | 
212 |   }
213 | 
214 |   private val argumentExpansionLabels = Set("det", "prep_of", "amod", "num", "number", "nn", "poss", "quantmod", "neg")
215 |   def expandArgument(graph: DependencyGraph, node: DependencyNode, until: Set[DependencyNode]): SortedSet[DependencyNode] = {
216 |     def expandNode(node: DependencyNode) = {
217 |       val expansion = expand(graph, node, until, argumentExpansionLabels)
218 |       if (expansion.exists(_.isProperNoun)) expansion
219 |       else expansion ++ components(graph, node, Set("rcmod", "infmod", "partmod", "ref", "prepc_of"), until, false).flatten
220 |     }
221 | 
222 |     // expand over any conjunction/disjunction edges to non-verbs
223 |     val nodes = graph.graph.connected(node, (dedge: DirectedEdge[DependencyNode]) =>
224 |       !(dedge.end.postag startsWith "VB") && (dedge.edge.label == "conj_and" || dedge.edge.label == "conj_or"))
225 | 
226 |     if (nodes.size == 1) {
227 |       // there are no conjunctive edges
228 |       expandNode(node)
229 |     }
230 |     else {
231 |       val flat = nodes.map(expandNode).flatten
232 |       val span = Interval.span(flat.map(_.indices).toSeq)
233 |       // take the nodes that cover all the nodes found
234 |       graph.nodes.filter(node => span.superset(node.indices))
235 |     }
236 |   }
237 | 
238 |   /** Expand the relation nodes of a match.
239 |     *
240 |     * Multiple parts can be returned if there are multiple dobj or iobjs.
241 |     *
242 |     * @return  parts  the part (or multiple parts) that describes the relation
243 |     */
244 |   def expandRelation(graph: DependencyGraph, node: DependencyNode, until: Set[DependencyNode]): Set[Part] = {
245 |     // count the adjacent dobj edges.  We will only expand across
246 |     // dobj components if there is exactly one adjacent dobj edge.
247 |     // This edge may already be used, but in that case we won't
248 |     // expand over it because of the until set.
249 |     val dobjCount = graph.graph.edges(node).count(_.label == "dobj")
250 |     val iobjCount = graph.graph.edges(node).count(_.label == "iobj")
251 | 
252 |     var attachLabels = Set[String]()
253 |     if (dobjCount == 1) attachLabels += "dobj"
254 |     if (iobjCount == 1) attachLabels += "iobj"
255 | 
256 |     /*
257 |      * acomp: "She looks beautiful on Thursday."
258 |      */
259 |     def pred(edge: Graph.Edge[DependencyNode]) =
260 |       // make sure we don't re-add the relation node
261 |       edge.dest != node && (
262 |           // attach adverbs
263 |           edge.label == "advmod" && edge.dest.postag == "RB" ||
264 |           edge.label == "aux" || edge.label == "cop" || edge.label == "auxpass" || edge.label == "prt" || edge.label == "acomp")
265 | 
266 |     // expand across noun label for relational nouns
267 |     // i.e. "He is the *best* president of the USA"
268 |     val expandNounLabels =
269 |       if (node.postag startsWith "NN") expand(graph, node, until, argumentExpansionLabels)
270 |       else expand(graph, node, until, Set("det", "amod", "num", "number", "nn", "poss", "quantmod", "neg"))
271 | 
272 |     // modifiers on copulars are stored on a different node
273 |     // i.e. in "he *will* be the president"
274 |     val cops = graph.graph.predecessors(node, (e: Graph.Edge[DependencyNode])=>e.label == "cop").headOption
275 |     val expandCopLabels = cops.map(cop => augment(graph, cop, until, pred)).getOrElse(List.empty)
276 | 
277 |     def f(s: Set[List[DependencyNode]]): Set[List[DependencyNode]] =
278 |       if (s.isEmpty) Set(List())
279 |       else s
280 |     val dobjs = f(components(graph, node, Set("dobj"), until, true))
281 |     val iobjs = f(components(graph, node, Set("iobj"), until, true))
282 | 
283 |     for (dobj <- dobjs; iobj <- iobjs) yield {
284 |       val expansion = expandCopLabels ++ (expandNounLabels ::
285 |         // make sure that we don't use a label that was
286 |         // already captured by expandNounlabels.  This
287 |         // can happen when a verb edges goes between two
288 |         // noun labels.
289 |         ((augment(graph, node, until, pred).map(_ -- expandNounLabels)) :+
290 |           // add subcomponents
291 |           (SortedSet[DependencyNode]() ++ dobj) :+
292 |           (SortedSet[DependencyNode]() ++ iobj)).filterNot { c =>
293 |             // don't add empty components
294 |             c.isEmpty ||
295 |               // don't add components with just "who" or "whom"
296 |               c.size == 1 && c.headOption.map(_.postag == "WP").getOrElse(false)
297 |           })
298 | 
299 |       val sorted = expansion.sortBy(nodes => Interval.span(nodes.map(_.indices)))
300 | 
301 |       // perform a more complicated node->text transformation
302 |       val texts = sorted.map(DetailedExtraction.nodesToString(_))
303 |       Part(expansion.reduce(_ ++ _), texts.mkString(" "))
304 |     }
305 |   }
306 | }
307 | 


--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/openparse/extract/GeneralExtractor.scala:
--------------------------------------------------------------------------------
 1 | package edu.knowitall.openparse.extract
 2 | 
 3 | import org.slf4j.LoggerFactory
 4 | import edu.knowitall.collection.immutable.graph.pattern.{Pattern, Match}
 5 | import edu.knowitall.collection.immutable.graph.Graph
 6 | import edu.knowitall.tool.parse.graph.{DependencyPattern, DependencyNode, DependencyGraph}
 7 | import edu.knowitall.ollie.Ollie.stemmer
 8 | import GeneralExtractor.logger
 9 | import edu.knowitall.openparse.ExtractorPattern
10 | 
11 | /** An extractor that is purely specified by a pattern.
12 |   *
13 |   * @param  pattern  the pattern to extract
14 |   * @param  conf  the confidence of this extractor
15 |   *
16 |   * @author Michael Schmitz
17 |   */
18 | class GeneralExtractor(pattern: ExtractorPattern, val conf: Double) extends PatternExtractor(pattern) {
19 |   import GeneralExtractor._
20 | 
21 |   def this(pattern: Pattern[DependencyNode], conf: Double) =
22 |     this(new ExtractorPattern(pattern), conf)
23 | 
24 |   protected def extractWithMatches(dgraph: DependencyGraph)(implicit
25 |     buildExtraction: (DependencyGraph, Match[DependencyNode], PatternExtractor)=>Iterable[DetailedExtraction],
26 |     validMatch: Graph[DependencyNode]=>Match[DependencyNode]=>Boolean) = {
27 | 
28 |     // apply pattern and keep valid matches
29 |     val matches = pattern(dgraph.graph)
30 |     if (!matches.isEmpty && logger.isDebugEnabled) logger.debug("matches: " + matches.mkString(", "))
31 | 
32 |     val filtered = matches.filter(validMatch(dgraph.graph))
33 |     if (!filtered.isEmpty && logger.isDebugEnabled) logger.debug("filtered: " + filtered.mkString(", "))
34 | 
35 |     for (m <- filtered; extr <- buildExtraction(dgraph, m, this)) yield {
36 |       (extr, m)
37 |     }
38 |   }
39 | 
40 |   override def extract(dgraph: DependencyGraph)(implicit
41 |     buildExtraction: (DependencyGraph, Match[DependencyNode], PatternExtractor)=>Iterable[DetailedExtraction],
42 |     validMatch: Graph[DependencyNode]=>Match[DependencyNode]=>Boolean) = {
43 |     logger.debug("pattern: " + pattern)
44 | 
45 |     val extractions = this.extractWithMatches(dgraph).map(_._1)
46 |     if (!extractions.isEmpty) logger.debug("extractions: " + extractions.mkString(", "))
47 | 
48 |     extractions
49 |   }
50 | 
51 |   override def confidence(extr: Extraction): Double = {
52 |     this.conf
53 |   }
54 | 
55 |   /** A maximum confidence for any extraction from this extractor.
56 |     * This is used for optimization.  If the minimum confidence is
57 |     * larger than the threshold, we don't need to run this extractor. */
58 |   override def maximumConfidence: Double = this.conf
59 | }
60 | 
61 | case object GeneralExtractor extends PatternExtractorType {
62 |   val logger = LoggerFactory.getLogger(this.getClass)
63 | 
64 |   def fromLines(lines: Iterator[String]): List[GeneralExtractor] = {
65 |     val patterns: List[(Pattern[DependencyNode], Int)] = lines.map { line =>
66 |         line.split("\t") match {
67 |           // full information specified
68 |           case Array(pat, count) => (DependencyPattern.deserialize(pat), count.toInt)
69 |           // assume a count of 1 if nothing is specified
70 |           case Array(pat) => logger.warn("warning: pattern has no count: " + pat); (DependencyPattern.deserialize(pat), 1)
71 |           case _ => throw new IllegalArgumentException("line must have one or two columns: " + line)
72 |         }
73 |       }.toList
74 | 
75 |     (for ((p, conf) <- patterns) yield {
76 |       new GeneralExtractor(new ExtractorPattern(p), conf.toDouble)
77 |     }).toList
78 |   }
79 | }
80 | 


--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/openparse/extract/PatternExtractor.scala:
--------------------------------------------------------------------------------
 1 | package edu.knowitall.openparse.extract
 2 | 
 3 | import java.io.File
 4 | import scala.io.Source
 5 | import edu.knowitall.collection.immutable.graph.pattern.{Pattern, Match}
 6 | import edu.knowitall.collection.immutable.graph.Graph
 7 | import edu.knowitall.common.Resource.using
 8 | import edu.knowitall.tool.parse.graph.{DependencyNode, DependencyGraph}
 9 | import javax.naming.OperationNotSupportedException
10 | import edu.knowitall.collection.immutable.graph.pattern.CaptureNodeMatcher
11 | import edu.knowitall.openparse.ExtractorPattern
12 | 
13 | /** An superclass for extractors based on patterns.
14 |   *
15 |   * @param  pattern  the pattern to extract
16 |   *
17 |   * @author Michael Schmitz
18 |   */
19 | abstract class PatternExtractor(val pattern: ExtractorPattern) {
20 |   def extract(dgraph: DependencyGraph)(implicit
21 |     buildExtraction: (DependencyGraph, Match[DependencyNode], PatternExtractor)=>Iterable[DetailedExtraction],
22 |     validMatch: Graph[DependencyNode]=>Match[DependencyNode]=>Boolean): Iterable[DetailedExtraction]
23 |   def confidence(extr: Extraction): Double
24 | 
25 |   /** A maximum confidence for any extraction from this extractor.
26 |     * This is used for optimization.  If the minimum confidence is
27 |     * larger than the threshold, we don't need to run this extractor.
28 |     */
29 |   def maximumConfidence: Double
30 | 
31 |   override def toString = pattern.toString
32 | 
33 |   def tabSerialize: String = throw new OperationNotSupportedException()
34 | 
35 |   def prepMismatch: Boolean = false
36 | }
37 | 
38 | object PatternExtractor {
39 |   def tabDeserialize(seq: Seq[String]): (PatternExtractor, Seq[String]) = {
40 |     seq(0).toLowerCase match {
41 |       case "template" => TemplateExtractor.tabDeserialize(seq.drop(1))
42 |     }
43 |   }
44 | }
45 | 
46 | abstract class PatternExtractorType {
47 |   def fromFile(file: File): Seq[PatternExtractor] = {
48 |     using (Source.fromFile(file, "UTF8")) { source =>
49 |       fromLines(source.getLines)
50 |     }
51 | 
52 |   }
53 |   def fromLines(lines: Iterator[String]): Seq[PatternExtractor]
54 | 
55 |   def name = this.getClass.getSimpleName
56 | }
57 | 
58 | object PatternExtractorType {
59 |   def apply(string: String) = string match {
60 |     case "general" => GeneralExtractor
61 |     case "template" => TemplateExtractor
62 |     case "specific" => SpecificExtractor
63 |     case _ => throw new IllegalArgumentException("unknown extractor: " + string)
64 |   }
65 | }


--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/openparse/extract/SpecificExtractor.scala:
--------------------------------------------------------------------------------
 1 | package edu.knowitall.openparse.extract
 2 | 
 3 | import scala.Array.canBuildFrom
 4 | import edu.knowitall.collection.immutable.graph.pattern.{Pattern, Match}
 5 | import edu.knowitall.collection.immutable.graph.Graph
 6 | import edu.knowitall.tool.parse.graph.{DependencyNode, DependencyGraph}
 7 | import edu.knowitall.tool.stem.MorphaStemmer
 8 | import edu.knowitall.openparse.ExtractorPattern
 9 | 
10 | /** An extractor that is specified only with a pattern
11 |   * but only works for specific relation lemmas.
12 |   *
13 |   * @param  relation  the resulting relation string
14 |   * @param  relationLemmas  the acceptible matched lemmas
15 |   * @param  pattern  the pattern to extract
16 |   * @param  conf  the confidence of this extractor
17 |   *
18 |   * @author Michael Schmitz
19 |   */
20 | class SpecificExtractor(val relation: String,
21 |   val relationLemmas: List[String],
22 |   pattern: ExtractorPattern, conf: Double)
23 | extends GeneralExtractor(pattern, conf) {
24 | 
25 |   def this(relation: String, relationLemmas: List[String], pattern: Pattern[DependencyNode], conf: Double) =
26 |     this(relation, relationLemmas, new ExtractorPattern(pattern), conf)
27 | 
28 |   override def extract(dgraph: DependencyGraph)(implicit
29 |     buildExtraction: (DependencyGraph, Match[DependencyNode], PatternExtractor)=>Iterable[DetailedExtraction],
30 |     validMatch: Graph[DependencyNode]=>Match[DependencyNode]=>Boolean) = {
31 |     val extractions = super.extract(dgraph)
32 |     extractions.withFilter{ extr =>
33 |       val extrRelationLemmas = extr.rel.text.split(" ").map(MorphaStemmer.lemmatize(_))
34 |       relationLemmas.forall(extrRelationLemmas.contains(_))
35 |     }.map(_.replaceRelation(relation))
36 |   }
37 | }
38 | 
39 | case object SpecificExtractor extends PatternExtractorType {
40 |   def fromLines(lines: Iterator[String]) = throw new UnsupportedOperationException
41 | }


--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/openparse/extract/TemplateExtractor.scala:
--------------------------------------------------------------------------------
  1 | package edu.knowitall.openparse.extract
  2 | 
  3 | import scala.util.matching.Regex
  4 | import org.slf4j.LoggerFactory
  5 | import edu.knowitall.collection.immutable.graph.pattern.{Pattern, Match}
  6 | import edu.knowitall.collection.immutable.graph.Graph
  7 | import edu.knowitall.tool.parse.graph.{DependencyPattern, DependencyNode, DependencyGraph}
  8 | import Template.group
  9 | import edu.knowitall.ollie.Ollie.stemmer
 10 | import edu.knowitall.tool.postag.Postagger
 11 | import edu.knowitall.tool.parse.graph.RegexEdgeMatcher
 12 | import edu.knowitall.tool.parse.graph.LabelEdgeMatcher
 13 | import edu.knowitall.openparse.ExtractorPattern
 14 | 
 15 | /** An extractor that is specified by a pattern and a template.
 16 |   * the template can add a "to be" and/or preposition word around
 17 |   * the relation.  It can also change the preposition word to another
 18 |   * preposition (i.e., switch "of" to "in").
 19 |   *
 20 |   * @param  template  a template in which to put the relation words
 21 |   * @param  pattern  the pattern to extract
 22 |   * @param  conf  the confidence of this extractor
 23 |   *
 24 |   * @author Michael Schmitz
 25 |   */
 26 | class TemplateExtractor(val template: Template, pattern: ExtractorPattern, conf: Double)
 27 | extends GeneralExtractor(pattern, conf) {
 28 | 
 29 |   def this(template: Template, pattern: Pattern[DependencyNode], conf: Double) =
 30 |     this(template, new ExtractorPattern(pattern), conf)
 31 | 
 32 |   override def extract(dgraph: DependencyGraph)(implicit
 33 |     buildExtraction: (DependencyGraph, Match[DependencyNode], PatternExtractor)=>Iterable[DetailedExtraction],
 34 |     validMatch: Graph[DependencyNode]=>Match[DependencyNode]=>Boolean) = {
 35 | 
 36 |     val extractions = super.extractWithMatches(dgraph)
 37 | 
 38 |     extractions.map{ case (extr, m) => template(extr, dgraph, m) }
 39 |   }
 40 | 
 41 |   override def tabSerialize = Iterable("Template", template.serialize, pattern.serialize, conf.toString).mkString("\t")
 42 | 
 43 |   override def prepMismatch: Boolean = {
 44 |     val trailingPrep = TemplateExtractor.trailingPreposition.findFirstIn(template.serialize)
 45 |     val lastPatternPrep = pattern.baseEdgeMatchers.flatMap {
 46 |       case m: RegexEdgeMatcher if m.labelRegex == new Regex("""prep_(.*)""") => Some("{prep}")
 47 |       case m: LabelEdgeMatcher if m.label startsWith "prep_" => Some(m.label.drop(5))
 48 |       case _ => None
 49 |     }.lastOption
 50 | 
 51 |     trailingPrep == lastPatternPrep
 52 |   }
 53 | }
 54 | 
 55 | case object TemplateExtractor extends PatternExtractorType {
 56 |   val logger = LoggerFactory.getLogger(this.getClass)
 57 | 
 58 |   private val trailingPreposition = new Regex("\\s(?:" + Postagger.prepositions.mkString("|") + "|\\{prep\\})$")
 59 | 
 60 |   override def fromLines(lines: Iterator[String]): List[PatternExtractor] = {
 61 |     val patterns: List[(Template, Pattern[DependencyNode], Double)] = lines.map { line =>
 62 |       line.split("\t") match {
 63 |         // full information specified
 64 |         case Array(template, pat, conf) =>
 65 |           (Template.deserialize(template), DependencyPattern.deserialize(pat), conf.toDouble)
 66 |         // assume a count of 1 if nothing is specified
 67 |         case Array(template, pat) =>
 68 |           logger.warn("warning: pattern has no confidence: " + pat);
 69 |           (Template.deserialize(template), DependencyPattern.deserialize(pat), 1.0)
 70 |         case _ => throw new IllegalArgumentException("line must have two or three columns: " +line)
 71 |       }
 72 |     }.toList
 73 | 
 74 |     val maxCount = patterns.maxBy(_._3)._3
 75 |     (for ((template, pattern, conf) <- patterns) yield {
 76 |       new TemplateExtractor(template, new ExtractorPattern(pattern), conf)
 77 |     }).toList
 78 |   }
 79 | 
 80 |   def tabDeserialize(string: String) = {
 81 |     val parts = string.split("\t")
 82 |   }
 83 | 
 84 |   def tabDeserialize(parts: Seq[String]): (TemplateExtractor, Seq[String]) = {
 85 |     val Seq(templateString, patternString, confString, rest @ _*) = parts
 86 | 
 87 |     val template = Template.deserialize(templateString)
 88 |     val pattern = new ExtractorPattern(DependencyPattern.deserialize(patternString))
 89 |     val conf = confString.toDouble
 90 | 
 91 |     (new TemplateExtractor(template, pattern, conf), rest)
 92 |   }
 93 | }
 94 | 
 95 | case class Template(template: String, be: Boolean) {
 96 |   import Template._
 97 |   def apply(extr: DetailedExtraction, dgraph: DependencyGraph, m: Match[DependencyNode]) = {
 98 |     def matchGroup(name: String): String = name match {
 99 |       case "rel" => extr.relText
100 |       case "arg1" => extr.arg1Text
101 |       case "arg2" => extr.arg2Text
102 |       case _ => m.groups(name).text
103 |     }
104 | 
105 |     // don't add the be if we attach a verb using a cop, aux, or auxpass edge.
106 |     // there are a lot of examples where adding "be" makes it very messy
107 |     //     "She has practiced law, with Foo, Bar."
108 |     //     don't want: (Bar; be has practiced with; Foo)
109 |     // This is somewhat of a hack that makes bad patterns look less bad.
110 |     val prefix = if (be &&
111 |         !(dgraph.graph.neighbors(m.nodeGroups.getOrElse("rel", m.nodeGroups("rel1")).node, dedge => (dedge.edge.label startsWith "aux") || dedge.edge.label == "cop") filter (_.postag startsWith "VB") exists (neighbor => extr.rel.nodes contains neighbor))) {
112 |       "be"
113 |     }
114 |     else ""
115 | 
116 |     // pull out the modals because they must preceed the prefix
117 |     // also include "to"
118 |     val modals = extr.rel.nodes.filter(node => (node.postag startsWith "MD") ||
119 |         (node.postag == "TO"))
120 | 
121 |     // horrible escape is required.  See JavaDoc for Match.replaceAll
122 |     // or https://issues.scala-lang.org/browse/SI-5437
123 |     var rel = group.replaceAllIn(template, (gm: Regex.Match) => matchGroup(gm.group(1))
124 |       .replaceAll("_", " ")
125 |       .replaceAll("""\\""", """\\\\""")
126 |       .replaceAll("""\$""", """\\\$"""))
127 | 
128 |     if (!prefix.isEmpty) {
129 |       if (modals.isEmpty) {
130 |         rel = prefix + " " + rel
131 |       } else {
132 |         val regex = new Regex("(^.*\\b(?:" + modals.iterator.map(_.text).mkString("|") + "))\\b")
133 |         rel = regex.replaceAllIn(rel, "$1 " + prefix)
134 |       }
135 |     }
136 | 
137 |     extr.replaceRelation(rel)
138 |   }
139 | 
140 |   override def toString = (if (be) "be " else "") + template
141 | 
142 |   def serialize = this.toString
143 | }
144 | 
145 | object Template {
146 |   val group = """\{(.*?)}""".r
147 |   def deserialize(string: String) = {
148 |     if (string.startsWith("be ")) {
149 |       Template(string.drop(3), true)
150 |     }
151 |     else {
152 |       Template(string, false)
153 |     }
154 |   }
155 | }
156 | 


--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/openparse/template/CountsToConfidence.scala:
--------------------------------------------------------------------------------
 1 | package edu.knowitall.openparse.template
 2 | 
 3 | import scopt.mutable.OptionParser
 4 | import java.io.File
 5 | import edu.knowitall.common.Resource.using
 6 | import scala.io.Source
 7 | import java.io.PrintWriter
 8 | 
 9 | object CountsToConfidence {
10 |   abstract class Settings {
11 |     def sourceFile: File
12 |     def destFile: Option[File]
13 |   }
14 |   
15 |   def main(args: Array[String]) = {
16 |     object settings extends Settings {
17 |       var sourceFile: File = _
18 |       var destFile: Option[File] = None
19 |     }
20 |     
21 |     val parser = new OptionParser("convertconf") {
22 |       arg("source", "file with pattern, count pairs", { path: String => settings.sourceFile = new File(path) })
23 |       argOpt("dest", "optional parameter to specify output to a file", { path: String => settings.destFile = Some(new File(path)) })
24 |     }
25 |     
26 |     if (parser.parse(args)) {
27 |       run(settings)
28 |     }
29 |   }
30 |   
31 |   def run(settings: Settings) = {
32 |     using (Source.fromFile(settings.sourceFile)) { source =>
33 |       using (
34 |         settings.destFile match {
35 |           case Some(file) => new PrintWriter(file)
36 |           case None => new PrintWriter(System.out)
37 |         }
38 |       ) { output =>
39 |         val lines = {
40 |           val it = source.getLines
41 |           val first = it.next
42 |           output.println(first)
43 |           it.toList
44 |         }
45 |         
46 |         val max = lines.map(_.split("\t").last.toInt).max
47 | 
48 |         for (line <- lines) {
49 |           val parts = line.split("\t")
50 |           val count = parts.last.toInt
51 |           output.println(parts.take(parts.length - 1).mkString("\t") + "\t" + ("%1.4f" format (count.toDouble / max.toDouble)))
52 |         }
53 |       }
54 |     }
55 |   }
56 | }


--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/openparse/template/GeneralizeTemplate.scala:
--------------------------------------------------------------------------------
  1 | package edu.knowitall.openparse.template
  2 | 
  3 | import java.io.{PrintWriter, File}
  4 | 
  5 | import scala.collection.immutable
  6 | import scala.io.Source
  7 | 
  8 | import edu.knowitall.collection.immutable.graph.pattern.{NodeMatcher, ConjunctiveNodeMatcher}
  9 | import edu.knowitall.common.Resource.using
 10 | import edu.knowitall.openparse.{SlotMatcher, RelationMatcher, ExtractorPattern, ExtractionPartMatcher}
 11 | import edu.knowitall.tool.parse.graph.{RegexNodeMatcher, PostagNodeMatcher, DependencyPattern, DependencyNode}
 12 | import edu.knowitall.ollie.Ollie.stemmer
 13 | 
 14 | import scopt.OptionParser
 15 | 
 16 | /** Generalize semantic restrictions to semantic classes.
 17 |   * 
 18 |   * @author Michael Schmitz
 19 |   */
 20 | object GeneralizeTemplates {
 21 |   abstract class Settings {
 22 |     def sourceFile: File
 23 |     def destFile: Option[File]
 24 | 
 25 |     val categories = List("person", "location")
 26 |   }
 27 | 
 28 |   def main(args: Array[String]) = {
 29 |     object settings extends Settings {
 30 |       var sourceFile: File = null
 31 |       var destFile: Option[File] = None
 32 |     }
 33 | 
 34 |     val parser = new OptionParser("buildtemp") {
 35 |       arg("source", "file with source relation, pattern pairs", { path: String => settings.sourceFile = new File(path) })
 36 |       argOpt("dest", "optional parameter to specify output to a file", { path: String => settings.destFile = Some(new File(path)) })
 37 |     }
 38 | 
 39 |     if (parser.parse(args)) {
 40 |       run(settings)
 41 |     }
 42 |   }
 43 | 
 44 |   def lexicalRestrictions(extractionPartMatcher: ExtractionPartMatcher) = {
 45 |     extractionPartMatcher.matcher match {
 46 |       case m: ConjunctiveNodeMatcher[_] =>
 47 |         val postag = (m.matchers.collect { case m: PostagNodeMatcher => m } head).postag
 48 |         val lemmas = (m.matchers.collect { case m: RegexNodeMatcher => m } head).regex.toString.split("\\|").toSeq
 49 |         Some(postag, lemmas)
 50 |       case _ => None
 51 |     }
 52 |   }
 53 | 
 54 |   case class Category(name: String, elements: Set[String]) {
 55 |     override def toString = "Category(" + name + ")"
 56 |   }
 57 | 
 58 |   def loadCategories(categories: Seq[String]) = {
 59 |     def loadCategory(name: String) = {
 60 |       val elements =
 61 |         using(this.getClass.getClassLoader.getResourceAsStream("categories/" + name + ".txt")) { stream =>
 62 |           using(Source.fromInputStream(stream)) { source =>
 63 |             source.getLines().toSet
 64 |           }
 65 |         }
 66 | 
 67 |       Category(name, elements)
 68 |     }
 69 | 
 70 |     (for (cat <- categories) yield (loadCategory(cat))).toList
 71 |   }
 72 | 
 73 |   def run(settings: Settings) {
 74 |     val categories = loadCategories(settings.categories)
 75 | 
 76 |     def generalize(matcher: NodeMatcher[DependencyNode], postag: String, lemmas: Set[String]) = {
 77 |       def distance(cat: Category) = {
 78 |         val intersectSize = (cat.elements intersect lemmas).size
 79 |         intersectSize.toDouble / lemmas.size.toDouble
 80 |         if (intersectSize < 5) 0.0
 81 |         else intersectSize.toDouble / lemmas.size.toDouble
 82 |       }
 83 |       if (lemmas.size < 10) matcher
 84 |       else {
 85 |         postag match {
 86 |           case "NN" | "NNS" =>
 87 |             val overlaps = categories map (cat => (cat, distance(cat))) sortBy (-_._2)
 88 |             if (overlaps.iterator.map(_._2).sum > 0.75) {
 89 |               val categories = overlaps.filter(_._2 > 0.10).map(_._1)
 90 |               val uncategorized = lemmas -- categories.flatMap(_.elements)
 91 |               val elements = immutable.SortedSet[String]() ++ categories.flatMap(_.elements) ++ uncategorized
 92 |               new ConjunctiveNodeMatcher(new PostagNodeMatcher(postag), new RegexNodeMatcher(elements.mkString("|").r))
 93 |             } else matcher
 94 |           case m => matcher
 95 |         }
 96 |       }
 97 |     }
 98 | 
 99 |     var templates =
100 |       using(Source.fromFile(settings.sourceFile, "UTF8")) { source =>
101 |         source.getLines().map { line =>
102 |           val Array(template, pattern, count) = line.split("\t")
103 |           ((template, new ExtractorPattern(DependencyPattern.deserialize(pattern))), count.toInt)
104 |         }.toList
105 |       }
106 | 
107 |     templates = templates.map {
108 |       case ((template, pattern), count) =>
109 |         val matchers = pattern.matchers.map { matcher =>
110 |           matcher match {
111 |             case m: ExtractionPartMatcher if m.isInstanceOf[SlotMatcher] || m.isInstanceOf[RelationMatcher] =>
112 |               lexicalRestrictions(m) match {
113 |                 case Some((postag, lemmas)) => m.withMatcher(generalize(m.matcher, postag, lemmas.toSet))
114 |                 case None => m
115 |               }
116 |             case m => m
117 |           }
118 |         }
119 | 
120 |         ((template, new ExtractorPattern(matchers)), count)
121 |     }
122 | 
123 |     using (
124 |       settings.destFile match {
125 |         case Some(file) => new PrintWriter(file, "UTF8")
126 |         case None => new PrintWriter(System.out)
127 |       })
128 |     { writer =>
129 |       templates map { case ((template, pattern), count) => Iterable(template, pattern, count).mkString("\t") } foreach writer.println
130 |     }
131 |   }
132 | }
133 | 


--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/openparse/template/PassiveReflections.scala:
--------------------------------------------------------------------------------
 1 | package edu.knowitall.openparse.template
 2 | 
 3 | import java.io.{PrintWriter, File}
 4 | import scala.Option.option2Iterable
 5 | import scala.annotation.elidable
 6 | import scala.collection.immutable
 7 | import scala.io.Source
 8 | import scala.util.matching.Regex
 9 | import org.slf4j.LoggerFactory
10 | import edu.knowitall.collection.immutable.graph.pattern.{Matcher, ConjunctiveNodeMatcher, DirectedEdgeMatcher, CaptureEdgeMatcher}
11 | import edu.knowitall.collection.immutable.Bag
12 | import edu.knowitall.common.Resource.using
13 | import edu.knowitall.common.enrich.Traversables.traversableOncePairTo
14 | import edu.knowitall.openparse.{SlotMatcher, RelationMatcher, ExtractorPattern, ArgumentMatcher}
15 | import edu.knowitall.tool.parse.graph.{RegexNodeMatcher, RegexEdgeMatcher, PostagNodeMatcher, LabelEdgeMatcher, DependencyPattern, DependencyNode}
16 | import edu.knowitall.tool.postag.Postagger
17 | import edu.knowitall.ollie.Ollie.stemmer
18 | import scalaz.Scalaz._
19 | import scalaz._
20 | import scopt.OptionParser
21 | import edu.knowitall.collection.immutable.graph.pattern.CaptureNodeMatcher
22 | 
23 | /** A main method for adding active and passive conversions
24 |   * of patterns to a model file.  BuiltTemplates removes
25 |   * duplicate patterns, removing a lot of garbage but also
26 |   * the active/passive conversions.
27 |   *
28 |   * @author Michael Schmitz
29 |   */
30 | object PassiveReflections {
31 |   val logger = LoggerFactory.getLogger(this.getClass)
32 | 
33 |   abstract class Settings {
34 |     def sourceFile: File
35 |     def destFile: Option[File]
36 |   }
37 | 
38 |   def main(args: Array[String]) {
39 |     val settings = new Settings {
40 |       var sourceFile: File = null
41 |       var destFile: Option[File] = None
42 |     }
43 | 
44 |     val parser = new OptionParser("passivemodel") {
45 |       arg("source", "input model file", { path: String => settings.sourceFile = new File(path) })
46 |       argOpt("dest", "output model file", { path: String => settings.destFile = Some(new File(path)) })
47 |     }
48 | 
49 |     if (parser.parse(args)) {
50 |       run(settings)
51 |     }
52 |   }
53 | 
54 |   def run(settings: Settings) {
55 |     def switchArgs(pattern: ExtractorPattern) = {
56 |       val arg1 = pattern.matchers.find { case m: CaptureNodeMatcher[_] => m.alias == "arg1" case _ => false } get
57 |       val arg2 = pattern.matchers.find { case m: CaptureNodeMatcher[_] => m.alias == "arg2" case _ => false } get
58 | 
59 |       new ExtractorPattern(pattern.matchers.map {
60 |         case m: CaptureNodeMatcher[_] if m.alias == "arg1" => arg2
61 |         case m: CaptureNodeMatcher[_] if m.alias == "arg2" => arg1
62 |         case m => m
63 |       })
64 |     }
65 | 
66 |     val patterns = using {
67 |       Source.fromFile(settings.sourceFile)
68 |     } { source =>
69 |       source.getLines.drop(1).map { line =>
70 |         val Array(template, pattern, count) = line.split("\t")
71 |         (template, new ExtractorPattern(DependencyPattern.deserialize(pattern)), count)
72 |       }.toList
73 |     }
74 | 
75 |     using(
76 |       settings.destFile match {
77 |         case Some(file) => new PrintWriter(file, "UTF8")
78 |         case None => new PrintWriter(System.out)
79 |       }) { output =>
80 |         patterns.foreach {
81 |           case (template, pattern, count) =>
82 |             output.println(Iterable(template, pattern, count).mkString("\t"))
83 | 
84 |             if (pattern.baseEdgeMatchers.exists { case m: LabelEdgeMatcher => m.label == "nsubj" case _ => false }) {
85 |               // print the passive conversion
86 | 
87 |               if (!(template startsWith "be ")) {
88 |                 output.println(Iterable("be " + template, switchArgs(pattern), count).mkString("\t"))
89 |               }
90 |             } else if (pattern.baseEdgeMatchers.exists { case m: LabelEdgeMatcher => m.label == "nsubjpass" case _ => false }) {
91 |               if (template startsWith "be ") {
92 |                 output.println(Iterable(template.drop(3), switchArgs(pattern), count).mkString("\t"))
93 |               }
94 |             }
95 |         }
96 |       }
97 |   }
98 | }


--------------------------------------------------------------------------------
/core/src/test/resources/logback-test.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <configuration>
 3 |   <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
 4 |     <!-- encoders are assigned the type
 5 |          ch.qos.logback.classic.encoder.PatternLayoutEncoder by default -->
 6 |     <encoder>
 7 |       <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 8 |     </encoder>
 9 |   </appender>
10 |   <root level="info">
11 |     <appender-ref ref="STDOUT"/>
12 |   </root>
13 | </configuration>
14 | 


--------------------------------------------------------------------------------
/core/src/test/scala/edu/knowitall/common/enrich/TraversableSpecTest.scala:
--------------------------------------------------------------------------------
 1 | package edu.knowitall.common.enrich
 2 | 
 3 | import edu.knowitall.collection.immutable.Bag
 4 | 
 5 | import org.junit.runner.RunWith
 6 | import org.specs2.mutable.Specification
 7 | import org.specs2.runner.JUnitRunner
 8 | 
 9 | @RunWith(classOf[JUnitRunner])
10 | object TraversableSpecTest extends Specification {
11 |   import Traversables._
12 | 
13 |   "simple histogram works fine" in {
14 |     val h1 = List(1, 2, 2, 3, 3, 3).histogram
15 |     val h2 = List(3, 2, 1, 3, 2, 3).histogram
16 |     h1 must_== h2
17 |     h1 must haveTheSameElementsAs(List((1, 1), (2, 2), (3, 3)))
18 |   }
19 | 
20 |   "histogram from partials works fine" in {
21 |     val list = List((1, 1), (2, 2), (2, 2), (3, 3), (3, 3), (3, 3))
22 |     val h1 = list.mergeHistograms
23 |     val h2 = list.reverse.mergeHistograms
24 |     val h3 = list.mergeKeys(_ + _)
25 |     h1 must_== h2
26 |     h1 must_== h3
27 |     h1 must haveTheSameElementsAs(List((1, 1), (2, 4), (3, 9)))
28 |   }
29 | 
30 |   "list multimaps works fine" in {
31 |     val list = List(1 -> 1, 1 -> 2, 1 -> 1, 2 -> 2)
32 |     val multimap = list.toListMultimap
33 | 
34 |     multimap must haveTheSameElementsAs(Map(1 -> List(1, 2, 1), 2 -> List(2)))
35 | 
36 |     val extended = (multimap.toSeq :+ (1 -> List(2, 3, 4, 5)))
37 |     val merged = extended.mergeKeys(_ ++ _)
38 | 
39 |     merged must haveTheSameElementsAs(Map(1 -> List(1, 2, 1, 2, 3, 4, 5), 2 -> List(2)))
40 |   }
41 | 
42 |   "set multimaps works fine" in {
43 |     val list = List(1 -> 1, 1 -> 2, 1 -> 1, 2 -> 2)
44 |     val multimap = list.toSetMultimap
45 | 
46 |     multimap must haveTheSameElementsAs(Map(1 -> Set(1, 2), 2 -> Set(2)))
47 | 
48 |     val extended = (multimap.toSeq :+ (1 -> Set(2, 3, 4, 5)))
49 |     val merged = extended.mergeKeys(_ ++ _)
50 | 
51 |     merged must haveTheSameElementsAs(Map(1 -> Set(1, 2, 3, 4, 5), 2 -> Set(2)))
52 |   }
53 | 
54 |   "bag multimaps works fine" in {
55 |     val list = List(1 -> 1, 1 -> 2, 1 -> 1, 2 -> 2)
56 |     val multimap = list.toBagMultimap
57 | 
58 |     multimap must haveTheSameElementsAs(Map(1 -> Bag(1, 1, 2), 2 -> Bag(2)))
59 | 
60 |     val extended = (multimap.toSeq :+ (1 -> Bag(2, 3, 4, 5)))
61 |     val merged = extended.mergeKeys(_ ++ _)
62 | 
63 |     merged must haveTheSameElementsAs(Map(1 -> Bag(1, 1, 2, 2, 3, 4, 5), 2 -> Bag(2)))
64 |   }
65 | }
66 | 


--------------------------------------------------------------------------------
/core/src/test/scala/edu/knowitall/ollie/DependencyGraphExtrasSpec.scala:
--------------------------------------------------------------------------------
 1 | package edu.knowitall.ollie
 2 | 
 3 | import org.junit.runner.RunWith
 4 | import org.specs2.mutable.Specification
 5 | import org.specs2.runner.JUnitRunner
 6 | import edu.knowitall.tool.parse.graph.DependencyGraph
 7 | 
 8 | @RunWith(classOf[JUnitRunner])
 9 | object DependencyGraphExtrasTest extends Specification {
10 |   "switch to passive voice works" in {
11 |     val graph = DependencyGraph.deserialize("nsubj(hit_VBD_1_8, Michael_NNP_0_0); dobj(hit_VBD_1_8, ball_NN_3_16); punct(hit_VBD_1_8, ._._4_20); det(ball_NN_3_16, the_DT_2_12)")
12 |     val extras = new DependencyGraphExtras(graph)
13 | 
14 |     val switched = extras.switchVoice
15 | 
16 |     switched.size must_== 1
17 |     switched.head.serialize must_== "det(ball_NN_1_4, the_DT_0_0); auxpass(hit_VBD_2_13, was_VBD_1_9); nsubjpass(hit_VBD_2_13, ball_NN_1_4); prep(hit_VBD_2_13, by_IN_3_17); punct(hit_VBD_2_13, ._._6_28); pobj(by_IN_3_17, Michael_NNP_4_20)"
18 |   }
19 | 
20 |   "switch to active voice works" in {
21 |     val graph = DependencyGraph.deserialize("det(ball_NN_1_4, The_DT_0_0); nsubjpass(hit_VBN_3_13, ball_NN_1_4); auxpass(hit_VBN_3_13, was_VBD_2_9); prep(hit_VBN_3_13, by_IN_4_17); punct(hit_VBN_3_13, ._._6_27); pobj(by_IN_4_17, Michael_NNP_5_20)")
22 |     val extras = new DependencyGraphExtras(graph)
23 | 
24 |     val switched = extras.switchVoice
25 | 
26 |     switched.size must_== 1
27 |     switched.head.serialize must_== "nsubj(hit_VBN_1_8, Michael_NNP_0_0); dobj(hit_VBN_1_8, ball_NN_3_16); punct(hit_VBN_1_8, ._._4_21); det(ball_NN_3_16, The_DT_2_12)"
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/core/src/test/scala/edu/knowitall/ollie/confidence/OllieFeatureSetSpec.scala:
--------------------------------------------------------------------------------
 1 | package edu.knowitall.ollie.confidence
 2 | 
 3 | import org.junit._
 4 | import org.junit.Assert._
 5 | import org.specs2.mutable.Specification
 6 | import org.junit.runner.RunWith
 7 | import org.specs2.runner.JUnitRunner
 8 | import edu.knowitall.tool.parse.graph.DependencyGraph
 9 | import edu.knowitall.ollie.Ollie
10 | import edu.knowitall.ollie.OllieExtractionInstance
11 | import edu.knowitall.ollie.ScoredOllieExtractionInstance
12 | import edu.knowitall.openparse.OpenParse
13 | import org.junit.runner.RunWith
14 | import org.specs2.runner.JUnitRunner
15 | 
16 | @RunWith(classOf[JUnitRunner])
17 | object OllieFeatureSetSpec extends Specification {
18 |   val ollie = new Ollie(OpenParse.withDefaultModel())
19 | 
20 |   "if right before arg1" in {
21 |     val graph = DependencyGraph.deserialize("poss(father_NN_2_12, his_PRP$_1_8); punct(father_NN_2_12, ,_,_3_19); appos(father_NN_2_12, Whitechapel_NNP_4_21); punct(father_NN_2_12, ,_,_5_33); advmod(betrays_VBZ_6_35, However_RB_0_0); nsubj(betrays_VBZ_6_35, father_NN_2_12); dobj(betrays_VBZ_6_35, whereabouts_NN_8_47); punct(betrays_VBZ_6_35, ,_,_9_59); xcomp(betrays_VBZ_6_35, fearing_VBG_10_61); punct(betrays_VBZ_6_35, ._._27_149); poss(whereabouts_NN_8_47, his_PRP$_7_43); ccomp(fearing_VBG_10_61, die_VB_15_87); poss(son_NN_13_78, his_PRP$_12_74); complm(die_VB_15_87, that_IN_11_69); nsubj(die_VB_15_87, son_NN_13_78); aux(die_VB_15_87, will_MD_14_82); advcl(die_VB_15_87, captured_VBN_20_104); mark(captured_VBN_20_104, if_IN_16_91); nsubjpass(captured_VBN_20_104, he_PRP_17_94); auxpass(captured_VBN_20_104, is_VBZ_18_97); neg(captured_VBN_20_104, not_RB_19_100); cc(captured_VBN_20_104, and_CC_21_113); conj(captured_VBN_20_104, returned_VBN_22_117); dobj(captured_VBN_20_104, home_NN_23_126); prep(captured_VBN_20_104, to_TO_24_131); pobj(to_TO_24_131, plantation_NN_26_138); det(plantation_NN_26_138, the_DT_25_134)")
22 |     val extrs = ollie.extract(graph)
23 | 
24 |     val extr = extrs.toSeq(2)
25 |     OllieFeatures.ifRightBeforeArg1(extr) must_== 1.0
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/core/src/test/scala/edu/knowitall/openparse/BuildPatternsSpec.scala:
--------------------------------------------------------------------------------
 1 | package edu.knowitall.openparse
 2 | 
 3 | import org.junit._
 4 | import org.junit.Assert._
 5 | import org.specs2.mutable.Specification
 6 | import org.junit.runner.RunWith
 7 | import org.specs2.runner.JUnitRunner
 8 | 
 9 | import edu.knowitall.tool.parse.graph.DependencyGraph
10 | import edu.knowitall.tool.stem.MorphaStemmer
11 | 
12 | @RunWith(classOf[JUnitRunner])
13 | object BuildPatternsSpecTest extends Specification {
14 |   def findPatterns(row: (String, String, String, String, String), maxLength: Option[Int] = None) = {
15 |     val (rel, arg1, arg2, lemmasString, pickled) = row
16 |     val lemmas = lemmasString.split("\\s+").toSet
17 |     val graph = DependencyGraph.deserialize(pickled).map(_.lemmatize(MorphaStemmer)).normalize
18 |     BuildPatterns.findRelationPatterns(graph, rel, arg1, arg2, lemmas, maxLength)
19 |   }
20 | 
21 |   "A pattern is found when the argument overlap" in {
22 |     val row, (arg1, rel, arg2, lemmas, pickled) = ("be marry to", "hillary clinton", "bill clinton", "hillary clinton marry bill", "cc(married_VBN_11_0, And_CC_0_0); nn(Clinton_NNP_2_0, Hillary_NNP_1_0); nsubjpass(married_VBN_11_0, Clinton_NNP_2_0); punct(Clinton_NNP_2_0, _,_3_0); dep(know_VBP_8_0, who_WP_4_0); punct(know_VBP_8_0, _,_5_0); mark(know_VBP_8_0, as_IN_6_0); nsubj(know_VBP_8_0, we_PRP_7_0); rcmod(Clinton_NNP_2_0, know_VBP_8_0); punct(Clinton_NNP_2_0, _,_9_0); auxpass(married_VBN_11_0, is_VBZ_10_0); nn(Clinton_NNP_14_0, Bill_NNP_13_0); prep_to(married_VBN_11_0, Clinton_NNP_14_0); punct(married_VBN_11_0, ._._15_0)")
23 |     val patterns = findPatterns(row)
24 |     patterns.size must_== 1
25 |     patterns.head._1.toString must_== "{arg1} <nsubjpass< {rel:postag=VBN} >prep_to> {arg2}"
26 |   }
27 | 
28 |   "A pattern is found with exactly one slot" in {
29 |     val row = ("arrive in", "barack obama", "afghanistan", "barack obama arrive afghanistan", "(to_TO_4_0), (in_IN_12_0), (on_IN_14_0), (or_CC_16_0), (for_IN_20_0), (to_TO_23_0), (and_CC_27_0), (in_IN_29_0), (of_IN_34_0), (from_IN_38_0), poss(trip_NN_3_0, his_PRP$_1_0); amod(trip_NN_3_0, two-day_JJ_2_0); pobj(After_IN_0_0, trip_NN_3_0); prep_to(trip_NN_3_0, Afghanistan_NNP_5_0); punct(trip_NN_3_0, ,_,_6_0); nn(Obama_NNP_10_0, U.S._NNP_7_0); nn(Obama_NNP_10_0, Senator_NNP_8_0); nn(Obama_NNP_10_0, Barack_NNP_9_0); nsubj(arrived_VBD_11_0, Obama_NNP_10_0); rcmod(trip_NN_3_0, arrived_VBD_11_0); prep_in(arrived_VBD_11_0, Iraq_NNP_13_0); prep_on(arrived_VBD_11_0, Monday_NNP_15_0); prep_on(arrived_VBD_11_0, July_NNP_17_0); conj_or(Monday_NNP_15_0, July_NNP_17_0); num(July_NNP_17_0, 21_CD_18_0); punct(trip_NN_3_0, ,_,_19_0); det(visit_NN_22_0, a_DT_21_0); prep_for(trip_NN_3_0, visit_NN_22_0); det(East_NNP_26_0, the_DT_24_0); nn(East_NNP_26_0, Middle_NNP_25_0); prep_to(visit_NN_22_0, East_NNP_26_0); prep_to(visit_NN_22_0, Europe_NNP_28_0); conj_and(East_NNP_26_0, Europe_NNP_28_0); poss(capacity_NN_31_0, his_PRP$_30_0); prep_in(visit_NN_22_0, capacity_NN_31_0); det(member_NN_33_0, a_DT_32_0); dep(capacity_NN_31_0, member_NN_33_0); det(Senate_NNP_37_0, the_DT_35_0); nn(Senate_NNP_37_0, U.S._NNP_36_0); prep_of(member_NN_33_0, Senate_NNP_37_0); prep_from(member_NN_33_0, Illinois_NNP_39_0); punct(After_IN_0_0, ._._40_0)")
30 |     val patterns = findPatterns(row)
31 |     patterns.size must_== 1
32 |     patterns.head._1.toString must_== "{arg1} <nsubj< {rel:postag=VBD} <rcmod< {slot0:postag=NN} >prep_to> {arg2}"
33 |   }
34 | 
35 |   "A pattern is NOT found because of a length restriction" in {
36 |     val row = ("arrive in", "barack obama", "afghanistan", "barack obama arrive afghanistan", "(to_TO_4_0), (in_IN_12_0), (on_IN_14_0), (or_CC_16_0), (for_IN_20_0), (to_TO_23_0), (and_CC_27_0), (in_IN_29_0), (of_IN_34_0), (from_IN_38_0), poss(trip_NN_3_0, his_PRP$_1_0); amod(trip_NN_3_0, two-day_JJ_2_0); pobj(After_IN_0_0, trip_NN_3_0); prep_to(trip_NN_3_0, Afghanistan_NNP_5_0); punct(trip_NN_3_0, ,_,_6_0); nn(Obama_NNP_10_0, U.S._NNP_7_0); nn(Obama_NNP_10_0, Senator_NNP_8_0); nn(Obama_NNP_10_0, Barack_NNP_9_0); nsubj(arrived_VBD_11_0, Obama_NNP_10_0); rcmod(trip_NN_3_0, arrived_VBD_11_0); prep_in(arrived_VBD_11_0, Iraq_NNP_13_0); prep_on(arrived_VBD_11_0, Monday_NNP_15_0); prep_on(arrived_VBD_11_0, July_NNP_17_0); conj_or(Monday_NNP_15_0, July_NNP_17_0); num(July_NNP_17_0, 21_CD_18_0); punct(trip_NN_3_0, ,_,_19_0); det(visit_NN_22_0, a_DT_21_0); prep_for(trip_NN_3_0, visit_NN_22_0); det(East_NNP_26_0, the_DT_24_0); nn(East_NNP_26_0, Middle_NNP_25_0); prep_to(visit_NN_22_0, East_NNP_26_0); prep_to(visit_NN_22_0, Europe_NNP_28_0); conj_and(East_NNP_26_0, Europe_NNP_28_0); poss(capacity_NN_31_0, his_PRP$_30_0); prep_in(visit_NN_22_0, capacity_NN_31_0); det(member_NN_33_0, a_DT_32_0); dep(capacity_NN_31_0, member_NN_33_0); det(Senate_NNP_37_0, the_DT_35_0); nn(Senate_NNP_37_0, U.S._NNP_36_0); prep_of(member_NN_33_0, Senate_NNP_37_0); prep_from(member_NN_33_0, Illinois_NNP_39_0); punct(After_IN_0_0, ._._40_0)")
37 |     val patterns = findPatterns(row, Some(2))
38 |     patterns.size must_== 0
39 |   }
40 | 
41 |   // rel rel
42 |   "A pattern is found" in {
43 |     val row = ("be bear a", "queequag", "slave", "bear queequag slave", "(in_IN_5_0), (._._7_0), nsubjpass(born_VBN_2_0, Queequag_NNP_0_0); auxpass(born_VBN_2_0, was_VBD_1_0); dobj(born_VBN_2_0, slave_NN_4_0); det(slave_NN_4_0, a_DT_3_0); prep_in(slave_NN_4_0, Africa_NNP_6_0)")
44 |     val patterns = findPatterns(row, Some(2))
45 |     patterns.size must_== 1
46 |     patterns.head._1.toString must_== "{arg1} <nsubjpass< {rel:postag=VBN} >dobj> {arg2}"
47 |   }
48 | 
49 |   "A single pattern is found with a slot instead of a rel rel" in {
50 |     val row = ("be elect president of", "barack obama", "unite state", "barack obama unite state elect president", "(of_IN_5_0), (._._9_0), nn(Obama_NNP_1_0, Barack_NNP_0_0); nsubjpass(elected_VBN_3_0, Obama_NNP_1_0); auxpass(elected_VBN_3_0, was_VBD_2_0); dobj(elected_VBN_3_0, president_NN_4_0); prep_of(president_NN_4_0, States_NNPS_8_0); det(States_NNPS_8_0, the_DT_6_0); nn(States_NNPS_8_0, United_NNP_7_0)")
51 |     val patterns = findPatterns(row)
52 |     patterns.size must_== 1
53 |     patterns.head._1.toString must_== "{arg1} <nsubjpass< {rel0:postag=VBN} >dobj> {rel1:postag=NN} >prep_of> {arg2}"
54 |   }
55 | 
56 |   "A single pattern is found with a slot instead of a rel rel" in {
57 |     val row = ("be team locate in", "mariner", "seattle", "mariner team locate seattle", "(in_IN_6_0), (._._8_0), det(Mariners_NNPS_1_0, The_DT_0_0); nsubj(team_NN_4_0, Mariners_NNPS_1_0); cop(team_NN_4_0, are_VBP_2_0); det(team_NN_4_0, a_DT_3_0); partmod(team_NN_4_0, located_VBN_5_0); prep_in(located_VBN_5_0, Seattle_NNP_7_0)")
58 |     val patterns = findPatterns(row)
59 |     patterns.head._1.toString must_== "{arg1} <nsubj< {rel0:postag=NN} >partmod> {rel1:postag=VBN} >prep_in> {arg2}"
60 |   }
61 | 
62 |   "A single pattern is found with a slot instead of a rel rel" in {
63 |     val row = ("be going populate", "human", "earth", "human go populate earth", "(._._7_0), nsubj(going_VBG_2_0, Humans_NNS_0_0); aux(going_VBG_2_0, are_VBP_1_0); xcomp(going_VBG_2_0, populate_VB_4_0); aux(populate_VB_4_0, to_TO_3_0); dobj(populate_VB_4_0, earth_NN_6_0); det(earth_NN_6_0, the_DT_5_0)")
64 |     val patterns = findPatterns(row)
65 |     patterns.size must_== 1
66 |     patterns.head._1.toString must_== "{arg1} <nsubj< {slot0:postag=VBG} >xcomp> {rel:postag=VB} >dobj> {arg2}"
67 |   }
68 | 
69 |   "A single pattern is found with a slot instead of a rel rel" in {
70 |     val row = ("have crush on", "juliette", "romeo", "juliette have crush romeo", "(on_IN_4_0), (._._6_0), nsubj(has_VBZ_1_0, Juliette_NNP_0_0); dobj(has_VBZ_1_0, crush_NN_3_0); det(crush_NN_3_0, a_DT_2_0); prep_on(crush_NN_3_0, Romeo_NNP_5_0)")
71 |     val patterns = findPatterns(row)
72 |     patterns.size must_== 1
73 |     patterns.head._1.toString must_== "{arg1} <nsubj< {rel0:postag=VBZ} >dobj> {rel1:postag=NN} >prep_on> {arg2}"
74 |   }
75 | }
76 | 


--------------------------------------------------------------------------------
/core/src/test/scala/edu/knowitall/openparse/ExtractorPatternSpec.scala:
--------------------------------------------------------------------------------
 1 | package edu.knowitall.openparse
 2 | 
 3 | import org.junit._
 4 | import org.junit.Assert._
 5 | import org.specs2.mutable.Specification
 6 | import org.junit.runner.RunWith
 7 | import org.specs2.runner.JUnitRunner
 8 | 
 9 | import edu.knowitall.tool.parse.graph.DependencyPattern
10 | import edu.knowitall.ollie.Ollie.stemmer
11 | 
12 | @RunWith(classOf[JUnitRunner])
13 | object ExtractorPatternSpecTest extends Specification {
14 |   def testSymmetric(pattern: String, symmetric: Boolean) {
15 |     (pattern + " is " + (if (symmetric) "symmetric" else "not symmetric")) in {
16 |       new ExtractorPattern(DependencyPattern.deserialize(pattern)).symmetric must be_==(symmetric)
17 |     }
18 |   }
19 | 
20 |   testSymmetric("{arg1} <nsubj< {rel:postag=VBZ} >dobj> {arg2}", false)
21 |   testSymmetric("{arg1} <nsubj< {rel:postag=VBD} >nsubj> {arg2}", true)
22 |   testSymmetric("{arg1} <prep_of< {rel:postag=NN} >prep_of> {arg2}", true)
23 |   testSymmetric("{rel:postag=NN} <nn< {arg1} >nn> {arg2}", false)
24 | }
25 | 


--------------------------------------------------------------------------------
/core/src/test/scala/edu/knowitall/openparse/OllieSpec.scala:
--------------------------------------------------------------------------------
 1 | package edu.knowitall.openparse
 2 | 
 3 | import org.junit._
 4 | import org.junit.Assert._
 5 | import org.specs2.mutable.Specification
 6 | import org.junit.runner.RunWith
 7 | import org.specs2.runner.JUnitRunner
 8 | import edu.knowitall.tool.parse.graph.DependencyGraph
 9 | import edu.knowitall.tool.stem.MorphaStemmer
10 | import edu.knowitall.ollie.Ollie
11 | import edu.knowitall.ollie.OllieExtractionInstance
12 | import edu.knowitall.ollie.ScoredOllieExtractionInstance
13 | import edu.knowitall.ollie.confidence.OllieConfidenceFunction
14 | 
15 | @RunWith(classOf[JUnitRunner])
16 | object OllieSpecTest extends Specification {
17 |   val ollie = new Ollie(OpenParse.withDefaultModel())
18 |   val conf = OllieConfidenceFunction.loadDefaultClassifier()
19 | 
20 |   "Ollie finds an example extraction" in {
21 |     val graph = DependencyGraph.deserialize("(._._5_37), nsubj(finds_VBZ_1_10, OpenParse_NNP_0_0); dobj(finds_VBZ_1_10, extraction_NN_4_27); det(extraction_NN_4_27, an_DT_2_16); nn(extraction_NN_4_27, example_NN_3_19)")
22 |     val extrs = ollie.extract(graph)
23 | 
24 |     val extr = extrs.head
25 |     extr must_== OllieExtractionInstance.tabDeserialize(extr.tabSerialize)
26 | 
27 |     val scored = new ScoredOllieExtractionInstance(true, extr)
28 |     scored must_== ScoredOllieExtractionInstance.tabDeserialize(scored.tabSerialize)
29 |   }
30 | 
31 |   "Ollie confidence function executes" in {
32 |     val graph = DependencyGraph.deserialize("(._._5_37), nsubj(finds_VBZ_1_10, OpenParse_NNP_0_0); dobj(finds_VBZ_1_10, extraction_NN_4_27); det(extraction_NN_4_27, an_DT_2_16); nn(extraction_NN_4_27, example_NN_3_19)")
33 |     val extrs = ollie.extract(graph)
34 |     extrs map conf must not(throwA[Exception])
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/core/src/test/scala/edu/knowitall/openparse/OpenParseSpec.scala:
--------------------------------------------------------------------------------
 1 | package edu.knowitall.openparse
 2 | 
 3 | import org.junit._
 4 | import org.junit.Assert._
 5 | import org.specs2.mutable.Specification
 6 | import org.junit.runner.RunWith
 7 | import org.specs2.runner.JUnitRunner
 8 | 
 9 | import edu.knowitall.tool.parse.graph.DependencyGraph
10 | import edu.knowitall.tool.stem.MorphaStemmer
11 | 
12 | @RunWith(classOf[JUnitRunner])
13 | object OpenParseSpecTest extends Specification {
14 |   val openparse = OpenParse.withDefaultModel()
15 | 
16 |   "OpenParse finds an example extraction" in {
17 |     val graph = DependencyGraph.deserialize("(._._5_37), nsubj(finds_VBZ_1_10, OpenParse_NNP_0_0); dobj(finds_VBZ_1_10, extraction_NN_4_27); det(extraction_NN_4_27, an_DT_2_16); nn(extraction_NN_4_27, example_NN_3_19)")
18 |     val extrs = openparse.extract(graph)
19 |     
20 |     extrs.size must_== 1
21 |     extrs.head._2.toString must_== "(OpenParse; finds; an example extraction)"
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/example/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 |   <modelVersion>4.0.0</modelVersion>
 4 |   <groupId>edu.washington.cs.knowitall.ollie</groupId>
 5 |   <artifactId>ollie-example</artifactId>
 6 |   <name>ollie-example</name>
 7 |   <version>1.0.0-SNAPSHOT</version>
 8 |   <parent>
 9 |     <groupId>edu.washington.cs.knowitall</groupId>
10 |     <artifactId>knowitall-oss</artifactId>
11 |     <version>1.0.2</version>
12 |   </parent>
13 |   <properties>
14 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
15 |     <nlptools.version>2.4.0</nlptools.version>
16 |   </properties>
17 |   <dependencies>
18 |     <dependency>
19 |       <groupId>edu.washington.cs.knowitall.ollie</groupId>
20 |       <artifactId>ollie-core_2.9.2</artifactId>
21 |       <version>1.0.2</version>
22 |     </dependency>
23 |     <dependency>
24 |       <groupId>edu.washington.cs.knowitall.nlptools</groupId>
25 |       <artifactId>nlptools-parse-malt_2.9.2</artifactId>
26 |       <version>${nlptools.version}</version>
27 |     </dependency>
28 |     <!-- Logging -->
29 |     <dependency>
30 |       <groupId>ch.qos.logback</groupId>
31 |       <artifactId>logback-classic</artifactId>
32 |       <version>1.0.7</version>
33 |     </dependency>
34 |     <dependency>
35 |       <groupId>ch.qos.logback</groupId>
36 |       <artifactId>logback-core</artifactId>
37 |       <version>1.0.7</version>
38 |     </dependency>
39 |   </dependencies>
40 |   <build>
41 |     <sourceDirectory>src/main/scala</sourceDirectory>
42 |     <testSourceDirectory>src/test/scala</testSourceDirectory>
43 |     <resources>
44 |       <resource>
45 |         <directory>src/main/resources</directory>
46 |       </resource>
47 |     </resources>
48 |     <plugins>
49 |       <plugin>
50 |         <groupId>net.alchim31.maven</groupId>
51 |         <artifactId>scala-maven-plugin</artifactId>
52 |         <version>3.1.0</version>
53 |         <configuration>
54 |           <args>
55 |             <arg>-deprecation</arg>
56 |             <arg>-unchecked</arg>
57 |           </args>
58 |         </configuration>
59 |         <executions>
60 |           <execution>
61 |             <goals>
62 |               <goal>compile</goal>
63 |               <goal>testCompile</goal>
64 |             </goals>
65 |           </execution>
66 |         </executions>
67 |       </plugin>
68 |     </plugins>
69 |   </build>
70 | </project>
71 | 


--------------------------------------------------------------------------------
/example/src/main/java/example/JavaOllieWrapper.java:
--------------------------------------------------------------------------------
 1 | package example;
 2 | 
 3 | import java.io.File;
 4 | import java.net.MalformedURLException;
 5 | 
 6 | import edu.knowitall.ollie.Ollie;
 7 | import edu.knowitall.ollie.OllieExtraction;
 8 | import edu.knowitall.ollie.OllieExtractionInstance;
 9 | import edu.knowitall.tool.parse.MaltParser;
10 | import edu.knowitall.tool.parse.graph.DependencyGraph;
11 | 
12 | /** This is an example class that shows one way of using Ollie from Java. */
13 | public class JavaOllieWrapper {
14 |     // the extractor itself
15 |     private Ollie ollie;
16 | 
17 |     // the parser--a step required before the extractor
18 |     private MaltParser maltParser;
19 | 
20 |     // the path of the malt parser model file
21 |     private static final String MALT_PARSER_FILENAME = "engmalt.linear-1.7.mco";
22 | 
23 |     public JavaOllieWrapper() throws MalformedURLException {
24 |         // initialize MaltParser
25 |         scala.Option<File> nullOption = scala.Option.apply(null);
26 |         maltParser = new MaltParser(new File(MALT_PARSER_FILENAME).toURI().toURL(), nullOption);
27 | 
28 |         // initialize Ollie
29 |         ollie = new Ollie();
30 |     }
31 | 
32 |     /**
33 |      * Gets Ollie extractions from a single sentence.
34 |      * @param sentence
35 |      * @return the set of ollie extractions
36 |      */
37 |     public Iterable<OllieExtractionInstance> extract(String sentence) {
38 |         // parse the sentence
39 |         DependencyGraph graph = maltParser.dependencyGraph(sentence);
40 | 
41 |         // run Ollie over the sentence and convert to a Java collection
42 |         Iterable<OllieExtractionInstance> extrs = scala.collection.JavaConversions.asJavaIterable(ollie.extract(graph));
43 |         return extrs;
44 |     }
45 | 
46 |     public static void main(String args[]) throws MalformedURLException {
47 |         System.out.println(JavaOllieWrapper.class.getResource("/logback.xml"));
48 |         // initialize
49 |         JavaOllieWrapper ollieWrapper = new JavaOllieWrapper();
50 | 
51 |         // extract from a single sentence.
52 |         String sentence = "President Obama will meet with Congressional leaders on Friday, and House Republicans summoned lawmakers back for a Sunday session, in a last-ditch effort to avert a fiscal crisis brought on by automatic tax increases and spending cuts scheduled to hit next week.";
53 |         Iterable<OllieExtractionInstance> extrs = ollieWrapper.extract(sentence);
54 | 
55 |         // print the extractions.
56 |         for (OllieExtractionInstance inst : extrs) {
57 |             OllieExtraction extr = inst.extr();
58 |             System.out.println(extr.arg1().text()+"\t"+extr.rel().text()+"\t"+extr.arg2().text());
59 |         }
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/example/src/main/resouces/logback.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <configuration>
 3 |   <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
 4 |     <encoder>
 5 |       <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 6 |     </encoder>
 7 |   </appender>
 8 |   <root level="info">
 9 |     <appender-ref ref="STDOUT"/>
10 |   </root>
11 | </configuration>
12 | 


--------------------------------------------------------------------------------
/example/src/main/scala/ollie/Example.scala:
--------------------------------------------------------------------------------
 1 | package ollie
 2 | 
 3 | import edu.knowitall.ollie.Ollie
 4 | import edu.knowitall.tool.parse.MaltParser
 5 | import scala.io.Source
 6 | import edu.knowitall.ollie.confidence.OllieConfidenceFunction
 7 | 
 8 | /** This is an example project that takes lines as input from stdin,
 9 |   * parses them, runs the Ollie extractor on them, scores the
10 |   * extractions with a confidence function, and then prints the results.
11 |   *
12 |   * You can run this project with the following command:
13 |   *   mvn clean compile exec:java -Dexec.mainClass=ollie.Example
14 |   *
15 |   * You will need to have engmalt.linear-1.7.mco in the base directory
16 |   * of this example for the program to work.  You can download this
17 |   * file from the MaltParser website:
18 |   *
19 |   *   http://www.maltparser.org/mco/english_parser/engmalt.html
20 |   */
21 | object Example extends App {
22 |   val parser = new MaltParser
23 |   val ollie = new Ollie
24 |   val confidence = OllieConfidenceFunction.loadDefaultClassifier()
25 |   for (line <- Source.stdin.getLines; if !line.trim.isEmpty) {
26 |     val parsed = parser.dependencyGraph(line)
27 |     val extractionInstances = ollie.extract(parsed)
28 | 
29 |     println("Extractions:")
30 |     for (inst <- extractionInstances) {
31 |       val conf = confidence(inst)
32 |       println(("%.2f" format conf) + "\t" + inst.extraction)
33 |     }
34 |     println("Waiting for next input...")
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 |   <modelVersion>4.0.0</modelVersion>
 4 |   <artifactId>ollie</artifactId>
 5 |   <name>ollie</name>
 6 |   <version>1.0.0-SNAPSHOT</version>
 7 |   <packaging>pom</packaging>
 8 |   <groupId>edu.washington.cs.knowitall.ollie</groupId>
 9 |   <properties>
10 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
11 |   </properties>
12 |   <modules>
13 |     <module>core</module>
14 |     <module>app</module>
15 |   </modules>
16 | </project>
17 | 


--------------------------------------------------------------------------------